diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 38ffbf481..87733a691 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -36,7 +36,7 @@ jobs: run: | make test - ubuntu-32bit-vendored-pcre2: + ubuntu-32bit-fetched-pcre2: runs-on: ubuntu-latest diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 755292dc6..4fbbbff8c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -42,7 +42,7 @@ Other improvements For distributors ---------------- -- *Placeholder text* +- The vendored PCRE2 sources have been removed. It is recommended to declare PCRE2 as a dependency when packaging fish. If the CMake variable FISH_USE_SYSTEM_PCRE2 is false, fish will now download and build PCRE2 from the official repo (:issue:`8355`). Note this variable defaults to true if PCRE2 is found installed on the system. -------------- diff --git a/README.rst b/README.rst index 80a2a19b3..7892985d6 100644 --- a/README.rst +++ b/README.rst @@ -148,7 +148,7 @@ Compiling fish requires: - a C++11 compiler (g++ 4.8 or later, or clang 3.3 or later) - CMake (version 3.5 or later) - a curses implementation such as ncurses (headers and libraries) -- PCRE2 (headers and libraries) - a copy is included with fish +- PCRE2 (headers and libraries) - optional, this will be downloaded if missing - gettext (headers and libraries) - optional, for translation support Sphinx is also optionally required to build the documentation from a diff --git a/build_tools/make_pkg.sh b/build_tools/make_pkg.sh index d4fcd9934..4d8e9b54a 100755 --- a/build_tools/make_pkg.sh +++ b/build_tools/make_pkg.sh @@ -27,7 +27,10 @@ SRC_DIR=$PWD OUTPUT_PATH=${FISH_ARTEFACT_PATH:-~/fish_built} mkdir -p "$PKGDIR/build" "$PKGDIR/root" "$PKGDIR/intermediates" "$PKGDIR/dst" -{ cd "$PKGDIR/build" && cmake -DMAC_INJECT_GET_TASK_ALLOW=OFF -DCMAKE_BUILD_TYPE=RelWithDebInfo -DWITH_GETTEXT=OFF -DCMAKE_OSX_ARCHITECTURES='arm64;x86_64' -DMAC_CODESIGN_ID="${MAC_CODESIGN_ID}" "$SRC_DIR" && make VERBOSE=1 -j 12 && env DESTDIR="$PKGDIR/root/" make install; } + +# Pass FISH_USE_SYSTEM_PCRE2=OFF because a system PCRE2 on macOS will not be signed by fish, +# and will probably not be built universal, so the package will fail to validate/run on other systems. +{ cd "$PKGDIR/build" && cmake -DMAC_INJECT_GET_TASK_ALLOW=OFF -DCMAKE_BUILD_TYPE=RelWithDebInfo -DWITH_GETTEXT=OFF -DFISH_USE_SYSTEM_PCRE2=OFF -DCMAKE_OSX_ARCHITECTURES='arm64;x86_64' -DMAC_CODESIGN_ID="${MAC_CODESIGN_ID}" "$SRC_DIR" && make VERBOSE=1 -j 12 && env DESTDIR="$PKGDIR/root/" make install; } pkgbuild --scripts "$SRC_DIR/build_tools/osx_package_scripts" --root "$PKGDIR/root/" --identifier 'com.ridiculousfish.fish-shell-pkg' --version "$VERSION" "$PKGDIR/intermediates/fish.pkg" productbuild --package-path "$PKGDIR/intermediates" --distribution "$SRC_DIR/build_tools/osx_distribution.xml" --resources "$SRC_DIR/build_tools/osx_package_resources/" "$OUTPUT_PATH/fish-$VERSION.pkg" @@ -37,4 +40,4 @@ productsign --sign "${MAC_PRODUCTSIGN_ID}" "$OUTPUT_PATH/fish-$VERSION.pkg" "$OU # Make the app { cd "$PKGDIR/build" && make -j 12 signed_fish_macapp && zip -r "$OUTPUT_PATH/fish-$VERSION.app.zip" fish.app; } -rm -r "$PKGDIR" +rm -rf "$PKGDIR" diff --git a/cmake/PCRE2.cmake b/cmake/PCRE2.cmake index 724d22255..b9138fe2e 100644 --- a/cmake/PCRE2.cmake +++ b/cmake/PCRE2.cmake @@ -28,16 +28,37 @@ else() endif() set(FISH_USE_SYSTEM_PCRE2 ${USE_SYS_PCRE2_DEFAULT} CACHE BOOL - "Use PCRE2 from the system, instead of bundled with fish") + "Use PCRE2 from the system, instead of fetching and building it") if(FISH_USE_SYSTEM_PCRE2) set(PCRE2_LIB "${SYS_PCRE2_LIB}") set(PCRE2_INCLUDE_DIR "${SYS_PCRE2_INCLUDE_DIR}") message(STATUS "Using system PCRE2 library ${PCRE2_INCLUDE_DIR}") else() - message(STATUS "Using bundled PCRE2 library") - add_subdirectory(pcre2 EXCLUDE_FROM_ALL) - set(PCRE2_INCLUDE_DIR ${CMAKE_BINARY_DIR}/pcre2) + include(FetchContent RESULT_VARIABLE HAVE_FetchContent) + if (${HAVE_FetchContent} STREQUAL "NOTFOUND") + message(FATAL_ERROR "Please install PCRE2 headers, or CMake >= 3.11 so I can download PCRE") + endif() + set(CMAKE_TLS_VERIFY true) + set(PCRE2_REPO "https://github.com/PhilipHazel/pcre2.git") + + message(STATUS "Fetching and configuring PCRE2 from ${PCRE2_REPO}") + Set(FETCHCONTENT_QUIET FALSE) + FetchContent_Declare( + pcre2 + GIT_REPOSITORY ${PCRE2_REPO} + GIT_TAG "72669190cb947f0cac1d038a8bb1820da59ef447" # tag: pcre2-10.36 + GIT_PROGRESS TRUE + ) + # Don't try FetchContent_MakeAvailable, there's no way to add EXCLUDE_FROM_ALL + # so we end up installing all of PCRE2 including its headers, man pages, etc. + FetchContent_GetProperties(pcre2) + if (NOT pcre2_POPULATED) + FetchContent_Populate(pcre2) + add_subdirectory(${pcre2_SOURCE_DIR} ${pcre2_BINARY_DIR} EXCLUDE_FROM_ALL) + endif() + + set(PCRE2_INCLUDE_DIR ${pcre2_BINARY_DIR}) set(PCRE2_LIB pcre2-${PCRE2_WIDTH}) # Disable -Wunused-macros inside PCRE2, as it is noisy. diff --git a/pcre2/132html b/pcre2/132html deleted file mode 100755 index 1bd62ba24..000000000 --- a/pcre2/132html +++ /dev/null @@ -1,314 +0,0 @@ -#! /usr/bin/perl -w - -# Script to turn PCRE2 man pages into HTML - - -# Subroutine to handle font changes and other escapes - -sub do_line { -my($s) = $_[0]; - -$s =~ s/ -$s =~ s/>/>/g; -$s =~ s"\\fI(.*?)\\f[RP]"$1"g; -$s =~ s"\\fB(.*?)\\f[RP]"$1"g; -$s =~ s"\\e"\\"g; -$s =~ s/(?<=Copyright )\(c\)/©/g; -$s; -} - -# Subroutine to ensure not in a paragraph - -sub end_para { -if ($inpara) - { - print TEMP "\n" if ($inpre); - print TEMP "

\n"; - } -$inpara = $inpre = 0; -$wrotetext = 0; -} - -# Subroutine to start a new paragraph - -sub new_para { -&end_para(); -print TEMP "

\n"; -$inpara = 1; -} - - -# Main program - -$innf = 0; -$inpara = 0; -$inpre = 0; -$wrotetext = 0; -$toc = 0; -$ref = 1; - -while ($#ARGV >= 0 && $ARGV[0] =~ /^-/) - { - $toc = 1 if $ARGV[0] eq "-toc"; - shift; - } - -# Initial output to STDOUT - -print < - -$ARGV[0] specification - - -

$ARGV[0] man page

-

-Return to the PCRE2 index page. -

-

-This page is part of the PCRE2 HTML documentation. It was generated -automatically from the original man page. If there is any nonsense in it, -please consult the man page, in case the conversion went wrong. -
-End - -print "

\n" if ($toc); - -# Copy the remainder to the standard output - -close(TEMP); -open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n"; - -print while (); - -print < -Return to the PCRE2 index page. -

-End - -close(TEMP); -unlink("/tmp/$$"); - -# End diff --git a/pcre2/AUTHORS b/pcre2/AUTHORS deleted file mode 100644 index f001cb770..000000000 --- a/pcre2/AUTHORS +++ /dev/null @@ -1,36 +0,0 @@ -THE MAIN PCRE2 LIBRARY CODE ---------------------------- - -Written by: Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com - -University of Cambridge Computing Service, -Cambridge, England. - -Copyright (c) 1997-2020 University of Cambridge -All rights reserved - - -PCRE2 JUST-IN-TIME COMPILATION SUPPORT --------------------------------------- - -Written by: Zoltan Herczeg -Email local part: hzmester -Emain domain: freemail.hu - -Copyright(c) 2010-2020 Zoltan Herczeg -All rights reserved. - - -STACK-LESS JUST-IN-TIME COMPILER --------------------------------- - -Written by: Zoltan Herczeg -Email local part: hzmester -Emain domain: freemail.hu - -Copyright(c) 2009-2020 Zoltan Herczeg -All rights reserved. - -#### diff --git a/pcre2/CMakeLists.txt b/pcre2/CMakeLists.txt deleted file mode 100644 index 9091c687d..000000000 --- a/pcre2/CMakeLists.txt +++ /dev/null @@ -1,1017 +0,0 @@ -# CMakeLists.txt -# -# This file enables PCRE2 to be built with the CMake configuration and build -# tool. Download CMake in source or binary form from http://www.cmake.org/ -# Converted to support PCRE2 from the original PCRE file, August 2014. -# -# Original listfile by Christian Ehrlicher -# Refined and expanded by Daniel Richard G. -# 2007-09-14 mod by Sheri so 7.4 supported configuration options can be entered -# 2007-09-19 Adjusted by PH to retain previous default settings -# 2007-12-26 (a) On UNIX, use names libpcre instead of just pcre -# (b) Ensure pcretest and pcregrep link with the local library, -# not a previously-installed one. -# (c) Add PCRE_SUPPORT_LIBREADLINE, PCRE_SUPPORT_LIBZ, and -# PCRE_SUPPORT_LIBBZ2. -# 2008-01-20 Brought up to date to include several new features by Christian -# Ehrlicher. -# 2008-01-22 Sheri added options for backward compatibility of library names -# when building with minGW: -# if "ON", NON_STANDARD_LIB_PREFIX causes shared libraries to -# be built without "lib" as prefix. (The libraries will be named -# pcre.dll, pcreposix.dll and pcrecpp.dll). -# if "ON", NON_STANDARD_LIB_SUFFIX causes shared libraries to -# be built with suffix of "-0.dll". (The libraries will be named -# libpcre-0.dll, libpcreposix-0.dll and libpcrecpp-0.dll - same names -# built by default with Configure and Make. -# 2008-01-23 PH removed the automatic build of pcredemo. -# 2008-04-22 PH modified READLINE support so it finds NCURSES when needed. -# 2008-07-03 PH updated for revised UCP property support (change of files) -# 2009-03-23 PH applied Steven Van Ingelgem's patch to change the name -# CMAKE_BINARY_DIR to PROJECT_BINARY_DIR so that it works when PCRE -# is included within another project. -# 2009-03-23 PH applied a modified version of Steven Van Ingelgem's patches to -# add options to stop the building of pcregrep and the tests, and -# to disable the final configuration report. -# 2009-04-11 PH applied Christian Ehrlicher's patch to show compiler flags that -# are set by specifying a release type. -# 2010-01-02 PH added test for stdint.h -# 2010-03-02 PH added test for inttypes.h -# 2011-08-01 PH added PCREGREP_BUFSIZE -# 2011-08-22 PH added PCRE_SUPPORT_JIT -# 2011-09-06 PH modified WIN32 ADD_TEST line as suggested by Sergey Cherepanov -# 2011-09-06 PH added PCRE_SUPPORT_PCREGREP_JIT -# 2011-10-04 Sheri added support for including coff data in windows shared libraries -# compiled with MINGW if pcre.rc and/or pcreposix.rc are placed in -# the source dir by the user prior to building -# 2011-10-04 Sheri changed various add_test's to use exes' location built instead -# of DEBUG location only (likely only matters in MSVC) -# 2011-10-04 Sheri added scripts to provide needed variables to RunTest and -# RunGrepTest (used for UNIX and Msys) -# 2011-10-04 Sheri added scripts to provide needed variables and to execute -# RunTest.bat in Win32 (for effortless testing with "make test") -# 2011-10-04 Sheri Increased minimum required cmake version -# 2012-01-06 PH removed pcre_info.c and added pcre_string_utils.c -# 2012-01-10 Zoltan Herczeg added libpcre16 support -# 2012-01-13 Stephen Kelly added out of source build support -# 2012-01-17 PH applied Stephen Kelly's patch to parse the version data out -# of the configure.ac file -# 2012-02-26 PH added support for libedit -# 2012-09-06 PH added support for PCRE_EBCDIC_NL25 -# 2012-09-08 ChPe added PCRE32 support -# 2012-10-23 PH added support for VALGRIND and GCOV -# 2012-12-08 PH added patch from Daniel Richard G to quash some MSVC warnings -# 2013-07-01 PH realized that the "support" for GCOV was a total nonsense and -# so it has been removed. -# 2013-10-08 PH got rid of the "source" command, which is a bash-ism (use ".") -# 2013-11-05 PH added support for PARENS_NEST_LIMIT -# 2014-08-29 PH converted the file for PCRE2 (which has no C++). -# 2015-04-24 PH added support for PCRE2_DEBUG -# 2015-07-16 PH updated for new pcre2_find_bracket source module -# 2015-08-24 PH correct C_FLAGS setting (patch from Roy Ivy III) -# 2015-10=16 PH added support for never-backslash-C -# 2016-03-01 PH applied Chris Wilson's patch for MSVC static -# 2016-06-24 PH applied Chris Wilson's second patch, putting the first under -# a new option instead of being unconditional. -# 2016-10-05 PH fixed a typo (PCRE should be PCRE2) in above patch -# fix by David Gaussmann -# 2016-10-07 PH added PCREGREP_MAX_BUFSIZE -# 2017-03-11 PH turned HEAP_MATCH_RECURSE into a NO-OP for 10.30 -# 2017-04-08 PH added HEAP_LIMIT -# 2017-06-15 ZH added SUPPORT_JIT_SEALLOC support -# 2018-06-19 PH added checks for stdint.h and inttypes.h (later removed) -# 2018-06-27 PH added Daniel's patch to increase the stack for MSVC -# 2018-11-14 PH removed unnecessary checks for stdint.h and inttypes.h -# 2018-11-16 PH added PCRE2GREP_SUPPORT_CALLOUT_FORK support and tidied -# 2019-02-16 PH hacked to avoid CMP0026 policy issue (see comments below) -# 2020-03-16 PH renamed dftables as pcre2_dftables (as elsewhere) -# 2020-03-24 PH changed CMAKE_MODULE_PATH definition to add, not replace -# 2020-04-08 Carlo added function check for secure_getenv, fixed strerror -# 2020-04-16 enh added check for __attribute__((uninitialized)) -# 2020-04-25 PH applied patches from Uwe Korn to support pkg-config and -# library versioning. -# 2020-04-25 Carlo added function check for mkostemp used in ProtExecAllocator -# 2020-04-28 PH added function check for memfd_create based on Carlo's patch -# 2020-05-25 PH added a check for Intel CET -# 2020-12-03 PH altered the definition of pcre2test as suggested by Daniel - -PROJECT(PCRE2 C) - -# Increased minimum to 2.8.5 to support GNUInstallDirs. -CMAKE_MINIMUM_REQUIRED(VERSION 2.8.5) - -# Set policy CMP0026 to avoid warnings for the use of LOCATION in -# GET_TARGET_PROPERTY. This should no longer be required. -# CMAKE_POLICY(SET CMP0026 OLD) - -# For FindReadline.cmake. This was changed to allow setting CMAKE_MODULE_PATH -# on the command line. -# SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) - -LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) - -SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -I\"${PROJECT_SOURCE_DIR}/src\"") - -# external packages -FIND_PACKAGE( BZip2 QUIET ) -FIND_PACKAGE( ZLIB QUIET ) -FIND_PACKAGE( Readline QUIET ) -FIND_PACKAGE( Editline QUIET ) - -# Configuration checks - -INCLUDE(CheckCSourceCompiles) -INCLUDE(CheckFunctionExists) -INCLUDE(CheckSymbolExists) -INCLUDE(CheckIncludeFile) -INCLUDE(CheckTypeSize) -INCLUDE(GNUInstallDirs) # for CMAKE_INSTALL_LIBDIR - -CHECK_INCLUDE_FILE(dirent.h HAVE_DIRENT_H) -CHECK_INCLUDE_FILE(stdint.h HAVE_STDINT_H) -CHECK_INCLUDE_FILE(inttypes.h HAVE_INTTYPES_H) -CHECK_INCLUDE_FILE(sys/stat.h HAVE_SYS_STAT_H) -CHECK_INCLUDE_FILE(sys/types.h HAVE_SYS_TYPES_H) -CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H) -CHECK_INCLUDE_FILE(windows.h HAVE_WINDOWS_H) - -CHECK_SYMBOL_EXISTS(bcopy "strings.h" HAVE_BCOPY) -CHECK_SYMBOL_EXISTS(memfd_create "sys/mman.h" HAVE_MEMFD_CREATE) -CHECK_SYMBOL_EXISTS(memmove "string.h" HAVE_MEMMOVE) -CHECK_SYMBOL_EXISTS(secure_getenv "stdlib.h" HAVE_SECURE_GETENV) -CHECK_SYMBOL_EXISTS(strerror "string.h" HAVE_STRERROR) - -set(ORIG_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) -set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror") -CHECK_C_SOURCE_COMPILES( - "int main() { char buf[128] __attribute__((uninitialized)); (void)buf; return 0; }" - HAVE_ATTRIBUTE_UNINITIALIZED -) -set(CMAKE_REQUIRED_FLAGS ${ORIG_CMAKE_REQUIRED_FLAGS}) - -# Check whether Intel CET is enabled, and if so, adjust compiler flags. This -# code was written by PH, trying to imitate the logic from the autotools -# configuration. - -CHECK_C_SOURCE_COMPILES( - "#ifndef __CET__ - #error CET is not enabled - #endif - int main() { return 0; }" - INTEL_CET_ENABLED -) - -IF (INTEL_CET_ENABLED) - SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mshstk") -ENDIF(INTEL_CET_ENABLED) - - - -# User-configurable options -# -# Note: CMakeSetup displays these in alphabetical order, regardless of -# the order we use here. - -SET(BUILD_SHARED_LIBS OFF CACHE BOOL - "Build shared libraries instead of static ones.") - -OPTION(PCRE2_BUILD_PCRE2_8 "Build 8 bit PCRE2 library" ON) - -OPTION(PCRE2_BUILD_PCRE2_16 "Build 16 bit PCRE2 library" OFF) - -OPTION(PCRE2_BUILD_PCRE2_32 "Build 32 bit PCRE2 library" OFF) - -OPTION(PCRE2_DEBUG "Include debugging code" OFF) - -OPTION(PCRE2_DISABLE_PERCENT_ZT "Disable the use of %zu and %td (rarely needed)" OFF) - -SET(PCRE2_EBCDIC OFF CACHE BOOL - "Use EBCDIC coding instead of ASCII. (This is rarely used outside of mainframe systems.)") - -SET(PCRE2_EBCDIC_NL25 OFF CACHE BOOL - "Use 0x25 as EBCDIC NL character instead of 0x15; implies EBCDIC.") - -SET(PCRE2_LINK_SIZE "2" CACHE STRING - "Internal link size (2, 3 or 4 allowed). See LINK_SIZE in config.h.in for details.") - -SET(PCRE2_PARENS_NEST_LIMIT "250" CACHE STRING - "Default nested parentheses limit. See PARENS_NEST_LIMIT in config.h.in for details.") - -SET(PCRE2_HEAP_LIMIT "20000000" CACHE STRING - "Default limit on heap memory (kibibytes). See HEAP_LIMIT in config.h.in for details.") - -SET(PCRE2_MATCH_LIMIT "10000000" CACHE STRING - "Default limit on internal looping. See MATCH_LIMIT in config.h.in for details.") - -SET(PCRE2_MATCH_LIMIT_DEPTH "MATCH_LIMIT" CACHE STRING - "Default limit on internal depth of search. See MATCH_LIMIT_DEPTH in config.h.in for details.") - -SET(PCRE2GREP_BUFSIZE "20480" CACHE STRING - "Buffer starting size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.") - -SET(PCRE2GREP_MAX_BUFSIZE "1048576" CACHE STRING - "Buffer maximum size parameter for pcre2grep. See PCRE2GREP_MAX_BUFSIZE in config.h.in for details.") - -SET(PCRE2_NEWLINE "LF" CACHE STRING - "What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF, NUL).") - -SET(PCRE2_HEAP_MATCH_RECURSE OFF CACHE BOOL - "Obsolete option: do not use") - -SET(PCRE2_SUPPORT_JIT OFF CACHE BOOL - "Enable support for Just-in-time compiling.") - -IF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD) - SET(PCRE2_SUPPORT_JIT_SEALLOC OFF CACHE BOOL - "Enable SELinux compatible execmem allocator in JIT (experimental).") -ELSE(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD) - SET(PCRE2_SUPPORT_JIT_SEALLOC IGNORE) -ENDIF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD) - -SET(PCRE2GREP_SUPPORT_JIT ON CACHE BOOL - "Enable use of Just-in-time compiling in pcre2grep.") - -SET(PCRE2GREP_SUPPORT_CALLOUT ON CACHE BOOL - "Enable callout string support in pcre2grep.") - -SET(PCRE2GREP_SUPPORT_CALLOUT_FORK ON CACHE BOOL - "Enable callout string fork support in pcre2grep.") - -SET(PCRE2_SUPPORT_UNICODE ON CACHE BOOL - "Enable support for Unicode and UTF-8/UTF-16/UTF-32 encoding.") - -SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL - "ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks") - -SET(PCRE2_NEVER_BACKSLASH_C OFF CACHE BOOL - "If ON, backslash-C (upper case C) is locked out.") - -SET(PCRE2_SUPPORT_VALGRIND OFF CACHE BOOL - "Enable Valgrind support.") - -OPTION(PCRE2_SHOW_REPORT "Show the final configuration report" ON) -OPTION(PCRE2_BUILD_PCRE2GREP "Build pcre2grep" ON) -OPTION(PCRE2_BUILD_TESTS "Build the tests" ON) - -IF (MINGW) - OPTION(NON_STANDARD_LIB_PREFIX - "ON=Shared libraries built in mingw will be named pcre2.dll, etc., instead of libpcre2.dll, etc." - OFF) - - OPTION(NON_STANDARD_LIB_SUFFIX - "ON=Shared libraries built in mingw will be named libpcre2-0.dll, etc., instead of libpcre2.dll, etc." - OFF) -ENDIF(MINGW) - -IF(MSVC) - OPTION(PCRE2_STATIC_RUNTIME - "ON=Compile against the static runtime (/MT)." - OFF) - OPTION(INSTALL_MSVC_PDB - "ON=Install .pdb files built by MSVC, if generated" - OFF) -ENDIF(MSVC) - -# bzip2 lib -IF(BZIP2_FOUND) - OPTION (PCRE2_SUPPORT_LIBBZ2 "Enable support for linking pcre2grep with libbz2." ON) -ENDIF(BZIP2_FOUND) -IF(PCRE2_SUPPORT_LIBBZ2) - INCLUDE_DIRECTORIES(${BZIP2_INCLUDE_DIR}) -ENDIF(PCRE2_SUPPORT_LIBBZ2) - -# zlib -IF(ZLIB_FOUND) - OPTION (PCRE2_SUPPORT_LIBZ "Enable support for linking pcre2grep with libz." ON) -ENDIF(ZLIB_FOUND) -IF(PCRE2_SUPPORT_LIBZ) - INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) -ENDIF(PCRE2_SUPPORT_LIBZ) - -# editline lib -IF(EDITLINE_FOUND) - OPTION (PCRE2_SUPPORT_LIBEDIT "Enable support for linking pcre2test with libedit." OFF) -ENDIF(EDITLINE_FOUND) -IF(PCRE2_SUPPORT_LIBEDIT) - INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR}) -ENDIF(PCRE2_SUPPORT_LIBEDIT) - -# readline lib -IF(READLINE_FOUND) - OPTION (PCRE2_SUPPORT_LIBREADLINE "Enable support for linking pcre2test with libreadline." ON) -ENDIF(READLINE_FOUND) -IF(PCRE2_SUPPORT_LIBREADLINE) - INCLUDE_DIRECTORIES(${READLINE_INCLUDE_DIR}) -ENDIF(PCRE2_SUPPORT_LIBREADLINE) - -# Prepare build configuration - -IF(NOT BUILD_SHARED_LIBS) - SET(PCRE2_STATIC 1) -ENDIF(NOT BUILD_SHARED_LIBS) - -IF(NOT PCRE2_BUILD_PCRE2_8 AND NOT PCRE2_BUILD_PCRE2_16 AND NOT PCRE2_BUILD_PCRE2_32) - MESSAGE(FATAL_ERROR "At least one of PCRE2_BUILD_PCRE2_8, PCRE2_BUILD_PCRE2_16 or PCRE2_BUILD_PCRE2_32 must be enabled") -ENDIF(NOT PCRE2_BUILD_PCRE2_8 AND NOT PCRE2_BUILD_PCRE2_16 AND NOT PCRE2_BUILD_PCRE2_32) - -IF(PCRE2_BUILD_PCRE2_8) - SET(SUPPORT_PCRE2_8 1) -ENDIF(PCRE2_BUILD_PCRE2_8) - -IF(PCRE2_BUILD_PCRE2_16) - SET(SUPPORT_PCRE2_16 1) -ENDIF(PCRE2_BUILD_PCRE2_16) - -IF(PCRE2_BUILD_PCRE2_32) - SET(SUPPORT_PCRE2_32 1) -ENDIF(PCRE2_BUILD_PCRE2_32) - -IF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8) - MESSAGE(STATUS "** PCRE2_BUILD_PCRE2_8 must be enabled for the pcre2grep program") - SET(PCRE2_BUILD_PCRE2GREP OFF) -ENDIF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8) - -IF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT) - MESSAGE(FATAL_ERROR "Only one of libreadline or libeditline can be specified") -ENDIF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT) - -IF(PCRE2_SUPPORT_BSR_ANYCRLF) - SET(BSR_ANYCRLF 1) -ENDIF(PCRE2_SUPPORT_BSR_ANYCRLF) - -IF(PCRE2_NEVER_BACKSLASH_C) - SET(NEVER_BACKSLASH_C 1) -ENDIF(PCRE2_NEVER_BACKSLASH_C) - -IF(PCRE2_SUPPORT_UNICODE) - SET(SUPPORT_UNICODE 1) -ENDIF(PCRE2_SUPPORT_UNICODE) - -IF(PCRE2_SUPPORT_JIT) - SET(SUPPORT_JIT 1) -ENDIF(PCRE2_SUPPORT_JIT) - -IF(PCRE2_SUPPORT_JIT_SEALLOC) - SET(CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE) - CHECK_SYMBOL_EXISTS(mkostemp stdlib.h REQUIRED) - UNSET(CMAKE_REQUIRED_DEFINITIONS) - IF(${REQUIRED}) - IF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD) - ADD_DEFINITIONS(-D_GNU_SOURCE) - SET(SLJIT_PROT_EXECUTABLE_ALLOCATOR 1) - ELSE(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD) - MESSAGE(FATAL_ERROR "Your configuration is not supported") - ENDIF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD) - ELSE(${REQUIRED}) - SET(PCRE2_SUPPORT_JIT_SEALLOC OFF) - ENDIF(${REQUIRED}) -ENDIF(PCRE2_SUPPORT_JIT_SEALLOC) - -IF(PCRE2GREP_SUPPORT_JIT) - SET(SUPPORT_PCRE2GREP_JIT 1) -ENDIF(PCRE2GREP_SUPPORT_JIT) - -IF(PCRE2GREP_SUPPORT_CALLOUT) - SET(SUPPORT_PCRE2GREP_CALLOUT 1) - IF(PCRE2GREP_SUPPORT_CALLOUT_FORK) - SET(SUPPORT_PCRE2GREP_CALLOUT_FORK 1) - ENDIF(PCRE2GREP_SUPPORT_CALLOUT_FORK) -ENDIF(PCRE2GREP_SUPPORT_CALLOUT) - -IF(PCRE2_SUPPORT_VALGRIND) - SET(SUPPORT_VALGRIND 1) -ENDIF(PCRE2_SUPPORT_VALGRIND) - -IF(PCRE2_DISABLE_PERCENT_ZT) - SET(DISABLE_PERCENT_ZT 1) -ENDIF(PCRE2_DISABLE_PERCENT_ZT) - -# This next one used to reference ${READLINE_LIBRARY}) -# but I was advised to add the NCURSES test as well, along with -# some modifications to cmake/FindReadline.cmake which should -# make it possible to override the default if necessary. PH - -IF(PCRE2_SUPPORT_LIBREADLINE) - SET(SUPPORT_LIBREADLINE 1) - SET(PCRE2TEST_LIBS ${READLINE_LIBRARY} ${NCURSES_LIBRARY}) -ENDIF(PCRE2_SUPPORT_LIBREADLINE) - -# libedit is a plug-compatible alternative to libreadline - -IF(PCRE2_SUPPORT_LIBEDIT) - SET(SUPPORT_LIBEDIT 1) - SET(PCRE2TEST_LIBS ${EDITLINE_LIBRARY} ${NCURSES_LIBRARY}) -ENDIF(PCRE2_SUPPORT_LIBEDIT) - -IF(PCRE2_SUPPORT_LIBZ) - SET(SUPPORT_LIBZ 1) - SET(PCRE2GREP_LIBS ${PCRE2GREP_LIBS} ${ZLIB_LIBRARIES}) -ENDIF(PCRE2_SUPPORT_LIBZ) - -IF(PCRE2_SUPPORT_LIBBZ2) - SET(SUPPORT_LIBBZ2 1) - SET(PCRE2GREP_LIBS ${PCRE2GREP_LIBS} ${BZIP2_LIBRARIES}) -ENDIF(PCRE2_SUPPORT_LIBBZ2) - -SET(NEWLINE_DEFAULT "") - -IF(PCRE2_NEWLINE STREQUAL "CR") - SET(NEWLINE_DEFAULT "1") -ENDIF(PCRE2_NEWLINE STREQUAL "CR") -IF(PCRE2_NEWLINE STREQUAL "LF") - SET(NEWLINE_DEFAULT "2") -ENDIF(PCRE2_NEWLINE STREQUAL "LF") -IF(PCRE2_NEWLINE STREQUAL "CRLF") - SET(NEWLINE_DEFAULT "3") -ENDIF(PCRE2_NEWLINE STREQUAL "CRLF") -IF(PCRE2_NEWLINE STREQUAL "ANY") - SET(NEWLINE_DEFAULT "4") -ENDIF(PCRE2_NEWLINE STREQUAL "ANY") -IF(PCRE2_NEWLINE STREQUAL "ANYCRLF") - SET(NEWLINE_DEFAULT "5") -ENDIF(PCRE2_NEWLINE STREQUAL "ANYCRLF") -IF(PCRE2_NEWLINE STREQUAL "NUL") - SET(NEWLINE_DEFAULT "6") -ENDIF(PCRE2_NEWLINE STREQUAL "NUL") - -IF(NEWLINE_DEFAULT STREQUAL "") - MESSAGE(FATAL_ERROR "The PCRE2_NEWLINE variable must be set to one of the following values: \"LF\", \"CR\", \"CRLF\", \"ANY\", \"ANYCRLF\".") -ENDIF(NEWLINE_DEFAULT STREQUAL "") - -IF(PCRE2_EBCDIC) - SET(EBCDIC 1) -ENDIF(PCRE2_EBCDIC) - -IF(PCRE2_EBCDIC_NL25) - SET(EBCDIC 1) - SET(EBCDIC_NL25 1) -ENDIF(PCRE2_EBCDIC_NL25) - -# Output files - -CONFIGURE_FILE(config-cmake.h.in - ${PROJECT_BINARY_DIR}/config.h - @ONLY) - -# Parse version numbers and date out of configure.ac - -file(STRINGS ${PROJECT_SOURCE_DIR}/configure.ac - configure_lines - LIMIT_COUNT 50 # Read only the first 50 lines of the file -) - -set(SEARCHED_VARIABLES "pcre2_major" "pcre2_minor" "pcre2_prerelease" "pcre2_date" - "libpcre2_posix_version" "libpcre2_8_version" "libpcre2_16_version" "libpcre2_32_version") -foreach(configure_line ${configure_lines}) - foreach(_substitution_variable ${SEARCHED_VARIABLES}) - string(TOUPPER ${_substitution_variable} _substitution_variable_upper) - if (NOT ${_substitution_variable_upper}) - string(REGEX MATCH "m4_define\\(${_substitution_variable}, *\\[(.*)\\]" MATCHED_STRING ${configure_line}) - if (CMAKE_MATCH_1) - set(${_substitution_variable_upper} ${CMAKE_MATCH_1}) - endif() - endif() - endforeach() -endforeach() - -macro(PARSE_LIB_VERSION VARIABLE_PREFIX) - string(REPLACE ":" ";" ${VARIABLE_PREFIX}_VERSION_LIST ${${VARIABLE_PREFIX}_VERSION}) - list(GET ${VARIABLE_PREFIX}_VERSION_LIST 0 ${VARIABLE_PREFIX}_VERSION_CURRENT) - list(GET ${VARIABLE_PREFIX}_VERSION_LIST 1 ${VARIABLE_PREFIX}_VERSION_REVISION) - list(GET ${VARIABLE_PREFIX}_VERSION_LIST 2 ${VARIABLE_PREFIX}_VERSION_AGE) - - math(EXPR ${VARIABLE_PREFIX}_SOVERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} - ${${VARIABLE_PREFIX}_VERSION_AGE}") - math(EXPR ${VARIABLE_PREFIX}_MACHO_COMPATIBILITY_VERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} + 1") - math(EXPR ${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} + 1") - set(${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION "${${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION}.${${VARIABLE_PREFIX}_VERSION_REVISION}}") - set(${VARIABLE_PREFIX}_VERSION "${${VARIABLE_PREFIX}_SOVERSION}.${${VARIABLE_PREFIX}_VERSION_AGE}.${${VARIABLE_PREFIX}_VERSION_REVISION}") -endmacro() - -PARSE_LIB_VERSION(LIBPCRE2_POSIX) -PARSE_LIB_VERSION(LIBPCRE2_8) -PARSE_LIB_VERSION(LIBPCRE2_16) -PARSE_LIB_VERSION(LIBPCRE2_32) - -CONFIGURE_FILE(src/pcre2.h.in - ${PROJECT_BINARY_DIR}/pcre2.h - @ONLY) - -# Make sure to not link debug libs -# against release libs and vice versa -IF(WIN32) - SET(CMAKE_DEBUG_POSTFIX "d") -ENDIF(WIN32) - -# Generate pkg-config files - -SET(PACKAGE_VERSION "${PCRE2_MAJOR}.${PCRE2_MINOR}") -SET(prefix ${CMAKE_INSTALL_PREFIX}) - -SET(exec_prefix "\${prefix}") -SET(libdir "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}") -SET(includedir "\${prefix}/include") -IF(WIN32 AND (CMAKE_BUILD_TYPE MATCHES Debug)) - SET(LIB_POSTFIX ${CMAKE_DEBUG_POSTFIX}) -ENDIF() -CONFIGURE_FILE(libpcre2-posix.pc.in libpcre2-posix.pc @ONLY) -SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-posix.pc") - -IF(PCRE2_BUILD_PCRE2_8) - CONFIGURE_FILE(libpcre2-8.pc.in libpcre2-8.pc @ONLY) - SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-8.pc") - SET(enable_pcre2_8 "yes") -ELSE() - SET(enable_pcre2_8 "no") -ENDIF() - -IF(PCRE2_BUILD_PCRE2_16) - CONFIGURE_FILE(libpcre2-16.pc.in libpcre2-16.pc @ONLY) - SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-16.pc") - SET(enable_pcre2_16 "yes") -ELSE() - SET(enable_pcre2_16 "no") -ENDIF() - -IF(PCRE2_BUILD_PCRE2_32) - CONFIGURE_FILE(libpcre2-32.pc.in libpcre2-32.pc @ONLY) - SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-32.pc") - SET(enable_pcre2_32 "yes") -ELSE() - SET(enable_pcre2_32 "no") -ENDIF() - -CONFIGURE_FILE(pcre2-config.in pcre2-config @ONLY) - -# Character table generation - -OPTION(PCRE2_REBUILD_CHARTABLES "Rebuild char tables" OFF) -IF(PCRE2_REBUILD_CHARTABLES) - ADD_EXECUTABLE(pcre2_dftables src/pcre2_dftables.c) - ADD_CUSTOM_COMMAND( - COMMENT "Generating character tables (pcre2_chartables.c) for current locale" - DEPENDS pcre2_dftables - COMMAND pcre2_dftables - ARGS ${PROJECT_BINARY_DIR}/pcre2_chartables.c - OUTPUT ${PROJECT_BINARY_DIR}/pcre2_chartables.c - ) -ELSE(PCRE2_REBUILD_CHARTABLES) - CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/src/pcre2_chartables.c.dist - ${PROJECT_BINARY_DIR}/pcre2_chartables.c - COPYONLY) -ENDIF(PCRE2_REBUILD_CHARTABLES) - -# Source code - -SET(PCRE2_HEADERS ${PROJECT_BINARY_DIR}/pcre2.h) - -SET(PCRE2_SOURCES - src/pcre2_auto_possess.c - ${PROJECT_BINARY_DIR}/pcre2_chartables.c - src/pcre2_compile.c - src/pcre2_config.c - src/pcre2_context.c - src/pcre2_convert.c - src/pcre2_dfa_match.c - src/pcre2_error.c - src/pcre2_extuni.c - src/pcre2_find_bracket.c - src/pcre2_jit_compile.c - src/pcre2_maketables.c - src/pcre2_match.c - src/pcre2_match_data.c - src/pcre2_newline.c - src/pcre2_ord2utf.c - src/pcre2_pattern_info.c - src/pcre2_script_run.c - src/pcre2_serialize.c - src/pcre2_string_utils.c - src/pcre2_study.c - src/pcre2_substitute.c - src/pcre2_substring.c - src/pcre2_tables.c - src/pcre2_ucd.c - src/pcre2_valid_utf.c - src/pcre2_xclass.c -) - -SET(PCRE2POSIX_HEADERS src/pcre2posix.h) -SET(PCRE2POSIX_SOURCES src/pcre2posix.c) - -IF(MINGW AND NOT PCRE2_STATIC) -IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc) -ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o -PRE-LINK -COMMAND windres ARGS pcre2.rc pcre2.o -WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} -COMMENT Using pcre2 coff info in mingw build) -SET(PCRE2_SOURCES - ${PCRE2_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2.o -) -ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc) -IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc) -ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2posix.o -PRE-LINK -COMMAND windres ARGS pcre2posix.rc pcre2posix.o -WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} -COMMENT Using pcre2posix coff info in mingw build) -SET(PCRE2POSIX_SOURCES - ${PCRE2POSIX_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2posix.o -) -ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc) -ENDIF(MINGW AND NOT PCRE2_STATIC) - -IF(MSVC AND NOT PCRE2_STATIC) -IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc) -SET(PCRE2_SOURCES - ${PCRE2_SOURCES} pcre2.rc) -ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc) -IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc) -SET(PCRE2POSIX_SOURCES - ${PCRE2POSIX_SOURCES} pcre2posix.rc) -ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc) -ENDIF(MSVC AND NOT PCRE2_STATIC) - -# Fix static compilation with MSVC: https://bugs.exim.org/show_bug.cgi?id=1681 -# This code was taken from the CMake wiki, not from WebM. - -IF(MSVC AND PCRE2_STATIC_RUNTIME) - MESSAGE(STATUS "** MSVC and PCRE2_STATIC_RUNTIME: modifying compiler flags to use static runtime library") - foreach(flag_var - CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE - CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) - string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endforeach() -ENDIF(MSVC AND PCRE2_STATIC_RUNTIME) - -# Build setup - -ADD_DEFINITIONS(-DHAVE_CONFIG_H) - -IF(MSVC) - ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE -D_CRT_SECURE_NO_WARNINGS) -ENDIF(MSVC) - -SET(CMAKE_INCLUDE_CURRENT_DIR 1) - -SET(targets) - -# 8-bit library - -IF(PCRE2_BUILD_PCRE2_8) -ADD_LIBRARY(pcre2-8 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h) -SET_TARGET_PROPERTIES(pcre2-8 PROPERTIES - COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8 - MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}" - MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}" - VERSION ${LIBPCRE2_8_VERSION} - SOVERSION ${LIBPCRE2_8_SOVERSION}) -SET(targets ${targets} pcre2-8) -ADD_LIBRARY(pcre2-posix ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES}) -SET_TARGET_PROPERTIES(pcre2-posix PROPERTIES - COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8 - MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}" - MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}" - VERSION ${LIBPCRE2_POSIX_VERSION} - SOVERSION ${LIBPCRE2_POSIX_SOVERSION}) -SET(targets ${targets} pcre2-posix) -TARGET_LINK_LIBRARIES(pcre2-posix pcre2-8) - -IF(MINGW AND NOT PCRE2_STATIC) - IF(NON_STANDARD_LIB_PREFIX) - SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES PREFIX "") - ENDIF(NON_STANDARD_LIB_PREFIX) - IF(NON_STANDARD_LIB_SUFFIX) - SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES SUFFIX "-0.dll") - ENDIF(NON_STANDARD_LIB_SUFFIX) -ENDIF(MINGW AND NOT PCRE2_STATIC) -ENDIF(PCRE2_BUILD_PCRE2_8) - -# 16-bit library - -IF(PCRE2_BUILD_PCRE2_16) -ADD_LIBRARY(pcre2-16 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h) -SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES - COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16 - MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}" - MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}" - VERSION ${LIBPCRE2_16_VERSION} - SOVERSION ${LIBPCRE2_16_SOVERSION}) -SET(targets ${targets} pcre2-16) - -IF(MINGW AND NOT PCRE2_STATIC) - IF(NON_STANDARD_LIB_PREFIX) - SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES PREFIX "") - ENDIF(NON_STANDARD_LIB_PREFIX) - IF(NON_STANDARD_LIB_SUFFIX) - SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES SUFFIX "-0.dll") - ENDIF(NON_STANDARD_LIB_SUFFIX) -ENDIF(MINGW AND NOT PCRE2_STATIC) -ENDIF(PCRE2_BUILD_PCRE2_16) - -# 32-bit library - -IF(PCRE2_BUILD_PCRE2_32) -ADD_LIBRARY(pcre2-32 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h) -SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES - COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32 - MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}" - MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}" - VERSION ${LIBPCRE2_32_VERSION} - SOVERSION ${LIBPCRE2_32_SOVERSION}) -SET(targets ${targets} pcre2-32) - -IF(MINGW AND NOT PCRE2_STATIC) - IF(NON_STANDARD_LIB_PREFIX) - SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES PREFIX "") - ENDIF(NON_STANDARD_LIB_PREFIX) - IF(NON_STANDARD_LIB_SUFFIX) - SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES SUFFIX "-0.dll") - ENDIF(NON_STANDARD_LIB_SUFFIX) -ENDIF(MINGW AND NOT PCRE2_STATIC) -ENDIF(PCRE2_BUILD_PCRE2_32) - -# Executables - -IF(PCRE2_BUILD_PCRE2GREP) - ADD_EXECUTABLE(pcre2grep src/pcre2grep.c) - SET_PROPERTY(TARGET pcre2grep - PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8) - SET(targets ${targets} pcre2grep) - TARGET_LINK_LIBRARIES(pcre2grep pcre2-posix ${PCRE2GREP_LIBS}) -ENDIF(PCRE2_BUILD_PCRE2GREP) - -# Testing - -IF(PCRE2_BUILD_TESTS) - ENABLE_TESTING() - - SET(PCRE2TEST_SOURCES src/pcre2test.c) - - IF(MSVC) - # This is needed to avoid a stack overflow error in the standard tests. The - # flag should be indicated with a forward-slash instead of a hyphen, but - # then CMake treats it as a file path. - SET(PCRE2TEST_LINKER_FLAGS -STACK:2500000) - ENDIF(MSVC) - - ADD_EXECUTABLE(pcre2test ${PCRE2TEST_SOURCES}) - SET(targets ${targets} pcre2test) - IF(PCRE2_BUILD_PCRE2_8) - LIST(APPEND PCRE2TEST_LIBS pcre2-posix pcre2-8) - ENDIF(PCRE2_BUILD_PCRE2_8) - IF(PCRE2_BUILD_PCRE2_16) - LIST(APPEND PCRE2TEST_LIBS pcre2-16) - ENDIF(PCRE2_BUILD_PCRE2_16) - IF(PCRE2_BUILD_PCRE2_32) - LIST(APPEND PCRE2TEST_LIBS pcre2-32) - ENDIF(PCRE2_BUILD_PCRE2_32) - TARGET_LINK_LIBRARIES(pcre2test ${PCRE2TEST_LIBS} ${PCRE2TEST_LINKER_FLAGS}) - - IF(PCRE2_SUPPORT_JIT) - ADD_EXECUTABLE(pcre2_jit_test src/pcre2_jit_test.c) - SET(targets ${targets} pcre2_jit_test) - SET(PCRE2_JIT_TEST_LIBS ) - IF(PCRE2_BUILD_PCRE2_8) - LIST(APPEND PCRE2_JIT_TEST_LIBS pcre2-8) - ENDIF(PCRE2_BUILD_PCRE2_8) - IF(PCRE2_BUILD_PCRE2_16) - LIST(APPEND PCRE2_JIT_TEST_LIBS pcre2-16) - ENDIF(PCRE2_BUILD_PCRE2_16) - IF(PCRE2_BUILD_PCRE2_32) - LIST(APPEND PCRE2_JIT_TEST_LIBS pcre2-32) - ENDIF(PCRE2_BUILD_PCRE2_32) - TARGET_LINK_LIBRARIES(pcre2_jit_test ${PCRE2_JIT_TEST_LIBS}) - ENDIF(PCRE2_SUPPORT_JIT) - - # exes in Debug location tested by the RunTest and RunGrepTest shell scripts - # via "make test" - - # The commented out code below provokes a warning about future removal - # of the facility, and requires policy CMP0026 to be set to "OLD". I have - # got fed-up with the warnings, but my plea for help on the mailing list - # produced no response. So, I've hacked. The new code below seems to work on - # Linux. - -# IF(PCRE2_BUILD_PCRE2GREP) -# GET_TARGET_PROPERTY(PCRE2GREP_EXE pcre2grep DEBUG_LOCATION) -# ENDIF(PCRE2_BUILD_PCRE2GREP) -# -# GET_TARGET_PROPERTY(PCRE2TEST_EXE pcre2test DEBUG_LOCATION) - - IF(PCRE2_BUILD_PCRE2GREP) - SET(PCRE2GREP_EXE $) - ENDIF(PCRE2_BUILD_PCRE2GREP) - - SET(PCRE2TEST_EXE $) - - -# ================================================= - # Write out a CTest configuration file - # - FILE(WRITE ${PROJECT_BINARY_DIR}/CTestCustom.ctest - "# This is a generated file. -MESSAGE(\"When testing is complete, review test output in the -\\\"${PROJECT_BINARY_DIR}/Testing/Temporary\\\" folder.\") -MESSAGE(\" \") -") - - FILE(WRITE ${PROJECT_BINARY_DIR}/pcre2_test.sh - "#! /bin/sh -# This is a generated file. -. ${PROJECT_SOURCE_DIR}/RunTest -if test \"$?\" != \"0\"; then exit 1; fi -# End -") - - IF(UNIX) - ADD_TEST(pcre2_test sh ${PROJECT_BINARY_DIR}/pcre2_test.sh) - ENDIF(UNIX) - - IF(PCRE2_BUILD_PCRE2GREP) - FILE(WRITE ${PROJECT_BINARY_DIR}/pcre2_grep_test.sh - "#! /bin/sh -# This is a generated file. -. ${PROJECT_SOURCE_DIR}/RunGrepTest -if test \"$?\" != \"0\"; then exit 1; fi -# End -") - - IF(UNIX) - ADD_TEST(pcre2_grep_test sh ${PROJECT_BINARY_DIR}/pcre2_grep_test.sh) - ENDIF(UNIX) - ENDIF(PCRE2_BUILD_PCRE2GREP) - - IF(WIN32) - # Provide environment for executing the bat file version of RunTest - FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR} winsrc) - FILE(TO_NATIVE_PATH ${PROJECT_BINARY_DIR} winbin) - FILE(TO_NATIVE_PATH ${PCRE2TEST_EXE} winexe) - - FILE(WRITE ${PROJECT_BINARY_DIR}/pcre2_test.bat - "\@REM This is a generated file. -\@echo off -setlocal -SET srcdir=\"${winsrc}\" -# The next line was replaced by the following one after a user comment. -# SET pcre2test=\"${winexe}\" -SET pcre2test=\"${winbin}\\pcre2test.exe\" -if not [%CMAKE_CONFIG_TYPE%]==[] SET pcre2test=\"${winbin}\\%CMAKE_CONFIG_TYPE%\\pcre2test.exe\" -call %srcdir%\\RunTest.Bat -if errorlevel 1 exit /b 1 -echo RunTest.bat tests successfully completed -") - - ADD_TEST(NAME pcre2_test_bat - COMMAND pcre2_test.bat) - SET_TESTS_PROPERTIES(pcre2_test_bat PROPERTIES - PASS_REGULAR_EXPRESSION "RunTest\\.bat tests successfully completed") - - IF("$ENV{OSTYPE}" STREQUAL "msys") - # Both the sh and bat file versions of RunTest are run if make test is used - # in msys - ADD_TEST(pcre2_test_sh sh.exe ${PROJECT_BINARY_DIR}/pcre2_test.sh) - IF(PCRE2_BUILD_PCRE2GREP) - ADD_TEST(pcre2_grep_test sh.exe ${PROJECT_BINARY_DIR}/pcre2_grep_test.sh) - ENDIF(PCRE2_BUILD_PCRE2GREP) - ENDIF("$ENV{OSTYPE}" STREQUAL "msys") - ENDIF(WIN32) - - # Changed to accommodate testing whichever location was just built - - IF(PCRE2_SUPPORT_JIT) - ADD_TEST(pcre2_jit_test pcre2_jit_test) - ENDIF(PCRE2_SUPPORT_JIT) - -ENDIF(PCRE2_BUILD_TESTS) - -# Installation - -SET(CMAKE_INSTALL_ALWAYS 1) - -INSTALL(TARGETS ${targets} - RUNTIME DESTINATION bin - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) -INSTALL(FILES ${pkg_config_files} DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) -INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/pcre2-config" - DESTINATION bin - # Set 0755 permissions - PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) - -INSTALL(FILES ${PCRE2_HEADERS} ${PCRE2POSIX_HEADERS} DESTINATION include) - -FILE(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html) -FILE(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1) -FILE(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3) - -FOREACH(man ${man3}) - GET_FILENAME_COMPONENT(man_tmp ${man} NAME) - SET(man3_new ${man3} ${man}) -ENDFOREACH(man ${man3}) -SET(man3 ${man3_new}) - -INSTALL(FILES ${man1} DESTINATION man/man1) -INSTALL(FILES ${man3} DESTINATION man/man3) -INSTALL(FILES ${html} DESTINATION share/doc/pcre2/html) - -IF(MSVC AND INSTALL_MSVC_PDB) - INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2.pdb - ${PROJECT_BINARY_DIR}/pcre2posix.pdb - DESTINATION bin - CONFIGURATIONS RelWithDebInfo) - INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2d.pdb - ${PROJECT_BINARY_DIR}/pcre2posixd.pdb - DESTINATION bin - CONFIGURATIONS Debug) -ENDIF(MSVC AND INSTALL_MSVC_PDB) - -# Help, only for nice output -IF(BUILD_SHARED_LIBS) - SET(BUILD_STATIC_LIBS OFF) -ELSE(BUILD_SHARED_LIBS) - SET(BUILD_STATIC_LIBS ON) -ENDIF(BUILD_SHARED_LIBS) - -IF(PCRE2_HEAP_MATCH_RECURSE) - MESSAGE(WARNING "HEAP_MATCH_RECURSE is obsolete and does nothing.") -ENDIF(PCRE2_HEAP_MATCH_RECURSE) - -IF(PCRE2_SHOW_REPORT) - STRING(TOUPPER "${CMAKE_BUILD_TYPE}" buildtype) - IF (CMAKE_C_FLAGS) - SET(cfsp " ") - ENDIF(CMAKE_C_FLAGS) - MESSAGE(STATUS "") - MESSAGE(STATUS "") - MESSAGE(STATUS "PCRE2-${PCRE2_MAJOR}.${PCRE2_MINOR} configuration summary:") - MESSAGE(STATUS "") - MESSAGE(STATUS " Install prefix .................. : ${CMAKE_INSTALL_PREFIX}") - MESSAGE(STATUS " C compiler ...................... : ${CMAKE_C_COMPILER}") - MESSAGE(STATUS " C compiler flags ................ : ${CMAKE_C_FLAGS}${cfsp}${CMAKE_C_FLAGS_${buildtype}}") - MESSAGE(STATUS "") - MESSAGE(STATUS " Build 8 bit PCRE2 library ....... : ${PCRE2_BUILD_PCRE2_8}") - MESSAGE(STATUS " Build 16 bit PCRE2 library ...... : ${PCRE2_BUILD_PCRE2_16}") - MESSAGE(STATUS " Build 32 bit PCRE2 library ...... : ${PCRE2_BUILD_PCRE2_32}") - MESSAGE(STATUS " Enable JIT compiling support .... : ${PCRE2_SUPPORT_JIT}") - MESSAGE(STATUS " Use SELinux allocator in JIT .... : ${PCRE2_SUPPORT_JIT_SEALLOC}") - MESSAGE(STATUS " Enable Unicode support .......... : ${PCRE2_SUPPORT_UNICODE}") - MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE2_NEWLINE}") - MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE2_SUPPORT_BSR_ANYCRLF}") - MESSAGE(STATUS " \\C is disabled .................. : ${PCRE2_NEVER_BACKSLASH_C}") - MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE2_EBCDIC}") - MESSAGE(STATUS " EBCDIC coding with NL=0x25 ...... : ${PCRE2_EBCDIC_NL25}") - MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE2_REBUILD_CHARTABLES}") - MESSAGE(STATUS " Internal link size .............. : ${PCRE2_LINK_SIZE}") - MESSAGE(STATUS " Parentheses nest limit .......... : ${PCRE2_PARENS_NEST_LIMIT}") - MESSAGE(STATUS " Heap limit ...................... : ${PCRE2_HEAP_LIMIT}") - MESSAGE(STATUS " Match limit ..................... : ${PCRE2_MATCH_LIMIT}") - MESSAGE(STATUS " Match depth limit ............... : ${PCRE2_MATCH_LIMIT_DEPTH}") - MESSAGE(STATUS " Build shared libs ............... : ${BUILD_SHARED_LIBS}") - MESSAGE(STATUS " Build static libs ............... : ${BUILD_STATIC_LIBS}") - MESSAGE(STATUS " Build pcre2grep ................. : ${PCRE2_BUILD_PCRE2GREP}") - MESSAGE(STATUS " Enable JIT in pcre2grep ......... : ${PCRE2GREP_SUPPORT_JIT}") - MESSAGE(STATUS " Enable callouts in pcre2grep .... : ${PCRE2GREP_SUPPORT_CALLOUT}") - MESSAGE(STATUS " Enable callout fork in pcre2grep. : ${PCRE2GREP_SUPPORT_CALLOUT_FORK}") - MESSAGE(STATUS " Buffer size for pcre2grep ....... : ${PCRE2GREP_BUFSIZE}") - MESSAGE(STATUS " Build tests (implies pcre2test .. : ${PCRE2_BUILD_TESTS}") - MESSAGE(STATUS " and pcre2grep)") - IF(ZLIB_FOUND) - MESSAGE(STATUS " Link pcre2grep with libz ........ : ${PCRE2_SUPPORT_LIBZ}") - ELSE(ZLIB_FOUND) - MESSAGE(STATUS " Link pcre2grep with libz ........ : Library not found" ) - ENDIF(ZLIB_FOUND) - IF(BZIP2_FOUND) - MESSAGE(STATUS " Link pcre2grep with libbz2 ...... : ${PCRE2_SUPPORT_LIBBZ2}") - ELSE(BZIP2_FOUND) - MESSAGE(STATUS " Link pcre2grep with libbz2 ...... : Library not found" ) - ENDIF(BZIP2_FOUND) - IF(EDITLINE_FOUND) - MESSAGE(STATUS " Link pcre2test with libeditline . : ${PCRE2_SUPPORT_LIBEDIT}") - ELSE(EDITLINE_FOUND) - MESSAGE(STATUS " Link pcre2test with libeditline . : Library not found" ) - ENDIF(EDITLINE_FOUND) - IF(READLINE_FOUND) - MESSAGE(STATUS " Link pcre2test with libreadline . : ${PCRE2_SUPPORT_LIBREADLINE}") - ELSE(READLINE_FOUND) - MESSAGE(STATUS " Link pcre2test with libreadline . : Library not found" ) - ENDIF(READLINE_FOUND) - MESSAGE(STATUS " Support Valgrind .................: ${PCRE2_SUPPORT_VALGRIND}") - IF(PCRE2_DISABLE_PERCENT_ZT) - MESSAGE(STATUS " Use %zu and %td ..................: OFF" ) - ELSE(PCRE2_DISABLE_PERCENT_ZT) - MESSAGE(STATUS " Use %zu and %td ..................: AUTO" ) - ENDIF(PCRE2_DISABLE_PERCENT_ZT) - - IF(MINGW AND NOT PCRE2_STATIC) - MESSAGE(STATUS " Non-standard dll names (prefix) . : ${NON_STANDARD_LIB_PREFIX}") - MESSAGE(STATUS " Non-standard dll names (suffix) . : ${NON_STANDARD_LIB_SUFFIX}") - ENDIF(MINGW AND NOT PCRE2_STATIC) - - IF(MSVC) - MESSAGE(STATUS " Install MSVC .pdb files ..........: ${INSTALL_MSVC_PDB}") - ENDIF(MSVC) - - MESSAGE(STATUS "") -ENDIF(PCRE2_SHOW_REPORT) - -# end CMakeLists.txt diff --git a/pcre2/COPYING b/pcre2/COPYING deleted file mode 100644 index c233950f6..000000000 --- a/pcre2/COPYING +++ /dev/null @@ -1,5 +0,0 @@ -PCRE2 LICENCE - -Please see the file LICENCE in the PCRE2 distribution for licensing details. - -End diff --git a/pcre2/ChangeLog b/pcre2/ChangeLog deleted file mode 100644 index 2e20bdbb6..000000000 --- a/pcre2/ChangeLog +++ /dev/null @@ -1,2434 +0,0 @@ -Change Log for PCRE2 --------------------- - -Version 10.36-RC1 04-December-2020 ----------------------------------- - -1. Add CET_CFLAGS so that when Intel CET is enabled, pass -mshstk to -compiler. This fixes https://bugs.exim.org/show_bug.cgi?id=2578. Patch for -Makefile.am and configure.ac by H.J. Lu. Equivalent patch for CMakeLists.txt -invented by PH. - -2. Fix inifinite loop when a single byte newline is searched in JIT when -invalid utf8 mode is enabled. - -3. Updated CMakeLists.txt with patch from Wolfgang Stöggl (Bugzilla #2584): - - - Include GNUInstallDirs and use ${CMAKE_INSTALL_LIBDIR} instead of hardcoded - lib. This allows differentiation between lib and lib64. - CMAKE_INSTALL_LIBDIR is used for installation of libraries and also for - pkgconfig file generation. - - - Add the version of PCRE2 to the configuration summary like ./configure - does. - - - Fix typo: MACTHED_STRING->MATCHED_STRING - -4. Updated CMakeLists.txt with another patch from Wolfgang Stöggl (Bugzilla -#2588): - - - Add escaped double quotes around include directory in CMakeLists.txt to - allow spaces in directory names. - - - This fixes a cmake error, if the path of the pcre2 source contains a space. - -5. Updated CMakeLists.txt with a patch from B. Scott Michel: CMake's -documentation suggests using CHECK_SYMBOL_EXISTS over CHECK_FUNCTION_EXIST. -Moreover, these functions come from specific header files, which need to be -specified (and, thankfully, are the same on both the Linux and WinXX -platforms.) - -6. Added a (uint32_t) cast to prevent a compiler warning in pcre2_compile.c. - -7. Applied a patch from Wolfgang Stöggl (Bugzilla #2600) to fix postfix for -debug Windows builds using CMake. This also updated configure so that it -generates *.pc files and pcre2-config with the same content, as in the past. - -8. If a pattern ended with (?(VERSION=n.d where n is any number but d is just a -single digit, the code unit beyond d was being read (i.e. there was a read -buffer overflow). Fixes ClusterFuzz 23779. - -9. After the rework in r1235, certain character ranges were incorrectly -handled by an optimization in JIT. Furthermore a wrong offset was used to -read a value from a buffer which could lead to memory overread. - -10. Unnoticed for many years was the fact that delimiters other than / in the -testinput1 and testinput4 files could cause incorrect behaviour when these -files were processed by perltest.sh. There were several tests that used quotes -as delimiters, and it was just luck that they didn't go wrong with perltest.sh. -All the patterns in testinput1 and testinput4 now use / as their delimiter. -This fixes Bugzilla #2641. - -11. Perl has started to give an error for \K within lookarounds (though there -are cases where it doesn't). PCRE2 still allows this, so the tests that include -this case have been moved from test 1 to test 2. - -12. Further to 10 above, pcre2test has been updated to detect and grumble if a -delimiter other than / is used after #perltest. - -13. Fixed a bug with PCRE2_MATCH_INVALID_UTF in 8-bit mode when PCRE2_CASELESS -was set and PCRE2_NO_START_OPTIMIZE was not set. The optimization for finding -the start of a match was not resetting correctly after a failed match on the -first valid fragment of the subject, possibly causing incorrect "no match" -returns on subsequent fragments. For example, the pattern /A/ failed to match -the subject \xe5A. Fixes Bugzilla #2642. - -14. Fixed a bug in character set matching when JIT is enabled and both unicode -scripts and unicode classes are present at the same time. - -15. Added GNU grep's -m (aka --max-count) option to pcre2grep. - -16. Refactored substitution processing in pcre2grep strings, both for the -O -option and when dealing with callouts. There is now a single function that -handles $ expansion in all cases (instead of multiple copies of almost -identical code). This means that the same escape sequences are available -everywhere, which was not previously the case. At the same time, the escape -sequences $x{...} and $o{...} have been introduced, to allow for characters -whose code points are greater than 255 in Unicode mode. - -17. Applied the patch from Bugzilla #2628 to RunGrepTest. This does an explicit -test for a version of sed that can handle binary zero, instead of assuming that -any Linux version will work. Later: replaced $(...) by `...` because not all -shells recognize the former. - -18. Fixed a word boundary check bug in JIT when partial matching is enabled. - -19. Fix ARM64 compilation warning in JIT. Patch by Carlo. - -20. A bug in the RunTest script meant that if the first part of test 2 failed, -the failure was not reported. - -21. Test 2 was failing when run from a directory other than the source -directory. This failure was previously missed in RunTest because of 20 above. -Fixes added to both RunTest and RunTest.bat. - -22. Patch to CMakeLists.txt from Daniel to fix problem with testing under -Windows. - - -Version 10.35 09-May-2020 ---------------------------- - -1. Use PCRE2_MATCH_EMPTY flag to detect empty matches in JIT. - -2. Fix ARMv5 JIT improper handling of labels right after a constant pool. - -3. A JIT bug is fixed which allowed to read the fields of the compiled -pattern before its existence is checked. - -4. Back in the PCRE1 day, capturing groups that contained recursive back -references to themselves were made atomic (version 8.01, change 18) because -after the end a repeated group, the captured substrings had their values from -the final repetition, not from an earlier repetition that might be the -destination of a backtrack. This feature was documented, and was carried over -into PCRE2. However, it has now been realized that the major refactoring that -was done for 10.30 has made this atomicizing unnecessary, and it is confusing -when users are unaware of it, making some patterns appear not to be working as -expected. Capture values of recursive back references in repeated groups are -now correctly backtracked, so this unnecessary restriction has been removed. - -5. Added PCRE2_SUBSTITUTE_LITERAL. - -6. Avoid some VS compiler warnings. - -7. Added PCRE2_SUBSTITUTE_MATCHED. - -8. Added (?* and (?<* as synonms for (*napla: and (*naplb: to match another -regex engine. The Perl regex folks are aware of this usage and have made a note -about it. - -9. When an assertion is repeated, PCRE2 used to limit the maximum repetition to -1, believing that repeating an assertion is pointless. However, if a positive -assertion contains capturing groups, repetition can be useful. In any case, an -assertion could always be wrapped in a repeated group. The only restriction -that is now imposed is that an unlimited maximum is changed to one more than -the minimum. - -10. Fix *THEN verbs in lookahead assertions in JIT. - -11. Added PCRE2_SUBSTITUTE_REPLACEMENT_ONLY. - -12. The JIT stack should be freed when the low-level stack allocation fails. - -13. In pcre2grep, if the final line in a scanned file is output but does not -end with a newline sequence, add a newline according to the --newline setting. - -14. (?(DEFINE)...) groups were not being handled correctly when checking for -the fixed length of a lookbehind assertion. Such a group within a lookbehind -should be skipped, as it does not contribute to the length of the group. -Instead, the (DEFINE) group was being processed, and if at the end of the -lookbehind, that end was not correctly recognized. Errors such as "lookbehind -assertion is not fixed length" and also "internal error: bad code value in -parsed_skip()" could result. - -15. Put a limit of 1000 on recursive calls in pcre2_study() when searching -nested groups for starting code units, in order to avoid stack overflow issues. -If the limit is reached, it just gives up trying for this optimization. - -16. The control verb chain list must always be restored when exiting from a -recurse function in JIT. - -17. Fix a crash which occurs when the character type of an invalid UTF -character is decoded in JIT. - -18. Changes in many areas of the code so that when Unicode is supported and -PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for -upper/lower case computations on characters whose code points are greater than -127. - -19. The function for checking UTF-16 validity was returning an incorrect offset -for the start of the error when a high surrogate was not followed by a valid -low surrogate. This caused incorrect behaviour, for example when -PCRE2_MATCH_INVALID_UTF was set and a match started immediately following the -invalid high surrogate, such as /aa/ matching "\x{d800}aa". - -20. If a DEFINE group immediately preceded a lookbehind assertion, the pattern -could be mis-compiled and therefore not match correctly. This is the example -that found this: /(?(DEFINE)(?bar))(? has been raised to -50, (b) the new --om-capture option changes the limit, (c) an error is raised -if -o asks for a group that is above the limit. - -12. The quantifier {1} was always being ignored, but this is incorrect when it -is made possessive and applied to an item in parentheses, because a -parenthesized item may contain multiple branches or other backtracking points, -for example /(a|ab){1}+c/ or /(a+){1}+a/. - -13. For partial matches, pcre2test was always showing the maximum lookbehind -characters, flagged with "<", which is misleading when the lookbehind didn't -actually look behind the start (because it was later in the pattern). Showing -all consulted preceding characters for partial matches is now controlled by the -existing "allusedtext" modifier and, as for complete matches, this facility is -available only for non-JIT matching, because JIT does not maintain the first -and last consulted characters. - -14. DFA matching (using pcre2_dfa_match()) was not recognising a partial match -if the end of the subject was encountered in a lookahead (conditional or -otherwise), an atomic group, or a recursion. - -15. Give error if pcre2test -t, -T, -tm or -TM is given an argument of zero. - -16. Check for integer overflow when computing lookbehind lengths. Fixes -Clusterfuzz issue 15636. - -17. Implemented non-atomic positive lookaround assertions. - -18. If a lookbehind contained a lookahead that contained another lookbehind -within it, the nested lookbehind was not correctly processed. For example, if -/(?<=(?=(?<=a)))b/ was matched to "ab" it gave no match instead of matching -"b". - -19. Implemented pcre2_get_match_data_size(). - -20. Two alterations to partial matching: - - (a) The definition of a partial match is slightly changed: if a pattern - contains any lookbehinds, an empty partial match may be given, because this - is another situation where adding characters to the current subject can - lead to a full match. Example: /c*+(?<=[bc])/ with subject "ab". - - (b) Similarly, if a pattern could match an empty string, an empty partial - match may be given. Example: /(?![ab]).*/ with subject "ab". This case - applies only to PCRE2_PARTIAL_HARD. - - (c) An empty string partial hard match can be returned for \z and \Z as it - is documented that they shouldn't match. - -21. A branch that started with (*ACCEPT) was not being recognized as one that -could match an empty string. - -22. Corrected pcre2_set_character_tables() tables data type: was const unsigned -char * instead of const uint8_t *, as generated by pcre2_maketables(). - -23. Upgraded to Unicode 12.1.0. - -24. Add -jitfast command line option to pcre2test (to make all the jit options -available directly). - -25. Make pcre2test -C show if libreadline or libedit is supported. - -26. If the length of one branch of a group exceeded 65535 (the maximum value -that is remembered as a minimum length), the whole group's length was -incorrectly recorded as 65535, leading to incorrect "no match" when start-up -optimizations were in force. - -27. The "rightmost consulted character" value was not always correct; in -particular, if a pattern ended with a negative lookahead, characters that were -inspected in that lookahead were not included. - -28. Add the pcre2_maketables_free() function. - -29. The start-up optimization that looks for a unique initial matching -code unit in the interpretive engines uses memchr() in 8-bit mode. When the -search is caseless, it was doing so inefficiently, which ended up slowing down -the match drastically when the subject was very long. The revised code (a) -remembers if one case is not found, so it never repeats the search for that -case after a bumpalong and (b) when one case has been found, it searches only -up to that position for an earlier occurrence of the other case. This fix -applies to both interpretive pcre2_match() and to pcre2_dfa_match(). - -30. While scanning to find the minimum length of a group, if any branch has -minimum length zero, there is no need to scan any subsequent branches (a small -compile-time performance improvement). - -31. Installed a .gitignore file on a user's suggestion. When using the svn -repository with git (through git svn) this helps keep it tidy. - -32. Add underflow check in JIT which may occur when the value of subject -string pointer is close to 0. - -33. Arrange for classes such as [Aa] which contain just the two cases of the -same character, to be treated as a single caseless character. This causes the -first and required code unit optimizations to kick in where relevant. - -34. Improve the bitmap of starting bytes for positive classes that include wide -characters, but no property types, in UTF-8 mode. Previously, on encountering -such a class, the bits for all bytes greater than \xc4 were set, thus -specifying any character with codepoint >= 0x100. Now the only bits that are -set are for the relevant bytes that start the wide characters. This can give a -noticeable performance improvement. - -35. If the bitmap of starting code units contains only 1 or 2 bits, replace it -with a single starting code unit (1 bit) or a caseless single starting code -unit if the two relevant characters are case-partners. This is particularly -relevant to the 8-bit library, though it applies to all. It can give a -performance boost for patterns such as [Ww]ord and (word|WORD). However, this -optimization doesn't happen if there is a "required" code unit of the same -value (because the search for a "required" code unit starts at the match start -for non-unique first code unit patterns, but after a unique first code unit, -and patterns such as a*a need the former action). - -36. Small patch to pcre2posix.c to set the erroroffset field to -1 immediately -after a successful compile, instead of at the start of matching to avoid a -sanitizer complaint (regexec is supposed to be thread safe). - -37. Add NEON vectorization to JIT to speed up matching of first character and -pairs of characters on ARM64 CPUs. - -38. If a non-ASCII character was the first in a starting assertion in a -caseless match, the "first code unit" optimization did not get the casing -right, and the assertion failed to match a character in the other case if it -did not start with the same code unit. - -39. Fixed the incorrect computation of jump sizes on x86 CPUs in JIT. A masking -operation was incorrectly removed in r1136. Reported by Ralf Junker. - - -Version 10.33 16-April-2019 ---------------------------- - -1. Added "allvector" to pcre2test to make it easy to check the part of the -ovector that shouldn't be changed, in particular after substitute and failed or -partial matches. - -2. Fix subject buffer overread in JIT when UTF is disabled and \X or \R has -a greater than 1 fixed quantifier. This issue was found by Yunho Kim. - -3. Added support for callouts from pcre2_substitute(). After 10.33-RC1, but -prior to release, fixed a bug that caused a crash if pcre2_substitute() was -called with a NULL match context. - -4. The POSIX functions are now all called pcre2_regcomp() etc., with wrapper -functions that use the standard POSIX names. However, in pcre2posix.h the POSIX -names are defined as macros. This should help avoid linking with the wrong -library in some environments while still exporting the POSIX names for -pre-existing programs that use them. (The Debian alternative names are also -defined as macros, but not documented.) - -5. Fix an xclass matching issue in JIT. - -6. Implement PCRE2_EXTRA_ESCAPED_CR_IS_LF (see Bugzilla 2315). - -7. Implement the Perl 5.28 experimental alphabetic names for atomic groups and -lookaround assertions, for example, (*pla:...) and (*atomic:...). These are -characterized by a lower case letter following (* and to simplify coding for -this, the character tables created by pcre2_maketables() were updated to add a -new "is lower case letter" bit. At the same time, the now unused "is -hexadecimal digit" bit was removed. The default tables in -src/pcre2_chartables.c.dist are updated. - -8. Implement the new Perl "script run" features (*script_run:...) and -(*atomic_script_run:...) aka (*sr:...) and (*asr:...). - -9. Fixed two typos in change 22 for 10.21, which added special handling for -ranges such as a-z in EBCDIC environments. The original code probably never -worked, though there were no bug reports. - -10. Implement PCRE2_COPY_MATCHED_SUBJECT for pcre2_match() (including JIT via -pcre2_match()) and pcre2_dfa_match(), but *not* the pcre2_jit_match() fast -path. Also, when a match fails, set the subject field in the match data to NULL -for tidiness - none of the substring extractors should reference this after -match failure. - -11. If a pattern started with a subroutine call that had a quantifier with a -minimum of zero, an incorrect "match must start with this character" could be -recorded. Example: /(?&xxx)*ABC(?XYZ)/ would (incorrectly) expect 'A' to -be the first character of a match. - -12. The heap limit checking code in pcre2_dfa_match() could suffer from -overflow if the heap limit was set very large. This could cause incorrect "heap -limit exceeded" errors. - -13. Add "kibibytes" to the heap limit output from pcre2test -C to make the -units clear. - -14. Add a call to pcre2_jit_free_unused_memory() in pcre2grep, for tidiness. - -15. Updated the VMS-specific code in pcre2test on the advice of a VMS user. - -16. Removed the unnecessary inclusion of stdint.h (or inttypes.h) from -pcre2_internal.h as it is now included by pcre2.h. Also, change 17 for 10.32 -below was unnecessarily complicated, as inttypes.h is a Standard C header, -which is defined to be a superset of stdint.h. Instead of conditionally -including stdint.h or inttypes.h, pcre2.h now unconditionally includes -inttypes.h. This supports environments that do not have stdint.h but do have -inttypes.h, which are known to exist. A note in the autotools documentation -says (November 2018) that there are none known that are the other way round. - -17. Added --disable-percent-zt to "configure" (and equivalent to CMake) to -forcibly disable the use of %zu and %td in formatting strings because there is -at least one version of VMS that claims to be C99 but does not support these -modifiers. - -18. Added --disable-pcre2grep-callout-fork, which restricts the callout support -in pcre2grep to the inbuilt echo facility. This may be useful in environments -that do not support fork(). - -19. Fix two instances of <= 0 being applied to unsigned integers (the VMS -compiler complains). - -20. Added "fork" support for VMS to pcre2grep, for running an external program -via a string callout. - -21. Improve MAP_JIT flag usage on MacOS. Patch by Rich Siegel. - -22. If a pattern started with (*MARK), (*COMMIT), (*PRUNE), (*SKIP), or (*THEN) -followed by ^ it was not recognized as anchored. - -23. The RunGrepTest script used to cut out the test of NUL characters for -Solaris and MacOS as printf and sed can't handle them. It seems that the *BSD -systems can't either. I've inverted the test so that only those OS that are -known to work (currently only Linux) try to run this test. - -24. Some tests in RunGrepTest appended to testtrygrep from two different file -descriptors instead of redirecting stderr to stdout. This worked on Linux, but -it was reported not to on other systems, causing the tests to fail. - -25. In the RunTest script, make the test for stack setting use the same value -for the stack as it needs for -bigstack. - -26. Insert a cast in pcre2_dfa_match.c to suppress a compiler warning. - -26. With PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL set, escape sequences such as \s -which are valid in character classes, but not as the end of ranges, were being -treated as literals. An example is [_-\s] (but not [\s-_] because that gave an -error at the *start* of a range). Now an "invalid range" error is given -independently of PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL. - -27. Related to 26 above, PCRE2_BAD_ESCAPE_IS_LITERAL was affecting known escape -sequences such as \eX when they appeared invalidly in a character class. Now -the option applies only to unrecognized or malformed escape sequences. - -28. Fix word boundary in JIT compiler. Patch by Mike Munday. - -29. The pcre2_dfa_match() function was incorrectly handling conditional version -tests such as (?(VERSION>=0)...) when the version test was true. Incorrect -processing or a crash could result. - -30. When PCRE2_UTF is set, allow non-ASCII letters and decimal digits in group -names, as Perl does. There was a small bug in this new code, found by -ClusterFuzz 12950, fixed before release. - -31. Implemented PCRE2_EXTRA_ALT_BSUX to support ECMAScript 6's \u{hhh} -construct. - -32. Compile \p{Any} to be the same as . in DOTALL mode, so that it benefits -from auto-anchoring if \p{Any}* starts a pattern. - -33. Compile invalid UTF check in JIT test when only pcre32 is enabled. - -34. For some time now, CMake has been warning about the setting of policy -CMP0026 to "OLD" in CmakeLists.txt, and hinting that the feature might be -removed in a future version. A request for CMake expertise on the list produced -no result, so I have now hacked CMakeLists.txt along the lines of some changes -I found on the Internet. The new code no longer needs the policy setting, and -it appears to work fine on Linux. - -35. Setting --enable-jit=auto for an out-of-tree build failed because the -source directory wasn't in the search path for AC_TRY_COMPILE always. Patch -from Ross Burton. - -36. Disable SSE2 JIT optimizations in x86 CPUs when SSE2 is not available. -Patch by Guillem Jover. - -37. Changed expressions such as 1<<10 to 1u<<10 in many places because compiler -warnings were reported. - -38. Using the clang compiler with sanitizing options causes runtime complaints -about truncation for statments such as x = ~x when x is an 8-bit value; it -seems to compute ~x as a 32-bit value. Changing such statements to x = 255 ^ x -gets rid of the warnings. There were also two missing casts in pcre2test. - - -Version 10.32 10-September-2018 -------------------------------- - -1. When matching using the the REG_STARTEND feature of the POSIX API with a -non-zero starting offset, unset capturing groups with lower numbers than a -group that did capture something were not being correctly returned as "unset" -(that is, with offset values of -1). - -2. When matching using the POSIX API, pcre2test used to omit listing unset -groups altogether. Now it shows those that come before any actual captures as -"", as happens for non-POSIX matching. - -3. Running "pcre2test -C" always stated "\R matches CR, LF, or CRLF only", -whatever the build configuration was. It now correctly says "\R matches all -Unicode newlines" in the default case when --enable-bsr-anycrlf has not been -specified. Similarly, running "pcre2test -C bsr" never produced the result -ANY. - -4. Matching the pattern /(*UTF)\C[^\v]+\x80/ against an 8-bit string containing -multi-code-unit characters caused bad behaviour and possibly a crash. This -issue was fixed for other kinds of repeat in release 10.20 by change 19, but -repeating character classes were overlooked. - -5. pcre2grep now supports the inclusion of binary zeros in patterns that are -read from files via the -f option. - -6. A small fix to pcre2grep to avoid compiler warnings for -Wformat-overflow=2. - -7. Added --enable-jit=auto support to configure.ac. - -8. Added some dummy variables to the heapframe structure in 16-bit and 32-bit -modes for the benefit of m68k, where pointers can be 16-bit aligned. The -dummies force 32-bit alignment and this ensures that the structure is a -multiple of PCRE2_SIZE, a requirement that is tested at compile time. In other -architectures, alignment requirements take care of this automatically. - -9. When returning an error from pcre2_pattern_convert(), ensure the error -offset is set zero for early errors. - -10. A number of patches for Windows support from Daniel Richard G: - - (a) List of error numbers in Runtest.bat corrected (it was not the same as in - Runtest). - - (b) pcre2grep snprintf() workaround as used elsewhere in the tree. - - (c) Support for non-C99 snprintf() that returns -1 in the overflow case. - -11. Minor tidy of pcre2_dfa_match() code. - -12. Refactored pcre2_dfa_match() so that the internal recursive calls no longer -use the stack for local workspace and local ovectors. Instead, an initial block -of stack is reserved, but if this is insufficient, heap memory is used. The -heap limit parameter now applies to pcre2_dfa_match(). - -13. If a "find limits" test of DFA matching in pcre2test resulted in too many -matches for the ovector, no matches were displayed. - -14. Removed an occurrence of ctrl/Z from test 6 because Windows treats it as -EOF. The test looks to have come from a fuzzer. - -15. If PCRE2 was built with a default match limit a lot greater than the -default default of 10 000 000, some JIT tests of the match limit no longer -failed. All such tests now set 10 000 000 as the upper limit. - -16. Another Windows related patch for pcregrep to ensure that WIN32 is -undefined under Cygwin. - -17. Test for the presence of stdint.h and inttypes.h in configure and CMake and -include whichever exists (stdint preferred) instead of unconditionally -including stdint. This makes life easier for old and non-standard systems. - -18. Further changes to improve portability, especially to old and or non- -standard systems: - - (a) Put all printf arguments in RunGrepTest into single, not double, quotes, - and use \0 not \x00 for binary zero. - - (b) Avoid the use of C++ (i.e. BCPL) // comments. - - (c) Parameterize the use of %zu in pcre2test to make it like %td. For both of - these now, if using MSVC or a standard C before C99, %lu is used with a - cast if necessary. - -19. Applied a contributed patch to CMakeLists.txt to increase the stack size -when linking pcre2test with MSVC. This gets rid of a stack overflow error in -the standard set of tests. - -20. Output a warning in pcre2test when ignoring the "altglobal" modifier when -it is given with the "replace" modifier. - -21. In both pcre2test and pcre2_substitute(), with global matching, a pattern -that matched an empty string, but never at the starting match offset, was not -handled in a Perl-compatible way. The pattern /(a(*:1))(?>b)(*SKIP:1)x|.*/ matched against "abc", where the *SKIP -shouldn't find a MARK (because is in an atomic group), but it did. - -26. Upgraded the perltest.sh script: (1) #pattern lines can now be used to set -a list of modifiers for all subsequent patterns - only those that the script -recognizes are meaningful; (2) #subject lines can be used to set or unset a -default "mark" modifier; (3) Unsupported #command lines give a warning when -they are ignored; (4) Mark data is output only if the "mark" modifier is -present. - -27. (*ACCEPT:ARG), (*FAIL:ARG), and (*COMMIT:ARG) are now supported. - -28. A (*MARK) name was not being passed back for positive assertions that were -terminated by (*ACCEPT). - -29. Add support for \N{U+dddd}, but only in Unicode mode. - -30. Add support for (?^) for unsetting all imnsx options. - -31. The PCRE2_EXTENDED (/x) option only ever discarded space characters whose -code point was less than 256 and that were recognized by the lookup table -generated by pcre2_maketables(), which uses isspace() to identify white space. -Now, when Unicode support is compiled, PCRE2_EXTENDED also discards U+0085, -U+200E, U+200F, U+2028, and U+2029, which are additional characters defined by -Unicode as "Pattern White Space". This makes PCRE2 compatible with Perl. - -32. In certain circumstances, option settings within patterns were not being -correctly processed. For example, the pattern /((?i)A)(?m)B/ incorrectly -matched "ab". (The (?m) setting lost the fact that (?i) should be reset at the -end of its group during the parse process, but without another setting such as -(?m) the compile phase got it right.) This bug was introduced by the -refactoring in release 10.23. - -33. PCRE2 uses bcopy() if available when memmove() is not, and it used just to -define memmove() as function call to bcopy(). This hasn't been tested for a -long time because in pcre2test the result of memmove() was being used, whereas -bcopy() doesn't return a result. This feature is now refactored always to call -an emulation function when there is no memmove(). The emulation makes use of -bcopy() when available. - -34. When serializing a pattern, set the memctl, executable_jit, and tables -fields (that is, all the fields that contain pointers) to zeros so that the -result of serializing is always the same. These fields are re-set when the -pattern is deserialized. - -35. In a pattern such as /[^\x{100}-\x{ffff}]*[\x80-\xff]/ which has a repeated -negative class with no characters less than 0x100 followed by a positive class -with only characters less than 0x100, the first class was incorrectly being -auto-possessified, causing incorrect match failures. - -36. Removed the character type bit ctype_meta, which dates from PCRE1 and is -not used in PCRE2. - -37. Tidied up unnecessarily complicated macros used in the escapes table. - -38. Since 10.21, the new testoutput8-16-4 file has accidentally been omitted -from distribution tarballs, owing to a typo in Makefile.am which had -testoutput8-16-3 twice. Now fixed. - -39. If the only branch in a conditional subpattern was anchored, the whole -subpattern was treated as anchored, when it should not have been, since the -assumed empty second branch cannot be anchored. Demonstrated by test patterns -such as /(?(1)^())b/ or /(?(?=^))b/. - -40. A repeated conditional subpattern that could match an empty string was -always assumed to be unanchored. Now it it checked just like any other -repeated conditional subpattern, and can be found to be anchored if the minimum -quantifier is one or more. I can't see much use for a repeated anchored -pattern, but the behaviour is now consistent. - -41. Minor addition to pcre2_jit_compile.c to avoid static analyzer complaint -(for an event that could never occur but you had to have external information -to know that). - -42. If before the first match in a file that was being searched by pcre2grep -there was a line that was sufficiently long to cause the input buffer to be -expanded, the variable holding the location of the end of the previous match -was being adjusted incorrectly, and could cause an overflow warning from a code -sanitizer. However, as the value is used only to print pending "after" lines -when the next match is reached (and there are no such lines in this case) this -bug could do no damage. - - -Version 10.31 12-February-2018 ------------------------------- - -1. Fix typo (missing ]) in VMS code in pcre2test.c. - -2. Replace the replicated code for matching extended Unicode grapheme sequences -(which got a lot more complicated by change 10.30/49) by a single subroutine -that is called by both pcre2_match() and pcre2_dfa_match(). - -3. Add idempotent guard to pcre2_internal.h. - -4. Add new pcre2_config() options: PCRE2_CONFIG_NEVER_BACKSLASH_C and -PCRE2_CONFIG_COMPILED_WIDTHS. - -5. Cut out \C tests in the JIT regression tests when NEVER_BACKSLASH_C is -defined (e.g. by --enable-never-backslash-C). - -6. Defined public names for all the pcre2_compile() error numbers, and used -the public names in pcre2_convert.c. - -7. Fixed a small memory leak in pcre2test (convert contexts). - -8. Added two casts to compile.c and one to match.c to avoid compiler warnings. - -9. Added code to pcre2grep when compiled under VMS to set the symbol -PCRE2GREP_RC to the exit status, because VMS does not distinguish between -exit(0) and exit(1). - -10. Added the -LM (list modifiers) option to pcre2test. Also made -C complain -about a bad option only if the following argument item does not start with a -hyphen. - -11. pcre2grep was truncating components of file names to 128 characters when -processing files with the -r option, and also (some very odd code) truncating -path names to 512 characters. There is now a check on the absolute length of -full path file names, which may be up to 2047 characters long. - -12. When an assertion contained (*ACCEPT) it caused all open capturing groups -to be closed (as for a non-assertion ACCEPT), which was wrong and could lead to -misbehaviour for subsequent references to groups that started outside the -assertion. ACCEPT in an assertion now closes only those groups that were -started within that assertion. Fixes oss-fuzz issues 3852 and 3891. - -13. Multiline matching in pcre2grep was misbehaving if the pattern matched -within a line, and then matched again at the end of the line and over into -subsequent lines. Behaviour was different with and without colouring, and -sometimes context lines were incorrectly printed and/or line endings were lost. -All these issues should now be fixed. - -14. If --line-buffered was specified for pcre2grep when input was from a -compressed file (.gz or .bz2) a segfault occurred. (Line buffering should be -ignored for compressed files.) - -15. Although pcre2_jit_match checks whether the pattern is compiled -in a given mode, it was also expected that at least one mode is available. -This is fixed and pcre2_jit_match returns with PCRE2_ERROR_JIT_BADOPTION -when the pattern is not optimized by JIT at all. - -16. The line number and related variables such as match counts in pcre2grep -were all int variables, causing overflow when files with more than 2147483647 -lines were processed (assuming 32-bit ints). They have all been changed to -unsigned long ints. - -17. If a backreference with a minimum repeat count of zero was first in a -pattern, apart from assertions, an incorrect first matching character could be -recorded. For example, for the pattern /(?=(a))\1?b/, "b" was incorrectly set -as the first character of a match. - -18. Characters in a leading positive assertion are considered for recording a -first character of a match when the rest of the pattern does not provide one. -However, a character in a non-assertive group within a leading assertion such -as in the pattern /(?=(a))\1?b/ caused this process to fail. This was an -infelicity rather than an outright bug, because it did not affect the result of -a match, just its speed. (In fact, in this case, the starting 'a' was -subsequently picked up in the study.) - -19. A minor tidy in pcre2_match(): making all PCRE2_ERROR_ returns use "return" -instead of "RRETURN" saves unwinding the backtracks in these cases (only one -didn't). - -20. Allocate a single callout block on the stack at the start of pcre2_match() -and set its never-changing fields once only. Do the same for pcre2_dfa_match(). - -21. Save the extra compile options (set in the compile context) with the -compiled pattern (they were not previously saved), add PCRE2_INFO_EXTRAOPTIONS -to retrieve them, and update pcre2test to show them. - -22. Added PCRE2_CALLOUT_STARTMATCH and PCRE2_CALLOUT_BACKTRACK bits to a new -field callout_flags in callout blocks. The bits are set by pcre2_match(), but -not by JIT or pcre2_dfa_match(). Their settings are shown in pcre2test callouts -if the callout_extra subject modifier is set. These bits are provided to help -with tracking how a backtracking match is proceeding. - -23. Updated the pcre2demo.c demonstration program, which was missing the extra -code for -g that handles the case when \K in an assertion causes the match to -end at the original start point. Also arranged for it to detect when \K causes -the end of a match to be before its start. - -24. Similar to 23 above, strange things (including loops) could happen in -pcre2grep when \K was used in an assertion when --colour was used or in -multiline mode. The "end at original start point" bug is fixed, and if the end -point is found to be before the start point, they are swapped. - -25. When PCRE2_FIRSTLINE without PCRE2_NO_START_OPTIMIZE was used in non-JIT -matching (both pcre2_match() and pcre2_dfa_match()) and the matched string -started with the first code unit of a newline sequence, matching failed because -it was not tried at the newline. - -26. Code for giving up a non-partial match after failing to find a starting -code unit anywhere in the subject was missing when searching for one of a -number of code units (the bitmap case) in both pcre2_match() and -pcre2_dfa_match(). This was a missing optimization rather than a bug. - -27. Tidied up the ACROSSCHAR macro to be like FORWARDCHAR and BACKCHAR, using a -pointer argument rather than a code unit value. This should not have affected -the generated code. - -28. The JIT compiler has been updated. - -29. Avoid pointer overflow for unset captures in pcre2_substring_list_get(). -This could not actually cause a crash because it was always used in a memcpy() -call with zero length. - -30. Some internal structures have a variable-length ovector[] as their last -element. Their actual memory is obtained dynamically, giving an ovector of -appropriate length. However, they are defined in the structure as -ovector[NUMBER], where NUMBER is large so that array bound checkers don't -grumble. The value of NUMBER was 10000, but a fuzzer exceeded 5000 capturing -groups, making the ovector larger than this. The number has been increased to -131072, which allows for the maximum number of captures (65535) plus the -overall match. This fixes oss-fuzz issue 5415. - -31. Auto-possessification at the end of a capturing group was dependent on what -follows the group (e.g. /(a+)b/ would auto-possessify the a+) but this caused -incorrect behaviour when the group was called recursively from elsewhere in the -pattern where something different might follow. This bug is an unforseen -consequence of change #1 for 10.30 - the implementation of backtracking into -recursions. Iterators at the ends of capturing groups are no longer considered -for auto-possessification if the pattern contains any recursions. Fixes -Bugzilla #2232. - - -Version 10.30 14-August-2017 ----------------------------- - -1. The main interpreter, pcre2_match(), has been refactored into a new version -that does not use recursive function calls (and therefore the stack) for -remembering backtracking positions. This makes --disable-stack-for-recursion a -NOOP. The new implementation allows backtracking into recursive group calls in -patterns, making it more compatible with Perl, and also fixes some other -hard-to-do issues such as #1887 in Bugzilla. The code is also cleaner because -the old code had a number of fudges to try to reduce stack usage. It seems to -run no slower than the old code. - -A number of bugs in the refactored code were subsequently fixed during testing -before release, but after the code was made available in the repository. These -bugs were never in fully released code, but are noted here for the record. - - (a) If a pattern had fewer capturing parentheses than the ovector supplied in - the match data block, a memory error (detectable by ASAN) occurred after - a match, because the external block was being set from non-existent - internal ovector fields. Fixes oss-fuzz issue 781. - - (b) A pattern with very many capturing parentheses (when the internal frame - size was greater than the initial frame vector on the stack) caused a - crash. A vector on the heap is now set up at the start of matching if the - vector on the stack is not big enough to handle at least 10 frames. - Fixes oss-fuzz issue 783. - - (c) Handling of (*VERB)s in recursions was wrong in some cases. - - (d) Captures in negative assertions that were used as conditions were not - happening if the assertion matched via (*ACCEPT). - - (e) Mark values were not being passed out of recursions. - - (f) Refactor some code in do_callout() to avoid picky compiler warnings about - negative indices. Fixes oss-fuzz issue 1454. - - (g) Similarly refactor the way the variable length ovector is addressed for - similar reasons. Fixes oss-fuzz issue 1465. - -2. Now that pcre2_match() no longer uses recursive function calls (see above), -the "match limit recursion" value seems misnamed. It still exists, and limits -the depth of tree that is searched. To avoid future confusion, it has been -renamed as "depth limit" in all relevant places (--with-depth-limit, -(*LIMIT_DEPTH), pcre2_set_depth_limit(), etc) but the old names are still -available for backwards compatibility. - -3. Hardened pcre2test so as to reduce the number of bugs reported by fuzzers: - - (a) Check for malloc failures when getting memory for the ovector (POSIX) or - the match data block (non-POSIX). - -4. In the 32-bit library in non-UTF mode, an attempt to find a Unicode property -for a character with a code point greater than 0x10ffff (the Unicode maximum) -caused a crash. - -5. If a lookbehind assertion that contained a back reference to a group -appearing later in the pattern was compiled with the PCRE2_ANCHORED option, -undefined actions (often a segmentation fault) could occur, depending on what -other options were set. An example assertion is (?" should be ">=" in opcode check in pcre2_auto_possess.c. - (b) Added some casts to avoid "suspicious implicit sign extension". - (c) Resource leaks in pcre2test in rare error cases. - (d) Avoid warning for never-use case OP_TABLE_LENGTH which is just a fudge - for checking at compile time that tables are the right size. - (e) Add missing "fall through" comment. - -29. Implemented PCRE2_EXTENDED_MORE and related /xx and (?xx) features. - -30. Implement (?n: for PCRE2_NO_AUTO_CAPTURE, because Perl now has this. - -31. If more than one of "push", "pushcopy", or "pushtablescopy" were set in -pcre2test, a crash could occur. - -32. Make -bigstack in RunTest allocate a 64MiB stack (instead of 16MiB) so -that all the tests can run with clang's sanitizing options. - -33. Implement extra compile options in the compile context and add the first -one: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES. - -34. Implement newline type PCRE2_NEWLINE_NUL. - -35. A lookbehind assertion that had a zero-length branch caused undefined -behaviour when processed by pcre2_dfa_match(). This is oss-fuzz issue 1859. - -36. The match limit value now also applies to pcre2_dfa_match() as there are -patterns that can use up a lot of resources without necessarily recursing very -deeply. (Compare item 10.23/36.) This should fix oss-fuzz #1761. - -37. Implement PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL. - -38. Fix returned offsets from regexec() when REG_STARTEND is used with a -starting offset greater than zero. - -39. Implement REG_PEND (GNU extension) for the POSIX wrapper. - -40. Implement the subject_literal modifier in pcre2test, and allow jitstack on -pattern lines. - -41. Implement PCRE2_LITERAL and use it to support REG_NOSPEC. - -42. Implement PCRE2_EXTRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD for the benefit -of pcre2grep. - -43. Re-implement pcre2grep's -F, -w, and -x options using PCRE2_LITERAL, -PCRE2_EXTRA_MATCH_WORD, and PCRE2_EXTRA_MATCH_LINE. This fixes two bugs: - - (a) The -F option did not work for fixed strings containing \E. - (b) The -w option did not work for patterns with multiple branches. - -44. Added configuration options for the SELinux compatible execmem allocator in -JIT. - -45. Increased the limit for searching for a "must be present" code unit in -subjects from 1000 to 2000 for 8-bit searches, since they use memchr() and are -much faster. - -46. Arrange for anchored patterns to record and use "first code unit" data, -because this can give a fast "no match" without searching for a "required code -unit". Previously only non-anchored patterns did this. - -47. Upgraded the Unicode tables from Unicode 8.0.0 to Unicode 10.0.0. - -48. Add the callout_no_where modifier to pcre2test. - -49. Update extended grapheme breaking rules to the latest set that are in -Unicode Standard Annex #29. - -50. Added experimental foreign pattern conversion facilities -(pcre2_pattern_convert() and friends). - -51. Change the macro FWRITE, used in pcre2grep, to FWRITE_IGNORE because FWRITE -is defined in a system header in cygwin. Also modified some of the #ifdefs in -pcre2grep related to Windows and Cygwin support. - -52. Change 3(g) for 10.23 was a bit too zealous. If a hyphen that follows a -character class is the last character in the class, Perl does not give a -warning. PCRE2 now also treats this as a literal. - -53. Related to 52, though PCRE2 was throwing an error for [[:digit:]-X] it was -not doing so for [\d-X] (and similar escapes), as is documented. - -54. Fixed a MIPS issue in the JIT compiler reported by Joshua Kinard. - -55. Fixed a "maybe uninitialized" warning for class_uchardata in \p handling in -pcre2_compile() which could never actually trigger (code should have been cut -out when Unicode support is disabled). - - -Version 10.23 14-February-2017 ------------------------------- - -1. Extended pcre2test with the utf8_input modifier so that it is able to -generate all possible 16-bit and 32-bit code unit values in non-UTF modes. - -2. In any wide-character mode (8-bit UTF or any 16-bit or 32-bit mode), without -PCRE2_UCP set, a negative character type such as \D in a positive class should -cause all characters greater than 255 to match, whatever else is in the class. -There was a bug that caused this not to happen if a Unicode property item was -added to such a class, for example [\D\P{Nd}] or [\W\pL]. - -3. There has been a major re-factoring of the pcre2_compile.c file. Most syntax -checking is now done in the pre-pass that identifies capturing groups. This has -reduced the amount of duplication and made the code tidier. While doing this, -some minor bugs and Perl incompatibilities were fixed, including: - - (a) \Q\E in the middle of a quantifier such as A+\Q\E+ is now ignored instead - of giving an invalid quantifier error. - - (b) {0} can now be used after a group in a lookbehind assertion; previously - this caused an "assertion is not fixed length" error. - - (c) Perl always treats (?(DEFINE) as a "define" group, even if a group with - the name "DEFINE" exists. PCRE2 now does likewise. - - (d) A recursion condition test such as (?(R2)...) must now refer to an - existing subpattern. - - (e) A conditional recursion test such as (?(R)...) misbehaved if there was a - group whose name began with "R". - - (f) When testing zero-terminated patterns under valgrind, the terminating - zero is now marked "no access". This catches bugs that would otherwise - show up only with non-zero-terminated patterns. - - (g) A hyphen appearing immediately after a POSIX character class (for example - /[[:ascii:]-z]/) now generates an error. Perl does accept this as a - literal, but gives a warning, so it seems best to fail it in PCRE. - - (h) An empty \Q\E sequence may appear after a callout that precedes an - assertion condition (it is, of course, ignored). - -One effect of the refactoring is that some error numbers and messages have -changed, and the pattern offset given for compiling errors is not always the -right-most character that has been read. In particular, for a variable-length -lookbehind assertion it now points to the start of the assertion. Another -change is that when a callout appears before a group, the "length of next -pattern item" that is passed now just gives the length of the opening -parenthesis item, not the length of the whole group. A length of zero is now -given only for a callout at the end of the pattern. Automatic callouts are no -longer inserted before and after explicit callouts in the pattern. - -A number of bugs in the refactored code were subsequently fixed during testing -before release, but after the code was made available in the repository. Many -of the bugs were discovered by fuzzing testing. Several of them were related to -the change from assuming a zero-terminated pattern (which previously had -required non-zero terminated strings to be copied). These bugs were never in -fully released code, but are noted here for the record. - - (a) An overall recursion such as (?0) inside a lookbehind assertion was not - being diagnosed as an error. - - (b) In utf mode, the length of a *MARK (or other verb) name was being checked - in characters instead of code units, which could lead to bad code being - compiled, leading to unpredictable behaviour. - - (c) In extended /x mode, characters whose code was greater than 255 caused - a lookup outside one of the global tables. A similar bug existed for wide - characters in *VERB names. - - (d) The amount of memory needed for a compiled pattern was miscalculated if a - lookbehind contained more than one toplevel branch and the first branch - was of length zero. - - (e) In UTF-8 or UTF-16 modes with PCRE2_EXTENDED (/x) set and a non-zero- - terminated pattern, if a # comment ran on to the end of the pattern, one - or more code units past the end were being read. - - (f) An unterminated repeat at the end of a non-zero-terminated pattern (e.g. - "{2,2") could cause reading beyond the pattern. - - (g) When reading a callout string, if the end delimiter was at the end of the - pattern one further code unit was read. - - (h) An unterminated number after \g' could cause reading beyond the pattern. - - (i) An insufficient memory size was being computed for compiling with - PCRE2_AUTO_CALLOUT. - - (j) A conditional group with an assertion condition used more memory than was - allowed for it during parsing, so too many of them could therefore - overrun a buffer. - - (k) If parsing a pattern exactly filled the buffer, the internal test for - overrun did not check when the final META_END item was added. - - (l) If a lookbehind contained a subroutine call, and the called group - contained an option setting such as (?s), and the PCRE2_ANCHORED option - was set, unpredictable behaviour could occur. The underlying bug was - incorrect code and insufficient checking while searching for the end of - the called subroutine in the parsed pattern. - - (m) Quantifiers following (*VERB)s were not being diagnosed as errors. - - (n) The use of \Q...\E in a (*VERB) name when PCRE2_ALT_VERBNAMES and - PCRE2_AUTO_CALLOUT were both specified caused undetermined behaviour. - - (o) If \Q was preceded by a quantified item, and the following \E was - followed by '?' or '+', and there was at least one literal character - between them, an internal error "unexpected repeat" occurred (example: - /.+\QX\E+/). - - (p) A buffer overflow could occur while sorting the names in the group name - list (depending on the order in which the names were seen). - - (q) A conditional group that started with a callout was not doing the right - check for a following assertion, leading to compiling bad code. Example: - /(?(C'XX))?!XX/ - - (r) If a character whose code point was greater than 0xffff appeared within - a lookbehind that was within another lookbehind, the calculation of the - lookbehind length went wrong and could provoke an internal error. - - (t) The sequence \E- or \Q\E- after a POSIX class in a character class caused - an internal error. Now the hyphen is treated as a literal. - -4. Back references are now permitted in lookbehind assertions when there are -no duplicated group numbers (that is, (?| has not been used), and, if the -reference is by name, there is only one group of that name. The referenced -group must, of course be of fixed length. - -5. pcre2test has been upgraded so that, when run under valgrind with valgrind -support enabled, reading past the end of the pattern is detected, both when -compiling and during callout processing. - -6. \g{+} (e.g. \g{+2} ) is now supported. It is a "forward back -reference" and can be useful in repetitions (compare \g{-} ). Perl does -not recognize this syntax. - -7. Automatic callouts are no longer generated before and after callouts in the -pattern. - -8. When pcre2test was outputing information from a callout, the caret indicator -for the current position in the subject line was incorrect if it was after an -escape sequence for a character whose code point was greater than \x{ff}. - -9. Change 19 for 10.22 had a typo (PCRE_STATIC_RUNTIME should be -PCRE2_STATIC_RUNTIME). Fix from David Gaussmann. - -10. Added --max-buffer-size to pcre2grep, to allow for automatic buffer -expansion when long lines are encountered. Original patch by Dmitry -Cherniachenko. - -11. If pcre2grep was compiled with JIT support, but the library was compiled -without it (something that neither ./configure nor CMake allow, but it can be -done by editing config.h), pcre2grep was giving a JIT error. Now it detects -this situation and does not try to use JIT. - -12. Added some "const" qualifiers to variables in pcre2grep. - -13. Added Dmitry Cherniachenko's patch for colouring output in Windows -(untested by me). Also, look for GREP_COLOUR or GREP_COLOR if the environment -variables PCRE2GREP_COLOUR and PCRE2GREP_COLOR are not found. - -14. Add the -t (grand total) option to pcre2grep. - -15. A number of bugs have been mended relating to match start-up optimizations -when the first thing in a pattern is a positive lookahead. These all applied -only when PCRE2_NO_START_OPTIMIZE was *not* set: - - (a) A pattern such as (?=.*X)X$ was incorrectly optimized as if it needed - both an initial 'X' and a following 'X'. - (b) Some patterns starting with an assertion that started with .* were - incorrectly optimized as having to match at the start of the subject or - after a newline. There are cases where this is not true, for example, - (?=.*[A-Z])(?=.{8,16})(?!.*[\s]) matches after the start in lines that - start with spaces. Starting .* in an assertion is no longer taken as an - indication of matching at the start (or after a newline). - -16. The "offset" modifier in pcre2test was not being ignored (as documented) -when the POSIX API was in use. - -17. Added --enable-fuzz-support to "configure", causing an non-installed -library containing a test function that can be called by fuzzers to be -compiled. A non-installed binary to run the test function locally, called -pcre2fuzzcheck is also compiled. - -18. A pattern with PCRE2_DOTALL (/s) set but not PCRE2_NO_DOTSTAR_ANCHOR, and -which started with .* inside a positive lookahead was incorrectly being -compiled as implicitly anchored. - -19. Removed all instances of "register" declarations, as they are considered -obsolete these days and in any case had become very haphazard. - -20. Add strerror() to pcre2test for failed file opening. - -21. Make pcre2test -C list valgrind support when it is enabled. - -22. Add the use_length modifier to pcre2test. - -23. Fix an off-by-one bug in pcre2test for the list of names for 'get' and -'copy' modifiers. - -24. Add PCRE2_CALL_CONVENTION into the prototype declarations in pcre2.h as it -is apparently needed there as well as in the function definitions. (Why did -nobody ask for this in PCRE1?) - -25. Change the _PCRE2_H and _PCRE2_UCP_H guard macros in the header files to -PCRE2_H_IDEMPOTENT_GUARD and PCRE2_UCP_H_IDEMPOTENT_GUARD to be more standard -compliant and unique. - -26. pcre2-config --libs-posix was listing -lpcre2posix instead of --lpcre2-posix. Also, the CMake build process was building the library with the -wrong name. - -27. In pcre2test, give some offset information for errors in hex patterns. -This uses the C99 formatting sequence %td, except for MSVC which doesn't -support it - %lu is used instead. - -28. Implemented pcre2_code_copy_with_tables(), and added pushtablescopy to -pcre2test for testing it. - -29. Fix small memory leak in pcre2test. - -30. Fix out-of-bounds read for partial matching of /./ against an empty string -when the newline type is CRLF. - -31. Fix a bug in pcre2test that caused a crash when a locale was set either in -the current pattern or a previous one and a wide character was matched. - -32. The appearance of \p, \P, or \X in a substitution string when -PCRE2_SUBSTITUTE_EXTENDED was set caused a segmentation fault (NULL -dereference). - -33. If the starting offset was specified as greater than the subject length in -a call to pcre2_substitute() an out-of-bounds memory reference could occur. - -34. When PCRE2 was compiled to use the heap instead of the stack for recursive -calls to match(), a repeated minimizing caseless back reference, or a -maximizing one where the two cases had different numbers of code units, -followed by a caseful back reference, could lose the caselessness of the first -repeated back reference (example: /(Z)(a)\2{1,2}?(?-i)\1X/i should match ZaAAZX -but didn't). - -35. When a pattern is too complicated, PCRE2 gives up trying to find a minimum -matching length and just records zero. Typically this happens when there are -too many nested or recursive back references. If the limit was reached in -certain recursive cases it failed to be triggered and an internal error could -be the result. - -36. The pcre2_dfa_match() function now takes note of the recursion limit for -the internal recursive calls that are used for lookrounds and recursions within -the pattern. - -37. More refactoring has got rid of the internal could_be_empty_branch() -function (around 400 lines of code, including comments) by keeping track of -could-be-emptiness as the pattern is compiled instead of scanning compiled -groups. (This would have been much harder before the refactoring of #3 above.) -This lifts a restriction on the number of branches in a group (more than about -1100 would give "pattern is too complicated"). - -38. Add the "-ac" command line option to pcre2test as a synonym for "-pattern -auto_callout". - -39. In a library with Unicode support, incorrect data was compiled for a -pattern with PCRE2_UCP set without PCRE2_UTF if a class required all wide -characters to match (for example, /[\s[:^ascii:]]/). - -40. The callout_error modifier has been added to pcre2test to make it possible -to return PCRE2_ERROR_CALLOUT from a callout. - -41. A minor change to pcre2grep: colour reset is now "[0m" instead of -"[00m". - -42. The limit in the auto-possessification code that was intended to catch -overly-complicated patterns and not spend too much time auto-possessifying was -being reset too often, resulting in very long compile times for some patterns. -Now such patterns are no longer completely auto-possessified. - -43. Applied Jason Hood's revised patch for RunTest.bat. - -44. Added a new Windows script RunGrepTest.bat, courtesy of Jason Hood. - -45. Minor cosmetic fix to pcre2test: move a variable that is not used under -Windows into the "not Windows" code. - -46. Applied Jason Hood's patches to upgrade pcre2grep under Windows and tidy -some of the code: - - * normalised the Windows condition by ensuring WIN32 is defined; - * enables the callout feature under Windows; - * adds globbing (Microsoft's implementation expands quoted args), - using a tweaked opendirectory; - * implements the is_*_tty functions for Windows; - * --color=always will write the ANSI sequences to file; - * add sequences 4 (underline works on Win10) and 5 (blink as bright - background, relatively standard on DOS/Win); - * remove the (char *) casts for the now-const strings; - * remove GREP_COLOUR (grep's command line allowed the 'u', but not - the environment), parsing GREP_COLORS instead; - * uses the current colour if not set, rather than black; - * add print_match for the undefined case; - * fixes a typo. - -In addition, colour settings containing anything other than digits and -semicolon are ignored, and the colour controls are no longer output for empty -strings. - -47. Detecting patterns that are too large inside the length-measuring loop -saves processing ridiculously long patterns to their end. - -48. Ignore PCRE2_CASELESS when processing \h, \H, \v, and \V in classes as it -just wastes time. In the UTF case it can also produce redundant entries in -XCLASS lists caused by characters with multiple other cases and pairs of -characters in the same "not-x" sublists. - -49. A pattern such as /(?=(a\K))/ can report the end of the match being before -its start; pcre2test was not handling this correctly when using the POSIX -interface (it was OK with the native interface). - -50. In pcre2grep, ignore all JIT compile errors. This means that pcre2grep will -continue to work, falling back to interpretation if anything goes wrong with -JIT. - -51. Applied patches from Christian Persch to configure.ac to make use of the -AC_USE_SYSTEM_EXTENSIONS macro and to test for functions used by the JIT -modules. - -52. Minor fixes to pcre2grep from Jason Hood: - * fixed some spacing; - * Windows doesn't usually use single quotes, so I've added a define - to use appropriate quotes [in an example]; - * LC_ALL was displayed as "LCC_ALL"; - * numbers 11, 12 & 13 should end in "th"; - * use double quotes in usage message. - -53. When autopossessifying, skip empty branches without recursion, to reduce -stack usage for the benefit of clang with -fsanitize-address, which uses huge -stack frames. Example pattern: /X?(R||){3335}/. Fixes oss-fuzz issue 553. - -54. A pattern with very many explicit back references to a group that is a long -way from the start of the pattern could take a long time to compile because -searching for the referenced group in order to find the minimum length was -being done repeatedly. Now up to 128 group minimum lengths are cached and the -attempt to find a minimum length is abandoned if there is a back reference to a -group whose number is greater than 128. (In that case, the pattern is so -complicated that this optimization probably isn't worth it.) This fixes -oss-fuzz issue 557. - -55. Issue 32 for 10.22 below was not correctly fixed. If pcre2grep in multiline -mode with --only-matching matched several lines, it restarted scanning at the -next line instead of moving on to the end of the matched string, which can be -several lines after the start. - -56. Applied Jason Hood's new patch for RunGrepTest.bat that updates it in line -with updates to the non-Windows version. - - - -Version 10.22 29-July-2016 --------------------------- - -1. Applied Jason Hood's patches to RunTest.bat and testdata/wintestoutput3 -to fix problems with running the tests under Windows. - -2. Implemented a facility for quoting literal characters within hexadecimal -patterns in pcre2test, to make it easier to create patterns with just a few -non-printing characters. - -3. Binary zeros are not supported in pcre2test input files. It now detects them -and gives an error. - -4. Updated the valgrind parameters in RunTest: (a) changed smc-check=all to -smc-check=all-non-file; (b) changed obj:* in the suppression file to obj:??? so -that it matches only unknown objects. - -5. Updated the maintenance script maint/ManyConfigTests to make it easier to -select individual groups of tests. - -6. When the POSIX wrapper function regcomp() is called, the REG_NOSUB option -used to set PCRE2_NO_AUTO_CAPTURE when calling pcre2_compile(). However, this -disables the use of back references (and subroutine calls), which are supported -by other implementations of regcomp() with RE_NOSUB. Therefore, REG_NOSUB no -longer causes PCRE2_NO_AUTO_CAPTURE to be set, though it still ignores nmatch -and pmatch when regexec() is called. - -7. Because of 6 above, pcre2test has been modified with a new modifier called -posix_nosub, to call regcomp() with REG_NOSUB. Previously the no_auto_capture -modifier had this effect. That option is now ignored when the POSIX API is in -use. - -8. Minor tidies to the pcre2demo.c sample program, including more comments -about its 8-bit-ness. - -9. Detect unmatched closing parentheses and give the error in the pre-scan -instead of later. Previously the pre-scan carried on and could give a -misleading incorrect error message. For example, /(?J)(?'a'))(?'a')/ gave a -message about invalid duplicate group names. - -10. It has happened that pcre2test was accidentally linked with another POSIX -regex library instead of libpcre2-posix. In this situation, a call to regcomp() -(in the other library) may succeed, returning zero, but of course putting its -own data into the regex_t block. In one example the re_pcre2_code field was -left as NULL, which made pcre2test think it had not got a compiled POSIX regex, -so it treated the next line as another pattern line, resulting in a confusing -error message. A check has been added to pcre2test to see if the data returned -from a successful call of regcomp() are valid for PCRE2's regcomp(). If they -are not, an error message is output and the pcre2test run is abandoned. The -message points out the possibility of a mis-linking. Hopefully this will avoid -some head-scratching the next time this happens. - -11. A pattern such as /(?<=((?C)0))/, which has a callout inside a lookbehind -assertion, caused pcre2test to output a very large number of spaces when the -callout was taken, making the program appearing to loop. - -12. A pattern that included (*ACCEPT) in the middle of a sufficiently deeply -nested set of parentheses of sufficient size caused an overflow of the -compiling workspace (which was diagnosed, but of course is not desirable). - -13. Detect missing closing parentheses during the pre-pass for group -identification. - -14. Changed some integer variable types and put in a number of casts, following -a report of compiler warnings from Visual Studio 2013 and a few tests with -gcc's -Wconversion (which still throws up a lot). - -15. Implemented pcre2_code_copy(), and added pushcopy and #popcopy to pcre2test -for testing it. - -16. Change 66 for 10.21 introduced the use of snprintf() in PCRE2's version of -regerror(). When the error buffer is too small, my version of snprintf() puts a -binary zero in the final byte. Bug #1801 seems to show that other versions do -not do this, leading to bad output from pcre2test when it was checking for -buffer overflow. It no longer assumes a binary zero at the end of a too-small -regerror() buffer. - -17. Fixed typo ("&&" for "&") in pcre2_study(). Fortunately, this could not -actually affect anything, by sheer luck. - -18. Two minor fixes for MSVC compilation: (a) removal of apparently incorrect -"const" qualifiers in pcre2test and (b) defining snprintf as _snprintf for -older MSVC compilers. This has been done both in src/pcre2_internal.h for most -of the library, and also in src/pcre2posix.c, which no longer includes -pcre2_internal.h (see 24 below). - -19. Applied Chris Wilson's patch (Bugzilla #1681) to CMakeLists.txt for MSVC -static compilation. Subsequently applied Chris Wilson's second patch, putting -the first patch under a new option instead of being unconditional when -PCRE_STATIC is set. - -20. Updated pcre2grep to set stdout as binary when run under Windows, so as not -to convert \r\n at the ends of reflected lines into \r\r\n. This required -ensuring that other output that is written to stdout (e.g. file names) uses the -appropriate line terminator: \r\n for Windows, \n otherwise. - -21. When a line is too long for pcre2grep's internal buffer, show the maximum -length in the error message. - -22. Added support for string callouts to pcre2grep (Zoltan's patch with PH -additions). - -23. RunTest.bat was missing a "set type" line for test 22. - -24. The pcre2posix.c file was including pcre2_internal.h, and using some -"private" knowledge of the data structures. This is unnecessary; the code has -been re-factored and no longer includes pcre2_internal.h. - -25. A racing condition is fixed in JIT reported by Mozilla. - -26. Minor code refactor to avoid "array subscript is below array bounds" -compiler warning. - -27. Minor code refactor to avoid "left shift of negative number" warning. - -28. Add a bit more sanity checking to pcre2_serialize_decode() and document -that it expects trusted data. - -29. Fix typo in pcre2_jit_test.c - -30. Due to an oversight, pcre2grep was not making use of JIT when available. -This is now fixed. - -31. The RunGrepTest script is updated to use the valgrind suppressions file -when testing with JIT under valgrind (compare 10.21/51 below). The suppressions -file is updated so that is now the same as for PCRE1: it suppresses the -Memcheck warnings Addr16 and Cond in unknown objects (that is, JIT-compiled -code). Also changed smc-check=all to smc-check=all-non-file as was done for -RunTest (see 4 above). - -32. Implemented the PCRE2_NO_JIT option for pcre2_match(). - -33. Fix typo that gave a compiler error when JIT not supported. - -34. Fix comment describing the returns from find_fixedlength(). - -35. Fix potential negative index in pcre2test. - -36. Calls to pcre2_get_error_message() with error numbers that are never -returned by PCRE2 functions were returning empty strings. Now the error code -PCRE2_ERROR_BADDATA is returned. A facility has been added to pcre2test to -show the texts for given error numbers (i.e. to call pcre2_get_error_message() -and display what it returns) and a few representative error codes are now -checked in RunTest. - -37. Added "&& !defined(__INTEL_COMPILER)" to the test for __GNUC__ in -pcre2_match.c, in anticipation that this is needed for the same reason it was -recently added to pcrecpp.cc in PCRE1. - -38. Using -o with -M in pcre2grep could cause unnecessary repeated output when -the match extended over a line boundary, as it tried to find more matches "on -the same line" - but it was already over the end. - -39. Allow \C in lookbehinds and DFA matching in UTF-32 mode (by converting it -to the same code as '.' when PCRE2_DOTALL is set). - -40. Fix two clang compiler warnings in pcre2test when only one code unit width -is supported. - -41. Upgrade RunTest to automatically re-run test 2 with a large (64MiB) stack -if it fails when running the interpreter with a 16MiB stack (and if changing -the stack size via pcre2test is possible). This avoids having to manually set a -large stack size when testing with clang. - -42. Fix register overwite in JIT when SSE2 acceleration is enabled. - -43. Detect integer overflow in pcre2test pattern and data repetition counts. - -44. In pcre2test, ignore "allcaptures" after DFA matching. - -45. Fix unaligned accesses on x86. Patch by Marc Mutz. - -46. Fix some more clang compiler warnings. - - -Version 10.21 12-January-2016 ------------------------------ - -1. Improve matching speed of patterns starting with + or * in JIT. - -2. Use memchr() to find the first character in an unanchored match in 8-bit -mode in the interpreter. This gives a significant speed improvement. - -3. Removed a redundant copy of the opcode_possessify table in the -pcre2_auto_possessify.c source. - -4. Fix typos in dftables.c for z/OS. - -5. Change 36 for 10.20 broke the handling of [[:>:]] and [[:<:]] in that -processing them could involve a buffer overflow if the following character was -an opening parenthesis. - -6. Change 36 for 10.20 also introduced a bug in processing this pattern: -/((?x)(*:0))#(?'/. Specifically: if a setting of (?x) was followed by a (*MARK) -setting (which (*:0) is), then (?x) did not get unset at the end of its group -during the scan for named groups, and hence the external # was incorrectly -treated as a comment and the invalid (?' at the end of the pattern was not -diagnosed. This caused a buffer overflow during the real compile. This bug was -discovered by Karl Skomski with the LLVM fuzzer. - -7. Moved the pcre2_find_bracket() function from src/pcre2_compile.c into its -own source module to avoid a circular dependency between src/pcre2_compile.c -and src/pcre2_study.c - -8. A callout with a string argument containing an opening square bracket, for -example /(?C$[$)(?<]/, was incorrectly processed and could provoke a buffer -overflow. This bug was discovered by Karl Skomski with the LLVM fuzzer. - -9. The handling of callouts during the pre-pass for named group identification -has been tightened up. - -10. The quantifier {1} can be ignored, whether greedy, non-greedy, or -possessive. This is a very minor optimization. - -11. A possessively repeated conditional group that could match an empty string, -for example, /(?(R))*+/, was incorrectly compiled. - -12. The Unicode tables have been updated to Unicode 8.0.0 (thanks to Christian -Persch). - -13. An empty comment (?#) in a pattern was incorrectly processed and could -provoke a buffer overflow. This bug was discovered by Karl Skomski with the -LLVM fuzzer. - -14. Fix infinite recursion in the JIT compiler when certain patterns such as -/(?:|a|){100}x/ are analysed. - -15. Some patterns with character classes involving [: and \\ were incorrectly -compiled and could cause reading from uninitialized memory or an incorrect -error diagnosis. Examples are: /[[:\\](?<[::]/ and /[[:\\](?'abc')[a:]. The -first of these bugs was discovered by Karl Skomski with the LLVM fuzzer. - -16. Pathological patterns containing many nested occurrences of [: caused -pcre2_compile() to run for a very long time. This bug was found by the LLVM -fuzzer. - -17. A missing closing parenthesis for a callout with a string argument was not -being diagnosed, possibly leading to a buffer overflow. This bug was found by -the LLVM fuzzer. - -18. A conditional group with only one branch has an implicit empty alternative -branch and must therefore be treated as potentially matching an empty string. - -19. If (?R was followed by - or + incorrect behaviour happened instead of a -diagnostic. This bug was discovered by Karl Skomski with the LLVM fuzzer. - -20. Another bug that was introduced by change 36 for 10.20: conditional groups -whose condition was an assertion preceded by an explicit callout with a string -argument might be incorrectly processed, especially if the string contained \Q. -This bug was discovered by Karl Skomski with the LLVM fuzzer. - -21. Compiling PCRE2 with the sanitize options of clang showed up a number of -very pedantic coding infelicities and a buffer overflow while checking a UTF-8 -string if the final multi-byte UTF-8 character was truncated. - -22. For Perl compatibility in EBCDIC environments, ranges such as a-z in a -class, where both values are literal letters in the same case, omit the -non-letter EBCDIC code points within the range. - -23. Finding the minimum matching length of complex patterns with back -references and/or recursions can take a long time. There is now a cut-off that -gives up trying to find a minimum length when things get too complex. - -24. An optimization has been added that speeds up finding the minimum matching -length for patterns containing repeated capturing groups or recursions. - -25. If a pattern contained a back reference to a group whose number was -duplicated as a result of appearing in a (?|...) group, the computation of the -minimum matching length gave a wrong result, which could cause incorrect "no -match" errors. For such patterns, a minimum matching length cannot at present -be computed. - -26. Added a check for integer overflow in conditions (?() and -(?(R). This omission was discovered by Karl Skomski with the LLVM -fuzzer. - -27. Fixed an issue when \p{Any} inside an xclass did not read the current -character. - -28. If pcre2grep was given the -q option with -c or -l, or when handling a -binary file, it incorrectly wrote output to stdout. - -29. The JIT compiler did not restore the control verb head in case of *THEN -control verbs. This issue was found by Karl Skomski with a custom LLVM fuzzer. - -30. The way recursive references such as (?3) are compiled has been re-written -because the old way was the cause of many issues. Now, conversion of the group -number into a pattern offset does not happen until the pattern has been -completely compiled. This does mean that detection of all infinitely looping -recursions is postponed till match time. In the past, some easy ones were -detected at compile time. This re-writing was done in response to yet another -bug found by the LLVM fuzzer. - -31. A test for a back reference to a non-existent group was missing for items -such as \987. This caused incorrect code to be compiled. This issue was found -by Karl Skomski with a custom LLVM fuzzer. - -32. Error messages for syntax errors following \g and \k were giving inaccurate -offsets in the pattern. - -33. Improve the performance of starting single character repetitions in JIT. - -34. (*LIMIT_MATCH=) now gives an error instead of setting the value to 0. - -35. Error messages for syntax errors in *LIMIT_MATCH and *LIMIT_RECURSION now -give the right offset instead of zero. - -36. The JIT compiler should not check repeats after a {0,1} repeat byte code. -This issue was found by Karl Skomski with a custom LLVM fuzzer. - -37. The JIT compiler should restore the control chain for empty possessive -repeats. This issue was found by Karl Skomski with a custom LLVM fuzzer. - -38. A bug which was introduced by the single character repetition optimization -was fixed. - -39. Match limit check added to recursion. This issue was found by Karl Skomski -with a custom LLVM fuzzer. - -40. Arrange for the UTF check in pcre2_match() and pcre2_dfa_match() to look -only at the part of the subject that is relevant when the starting offset is -non-zero. - -41. Improve first character match in JIT with SSE2 on x86. - -42. Fix two assertion fails in JIT. These issues were found by Karl Skomski -with a custom LLVM fuzzer. - -43. Correct the setting of CMAKE_C_FLAGS in CMakeLists.txt (patch from Roy Ivy -III). - -44. Fix bug in RunTest.bat for new test 14, and adjust the script for the added -test (there are now 20 in total). - -45. Fixed a corner case of range optimization in JIT. - -46. Add the ${*MARK} facility to pcre2_substitute(). - -47. Modifier lists in pcre2test were splitting at spaces without the required -commas. - -48. Implemented PCRE2_ALT_VERBNAMES. - -49. Fixed two issues in JIT. These were found by Karl Skomski with a custom -LLVM fuzzer. - -50. The pcre2test program has been extended by adding the #newline_default -command. This has made it possible to run the standard tests when PCRE2 is -compiled with either CR or CRLF as the default newline convention. As part of -this work, the new command was added to several test files and the testing -scripts were modified. The pcre2grep tests can now also be run when there is no -LF in the default newline convention. - -51. The RunTest script has been modified so that, when JIT is used and valgrind -is specified, a valgrind suppressions file is set up to ignore "Invalid read of -size 16" errors because these are false positives when the hardware supports -the SSE2 instruction set. - -52. It is now possible to have comment lines amid the subject strings in -pcre2test (and perltest.sh) input. - -53. Implemented PCRE2_USE_OFFSET_LIMIT and pcre2_set_offset_limit(). - -54. Add the null_context modifier to pcre2test so that calling pcre2_compile() -and the matching functions with NULL contexts can be tested. - -55. Implemented PCRE2_SUBSTITUTE_EXTENDED. - -56. In a character class such as [\W\p{Any}] where both a negative-type escape -("not a word character") and a property escape were present, the property -escape was being ignored. - -57. Fixed integer overflow for patterns whose minimum matching length is very, -very large. - -58. Implemented --never-backslash-C. - -59. Change 55 above introduced a bug by which certain patterns provoked the -erroneous error "\ at end of pattern". - -60. The special sequences [[:<:]] and [[:>:]] gave rise to incorrect compiling -errors or other strange effects if compiled in UCP mode. Found with libFuzzer -and AddressSanitizer. - -61. Whitespace at the end of a pcre2test pattern line caused a spurious error -message if there were only single-character modifiers. It should be ignored. - -62. The use of PCRE2_NO_AUTO_CAPTURE could cause incorrect compilation results -or segmentation errors for some patterns. Found with libFuzzer and -AddressSanitizer. - -63. Very long names in (*MARK) or (*THEN) etc. items could provoke a buffer -overflow. - -64. Improve error message for overly-complicated patterns. - -65. Implemented an optional replication feature for patterns in pcre2test, to -make it easier to test long repetitive patterns. The tests for 63 above are -converted to use the new feature. - -66. In the POSIX wrapper, if regerror() was given too small a buffer, it could -misbehave. - -67. In pcre2_substitute() in UTF mode, the UTF validity check on the -replacement string was happening before the length setting when the replacement -string was zero-terminated. - -68. In pcre2_substitute() in UTF mode, PCRE2_NO_UTF_CHECK can be set for the -second and subsequent calls to pcre2_match(). - -69. There was no check for integer overflow for a replacement group number in -pcre2_substitute(). An added check for a number greater than the largest group -number in the pattern means this is not now needed. - -70. The PCRE2-specific VERSION condition didn't work correctly if only one -digit was given after the decimal point, or if more than two digits were given. -It now works with one or two digits, and gives a compile time error if more are -given. - -71. In pcre2_substitute() there was the possibility of reading one code unit -beyond the end of the replacement string. - -72. The code for checking a subject's UTF-32 validity for a pattern with a -lookbehind involved an out-of-bounds pointer, which could potentially cause -trouble in some environments. - -73. The maximum lookbehind length was incorrectly calculated for patterns such -as /(?<=(a)(?-1))x/ which have a recursion within a backreference. - -74. Give an error if a lookbehind assertion is longer than 65535 code units. - -75. Give an error in pcre2_substitute() if a match ends before it starts (as a -result of the use of \K). - -76. Check the length of subpattern names and the names in (*MARK:xx) etc. -dynamically to avoid the possibility of integer overflow. - -77. Implement pcre2_set_max_pattern_length() so that programs can restrict the -size of patterns that they are prepared to handle. - -78. (*NO_AUTO_POSSESS) was not working. - -79. Adding group information caching improves the speed of compiling when -checking whether a group has a fixed length and/or could match an empty string, -especially when recursion or subroutine calls are involved. However, this -cannot be used when (?| is present in the pattern because the same number may -be used for groups of different sizes. To catch runaway patterns in this -situation, counts have been introduced to the functions that scan for empty -branches or compute fixed lengths. - -80. Allow for the possibility of the size of the nest_save structure not being -a factor of the size of the compiling workspace (it currently is). - -81. Check for integer overflow in minimum length calculation and cap it at -65535. - -82. Small optimizations in code for finding the minimum matching length. - -83. Lock out configuring for EBCDIC with non-8-bit libraries. - -84. Test for error code <= 0 in regerror(). - -85. Check for too many replacements (more than INT_MAX) in pcre2_substitute(). - -86. Avoid the possibility of computing with an out-of-bounds pointer (though -not dereferencing it) while handling lookbehind assertions. - -87. Failure to get memory for the match data in regcomp() is now given as a -regcomp() error instead of waiting for regexec() to pick it up. - -88. In pcre2_substitute(), ensure that CRLF is not split when it is a valid -newline sequence. - -89. Paranoid check in regcomp() for bad error code from pcre2_compile(). - -90. Run test 8 (internal offsets and code sizes) for link sizes 3 and 4 as well -as for link size 2. - -91. Document that JIT has a limit on pattern size, and give more information -about JIT compile failures in pcre2test. - -92. Implement PCRE2_INFO_HASBACKSLASHC. - -93. Re-arrange valgrind support code in pcre2test to avoid spurious reports -with JIT (possibly caused by SSE2?). - -94. Support offset_limit in JIT. - -95. A sequence such as [[:punct:]b] that is, a POSIX character class followed -by a single ASCII character in a class item, was incorrectly compiled in UCP -mode. The POSIX class got lost, but only if the single character followed it. - -96. [:punct:] in UCP mode was matching some characters in the range 128-255 -that should not have been matched. - -97. If [:^ascii:] or [:^xdigit:] are present in a non-negated class, all -characters with code points greater than 255 are in the class. When a Unicode -property was also in the class (if PCRE2_UCP is set, escapes such as \w are -turned into Unicode properties), wide characters were not correctly handled, -and could fail to match. - -98. In pcre2test, make the "startoffset" modifier a synonym of "offset", -because it sets the "startoffset" parameter for pcre2_match(). - -99. If PCRE2_AUTO_CALLOUT was set on a pattern that had a (?# comment between -an item and its qualifier (for example, A(?#comment)?B) pcre2_compile() -misbehaved. This bug was found by the LLVM fuzzer. - -100. The error for an invalid UTF pattern string always gave the code unit -offset as zero instead of where the invalidity was found. - -101. Further to 97 above, negated classes such as [^[:^ascii:]\d] were also not -working correctly in UCP mode. - -102. Similar to 99 above, if an isolated \E was present between an item and its -qualifier when PCRE2_AUTO_CALLOUT was set, pcre2_compile() misbehaved. This bug -was found by the LLVM fuzzer. - -103. The POSIX wrapper function regexec() crashed if the option REG_STARTEND -was set when the pmatch argument was NULL. It now returns REG_INVARG. - -104. Allow for up to 32-bit numbers in the ordin() function in pcre2grep. - -105. An empty \Q\E sequence between an item and its qualifier caused -pcre2_compile() to misbehave when auto callouts were enabled. This bug -was found by the LLVM fuzzer. - -106. If both PCRE2_ALT_VERBNAMES and PCRE2_EXTENDED were set, and a (*MARK) or -other verb "name" ended with whitespace immediately before the closing -parenthesis, pcre2_compile() misbehaved. Example: /(*:abc )/, but only when -both those options were set. - -107. In a number of places pcre2_compile() was not handling NULL characters -correctly, and pcre2test with the "bincode" modifier was not always correctly -displaying fields containing NULLS: - - (a) Within /x extended #-comments - (b) Within the "name" part of (*MARK) and other *verbs - (c) Within the text argument of a callout - -108. If a pattern that was compiled with PCRE2_EXTENDED started with white -space or a #-type comment that was followed by (?-x), which turns off -PCRE2_EXTENDED, and there was no subsequent (?x) to turn it on again, -pcre2_compile() assumed that (?-x) applied to the whole pattern and -consequently mis-compiled it. This bug was found by the LLVM fuzzer. The fix -for this bug means that a setting of any of the (?imsxJU) options at the start -of a pattern is no longer transferred to the options that are returned by -PCRE2_INFO_ALLOPTIONS. In fact, this was an anachronism that should have -changed when the effects of those options were all moved to compile time. - -109. An escaped closing parenthesis in the "name" part of a (*verb) when -PCRE2_ALT_VERBNAMES was set caused pcre2_compile() to malfunction. This bug -was found by the LLVM fuzzer. - -110. Implemented PCRE2_SUBSTITUTE_UNSET_EMPTY, and updated pcre2test to make it -possible to test it. - -111. "Harden" pcre2test against ridiculously large values in modifiers and -command line arguments. - -112. Implemented PCRE2_SUBSTITUTE_UNKNOWN_UNSET and PCRE2_SUBSTITUTE_OVERFLOW_ -LENGTH. - -113. Fix printing of *MARK names that contain binary zeroes in pcre2test. - - -Version 10.20 30-June-2015 --------------------------- - -1. Callouts with string arguments have been added. - -2. Assertion code generator in JIT has been optimized. - -3. The invalid pattern (?(?C) has a missing assertion condition at the end. The -pcre2_compile() function read past the end of the input before diagnosing an -error. This bug was discovered by the LLVM fuzzer. - -4. Implemented pcre2_callout_enumerate(). - -5. Fix JIT compilation of conditional blocks whose assertion is converted to -(*FAIL). E.g: /(?(?!))/. - -6. The pattern /(?(?!)^)/ caused references to random memory. This bug was -discovered by the LLVM fuzzer. - -7. The assertion (?!) is optimized to (*FAIL). This was not handled correctly -when this assertion was used as a condition, for example (?(?!)a|b). In -pcre2_match() it worked by luck; in pcre2_dfa_match() it gave an incorrect -error about an unsupported item. - -8. For some types of pattern, for example /Z*(|d*){216}/, the auto- -possessification code could take exponential time to complete. A recursion -depth limit of 1000 has been imposed to limit the resources used by this -optimization. This infelicity was discovered by the LLVM fuzzer. - -9. A pattern such as /(*UTF)[\S\V\H]/, which contains a negated special class -such as \S in non-UCP mode, explicit wide characters (> 255) can be ignored -because \S ensures they are all in the class. The code for doing this was -interacting badly with the code for computing the amount of space needed to -compile the pattern, leading to a buffer overflow. This bug was discovered by -the LLVM fuzzer. - -10. A pattern such as /((?2)+)((?1))/ which has mutual recursion nested inside -other kinds of group caused stack overflow at compile time. This bug was -discovered by the LLVM fuzzer. - -11. A pattern such as /(?1)(?#?'){8}(a)/ which had a parenthesized comment -between a subroutine call and its quantifier was incorrectly compiled, leading -to buffer overflow or other errors. This bug was discovered by the LLVM fuzzer. - -12. The illegal pattern /(?(?.*!.*)?)/ was not being diagnosed as missing an -assertion after (?(. The code was failing to check the character after (?(?< -for the ! or = that would indicate a lookbehind assertion. This bug was -discovered by the LLVM fuzzer. - -13. A pattern such as /X((?2)()*+){2}+/ which has a possessive quantifier with -a fixed maximum following a group that contains a subroutine reference was -incorrectly compiled and could trigger buffer overflow. This bug was discovered -by the LLVM fuzzer. - -14. Negative relative recursive references such as (?-7) to non-existent -subpatterns were not being diagnosed and could lead to unpredictable behaviour. -This bug was discovered by the LLVM fuzzer. - -15. The bug fixed in 14 was due to an integer variable that was unsigned when -it should have been signed. Some other "int" variables, having been checked, -have either been changed to uint32_t or commented as "must be signed". - -16. A mutual recursion within a lookbehind assertion such as (?<=((?2))((?1))) -caused a stack overflow instead of the diagnosis of a non-fixed length -lookbehind assertion. This bug was discovered by the LLVM fuzzer. - -17. The use of \K in a positive lookbehind assertion in a non-anchored pattern -(e.g. /(?<=\Ka)/) could make pcre2grep loop. - -18. There was a similar problem to 17 in pcre2test for global matches, though -the code there did catch the loop. - -19. If a greedy quantified \X was preceded by \C in UTF mode (e.g. \C\X*), -and a subsequent item in the pattern caused a non-match, backtracking over the -repeated \X did not stop, but carried on past the start of the subject, causing -reference to random memory and/or a segfault. There were also some other cases -where backtracking after \C could crash. This set of bugs was discovered by the -LLVM fuzzer. - -20. The function for finding the minimum length of a matching string could take -a very long time if mutual recursion was present many times in a pattern, for -example, /((?2){73}(?2))((?1))/. A better mutual recursion detection method has -been implemented. This infelicity was discovered by the LLVM fuzzer. - -21. Implemented PCRE2_NEVER_BACKSLASH_C. - -22. The feature for string replication in pcre2test could read from freed -memory if the replication required a buffer to be extended, and it was not -working properly in 16-bit and 32-bit modes. This issue was discovered by a -fuzzer: see http://lcamtuf.coredump.cx/afl/. - -23. Added the PCRE2_ALT_CIRCUMFLEX option. - -24. Adjust the treatment of \8 and \9 to be the same as the current Perl -behaviour. - -25. Static linking against the PCRE2 library using the pkg-config module was -failing on missing pthread symbols. - -26. If a group that contained a recursive back reference also contained a -forward reference subroutine call followed by a non-forward-reference -subroutine call, for example /.((?2)(?R)\1)()/, pcre2_compile() failed to -compile correct code, leading to undefined behaviour or an internally detected -error. This bug was discovered by the LLVM fuzzer. - -27. Quantification of certain items (e.g. atomic back references) could cause -incorrect code to be compiled when recursive forward references were involved. -For example, in this pattern: /(?1)()((((((\1++))\x85)+)|))/. This bug was -discovered by the LLVM fuzzer. - -28. A repeated conditional group whose condition was a reference by name caused -a buffer overflow if there was more than one group with the given name. This -bug was discovered by the LLVM fuzzer. - -29. A recursive back reference by name within a group that had the same name as -another group caused a buffer overflow. For example: /(?J)(?'d'(?'d'\g{d}))/. -This bug was discovered by the LLVM fuzzer. - -30. A forward reference by name to a group whose number is the same as the -current group, for example in this pattern: /(?|(\k'Pm')|(?'Pm'))/, caused a -buffer overflow at compile time. This bug was discovered by the LLVM fuzzer. - -31. Fix -fsanitize=undefined warnings for left shifts of 1 by 31 (it treats 1 -as an int; fixed by writing it as 1u). - -32. Fix pcre2grep compile when -std=c99 is used with gcc, though it still gives -a warning for "fileno" unless -std=gnu99 us used. - -33. A lookbehind assertion within a set of mutually recursive subpatterns could -provoke a buffer overflow. This bug was discovered by the LLVM fuzzer. - -34. Give an error for an empty subpattern name such as (?''). - -35. Make pcre2test give an error if a pattern that follows #forbud_utf contains -\P, \p, or \X. - -36. The way named subpatterns are handled has been refactored. There is now a -pre-pass over the regex which does nothing other than identify named -subpatterns and count the total captures. This means that information about -named patterns is known before the rest of the compile. In particular, it means -that forward references can be checked as they are encountered. Previously, the -code for handling forward references was contorted and led to several errors in -computing the memory requirements for some patterns, leading to buffer -overflows. - -37. There was no check for integer overflow in subroutine calls such as (?123). - -38. The table entry for \l in EBCDIC environments was incorrect, leading to its -being treated as a literal 'l' instead of causing an error. - -39. If a non-capturing group containing a conditional group that could match -an empty string was repeated, it was not identified as matching an empty string -itself. For example: /^(?:(?(1)x|)+)+$()/. - -40. In an EBCDIC environment, pcretest was mishandling the escape sequences -\a and \e in test subject lines. - -41. In an EBCDIC environment, \a in a pattern was converted to the ASCII -instead of the EBCDIC value. - -42. The handling of \c in an EBCDIC environment has been revised so that it is -now compatible with the specification in Perl's perlebcdic page. - -43. Single character repetition in JIT has been improved. 20-30% speedup -was achieved on certain patterns. - -44. The EBCDIC character 0x41 is a non-breaking space, equivalent to 0xa0 in -ASCII/Unicode. This has now been added to the list of characters that are -recognized as white space in EBCDIC. - -45. When PCRE2 was compiled without Unicode support, the use of \p and \P gave -an error (correctly) when used outside a class, but did not give an error -within a class. - -46. \h within a class was incorrectly compiled in EBCDIC environments. - -47. JIT should return with error when the compiled pattern requires -more stack space than the maximum. - -48. Fixed a memory leak in pcre2grep when a locale is set. - - -Version 10.10 06-March-2015 ---------------------------- - -1. When a pattern is compiled, it remembers the highest back reference so that -when matching, if the ovector is too small, extra memory can be obtained to -use instead. A conditional subpattern whose condition is a check on a capture -having happened, such as, for example in the pattern /^(?:(a)|b)(?(1)A|B)/, is -another kind of back reference, but it was not setting the highest -backreference number. This mattered only if pcre2_match() was called with an -ovector that was too small to hold the capture, and there was no other kind of -back reference (a situation which is probably quite rare). The effect of the -bug was that the condition was always treated as FALSE when the capture could -not be consulted, leading to a incorrect behaviour by pcre2_match(). This bug -has been fixed. - -2. Functions for serialization and deserialization of sets of compiled patterns -have been added. - -3. The value that is returned by PCRE2_INFO_SIZE has been corrected to remove -excess code units at the end of the data block that may occasionally occur if -the code for calculating the size over-estimates. This change stops the -serialization code copying uninitialized data, to which valgrind objects. The -documentation of PCRE2_INFO_SIZE was incorrect in stating that the size did not -include the general overhead. This has been corrected. - -4. All code units in every slot in the table of group names are now set, again -in order to avoid accessing uninitialized data when serializing. - -5. The (*NO_JIT) feature is implemented. - -6. If a bug that caused pcre2_compile() to use more memory than allocated was -triggered when using valgrind, the code in (3) above passed a stupidly large -value to valgrind. This caused a crash instead of an "internal error" return. - -7. A reference to a duplicated named group (either a back reference or a test -for being set in a conditional) that occurred in a part of the pattern where -PCRE2_DUPNAMES was not set caused the amount of memory needed for the pattern -to be incorrectly calculated, leading to overwriting. - -8. A mutually recursive set of back references such as (\2)(\1) caused a -segfault at compile time (while trying to find the minimum matching length). -The infinite loop is now broken (with the minimum length unset, that is, zero). - -9. If an assertion that was used as a condition was quantified with a minimum -of zero, matching went wrong. In particular, if the whole group had unlimited -repetition and could match an empty string, a segfault was likely. The pattern -(?(?=0)?)+ is an example that caused this. Perl allows assertions to be -quantified, but not if they are being used as conditions, so the above pattern -is faulted by Perl. PCRE2 has now been changed so that it also rejects such -patterns. - -10. The error message for an invalid quantifier has been changed from "nothing -to repeat" to "quantifier does not follow a repeatable item". - -11. If a bad UTF string is compiled with NO_UTF_CHECK, it may succeed, but -scanning the compiled pattern in subsequent auto-possessification can get out -of step and lead to an unknown opcode. Previously this could have caused an -infinite loop. Now it generates an "internal error" error. This is a tidyup, -not a bug fix; passing bad UTF with NO_UTF_CHECK is documented as having an -undefined outcome. - -12. A UTF pattern containing a "not" match of a non-ASCII character and a -subroutine reference could loop at compile time. Example: /[^\xff]((?1))/. - -13. The locale test (RunTest 3) has been upgraded. It now checks that a locale -that is found in the output of "locale -a" can actually be set by pcre2test -before it is accepted. Previously, in an environment where a locale was listed -but would not set (an example does exist), the test would "pass" without -actually doing anything. Also the fr_CA locale has been added to the list of -locales that can be used. - -14. Fixed a bug in pcre2_substitute(). If a replacement string ended in a -capturing group number without parentheses, the last character was incorrectly -literally included at the end of the replacement string. - -15. A possessive capturing group such as (a)*+ with a minimum repeat of zero -failed to allow the zero-repeat case if pcre2_match() was called with an -ovector too small to capture the group. - -16. Improved error message in pcre2test when setting the stack size (-S) fails. - -17. Fixed two bugs in CMakeLists.txt: (1) Some lines had got lost in the -transfer from PCRE1, meaning that CMake configuration failed if "build tests" -was selected. (2) The file src/pcre2_serialize.c had not been added to the list -of PCRE2 sources, which caused a failure to build pcre2test. - -18. Fixed typo in pcre2_serialize.c (DECL instead of DEFN) that causes problems -only on Windows. - -19. Use binary input when reading back saved serialized patterns in pcre2test. - -20. Added RunTest.bat for running the tests under Windows. - -21. "make distclean" was not removing config.h, a file that may be created for -use with CMake. - -22. A pattern such as "((?2){0,1999}())?", which has a group containing a -forward reference repeated a large (but limited) number of times within a -repeated outer group that has a zero minimum quantifier, caused incorrect code -to be compiled, leading to the error "internal error: previously-checked -referenced subpattern not found" when an incorrect memory address was read. -This bug was reported as "heap overflow", discovered by Kai Lu of Fortinet's -FortiGuard Labs. (Added 24-March-2015: CVE-2015-2325 was given to this.) - -23. A pattern such as "((?+1)(\1))/" containing a forward reference subroutine -call within a group that also contained a recursive back reference caused -incorrect code to be compiled. This bug was reported as "heap overflow", -discovered by Kai Lu of Fortinet's FortiGuard Labs. (Added 24-March-2015: -CVE-2015-2326 was given to this.) - -24. Computing the size of the JIT read-only data in advance has been a source -of various issues, and new ones are still appear unfortunately. To fix -existing and future issues, size computation is eliminated from the code, -and replaced by on-demand memory allocation. - -25. A pattern such as /(?i)[A-`]/, where characters in the other case are -adjacent to the end of the range, and the range contained characters with more -than one other case, caused incorrect behaviour when compiled in UTF mode. In -that example, the range a-j was left out of the class. - - -Version 10.00 05-January-2015 ------------------------------ - -Version 10.00 is the first release of PCRE2, a revised API for the PCRE -library. Changes prior to 10.00 are logged in the ChangeLog file for the old -API, up to item 20 for release 8.36. - -The code of the library was heavily revised as part of the new API -implementation. Details of each and every modification were not individually -logged. In addition to the API changes, the following changes were made. They -are either new functionality, or bug fixes and other noticeable changes of -behaviour that were implemented after the code had been forked. - -1. Including Unicode support at build time is now enabled by default, but it -can optionally be disabled. It is not enabled by default at run time (no -change). - -2. The test program, now called pcre2test, was re-specified and almost -completely re-written. Its input is not compatible with input for pcretest. - -3. Patterns may start with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) to set the -PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART options for every subject line that is -matched by that pattern. - -4. For the benefit of those who use PCRE2 via some other application, that is, -not writing the function calls themselves, it is possible to check the PCRE2 -version by matching a pattern such as /(?(VERSION>=10)yes|no)/ against a -string such as "yesno". - -5. There are case-equivalent Unicode characters whose encodings use different -numbers of code units in UTF-8. U+023A and U+2C65 are one example. (It is -theoretically possible for this to happen in UTF-16 too.) If a backreference to -a group containing one of these characters was greedily repeated, and during -the match a backtrack occurred, the subject might be backtracked by the wrong -number of code units. For example, if /^(\x{23a})\1*(.)/ is matched caselessly -(and in UTF-8 mode) against "\x{23a}\x{2c65}\x{2c65}\x{2c65}", group 2 should -capture the final character, which is the three bytes E2, B1, and A5 in UTF-8. -Incorrect backtracking meant that group 2 captured only the last two bytes. -This bug has been fixed; the new code is slower, but it is used only when the -strings matched by the repetition are not all the same length. - -6. A pattern such as /()a/ was not setting the "first character must be 'a'" -information. This applied to any pattern with a group that matched no -characters, for example: /(?:(?=.)|(? 0) - { - $line = 0; - $file = shift @ARGV; - - open (IN, $file) || die "Failed to open $file\n"; - - while () - { - $count = 0; - $line++; - if (/^\s*$/) - { - printf "Empty line $line of $file\n"; - $yield = 1; - } - elsif (/^\./) - { - if (!/^\.\s*$| - ^\.B\s+\S| - ^\.TH\s\S| - ^\.SH\s\S| - ^\.SS\s\S| - ^\.TP(?:\s?\d+)?\s*$| - ^\.SM\s*$| - ^\.br\s*$| - ^\.rs\s*$| - ^\.sp\s*$| - ^\.nf\s*$| - ^\.fi\s*$| - ^\.P\s*$| - ^\.PP\s*$| - ^\.\\"(?:\ HREF)?\s*$| - ^\.\\"\sHTML\s\s*$| - ^\.\\"\sHTML\s<\/a>\s*$| - ^\.\\"\s<\/a>\s*$| - ^\.\\"\sJOINSH\s*$| - ^\.\\"\sJOIN\s*$/x - ) - { - printf "Bad control line $line of $file\n"; - $yield = 1; - } - } - elsif (/\\[^ef]|\\f[^IBP]/) - { - printf "Bad backslash in line $line of $file\n"; - $yield = 1; - } - while (/\\f[BI]/g) - { - $count++; - } - while (/\\fP/g) - { - $count--; - } - if ($count != 0) - { - printf "Mismatching formatting in line $line of $file\n"; - $yield = 1; - } - } - - close(IN); - } - -exit $yield; -# End diff --git a/pcre2/CleanTxt b/pcre2/CleanTxt deleted file mode 100755 index 1f42519c8..000000000 --- a/pcre2/CleanTxt +++ /dev/null @@ -1,113 +0,0 @@ -#! /usr/bin/perl -w - -# Script to take the output of nroff -man and remove all the backspacing and -# the page footers and the screen commands etc so that it is more usefully -# readable online. In fact, in the latest nroff, intermediate footers don't -# seem to be generated any more. - -$blankcount = 0; -$lastwascut = 0; -$firstheader = 1; - -# Input on STDIN; output to STDOUT. - -while () - { - s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m" - s/.\x8//g; # Remove "char, backspace" - - # Handle header lines. Retain only the first one we encounter, but remove - # the blank line that follows. Any others (e.g. at end of document) and the - # following blank line are dropped. - - if (/^PCRE(\w*)\(([13])\)\s+PCRE\1\(\2\)$/) - { - if ($firstheader) - { - $firstheader = 0; - print; - $lastprinted = $_; - $lastwascut = 0; - } - $_=; # Remove a blank that follows - next; - } - - # Count runs of empty lines - - if (/^\s*$/) - { - $blankcount++; - $lastwascut = 0; - next; - } - - # If a chunk of lines has been cut out (page footer) and the next line - # has a different indentation, put back one blank line. - - if ($lastwascut && $blankcount < 1 && defined($lastprinted)) - { - ($a) = $lastprinted =~ /^(\s*)/; - ($b) = $_ =~ /^(\s*)/; - $blankcount++ if ($a ne $b); - } - - # We get here only when we have a non-blank line in hand. If it was preceded - # by 3 or more blank lines, read the next 3 lines and see if they are blank. - # If so, remove all 7 lines, and remember that we have just done a cut. - - if ($blankcount >= 3) - { - for ($i = 0; $i < 3; $i++) - { - $next[$i] = ; - $next[$i] = "" if !defined $next[$i]; - $next[$i] =~ s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m" - $next[$i] =~ s/.\x8//g; # Remove "char, backspace" - } - - # Cut out chunks of the form <3 blanks><3 blanks> - - if ($next[0] =~ /^\s*$/ && - $next[1] =~ /^\s*$/ && - $next[2] =~ /^\s*$/) - { - $blankcount -= 3; - $lastwascut = 1; - } - - # Otherwise output the saved blanks, the current, and the next three - # lines. Remember the last printed line. - - else - { - for ($i = 0; $i < $blankcount; $i++) { print "\n"; } - print; - for ($i = 0; $i < 3; $i++) - { - $next[$i] =~ s/.\x8//g; - print $next[$i]; - $lastprinted = $_; - } - $lastwascut = 0; - $blankcount = 0; - } - } - - # This non-blank line is not preceded by 3 or more blank lines. Output - # any blanks there are, and the line. Remember it. Force two blank lines - # before headings. - - else - { - $blankcount = 2 if /^\S/ && !/^Last updated/ && !/^Copyright/ && - defined($lastprinted); - for ($i = 0; $i < $blankcount; $i++) { print "\n"; } - print; - $lastprinted = $_; - $lastwascut = 0; - $blankcount = 0; - } - } - -# End diff --git a/pcre2/Detrail b/pcre2/Detrail deleted file mode 100755 index 1c5c7e9ca..000000000 --- a/pcre2/Detrail +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/perl - -# This is a script for removing trailing whitespace from lines in files that -# are listed on the command line. - -# This subroutine does the work for one file. - -sub detrail { -my($file) = $_[0]; -my($changed) = 0; -open(IN, "$file") || die "Can't open $file for input"; -@lines = ; -close(IN); -foreach (@lines) - { - if (/\s+\n$/) - { - s/\s+\n$/\n/; - $changed = 1; - } - } -if ($changed) - { - open(OUT, ">$file") || die "Can't open $file for output"; - print OUT @lines; - close(OUT); - } -} - -# This is the main program - -$, = ""; # Output field separator -for ($i = 0; $i < @ARGV; $i++) { &detrail($ARGV[$i]); } - -# End diff --git a/pcre2/HACKING b/pcre2/HACKING deleted file mode 100644 index 20faf8f47..000000000 --- a/pcre2/HACKING +++ /dev/null @@ -1,830 +0,0 @@ -Technical Notes about PCRE2 ---------------------------- - -These are very rough technical notes that record potentially useful information -about PCRE2 internals. PCRE2 is a library based on the original PCRE library, -but with a revised (and incompatible) API. To avoid confusion, the original -library is referred to as PCRE1 below. For information about testing PCRE2, see -the pcre2test documentation and the comment at the head of the RunTest file. - -PCRE1 releases were up to 8.3x when PCRE2 was developed, and later bug fix -releases remain in the 8.xx series. PCRE2 releases started at 10.00 to avoid -confusion with PCRE1. - - -Historical note 1 ------------------ - -Many years ago I implemented some regular expression functions to an algorithm -suggested by Martin Richards. The rather simple patterns were not Unix-like in -form, and were quite restricted in what they could do by comparison with Perl. -The interesting part about the algorithm was that the amount of space required -to hold the compiled form of an expression was known in advance. The code to -apply an expression did not operate by backtracking, as the original Henry -Spencer code and current PCRE2 and Perl code does, but instead checked all -possibilities simultaneously by keeping a list of current states and checking -all of them as it advanced through the subject string. In the terminology of -Jeffrey Friedl's book, it was a "DFA algorithm", though it was not a -traditional Finite State Machine (FSM). When the pattern was all used up, all -remaining states were possible matches, and the one matching the longest subset -of the subject string was chosen. This did not necessarily maximize the -individual wild portions of the pattern, as is expected in Unix and Perl-style -regular expressions. - - -Historical note 2 ------------------ - -By contrast, the code originally written by Henry Spencer (which was -subsequently heavily modified for Perl) compiles the expression twice: once in -a dummy mode in order to find out how much store will be needed, and then for -real. (The Perl version probably doesn't do this any more; I'm talking about -the original library.) The execution function operates by backtracking and -maximizing (or, optionally, minimizing, in Perl) the amount of the subject that -matches individual wild portions of the pattern. This is an "NFA algorithm" in -Friedl's terminology. - - -OK, here's the real stuff -------------------------- - -For the set of functions that formed the original PCRE1 library in 1997 (which -are unrelated to those mentioned above), I tried at first to invent an -algorithm that used an amount of store bounded by a multiple of the number of -characters in the pattern, to save on compiling time. However, because of the -greater complexity in Perl regular expressions, I couldn't do this, even though -the then current Perl 5.004 patterns were much simpler than those supported -nowadays. In any case, a first pass through the pattern is helpful for other -reasons. - - -Support for 16-bit and 32-bit data strings -------------------------------------------- - -The PCRE2 library can be compiled in any combination of 8-bit, 16-bit or 32-bit -modes, creating up to three different libraries. In the description that -follows, the word "short" is used for a 16-bit data quantity, and the phrase -"code unit" is used for a quantity that is a byte in 8-bit mode, a short in -16-bit mode and a 32-bit word in 32-bit mode. The names of PCRE2 functions are -given in generic form, without the _8, _16, or _32 suffix. - - -Computing the memory requirement: how it was --------------------------------------------- - -Up to and including release 6.7, PCRE1 worked by running a very degenerate -first pass to calculate a maximum memory requirement, and then a second pass to -do the real compile - which might use a bit less than the predicted amount of -memory. The idea was that this would turn out faster than the Henry Spencer -code because the first pass is degenerate and the second pass can just store -stuff straight into memory, which it knows is big enough. - - -Computing the memory requirement: how it is -------------------------------------------- - -By the time I was working on a potential 6.8 release, the degenerate first pass -had become very complicated and hard to maintain. Indeed one of the early -things I did for 6.8 was to fix Yet Another Bug in the memory computation. Then -I had a flash of inspiration as to how I could run the real compile function in -a "fake" mode that enables it to compute how much memory it would need, while -in most cases only ever using a small amount of working memory, and without too -many tests of the mode that might slow it down. So I refactored the compiling -functions to work this way. This got rid of about 600 lines of source and made -further maintenance and development easier. As this was such a major change, I -never released 6.8, instead upping the number to 7.0 (other quite major changes -were also present in the 7.0 release). - -A side effect of this work was that the previous limit of 200 on the nesting -depth of parentheses was removed. However, there was a downside: compiling ran -more slowly than before (30% or more, depending on the pattern) because it now -did a full analysis of the pattern. My hope was that this would not be a big -issue, and in the event, nobody has commented on it. - -At release 8.34, a limit on the nesting depth of parentheses was re-introduced -(default 250, settable at build time) so as to put a limit on the amount of -system stack used by the compile function, which uses recursive function calls -for nested parenthesized groups. This is a safety feature for environments with -small stacks where the patterns are provided by users. - - -Yet another pattern scan ------------------------- - -History repeated itself for PCRE2 release 10.20. A number of bugs relating to -named subpatterns had been discovered by fuzzers. Most of these were related to -the handling of forward references when it was not known if the named group was -unique. (References to non-unique names use a different opcode and more -memory.) The use of duplicate group numbers (the (?| facility) also caused -issues. - -To get around these problems I adopted a new approach by adding a third pass -over the pattern (really a "pre-pass"), which did nothing other than identify -all the named subpatterns and their corresponding group numbers. This means -that the actual compile (both the memory-computing dummy run and the real -compile) has full knowledge of group names and numbers throughout. Several -dozen lines of messy code were eliminated, though the new pre-pass was not -short. In particular, parsing and skipping over [] classes is complicated. - -While working on 10.22 I realized that I could simplify yet again by moving -more of the parsing into the pre-pass, thus avoiding doing it in two places, so -after 10.22 was released, the code underwent yet another big refactoring. This -is how it is from 10.23 onwards: - -The function called parse_regex() scans the pattern characters, parsing them -into literal data and meta characters. It converts escapes such as \x{123} -into literals, handles \Q...\E, and skips over comments and non-significant -white space. The result of the scanning is put into a vector of 32-bit unsigned -integers. Values less than 0x80000000 are literal data. Higher values represent -meta-characters. The top 16-bits of such values identify the meta-character, -and these are given names such as META_CAPTURE. The lower 16-bits are available -for data, for example, the capturing group number. The only situation in which -literal data values greater than 0x7fffffff can appear is when the 32-bit -library is running in non-UTF mode. This is handled by having a special -meta-character that is followed by the 32-bit data value. - -The size of the parsed pattern vector, when auto-callouts are not enabled, is -bounded by the length of the pattern (with one exception). The code is written -so that each item in the pattern uses no more vector elements than the number -of code units in the item itself. The exception is the aforementioned large -32-bit number handling. For this reason, 32-bit non-UTF patterns are scanned in -advance to check for such values. When auto-callouts are enabled, the generous -assumption is made that there will be a callout for each pattern code unit -(which of course is only actually true if all code units are literals) plus one -at the end. There is a default parsed pattern vector on the system stack, but -if this is not big enough, heap memory is used. - -As before, the actual compiling function is run twice, the first time to -determine the amount of memory needed for the final compiled pattern. It -now processes the parsed pattern vector, not the pattern itself, although some -of the parsed items refer to strings in the pattern - for example, group -names. As escapes and comments have already been processed, the code is a bit -simpler than before. - -Most errors can be diagnosed during the parsing scan. For those that cannot -(for example, "lookbehind assertion is not fixed length"), the parsed code -contains offsets into the pattern so that the actual compiling code can -report where errors are. - - -The elements of the parsed pattern vector ------------------------------------------ - -The word "offset" below means a code unit offset into the pattern. When -PCRE2_SIZE (which is usually size_t) is no bigger than uint32_t, an offset is -stored in a single parsed pattern element. Otherwise (typically on 64-bit -systems) it occupies two elements. The following meta items occupy just one -element, with no data: - -META_ACCEPT (*ACCEPT) -META_ASTERISK * -META_ASTERISK_PLUS *+ -META_ASTERISK_QUERY *? -META_ATOMIC (?> start of atomic group -META_CIRCUMFLEX ^ metacharacter -META_CLASS [ start of non-empty class -META_CLASS_EMPTY [] empty class - only with PCRE2_ALLOW_EMPTY_CLASS -META_CLASS_EMPTY_NOT [^] negative empty class - ditto -META_CLASS_END ] end of non-empty class -META_CLASS_NOT [^ start non-empty negative class -META_COMMIT (*COMMIT) -META_COND_ASSERT (?(?assertion) -META_DOLLAR $ metacharacter -META_DOT . metacharacter -META_END End of pattern (this value is 0x80000000) -META_FAIL (*FAIL) -META_KET ) closing parenthesis -META_LOOKAHEAD (?= start of lookahead -META_LOOKAHEAD_NA (*napla: start of non-atomic lookahead -META_LOOKAHEADNOT (?! start of negative lookahead -META_NOCAPTURE (?: no capture parens -META_PLUS + -META_PLUS_PLUS ++ -META_PLUS_QUERY +? -META_PRUNE (*PRUNE) - no argument -META_QUERY ? -META_QUERY_PLUS ?+ -META_QUERY_QUERY ?? -META_RANGE_ESCAPED hyphen in class range with at least one escape -META_RANGE_LITERAL hyphen in class range defined literally -META_SKIP (*SKIP) - no argument -META_THEN (*THEN) - no argument - -The two RANGE values occur only in character classes. They are positioned -between two literals that define the start and end of the range. In an EBCDIC -evironment it is necessary to know whether either of the range values was -specified as an escape. In an ASCII/Unicode environment the distinction is not -relevant. - -The following have data in the lower 16 bits, and may be followed by other data -elements: - -META_ALT | alternation -META_BACKREF back reference -META_CAPTURE start of capturing group -META_ESCAPE non-literal escape sequence -META_RECURSE recursion call - -If the data for META_ALT is non-zero, it is inside a lookbehind, and the data -is the length of its branch, for which OP_REVERSE must be generated. - -META_BACKREF, META_CAPTURE, and META_RECURSE have the capture group number as -their data in the lower 16 bits of the element. - -META_BACKREF is followed by an offset if the back reference group number is 10 -or more. The offsets of the first ocurrences of references to groups whose -numbers are less than 10 are put in cb->small_ref_offset[] (only the first -occurrence is useful). On 64-bit systems this avoids using more than two parsed -pattern elements for items such as \3. The offset is used when an error occurs -because the reference is to a non-existent group. - -META_RECURSE is always followed by an offset, for use in error messages. - -META_ESCAPE has an ESC_xxx value as its data. For ESC_P and ESC_p, the next -element contains the 16-bit type and data property values, packed together. -ESC_g and ESC_k are used only for named references - numerical ones are turned -into META_RECURSE or META_BACKREF as appropriate. ESC_g and ESC_k are followed -by a length and an offset into the pattern to specify the name. - -The following have one data item that follows in the next vector element: - -META_BIGVALUE Next is a literal >= META_END -META_OPTIONS (?i) and friends (data is new option bits) -META_POSIX POSIX class item (data identifies the class) -META_POSIX_NEG negative POSIX class item (ditto) - -The following are followed by a length element, then a number of character code -values (which should match with the length): - -META_MARK (*MARK:xxxx) -META_COMMIT_ARG )*COMMIT:xxxx) -META_PRUNE_ARG (*PRUNE:xxx) -META_SKIP_ARG (*SKIP:xxxx) -META_THEN_ARG (*THEN:xxxx) - -The following are followed by a length element, then an offset in the pattern -that identifies the name: - -META_COND_NAME (?() or (?('name') or (?(name) -META_COND_RNAME (?(R&name) -META_COND_RNUMBER (?(Rdigits) -META_RECURSE_BYNAME (?&name) -META_BACKREF_BYNAME \k'name' - -META_COND_RNUMBER is used for names that start with R and continue with digits, -because this is an ambiguous case. It could be a back reference to a group with -that name, or it could be a recursion test on a numbered group. - -This one is followed by an offset, for use in error messages, then a number: - -META_COND_NUMBER (?([+-]digits) - -The following is followed just by an offset, for use in error messages: - -META_COND_DEFINE (?(DEFINE) - -The following are also followed just by an offset, but also the lower 16 bits -of the main word contain the length of the first branch of the lookbehind -group; this is used when generating OP_REVERSE for that branch. - -META_LOOKBEHIND (?<= start of lookbehind -META_LOOKBEHIND_NA (*naplb: start of non-atomic lookbehind -META_LOOKBEHINDNOT (?' and 1 for '>='; -the next two are the major and minor numbers: - -META_COND_VERSION (?(VERSIONx.y) - -Callouts are converted into one of two items: - -META_CALLOUT_NUMBER (?C with numerical argument -META_CALLOUT_STRING (?C with string argument - -In both cases, the next two elements contain the offset and length of the next -item in the pattern. Then there is either one callout number, or a length and -an offset for the string argument. The length includes both delimiters. - - -Traditional matching function ------------------------------ - -The "traditional", and original, matching function is called pcre2_match(), and -it implements an NFA algorithm, similar to the original Henry Spencer algorithm -and the way that Perl works. This is not surprising, since it is intended to be -as compatible with Perl as possible. This is the function most users of PCRE2 -will use most of the time. If PCRE2 is compiled with just-in-time (JIT) -support, and studying a compiled pattern with JIT is successful, the JIT code -is run instead of the normal pcre2_match() code, but the result is the same. - - -Supplementary matching function -------------------------------- - -There is also a supplementary matching function called pcre2_dfa_match(). This -implements a DFA matching algorithm that searches simultaneously for all -possible matches that start at one point in the subject string. (Going back to -my roots: see Historical Note 1 above.) This function intreprets the same -compiled pattern data as pcre2_match(); however, not all the facilities are -available, and those that are do not always work in quite the same way. See the -user documentation for details. - -The algorithm that is used for pcre2_dfa_match() is not a traditional FSM, -because it may have a number of states active at one time. More work would be -needed at compile time to produce a traditional FSM where only one state is -ever active at once. I believe some other regex matchers work this way. JIT -support is not available for this kind of matching. - - -Changeable options ------------------- - -The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL, and -others) may be changed in the middle of patterns by items such as (?i). Their -processing is handled entirely at compile time by generating different opcodes -for the different settings. The runtime functions do not need to keep track of -an option's state. - -PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE -are tracked and processed during the parsing pre-pass. The others are handled -from META_OPTIONS items during the main compile phase. - - -Format of compiled patterns ---------------------------- - -The compiled form of a pattern is a vector of unsigned code units (bytes in -8-bit mode, shorts in 16-bit mode, 32-bit words in 32-bit mode), containing -items of variable length. The first code unit in an item contains an opcode, -and the length of the item is either implicit in the opcode or contained in the -data that follows it. - -In many cases listed below, LINK_SIZE data values are specified for offsets -within the compiled pattern. LINK_SIZE always specifies a number of bytes. The -default value for LINK_SIZE is 2, except for the 32-bit library, where it can -only be 4. The 8-bit library can be compiled to used 3-byte or 4-byte values, -and the 16-bit library can be compiled to use 4-byte values, though this -impairs performance. Specifing a LINK_SIZE larger than 2 for these libraries is -necessary only when patterns whose compiled length is greater than 65535 code -units are going to be processed. When a LINK_SIZE value uses more than one code -unit, the most significant unit is first. - -In this description, we assume the "normal" compilation options. Data values -that are counts (e.g. quantifiers) are always two bytes long in 8-bit mode -(most significant byte first), and one code unit in 16-bit and 32-bit modes. - - -Opcodes with no following data ------------------------------- - -These items are all just one unit long: - - OP_END end of pattern - OP_ANY match any one character other than newline - OP_ALLANY match any one character, including newline - OP_ANYBYTE match any single code unit, even in UTF-8/16 mode - OP_SOD match start of data: \A - OP_SOM, start of match (subject + offset): \G - OP_SET_SOM, set start of match (\K) - OP_CIRC ^ (start of data) - OP_CIRCM ^ multiline mode (start of data or after newline) - OP_NOT_WORD_BOUNDARY \W - OP_WORD_BOUNDARY \w - OP_NOT_DIGIT \D - OP_DIGIT \d - OP_NOT_HSPACE \H - OP_HSPACE \h - OP_NOT_WHITESPACE \S - OP_WHITESPACE \s - OP_NOT_VSPACE \V - OP_VSPACE \v - OP_NOT_WORDCHAR \W - OP_WORDCHAR \w - OP_EODN match end of data or newline at end: \Z - OP_EOD match end of data: \z - OP_DOLL $ (end of data, or before final newline) - OP_DOLLM $ multiline mode (end of data or before newline) - OP_EXTUNI match an extended Unicode grapheme cluster - OP_ANYNL match any Unicode newline sequence - - OP_ASSERT_ACCEPT ) - OP_ACCEPT ) These are Perl 5.10's "backtracking control - OP_COMMIT ) verbs". If OP_ACCEPT is inside capturing - OP_FAIL ) parentheses, it may be preceded by one or more - OP_PRUNE ) OP_CLOSE, each followed by a number that - OP_SKIP ) indicates which parentheses must be closed. - OP_THEN ) - -OP_ASSERT_ACCEPT is used when (*ACCEPT) is encountered within an assertion. -This ends the assertion, not the entire pattern match. The assertion (?!) is -always optimized to OP_FAIL. - -OP_ALLANY is used for '.' when PCRE2_DOTALL is set. It is also used for \C in -non-UTF modes and in UTF-32 mode (since one code unit still equals one -character). Another use is for [^] when empty classes are permitted -(PCRE2_ALLOW_EMPTY_CLASS is set). - - -Backtracking control verbs --------------------------- - -Verbs with no arguments generate opcodes with no following data (as listed -in the section above). - -(*MARK:NAME) generates OP_MARK followed by the mark name, preceded by a -length in one code unit, and followed by a binary zero. The name length is -limited by the size of the code unit. - -(*ACCEPT:NAME) and (*FAIL:NAME) are compiled as (*MARK:NAME)(*ACCEPT) and -(*MARK:NAME)(*FAIL) respectively. - -For (*COMMIT:NAME), (*PRUNE:NAME), (*SKIP:NAME), and (*THEN:NAME), the opcodes -OP_COMMIT_ARG, OP_PRUNE_ARG, OP_SKIP_ARG, and OP_THEN_ARG are used, with the -name following in the same format as for OP_MARK. - - -Matching literal characters ---------------------------- - -The OP_CHAR opcode is followed by a single character that is to be matched -casefully. For caseless matching of characters that have at most two -case-equivalent code points, OP_CHARI is used. In UTF-8 or UTF-16 modes, the -character may be more than one code unit long. In UTF-32 mode, characters are -always exactly one code unit long. - -If there is only one character in a character class, OP_CHAR or OP_CHARI is -used for a positive class, and OP_NOT or OP_NOTI for a negative one (that is, -for something like [^a]). - -Caseless matching (positive or negative) of characters that have more than two -case-equivalent code points (which is possible only in UTF mode) is handled by -compiling a Unicode property item (see below), with the pseudo-property -PT_CLIST. The value of this property is an offset in a vector called -"ucd_caseless_sets" which identifies the start of a short list of equivalent -characters, terminated by the value NOTACHAR (0xffffffff). - - -Repeating single characters ---------------------------- - -The common repeats (*, +, ?), when applied to a single character, use the -following opcodes, which come in caseful and caseless versions: - - Caseful Caseless - OP_STAR OP_STARI - OP_MINSTAR OP_MINSTARI - OP_POSSTAR OP_POSSTARI - OP_PLUS OP_PLUSI - OP_MINPLUS OP_MINPLUSI - OP_POSPLUS OP_POSPLUSI - OP_QUERY OP_QUERYI - OP_MINQUERY OP_MINQUERYI - OP_POSQUERY OP_POSQUERYI - -Each opcode is followed by the character that is to be repeated. In ASCII or -UTF-32 modes, these are two-code-unit items; in UTF-8 or UTF-16 modes, the -length is variable. Those with "MIN" in their names are the minimizing -versions. Those with "POS" in their names are possessive versions. Other kinds -of repeat make use of these opcodes: - - Caseful Caseless - OP_UPTO OP_UPTOI - OP_MINUPTO OP_MINUPTOI - OP_POSUPTO OP_POSUPTOI - OP_EXACT OP_EXACTI - -Each of these is followed by a count and then the repeated character. The count -is two bytes long in 8-bit mode (most significant byte first), or one code unit -in 16-bit and 32-bit modes. - -OP_UPTO matches from 0 to the given number. A repeat with a non-zero minimum -and a fixed maximum is coded as an OP_EXACT followed by an OP_UPTO (or -OP_MINUPTO or OPT_POSUPTO). - -Another set of matching repeating opcodes (called OP_NOTSTAR, OP_NOTSTARI, -etc.) are used for repeated, negated, single-character classes such as [^a]*. -The normal single-character opcodes (OP_STAR, etc.) are used for repeated -positive single-character classes. - - -Repeating character types -------------------------- - -Repeats of things like \d are done exactly as for single characters, except -that instead of a character, the opcode for the type (e.g. OP_DIGIT) is stored -in the next code unit. The opcodes are: - - OP_TYPESTAR - OP_TYPEMINSTAR - OP_TYPEPOSSTAR - OP_TYPEPLUS - OP_TYPEMINPLUS - OP_TYPEPOSPLUS - OP_TYPEQUERY - OP_TYPEMINQUERY - OP_TYPEPOSQUERY - OP_TYPEUPTO - OP_TYPEMINUPTO - OP_TYPEPOSUPTO - OP_TYPEEXACT - - -Match by Unicode property -------------------------- - -OP_PROP and OP_NOTPROP are used for positive and negative matches of a -character by testing its Unicode property (the \p and \P escape sequences). -Each is followed by two code units that encode the desired property as a type -and a value. The types are a set of #defines of the form PT_xxx, and the values -are enumerations of the form ucp_xx, defined in the pcre2_ucp.h source file. -The value is relevant only for PT_GC (General Category), PT_PC (Particular -Category), PT_SC (Script), and the pseudo-property PT_CLIST, which is used to -identify a list of case-equivalent characters when there are three or more. - -Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by -three code units: OP_PROP or OP_NOTPROP, and then the desired property type and -value. - - -Character classes ------------------ - -If there is only one character in a class, OP_CHAR or OP_CHARI is used for a -positive class, and OP_NOT or OP_NOTI for a negative one (that is, for -something like [^a]), except when caselessly matching a character that has more -than two case-equivalent code points (which can happen only in UTF mode). In -this case a Unicode property item is used, as described above in "Matching -literal characters". - -A set of repeating opcodes (called OP_NOTSTAR etc.) are used for repeated, -negated, single-character classes. The normal single-character opcodes -(OP_STAR, etc.) are used for repeated positive single-character classes. - -When there is more than one character in a class, and all the code points are -less than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a -negative one. In either case, the opcode is followed by a 32-byte (16-short, -8-word) bit map containing a 1 bit for every character that is acceptable. The -bits are counted from the least significant end of each unit. In caseless mode, -bits for both cases are set. - -The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 and -16-bit and 32-bit modes, subject characters with values greater than 255 can be -handled correctly. For OP_CLASS they do not match, whereas for OP_NCLASS they -do. - -For classes containing characters with values greater than 255 or that contain -\p or \P, OP_XCLASS is used. It optionally uses a bit map if any acceptable -code points are less than 256, followed by a list of pairs (for a range) and/or -single characters and/or properties. In caseless mode, all equivalent -characters are explicitly listed. - -OP_XCLASS is followed by a LINK_SIZE value containing the total length of the -opcode and its data. This is followed by a code unit containing flag bits: -XCL_NOT indicates that this is a negative class, and XCL_MAP indicates that a -bit map is present. There follows the bit map, if XCL_MAP is set, and then a -sequence of items coded as follows: - - XCL_END marks the end of the list - XCL_SINGLE one character follows - XCL_RANGE two characters follow - XCL_PROP a Unicode property (type, value) follows - XCL_NOTPROP a Unicode property (type, value) follows - -If a range starts with a code point less than 256 and ends with one greater -than 255, it is split into two ranges, with characters less than 256 being -indicated in the bit map, and the rest with XCL_RANGE. - -When XCL_NOT is set, the bit map, if present, contains bits for characters that -are allowed (exactly as for OP_NCLASS), but the list of items that follow it -specifies characters and properties that are not allowed. - - -Back references ---------------- - -OP_REF (caseful) or OP_REFI (caseless) is followed by a count containing the -reference number when the reference is to a unique capturing group (either by -number or by name). When named groups are used, there may be more than one -group with the same name. In this case, a reference to such a group by name -generates OP_DNREF or OP_DNREFI. These are followed by two counts: the index -(not the byte offset) in the group name table of the first entry for the -required name, followed by the number of groups with the same name. The -matching code can then search for the first one that is set. - - -Repeating character classes and back references ------------------------------------------------ - -Single-character classes are handled specially (see above). This section -applies to other classes and also to back references. In both cases, the repeat -information follows the base item. The matching code looks at the following -opcode to see if it is one of these: - - OP_CRSTAR - OP_CRMINSTAR - OP_CRPOSSTAR - OP_CRPLUS - OP_CRMINPLUS - OP_CRPOSPLUS - OP_CRQUERY - OP_CRMINQUERY - OP_CRPOSQUERY - OP_CRRANGE - OP_CRMINRANGE - OP_CRPOSRANGE - -All but the last three are single-code-unit items, with no data. The range -opcodes are followed by the minimum and maximum repeat counts. - - -Brackets and alternation ------------------------- - -A pair of non-capturing round brackets is wrapped round each expression at -compile time, so alternation always happens in the context of brackets. - -[Note for North Americans: "bracket" to some English speakers, including -myself, can be round, square, curly, or pointy. Hence this usage rather than -"parentheses".] - -Non-capturing brackets use the opcode OP_BRA, capturing brackets use OP_CBRA. A -bracket opcode is followed by a LINK_SIZE value which gives the offset to the -next alternative OP_ALT or, if there aren't any branches, to the terminating -opcode. Each OP_ALT is followed by a LINK_SIZE value giving the offset to the -next one, or to the final opcode. For capturing brackets, the bracket number is -a count that immediately follows the offset. - -There are several opcodes that mark the end of a subpattern group. OP_KET is -used for subpatterns that do not repeat indefinitely, OP_KETRMIN and -OP_KETRMAX are used for indefinite repetitions, minimally or maximally -respectively, and OP_KETRPOS for possessive repetitions (see below for more -details). All four are followed by a LINK_SIZE value giving (as a positive -number) the offset back to the matching bracket opcode. - -If a subpattern is quantified such that it is permitted to match zero times, it -is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are -single-unit opcodes that tell the matcher that skipping the following -subpattern entirely is a valid match. In the case of the first two, not -skipping the pattern is also valid (greedy and non-greedy). The third is used -when a pattern has the quantifier {0,0}. It cannot be entirely discarded, -because it may be called as a subroutine from elsewhere in the pattern. - -A subpattern with an indefinite maximum repetition is replicated in the -compiled data its minimum number of times (or once with OP_BRAZERO if the -minimum is zero), with the final copy terminating with OP_KETRMIN or OP_KETRMAX -as appropriate. - -A subpattern with a bounded maximum repetition is replicated in a nested -fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO -before each replication after the minimum, so that, for example, (abc){2,5} is -compiled as (abc)(abc)((abc)((abc)(abc)?)?)?, except that each bracketed group -has the same number. - -When a repeated subpattern has an unbounded upper limit, it is checked to see -whether it could match an empty string. If this is the case, the opcode in the -final replication is changed to OP_SBRA or OP_SCBRA. This tells the matcher -that it needs to check for matching an empty string when it hits OP_KETRMIN or -OP_KETRMAX, and if so, to break the loop. - - -Possessive brackets -------------------- - -When a repeated group (capturing or non-capturing) is marked as possessive by -the "+" notation, e.g. (abc)++, different opcodes are used. Their names all -have POS on the end, e.g. OP_BRAPOS instead of OP_BRA and OP_SCBRAPOS instead -of OP_SCBRA. The end of such a group is marked by OP_KETRPOS. If the minimum -repetition is zero, the group is preceded by OP_BRAPOSZERO. - - -Once-only (atomic) groups -------------------------- - -These are just like other subpatterns, but they start with the opcode OP_ONCE. -The check for matching an empty string in an unbounded repeat is handled -entirely at runtime, so there is just this one opcode for atomic groups. - - -Assertions ----------- - -Forward assertions are also just like other subpatterns, but starting with one -of the opcodes OP_ASSERT, OP_ASSERT_NA (non-atomic assertion), or -OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK, -OP_ASSERTBACK_NA, and OP_ASSERTBACK_NOT, and the first opcode inside the -assertion is OP_REVERSE, followed by a count of the number of characters to -move back the pointer in the subject string. In ASCII or UTF-32 mode, the count -is also the number of code units, but in UTF-8/16 mode each character may -occupy more than one code unit. A separate count is present in each alternative -of a lookbehind assertion, allowing each branch to have a different (but fixed) -length. - - -Conditional subpatterns ------------------------ - -These are like other subpatterns, but they start with the opcode OP_COND, or -OP_SCOND for one that might match an empty string in an unbounded repeat. - -If the condition is a back reference, this is stored at the start of the -subpattern using the opcode OP_CREF followed by a count containing the -reference number, provided that the reference is to a unique capturing group. -If the reference was by name and there is more than one group with that name, -OP_DNCREF is used instead. It is followed by two counts: the index in the group -names table, and the number of groups with the same name. The allows the -matcher to check if any group with the given name is set. - -If the condition is "in recursion" (coded as "(?(R)"), or "in recursion of -group x" (coded as "(?(Rx)"), the group number is stored at the start of the -subpattern using the opcode OP_RREF (with a value of RREF_ANY (0xffff) for "the -whole pattern") or OP_DNRREF (with data as for OP_DNCREF). - -For a DEFINE condition, OP_FALSE is used (with no associated data). During -compilation, however, a DEFINE condition is coded as OP_DEFINE so that, when -the conditional group is complete, there can be a check to ensure that it -contains only one top-level branch. Once this has happened, the opcode is -changed to OP_FALSE, so the matcher never sees OP_DEFINE. - -There is a special PCRE2-specific condition of the form (VERSION[>]=x.y), which -tests the PCRE2 version number. This compiles into one of the opcodes OP_TRUE -or OP_FALSE. - -If a condition is not a back reference, recursion test, DEFINE, or VERSION, it -must start with a parenthesized atomic assertion, whose opcode normally -immediately follows OP_COND or OP_SCOND. However, if automatic callouts are -enabled, a callout is inserted immediately before the assertion. It is also -possible to insert a manual callout at this point. Only assertion conditions -may have callouts preceding the condition. - -A condition that is the negative assertion (?!) is optimized to OP_FAIL in all -parts of the pattern, so this is another opcode that may appear as a condition. -It is treated the same as OP_FALSE. - - -Recursion ---------- - -Recursion either matches the current pattern, or some subexpression. The opcode -OP_RECURSE is followed by a LINK_SIZE value that is the offset to the starting -bracket from the start of the whole pattern. OP_RECURSE is also used for -"subroutine" calls, even though they are not strictly a recursion. Up till -release 10.30 recursions were treated as atomic groups, making them -incompatible with Perl (but PCRE had them well before Perl did). From 10.30, -backtracking into recursions is supported. - -Repeated recursions used to be wrapped inside OP_ONCE brackets, which not only -forced no backtracking, but also allowed repetition to be handled as for other -bracketed groups. From 10.30 onwards, repeated recursions are duplicated for -their minimum repetitions, and then wrapped in non-capturing brackets for the -remainder. For example, (?1){3} is treated as (?1)(?1)(?1), and (?1){2,4} is -treated as (?1)(?1)(?:(?1)){0,2}. - - -Callouts --------- - -A callout may have either a numerical argument or a string argument. These use -OP_CALLOUT or OP_CALLOUT_STR, respectively. In each case these are followed by -two LINK_SIZE values giving the offset in the pattern string to the start of -the following item, and another count giving the length of this item. These -values make it possible for pcre2test to output useful tracing information -using callouts. - -In the case of a numeric callout, after these two values there is a single code -unit containing the callout number, in the range 0-255, with 255 being used for -callouts that are automatically inserted as a result of the PCRE2_AUTO_CALLOUT -option. Thus, this opcode item is of fixed length: - - [OP_CALLOUT] [PATTERN_OFFSET] [PATTERN_LENGTH] [NUMBER] - -For callouts with string arguments, OP_CALLOUT_STR has three more data items: -a LINK_SIZE value giving the complete length of the entire opcode item, a -LINK_SIZE item containing the offset within the pattern string to the start of -the string argument, and the string itself, preceded by its starting delimiter -and followed by a binary zero. When a callout function is called, a pointer to -the actual string is passed, but the delimiter can be accessed as string[-1] if -the application needs it. In the 8-bit library, the callout in /X(?C'abc')Y/ is -compiled as the following bytes (decimal numbers represent binary values): - - [OP_CALLOUT_STR] [0] [10] [0] [1] [0] [14] [0] [5] ['] [a] [b] [c] [0] - -------- ------- -------- ------- - | | | | - ------- LINK_SIZE items ------ - -Opcode table checking ---------------------- - -The last opcode that is defined in pcre2_internal.h is OP_TABLE_LENGTH. This is -not a real opcode, but is used to check at compile time that tables indexed by -opcode are the correct length, in order to catch updating errors. - -Philip Hazel -12 July 2019 diff --git a/pcre2/INSTALL b/pcre2/INSTALL deleted file mode 100644 index 8865734f8..000000000 --- a/pcre2/INSTALL +++ /dev/null @@ -1,368 +0,0 @@ -Installation Instructions -************************* - - Copyright (C) 1994-1996, 1999-2002, 2004-2016 Free Software -Foundation, Inc. - - Copying and distribution of this file, with or without modification, -are permitted in any medium without royalty provided the copyright -notice and this notice are preserved. This file is offered as-is, -without warranty of any kind. - -Basic Installation -================== - - Briefly, the shell command './configure && make && make install' -should configure, build, and install this package. The following -more-detailed instructions are generic; see the 'README' file for -instructions specific to this package. Some packages provide this -'INSTALL' file but do not implement all of the features documented -below. The lack of an optional feature in a given package is not -necessarily a bug. More recommendations for GNU packages can be found -in *note Makefile Conventions: (standards)Makefile Conventions. - - The 'configure' shell script attempts to guess correct values for -various system-dependent variables used during compilation. It uses -those values to create a 'Makefile' in each directory of the package. -It may also create one or more '.h' files containing system-dependent -definitions. Finally, it creates a shell script 'config.status' that -you can run in the future to recreate the current configuration, and a -file 'config.log' containing compiler output (useful mainly for -debugging 'configure'). - - It can also use an optional file (typically called 'config.cache' and -enabled with '--cache-file=config.cache' or simply '-C') that saves the -results of its tests to speed up reconfiguring. Caching is disabled by -default to prevent problems with accidental use of stale cache files. - - If you need to do unusual things to compile the package, please try -to figure out how 'configure' could check whether to do them, and mail -diffs or instructions to the address given in the 'README' so they can -be considered for the next release. If you are using the cache, and at -some point 'config.cache' contains results you don't want to keep, you -may remove or edit it. - - The file 'configure.ac' (or 'configure.in') is used to create -'configure' by a program called 'autoconf'. You need 'configure.ac' if -you want to change it or regenerate 'configure' using a newer version of -'autoconf'. - - The simplest way to compile this package is: - - 1. 'cd' to the directory containing the package's source code and type - './configure' to configure the package for your system. - - Running 'configure' might take a while. While running, it prints - some messages telling which features it is checking for. - - 2. Type 'make' to compile the package. - - 3. Optionally, type 'make check' to run any self-tests that come with - the package, generally using the just-built uninstalled binaries. - - 4. Type 'make install' to install the programs and any data files and - documentation. When installing into a prefix owned by root, it is - recommended that the package be configured and built as a regular - user, and only the 'make install' phase executed with root - privileges. - - 5. Optionally, type 'make installcheck' to repeat any self-tests, but - this time using the binaries in their final installed location. - This target does not install anything. Running this target as a - regular user, particularly if the prior 'make install' required - root privileges, verifies that the installation completed - correctly. - - 6. You can remove the program binaries and object files from the - source code directory by typing 'make clean'. To also remove the - files that 'configure' created (so you can compile the package for - a different kind of computer), type 'make distclean'. There is - also a 'make maintainer-clean' target, but that is intended mainly - for the package's developers. If you use it, you may have to get - all sorts of other programs in order to regenerate files that came - with the distribution. - - 7. Often, you can also type 'make uninstall' to remove the installed - files again. In practice, not all packages have tested that - uninstallation works correctly, even though it is required by the - GNU Coding Standards. - - 8. Some packages, particularly those that use Automake, provide 'make - distcheck', which can by used by developers to test that all other - targets like 'make install' and 'make uninstall' work correctly. - This target is generally not run by end users. - -Compilers and Options -===================== - - Some systems require unusual options for compilation or linking that -the 'configure' script does not know about. Run './configure --help' -for details on some of the pertinent environment variables. - - You can give 'configure' initial values for configuration parameters -by setting variables in the command line or in the environment. Here is -an example: - - ./configure CC=c99 CFLAGS=-g LIBS=-lposix - - *Note Defining Variables::, for more details. - -Compiling For Multiple Architectures -==================================== - - You can compile the package for more than one kind of computer at the -same time, by placing the object files for each architecture in their -own directory. To do this, you can use GNU 'make'. 'cd' to the -directory where you want the object files and executables to go and run -the 'configure' script. 'configure' automatically checks for the source -code in the directory that 'configure' is in and in '..'. This is known -as a "VPATH" build. - - With a non-GNU 'make', it is safer to compile the package for one -architecture at a time in the source code directory. After you have -installed the package for one architecture, use 'make distclean' before -reconfiguring for another architecture. - - On MacOS X 10.5 and later systems, you can create libraries and -executables that work on multiple system types--known as "fat" or -"universal" binaries--by specifying multiple '-arch' options to the -compiler but only a single '-arch' option to the preprocessor. Like -this: - - ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ - CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ - CPP="gcc -E" CXXCPP="g++ -E" - - This is not guaranteed to produce working output in all cases, you -may have to build one architecture at a time and combine the results -using the 'lipo' tool if you have problems. - -Installation Names -================== - - By default, 'make install' installs the package's commands under -'/usr/local/bin', include files under '/usr/local/include', etc. You -can specify an installation prefix other than '/usr/local' by giving -'configure' the option '--prefix=PREFIX', where PREFIX must be an -absolute file name. - - You can specify separate installation prefixes for -architecture-specific files and architecture-independent files. If you -pass the option '--exec-prefix=PREFIX' to 'configure', the package uses -PREFIX as the prefix for installing programs and libraries. -Documentation and other data files still use the regular prefix. - - In addition, if you use an unusual directory layout you can give -options like '--bindir=DIR' to specify different values for particular -kinds of files. Run 'configure --help' for a list of the directories -you can set and what kinds of files go in them. In general, the default -for these options is expressed in terms of '${prefix}', so that -specifying just '--prefix' will affect all of the other directory -specifications that were not explicitly provided. - - The most portable way to affect installation locations is to pass the -correct locations to 'configure'; however, many packages provide one or -both of the following shortcuts of passing variable assignments to the -'make install' command line to change installation locations without -having to reconfigure or recompile. - - The first method involves providing an override variable for each -affected directory. For example, 'make install -prefix=/alternate/directory' will choose an alternate location for all -directory configuration variables that were expressed in terms of -'${prefix}'. Any directories that were specified during 'configure', -but not in terms of '${prefix}', must each be overridden at install time -for the entire installation to be relocated. The approach of makefile -variable overrides for each directory variable is required by the GNU -Coding Standards, and ideally causes no recompilation. However, some -platforms have known limitations with the semantics of shared libraries -that end up requiring recompilation when using this method, particularly -noticeable in packages that use GNU Libtool. - - The second method involves providing the 'DESTDIR' variable. For -example, 'make install DESTDIR=/alternate/directory' will prepend -'/alternate/directory' before all installation names. The approach of -'DESTDIR' overrides is not required by the GNU Coding Standards, and -does not work on platforms that have drive letters. On the other hand, -it does better at avoiding recompilation issues, and works well even -when some directory options were not specified in terms of '${prefix}' -at 'configure' time. - -Optional Features -================= - - If the package supports it, you can cause programs to be installed -with an extra prefix or suffix on their names by giving 'configure' the -option '--program-prefix=PREFIX' or '--program-suffix=SUFFIX'. - - Some packages pay attention to '--enable-FEATURE' options to -'configure', where FEATURE indicates an optional part of the package. -They may also pay attention to '--with-PACKAGE' options, where PACKAGE -is something like 'gnu-as' or 'x' (for the X Window System). The -'README' should mention any '--enable-' and '--with-' options that the -package recognizes. - - For packages that use the X Window System, 'configure' can usually -find the X include and library files automatically, but if it doesn't, -you can use the 'configure' options '--x-includes=DIR' and -'--x-libraries=DIR' to specify their locations. - - Some packages offer the ability to configure how verbose the -execution of 'make' will be. For these packages, running './configure ---enable-silent-rules' sets the default to minimal output, which can be -overridden with 'make V=1'; while running './configure ---disable-silent-rules' sets the default to verbose, which can be -overridden with 'make V=0'. - -Particular systems -================== - - On HP-UX, the default C compiler is not ANSI C compatible. If GNU CC -is not installed, it is recommended to use the following options in -order to use an ANSI C compiler: - - ./configure CC="cc -Ae -D_XOPEN_SOURCE=500" - -and if that doesn't work, install pre-built binaries of GCC for HP-UX. - - HP-UX 'make' updates targets which have the same time stamps as their -prerequisites, which makes it generally unusable when shipped generated -files such as 'configure' are involved. Use GNU 'make' instead. - - On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot -parse its '' header file. The option '-nodtk' can be used as a -workaround. If GNU CC is not installed, it is therefore recommended to -try - - ./configure CC="cc" - -and if that doesn't work, try - - ./configure CC="cc -nodtk" - - On Solaris, don't put '/usr/ucb' early in your 'PATH'. This -directory contains several dysfunctional programs; working variants of -these programs are available in '/usr/bin'. So, if you need '/usr/ucb' -in your 'PATH', put it _after_ '/usr/bin'. - - On Haiku, software installed for all users goes in '/boot/common', -not '/usr/local'. It is recommended to use the following options: - - ./configure --prefix=/boot/common - -Specifying the System Type -========================== - - There may be some features 'configure' cannot figure out -automatically, but needs to determine by the type of machine the package -will run on. Usually, assuming the package is built to be run on the -_same_ architectures, 'configure' can figure that out, but if it prints -a message saying it cannot guess the machine type, give it the -'--build=TYPE' option. TYPE can either be a short name for the system -type, such as 'sun4', or a canonical name which has the form: - - CPU-COMPANY-SYSTEM - -where SYSTEM can have one of these forms: - - OS - KERNEL-OS - - See the file 'config.sub' for the possible values of each field. If -'config.sub' isn't included in this package, then this package doesn't -need to know the machine type. - - If you are _building_ compiler tools for cross-compiling, you should -use the option '--target=TYPE' to select the type of system they will -produce code for. - - If you want to _use_ a cross compiler, that generates code for a -platform different from the build platform, you should specify the -"host" platform (i.e., that on which the generated programs will -eventually be run) with '--host=TYPE'. - -Sharing Defaults -================ - - If you want to set default values for 'configure' scripts to share, -you can create a site shell script called 'config.site' that gives -default values for variables like 'CC', 'cache_file', and 'prefix'. -'configure' looks for 'PREFIX/share/config.site' if it exists, then -'PREFIX/etc/config.site' if it exists. Or, you can set the -'CONFIG_SITE' environment variable to the location of the site script. -A warning: not all 'configure' scripts look for a site script. - -Defining Variables -================== - - Variables not defined in a site shell script can be set in the -environment passed to 'configure'. However, some packages may run -configure again during the build, and the customized values of these -variables may be lost. In order to avoid this problem, you should set -them in the 'configure' command line, using 'VAR=value'. For example: - - ./configure CC=/usr/local2/bin/gcc - -causes the specified 'gcc' to be used as the C compiler (unless it is -overridden in the site shell script). - -Unfortunately, this technique does not work for 'CONFIG_SHELL' due to an -Autoconf limitation. Until the limitation is lifted, you can use this -workaround: - - CONFIG_SHELL=/bin/bash ./configure CONFIG_SHELL=/bin/bash - -'configure' Invocation -====================== - - 'configure' recognizes the following options to control how it -operates. - -'--help' -'-h' - Print a summary of all of the options to 'configure', and exit. - -'--help=short' -'--help=recursive' - Print a summary of the options unique to this package's - 'configure', and exit. The 'short' variant lists options used only - in the top level, while the 'recursive' variant lists options also - present in any nested packages. - -'--version' -'-V' - Print the version of Autoconf used to generate the 'configure' - script, and exit. - -'--cache-file=FILE' - Enable the cache: use and save the results of the tests in FILE, - traditionally 'config.cache'. FILE defaults to '/dev/null' to - disable caching. - -'--config-cache' -'-C' - Alias for '--cache-file=config.cache'. - -'--quiet' -'--silent' -'-q' - Do not print messages saying which checks are being made. To - suppress all normal output, redirect it to '/dev/null' (any error - messages will still be shown). - -'--srcdir=DIR' - Look for the package's source code in directory DIR. Usually - 'configure' can determine that directory automatically. - -'--prefix=DIR' - Use DIR as the installation prefix. *note Installation Names:: for - more details, including other options available for fine-tuning the - installation locations. - -'--no-create' -'-n' - Run the configure checks, but stop before creating any output - files. - -'configure' also accepts some other, not widely useful, options. Run -'configure --help' for more details. diff --git a/pcre2/LICENCE b/pcre2/LICENCE deleted file mode 100644 index 155d07312..000000000 --- a/pcre2/LICENCE +++ /dev/null @@ -1,94 +0,0 @@ -PCRE2 LICENCE -------------- - -PCRE2 is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - -Releases 10.00 and above of PCRE2 are distributed under the terms of the "BSD" -licence, as specified below, with one exemption for certain binary -redistributions. The documentation for PCRE2, supplied in the "doc" directory, -is distributed under the same terms as the software itself. The data in the -testdata directory is not copyrighted and is in the public domain. - -The basic library functions are written in C and are freestanding. Also -included in the distribution is a just-in-time compiler that can be used to -optimize pattern matching. This is an optional feature that can be omitted when -the library is built. - - -THE BASIC LIBRARY FUNCTIONS ---------------------------- - -Written by: Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com - -University of Cambridge Computing Service, -Cambridge, England. - -Copyright (c) 1997-2020 University of Cambridge -All rights reserved. - - -PCRE2 JUST-IN-TIME COMPILATION SUPPORT --------------------------------------- - -Written by: Zoltan Herczeg -Email local part: hzmester -Email domain: freemail.hu - -Copyright(c) 2010-2020 Zoltan Herczeg -All rights reserved. - - -STACK-LESS JUST-IN-TIME COMPILER --------------------------------- - -Written by: Zoltan Herczeg -Email local part: hzmester -Email domain: freemail.hu - -Copyright(c) 2009-2020 Zoltan Herczeg -All rights reserved. - - -THE "BSD" LICENCE ------------------ - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notices, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notices, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of any - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -EXEMPTION FOR BINARY LIBRARY-LIKE PACKAGES ------------------------------------------- - -The second condition in the BSD licence (covering binary redistributions) does -not apply all the way down a chain of software. If binary package A includes -PCRE2, it must respect the condition, but if package B is software that -includes package A, the condition is not imposed on package B unless it uses -PCRE2 independently. - -End diff --git a/pcre2/NEWS b/pcre2/NEWS deleted file mode 100644 index de797e7b5..000000000 --- a/pcre2/NEWS +++ /dev/null @@ -1,347 +0,0 @@ -News about PCRE2 releases -------------------------- - - -Version 10.36 04-December-2020 ------------------------------- - -Again, mainly bug fixes and tidies. The only enhancements are the addition of -GNU grep's -m (aka --max-count) option to pcre2grep, and also unifying the -handling of substitution strings for both -O and callouts in pcre2grep, with -the addition of $x{...} and $o{...} to allow for characters whose code points -are greater than 255 in Unicode mode. - -NOTE: there is an outstanding issue with JIT support for MacOS on arm64 -hardware. For details, please see Bugzilla issue #2618. - - -Version 10.35 15-April-2020 ---------------------------- - -Bugfixes, tidies, and a few new enhancements. - -1. Capturing groups that contain recursive backreferences to themselves are no -longer automatically atomic, because the restriction is no longer necessary -as a result of the 10.30 restructuring. - -2. Several new options for pcre2_substitute(). - -3. When Unicode is supported and PCRE2_UCP is set without PCRE2_UTF, Unicode -character properties are used for upper/lower case computations on characters -whose code points are greater than 127. - -4. The character tables (for low-valued characters) can now more easily be -saved and restored in binary. - -5. Updated to Unicode 13.0.0. - - -Version 10.34 21-November-2019 ------------------------------- - -Another release with a few enhancements as well as bugfixes and tidies. The -main new features are: - -1. There is now some support for matching in invalid UTF strings. - -2. Non-atomic positive lookarounds are implemented in the pcre2_match() -interpreter, but not in JIT. - -3. Added two new functions: pcre2_get_match_data_size() and -pcre2_maketables_free(). - -4. Upgraded to Unicode 12.1.0. - - -Version 10.33 16-April-2019 ---------------------------- - -Yet more bugfixes, tidies, and a few enhancements, summarized here (see -ChangeLog for the full list): - -1. Callouts from pcre2_substitute() are now available. - -2. The POSIX functions are now all called pcre2_regcomp() etc., with wrapper -functions that use the standard POSIX names. However, in pcre2posix.h the POSIX -names are defined as macros. This should help avoid linking with the wrong -library in some environments, while still exporting the POSIX names for -pre-existing programs that use them. - -3. Some new options: - - (a) PCRE2_EXTRA_ESCAPED_CR_IS_LF makes \r behave as \n. - - (b) PCRE2_EXTRA_ALT_BSUX enables support for ECMAScript 6's \u{hh...} - construct. - - (c) PCRE2_COPY_MATCHED_SUBJECT causes a copy of a matched subject to be - made, instead of just remembering a pointer. - -4. Some new Perl features: - - (a) Perl 5.28's experimental alphabetic names for atomic groups and - lookaround assertions, for example, (*pla:...) and (*atomic:...). - - (b) The new Perl "script run" features (*script_run:...) and - (*atomic_script_run:...) aka (*sr:...) and (*asr:...). - - (c) When PCRE2_UTF is set, allow non-ASCII letters and decimal digits in - capture group names. - -5. --disable-percent-zt disables the use of %zu and %td in formatting strings -in pcre2test. They were already automatically disabled for VC and older C -compilers. - -6. Some changes related to callouts in pcre2grep: - - (a) Support for running an external program under VMS has been added, in - addition to Windows and fork() support. - - (b) --disable-pcre2grep-callout-fork restricts the callout support in - to the inbuilt echo facility. - - -Version 10.32 10-September-2018 -------------------------------- - -This is another mainly bugfix and tidying release with a few minor -enhancements. These are the main ones: - -1. pcre2grep now supports the inclusion of binary zeros in patterns that are -read from files via the -f option. - -2. ./configure now supports --enable-jit=auto, which automatically enables JIT -if the hardware supports it. - -3. In pcre2_dfa_match(), internal recursive calls no longer use the stack for -local workspace and local ovectors. Instead, an initial block of stack is -reserved, but if this is insufficient, heap memory is used. The heap limit -parameter now applies to pcre2_dfa_match(). - -4. Updated to Unicode version 11.0.0. - -5. (*ACCEPT:ARG), (*FAIL:ARG), and (*COMMIT:ARG) are now supported. - -6. Added support for \N{U+dddd}, but only in Unicode mode. - -7. Added support for (?^) to unset all imnsx options. - - -Version 10.31 12-February-2018 ------------------------------- - -This is mainly a bugfix and tidying release (see ChangeLog for full details). -However, there are some minor enhancements. - -1. New pcre2_config() options: PCRE2_CONFIG_NEVER_BACKSLASH_C and -PCRE2_CONFIG_COMPILED_WIDTHS. - -2. New pcre2_pattern_info() option PCRE2_INFO_EXTRAOPTIONS to retrieve the -extra compile time options. - -3. There are now public names for all the pcre2_compile() error numbers. - -4. Added PCRE2_CALLOUT_STARTMATCH and PCRE2_CALLOUT_BACKTRACK bits to a new -field callout_flags in callout blocks. - - -Version 10.30 14-August-2017 ----------------------------- - -The full list of changes that includes bugfixes and tidies is, as always, in -ChangeLog. These are the most important new features: - -1. The main interpreter, pcre2_match(), has been refactored into a new version -that does not use recursive function calls (and therefore the system stack) for -remembering backtracking positions. This makes --disable-stack-for-recursion a -NOOP. The new implementation allows backtracking into recursive group calls in -patterns, making it more compatible with Perl, and also fixes some other -previously hard-to-do issues. For patterns that have a lot of backtracking, the -heap is now used, and there is an explicit limit on the amount, settable by -pcre2_set_heap_limit() or (*LIMIT_HEAP=xxx). The "recursion limit" is retained, -but is renamed as "depth limit" (though the old names remain for -compatibility). - -There is also a change in the way callouts from pcre2_match() are handled. The -offset_vector field in the callout block is no longer a pointer to the -actual ovector that was passed to the matching function in the match data -block. Instead it points to an internal ovector of a size large enough to hold -all possible captured substrings in the pattern. - -2. The new option PCRE2_ENDANCHORED insists that a pattern match must end at -the end of the subject. - -3. The new option PCRE2_EXTENDED_MORE implements Perl's /xx feature, and -pcre2test is upgraded to support it. Setting within the pattern by (?xx) is -also supported. - -4. (?n) can be used to set PCRE2_NO_AUTO_CAPTURE, because Perl now has this. - -5. Additional compile options in the compile context are now available, and the -first two are: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES and -PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL. - -6. The newline type PCRE2_NEWLINE_NUL is now available. - -7. The match limit value now also applies to pcre2_dfa_match() as there are -patterns that can use up a lot of resources without necessarily recursing very -deeply. - -8. The option REG_PEND (a GNU extension) is now available for the POSIX -wrapper. Also there is a new option PCRE2_LITERAL which is used to support -REG_NOSPEC. - -9. PCRE2_EXTRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD are implemented for the -benefit of pcre2grep, and pcre2grep's -F, -w, and -x options are re-implemented -using PCRE2_LITERAL, PCRE2_EXTRA_MATCH_WORD, and PCRE2_EXTRA_MATCH_LINE. This -is tidier and also fixes some bugs. - -10. The Unicode tables are upgraded from Unicode 8.0.0 to Unicode 10.0.0. - -11. There are some experimental functions for converting foreign patterns -(globs and POSIX patterns) into PCRE2 patterns. - - -Version 10.23 14-February-2017 ------------------------------- - -1. ChangeLog has the details of a lot of bug fixes and tidies. - -2. There has been a major re-factoring of the pcre2_compile.c file. Most syntax -checking is now done in the pre-pass that identifies capturing groups. This has -reduced the amount of duplication and made the code tidier. While doing this, -some minor bugs and Perl incompatibilities were fixed (see ChangeLog for -details.) - -3. Back references are now permitted in lookbehind assertions when there are -no duplicated group numbers (that is, (?| has not been used), and, if the -reference is by name, there is only one group of that name. The referenced -group must, of course be of fixed length. - -4. \g{+} (e.g. \g{+2} ) is now supported. It is a "forward back -reference" and can be useful in repetitions (compare \g{-} ). Perl does -not recognize this syntax. - -5. pcre2grep now automatically expands its buffer up to a maximum set by ---max-buffer-size. - -6. The -t option (grand total) has been added to pcre2grep. - -7. A new function called pcre2_code_copy_with_tables() exists to copy a -compiled pattern along with a private copy of the character tables that is -uses. - -8. A user supplied a number of patches to upgrade pcre2grep under Windows and -tidy the code. - -9. Several updates have been made to pcre2test and test scripts (see -ChangeLog). - - -Version 10.22 29-July-2016 --------------------------- - -1. ChangeLog has the details of a number of bug fixes. - -2. The POSIX wrapper function regcomp() did not used to support back references -and subroutine calls if called with the REG_NOSUB option. It now does. - -3. A new function, pcre2_code_copy(), is added, to make a copy of a compiled -pattern. - -4. Support for string callouts is added to pcre2grep. - -5. Added the PCRE2_NO_JIT option to pcre2_match(). - -6. The pcre2_get_error_message() function now returns with a negative error -code if the error number it is given is unknown. - -7. Several updates have been made to pcre2test and test scripts (see -ChangeLog). - - -Version 10.21 12-January-2016 ------------------------------ - -1. Many bugs have been fixed. A large number of them were provoked only by very -strange pattern input, and were discovered by fuzzers. Some others were -discovered by code auditing. See ChangeLog for details. - -2. The Unicode tables have been updated to Unicode version 8.0.0. - -3. For Perl compatibility in EBCDIC environments, ranges such as a-z in a -class, where both values are literal letters in the same case, omit the -non-letter EBCDIC code points within the range. - -4. There have been a number of enhancements to the pcre2_substitute() function, -giving more flexibility to replacement facilities. It is now also possible to -cause the function to return the needed buffer size if the one given is too -small. - -5. The PCRE2_ALT_VERBNAMES option causes the "name" parts of special verbs such -as (*THEN:name) to be processed for backslashes and to take note of -PCRE2_EXTENDED. - -6. PCRE2_INFO_HASBACKSLASHC makes it possible for a client to find out if a -pattern uses \C, and --never-backslash-C makes it possible to compile a version -PCRE2 in which the use of \C is always forbidden. - -7. A limit to the length of pattern that can be handled can now be set by -calling pcre2_set_max_pattern_length(). - -8. When matching an unanchored pattern, a match can be required to begin within -a given number of code units after the start of the subject by calling -pcre2_set_offset_limit(). - -9. The pcre2test program has been extended to test new facilities, and it can -now run the tests when LF on its own is not a valid newline sequence. - -10. The RunTest script has also been updated to enable more tests to be run. - -11. There have been some minor performance enhancements. - - -Version 10.20 30-June-2015 --------------------------- - -1. Callouts with string arguments and the pcre2_callout_enumerate() function -have been implemented. - -2. The PCRE2_NEVER_BACKSLASH_C option, which locks out the use of \C, is added. - -3. The PCRE2_ALT_CIRCUMFLEX option lets ^ match after a newline at the end of a -subject in multiline mode. - -4. The way named subpatterns are handled has been refactored. The previous -approach had several bugs. - -5. The handling of \c in EBCDIC environments has been changed to conform to the -perlebcdic document. This is an incompatible change. - -6. Bugs have been mended, many of them discovered by fuzzers. - - -Version 10.10 06-March-2015 ---------------------------- - -1. Serialization and de-serialization functions have been added to the API, -making it possible to save and restore sets of compiled patterns, though -restoration must be done in the same environment that was used for compilation. - -2. The (*NO_JIT) feature has been added; this makes it possible for a pattern -creator to specify that JIT is not to be used. - -3. A number of bugs have been fixed. In particular, bugs that caused building -on Windows using CMake to fail have been mended. - - -Version 10.00 05-January-2015 ------------------------------ - -Version 10.00 is the first release of PCRE2, a revised API for the PCRE -library. Changes prior to 10.00 are logged in the ChangeLog file for the old -API, up to item 20 for release 8.36. New programs are recommended to use the -new library. Programs that use the original (PCRE1) API will need changing -before linking with the new library. - -**** diff --git a/pcre2/NON-AUTOTOOLS-BUILD b/pcre2/NON-AUTOTOOLS-BUILD deleted file mode 100644 index a73c058bb..000000000 --- a/pcre2/NON-AUTOTOOLS-BUILD +++ /dev/null @@ -1,406 +0,0 @@ -Building PCRE2 without using autotools --------------------------------------- - -This document contains the following sections: - - General - Generic instructions for the PCRE2 C library - Stack size in Windows environments - Linking programs in Windows environments - Calling conventions in Windows environments - Comments about Win32 builds - Building PCRE2 on Windows with CMake - Building PCRE2 on Windows with Visual Studio - Testing with RunTest.bat - Building PCRE2 on native z/OS and z/VM - - -GENERAL - -The basic PCRE2 library consists entirely of code written in Standard C, and so -should compile successfully on any system that has a Standard C compiler and -library. - -The PCRE2 distribution includes a "configure" file for use by the -configure/make (autotools) build system, as found in many Unix-like -environments. The README file contains information about the options for -"configure". - -There is also support for CMake, which some users prefer, especially in Windows -environments, though it can also be run in Unix-like environments. See the -section entitled "Building PCRE2 on Windows with CMake" below. - -Versions of src/config.h and src/pcre2.h are distributed in the PCRE2 tarballs -under the names src/config.h.generic and src/pcre2.h.generic. These are -provided for those who build PCRE2 without using "configure" or CMake. If you -use "configure" or CMake, the .generic versions are not used. - - -GENERIC INSTRUCTIONS FOR THE PCRE2 C LIBRARY - -The following are generic instructions for building the PCRE2 C library "by -hand". If you are going to use CMake, this section does not apply to you; you -can skip ahead to the CMake section. - - (1) Copy or rename the file src/config.h.generic as src/config.h, and edit the - macro settings that it contains to whatever is appropriate for your - environment. In particular, you can alter the definition of the NEWLINE - macro to specify what character(s) you want to be interpreted as line - terminators by default. - - When you subsequently compile any of the PCRE2 modules, you must specify - -DHAVE_CONFIG_H to your compiler so that src/config.h is included in the - sources. - - An alternative approach is not to edit src/config.h, but to use -D on the - compiler command line to make any changes that you need to the - configuration options. In this case -DHAVE_CONFIG_H must not be set. - - NOTE: There have been occasions when the way in which certain parameters - in src/config.h are used has changed between releases. (In the - configure/make world, this is handled automatically.) When upgrading to a - new release, you are strongly advised to review src/config.h.generic - before re-using what you had previously. - - Note also that the src/config.h.generic file is created from a config.h - that was generated by Autotools, which automatically includes settings of - a number of macros that are not actually used by PCRE2 (for example, - HAVE_MEMORY_H). - - (2) Copy or rename the file src/pcre2.h.generic as src/pcre2.h. - - (3) EITHER: - Copy or rename file src/pcre2_chartables.c.dist as - src/pcre2_chartables.c. - - OR: - Compile src/pcre2_dftables.c as a stand-alone program (using - -DHAVE_CONFIG_H if you have set up src/config.h), and then run it with - the single argument "src/pcre2_chartables.c". This generates a set of - standard character tables and writes them to that file. The tables are - generated using the default C locale for your system. If you want to use - a locale that is specified by LC_xxx environment variables, add the -L - option to the pcre2_dftables command. You must use this method if you - are building on a system that uses EBCDIC code. - - The tables in src/pcre2_chartables.c are defaults. The caller of PCRE2 can - specify alternative tables at run time. - - (4) For an 8-bit library, compile the following source files from the src - directory, setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also - set -DHAVE_CONFIG_H if you have set up src/config.h with your - configuration, or else use other -D settings to change the configuration - as required. - - pcre2_auto_possess.c - pcre2_chartables.c - pcre2_compile.c - pcre2_config.c - pcre2_context.c - pcre2_convert.c - pcre2_dfa_match.c - pcre2_error.c - pcre2_extuni.c - pcre2_find_bracket.c - pcre2_jit_compile.c - pcre2_maketables.c - pcre2_match.c - pcre2_match_data.c - pcre2_newline.c - pcre2_ord2utf.c - pcre2_pattern_info.c - pcre2_script_run.c - pcre2_serialize.c - pcre2_string_utils.c - pcre2_study.c - pcre2_substitute.c - pcre2_substring.c - pcre2_tables.c - pcre2_ucd.c - pcre2_valid_utf.c - pcre2_xclass.c - - Make sure that you include -I. in the compiler command (or equivalent for - an unusual compiler) so that all included PCRE2 header files are first - sought in the src directory under the current directory. Otherwise you run - the risk of picking up a previously-installed file from somewhere else. - - Note that you must compile pcre2_jit_compile.c, even if you have not - defined SUPPORT_JIT in src/config.h, because when JIT support is not - configured, dummy functions are compiled. When JIT support IS configured, - pcre2_jit_compile.c #includes other files from the sljit subdirectory, - all of whose names begin with "sljit". It also #includes - src/pcre2_jit_match.c and src/pcre2_jit_misc.c, so you should not compile - these yourself. - - Note also that the pcre2_fuzzsupport.c file contains special code that is - useful to those who want to run fuzzing tests on the PCRE2 library. Unless - you are doing that, you can ignore it. - - (5) Now link all the compiled code into an object library in whichever form - your system keeps such libraries. This is the basic PCRE2 C 8-bit library. - If your system has static and shared libraries, you may have to do this - once for each type. - - (6) If you want to build a 16-bit library or 32-bit library (as well as, or - instead of the 8-bit library) just supply 16 or 32 as the value of - -DPCRE2_CODE_UNIT_WIDTH when you are compiling. - - (7) If you want to build the POSIX wrapper functions (which apply only to the - 8-bit library), ensure that you have the src/pcre2posix.h file and then - compile src/pcre2posix.c. Link the result (on its own) as the pcre2posix - library. - - (8) The pcre2test program can be linked with any combination of the 8-bit, - 16-bit and 32-bit libraries (depending on what you selected in - src/config.h). Compile src/pcre2test.c; don't forget -DHAVE_CONFIG_H if - necessary, but do NOT define PCRE2_CODE_UNIT_WIDTH. Then link with the - appropriate library/ies. If you compiled an 8-bit library, pcre2test also - needs the pcre2posix wrapper library. - - (9) Run pcre2test on the testinput files in the testdata directory, and check - that the output matches the corresponding testoutput files. There are - comments about what each test does in the section entitled "Testing PCRE2" - in the README file. If you compiled more than one of the 8-bit, 16-bit and - 32-bit libraries, you need to run pcre2test with the -16 option to do - 16-bit tests and with the -32 option to do 32-bit tests. - - Some tests are relevant only when certain build-time options are selected. - For example, test 4 is for Unicode support, and will not run if you have - built PCRE2 without it. See the comments at the start of each testinput - file. If you have a suitable Unix-like shell, the RunTest script will run - the appropriate tests for you. The command "RunTest list" will output a - list of all the tests. - - Note that the supplied files are in Unix format, with just LF characters - as line terminators. You may need to edit them to change this if your - system uses a different convention. - -(10) If you have built PCRE2 with SUPPORT_JIT, the JIT features can be tested - by running pcre2test with the -jit option. This is done automatically by - the RunTest script. You might also like to build and run the freestanding - JIT test program, src/pcre2_jit_test.c. - -(11) If you want to use the pcre2grep command, compile and link - src/pcre2grep.c; it uses only the basic 8-bit PCRE2 library (it does not - need the pcre2posix library). If you have built the PCRE2 library with JIT - support by defining SUPPORT_JIT in src/config.h, you can also define - SUPPORT_PCRE2GREP_JIT, which causes pcre2grep to make use of JIT (unless - it is run with --no-jit). If you define SUPPORT_PCRE2GREP_JIT without - defining SUPPORT_JIT, pcre2grep does not try to make use of JIT. - - -STACK SIZE IN WINDOWS ENVIRONMENTS - -Prior to release 10.30 the default system stack size of 1MiB in some Windows -environments caused issues with some tests. This should no longer be the case -for 10.30 and later releases. - - -LINKING PROGRAMS IN WINDOWS ENVIRONMENTS - -If you want to statically link a program against a PCRE2 library in the form of -a non-dll .a file, you must define PCRE2_STATIC before including src/pcre2.h. - - -CALLING CONVENTIONS IN WINDOWS ENVIRONMENTS - -It is possible to compile programs to use different calling conventions using -MSVC. Search the web for "calling conventions" for more information. To make it -easier to change the calling convention for the exported functions in the -PCRE2 library, the macro PCRE2_CALL_CONVENTION is present in all the external -definitions. It can be set externally when compiling (e.g. in CFLAGS). If it is -not set, it defaults to empty; the default calling convention is then used -(which is what is wanted most of the time). - - -COMMENTS ABOUT WIN32 BUILDS (see also "BUILDING PCRE2 ON WINDOWS WITH CMAKE") - -There are two ways of building PCRE2 using the "configure, make, make install" -paradigm on Windows systems: using MinGW or using Cygwin. These are not at all -the same thing; they are completely different from each other. There is also -support for building using CMake, which some users find a more straightforward -way of building PCRE2 under Windows. - -The MinGW home page (http://www.mingw.org/) says this: - - MinGW: A collection of freely available and freely distributable Windows - specific header files and import libraries combined with GNU toolsets that - allow one to produce native Windows programs that do not rely on any - 3rd-party C runtime DLLs. - -The Cygwin home page (http://www.cygwin.com/) says this: - - Cygwin is a Linux-like environment for Windows. It consists of two parts: - - . A DLL (cygwin1.dll) which acts as a Linux API emulation layer providing - substantial Linux API functionality - - . A collection of tools which provide Linux look and feel. - -On both MinGW and Cygwin, PCRE2 should build correctly using: - - ./configure && make && make install - -This should create two libraries called libpcre2-8 and libpcre2-posix. These -are independent libraries: when you link with libpcre2-posix you must also link -with libpcre2-8, which contains the basic functions. - -Using Cygwin's compiler generates libraries and executables that depend on -cygwin1.dll. If a library that is generated this way is distributed, -cygwin1.dll has to be distributed as well. Since cygwin1.dll is under the GPL -licence, this forces not only PCRE2 to be under the GPL, but also the entire -application. A distributor who wants to keep their own code proprietary must -purchase an appropriate Cygwin licence. - -MinGW has no such restrictions. The MinGW compiler generates a library or -executable that can run standalone on Windows without any third party dll or -licensing issues. - -But there is more complication: - -If a Cygwin user uses the -mno-cygwin Cygwin gcc flag, what that really does is -to tell Cygwin's gcc to use the MinGW gcc. Cygwin's gcc is only acting as a -front end to MinGW's gcc (if you install Cygwin's gcc, you get both Cygwin's -gcc and MinGW's gcc). So, a user can: - -. Build native binaries by using MinGW or by getting Cygwin and using - -mno-cygwin. - -. Build binaries that depend on cygwin1.dll by using Cygwin with the normal - compiler flags. - -The test files that are supplied with PCRE2 are in UNIX format, with LF -characters as line terminators. Unless your PCRE2 library uses a default -newline option that includes LF as a valid newline, it may be necessary to -change the line terminators in the test files to get some of the tests to work. - - -BUILDING PCRE2 ON WINDOWS WITH CMAKE - -CMake is an alternative configuration facility that can be used instead of -"configure". CMake creates project files (make files, solution files, etc.) -tailored to numerous development environments, including Visual Studio, -Borland, Msys, MinGW, NMake, and Unix. If possible, use short paths with no -spaces in the names for your CMake installation and your PCRE2 source and build -directories. - -The following instructions were contributed by a PCRE1 user, but they should -also work for PCRE2. If they are not followed exactly, errors may occur. In the -event that errors do occur, it is recommended that you delete the CMake cache -before attempting to repeat the CMake build process. In the CMake GUI, the -cache can be deleted by selecting "File > Delete Cache". - -1. Install the latest CMake version available from http://www.cmake.org/, and - ensure that cmake\bin is on your path. - -2. Unzip (retaining folder structure) the PCRE2 source tree into a source - directory such as C:\pcre2. You should ensure your local date and time - is not earlier than the file dates in your source dir if the release is - very new. - -3. Create a new, empty build directory, preferably a subdirectory of the - source dir. For example, C:\pcre2\pcre2-xx\build. - -4. Run cmake-gui from the Shell envirornment of your build tool, for example, - Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try - to start Cmake from the Windows Start menu, as this can lead to errors. - -5. Enter C:\pcre2\pcre2-xx and C:\pcre2\pcre2-xx\build for the source and - build directories, respectively. - -6. Hit the "Configure" button. - -7. Select the particular IDE / build tool that you are using (Visual - Studio, MSYS makefiles, MinGW makefiles, etc.) - -8. The GUI will then list several configuration options. This is where - you can disable Unicode support or select other PCRE2 optional features. - -9. Hit "Configure" again. The adjacent "Generate" button should now be - active. - -10. Hit "Generate". - -11. The build directory should now contain a usable build system, be it a - solution file for Visual Studio, makefiles for MinGW, etc. Exit from - cmake-gui and use the generated build system with your compiler or IDE. - E.g., for MinGW you can run "make", or for Visual Studio, open the PCRE2 - solution, select the desired configuration (Debug, or Release, etc.) and - build the ALL_BUILD project. - -12. If during configuration with cmake-gui you've elected to build the test - programs, you can execute them by building the test project. E.g., for - MinGW: "make test"; for Visual Studio build the RUN_TESTS project. The - most recent build configuration is targeted by the tests. A summary of - test results is presented. Complete test output is subsequently - available for review in Testing\Temporary under your build dir. - - -BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO - -The code currently cannot be compiled without a stdint.h header, which is -available only in relatively recent versions of Visual Studio. However, this -portable and permissively-licensed implementation of the header worked without -issue: - - http://www.azillionmonkeys.com/qed/pstdint.h - -Just rename it and drop it into the top level of the build tree. - - -TESTING WITH RUNTEST.BAT - -If configured with CMake, building the test project ("make test" or building -ALL_TESTS in Visual Studio) creates (and runs) pcre2_test.bat (and depending -on your configuration options, possibly other test programs) in the build -directory. The pcre2_test.bat script runs RunTest.bat with correct source and -exe paths. - -For manual testing with RunTest.bat, provided the build dir is a subdirectory -of the source directory: Open command shell window. Chdir to the location -of your pcre2test.exe and pcre2grep.exe programs. Call RunTest.bat with -"..\RunTest.Bat" or "..\..\RunTest.bat" as appropriate. - -To run only a particular test with RunTest.Bat provide a test number argument. - -Otherwise: - -1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe - have been created. - -2. Edit RunTest.bat to indentify the full or relative location of - the pcre2 source (wherein which the testdata folder resides), e.g.: - - set srcdir=C:\pcre2\pcre2-10.00 - -3. In a Windows command environment, chdir to the location of your bat and - exe programs. - -4. Run RunTest.bat. Test outputs will automatically be compared to expected - results, and discrepancies will be identified in the console output. - -To independently test the just-in-time compiler, run pcre2_jit_test.exe. - - -BUILDING PCRE2 ON NATIVE Z/OS AND Z/VM - -z/OS and z/VM are operating systems for mainframe computers, produced by IBM. -The character code used is EBCDIC, not ASCII or Unicode. In z/OS, UNIX APIs and -applications can be supported through UNIX System Services, and in such an -environment it should be possible to build PCRE2 in the same way as in other -systems, with the EBCDIC related configuration settings, but it is not known if -anybody has tried this. - -In native z/OS (without UNIX System Services) and in z/VM, special ports are -required. For details, please see file 939 on this web site: - - http://www.cbttape.org - -Everything in that location, source and executable, is in EBCDIC and native -z/OS file formats. The port provides an API for LE languages such as COBOL and -for the z/OS and z/VM versions of the Rexx languages. - -============================== -Last Updated: 14 November 2018 -============================== diff --git a/pcre2/PrepareRelease b/pcre2/PrepareRelease deleted file mode 100755 index e7cf8db8b..000000000 --- a/pcre2/PrepareRelease +++ /dev/null @@ -1,236 +0,0 @@ -#/bin/sh - -# Script to prepare the files for building a PCRE2 release. It does some -# processing of the documentation, detrails files, and creates pcre2.h.generic -# and config.h.generic (for use by builders who can't run ./configure). - -# You must run this script before runnning "make dist". If its first argument -# is "doc", it stops after preparing the documentation. There are no other -# arguments. The script makes use of the following files: - -# 132html A Perl script that converts a .1 or .3 man page into HTML. It -# "knows" the relevant troff constructs that are used in the PCRE2 -# man pages. - -# CheckMan A Perl script that checks man pages for typos in the mark up. - -# CleanTxt A Perl script that cleans up the output of "nroff -man" by -# removing backspaces and other redundant text so as to produce -# a readable .txt file. - -# Detrail A Perl script that removes trailing spaces from files. - -# doc/index.html.src -# A file that is copied as index.html into the doc/html directory -# when the HTML documentation is built. It works like this so that -# doc/html can be deleted and re-created from scratch. - -# README & NON-AUTOTOOLS-BUILD -# These files are copied into the doc/html directory, with .txt -# extensions so that they can by hyperlinked from the HTML -# documentation, because some people just go to the HTML without -# looking for text files. - - -# First, sort out the documentation. Remove pcre2demo.3 first because it won't -# pass the markup check (it is created below, using markup that none of the -# other pages use). - -cd doc -echo Processing documentation - -/bin/rm -f pcre2demo.3 - -# Check the remaining man pages - -perl ../CheckMan *.1 *.3 -if [ $? != 0 ] ; then exit 1; fi - -# Make Text form of the documentation. It needs some mangling to make it -# tidy for online reading. Concatenate all the .3 stuff, but omit the -# individual function pages. - -cat <pcre2.txt ------------------------------------------------------------------------------ -This file contains a concatenation of the PCRE2 man pages, converted to plain -text format for ease of searching with a text editor, or for use on systems -that do not have a man page processor. The small individual files that give -synopses of each function in the library have not been included. Neither has -the pcre2demo program. There are separate text files for the pcre2grep and -pcre2test commands. ------------------------------------------------------------------------------ - - -End - -echo "Making pcre2.txt" -for file in pcre2 pcre2api pcre2build pcre2callout pcre2compat pcre2jit \ - pcre2limits pcre2matching pcre2partial pcre2pattern pcre2perform \ - pcre2posix pcre2sample pcre2serialize pcre2syntax \ - pcre2unicode ; do - echo " Processing $file.3" - nroff -c -man $file.3 >$file.rawtxt - perl ../CleanTxt <$file.rawtxt >>pcre2.txt - /bin/rm $file.rawtxt - echo "------------------------------------------------------------------------------" >>pcre2.txt - if [ "$file" != "pcre2sample" ] ; then - echo " " >>pcre2.txt - echo " " >>pcre2.txt - fi -done - -# The three commands -for file in pcre2test pcre2grep pcre2-config ; do - echo Making $file.txt - nroff -c -man $file.1 >$file.rawtxt - perl ../CleanTxt <$file.rawtxt >$file.txt - /bin/rm $file.rawtxt -done - - -# Make pcre2demo.3 from the pcre2demo.c source file - -echo "Making pcre2demo.3" -perl <<"END" >pcre2demo.3 - open(IN, "../src/pcre2demo.c") || die "Failed to open src/pcre2demo.c\n"; - open(OUT, ">pcre2demo.3") || die "Failed to open pcre2demo.3\n"; - print OUT ".\\\" Start example.\n" . - ".de EX\n" . - ". nr mE \\\\n(.f\n" . - ". nf\n" . - ". nh\n" . - ". ft CW\n" . - "..\n" . - ".\n" . - ".\n" . - ".\\\" End example.\n" . - ".de EE\n" . - ". ft \\\\n(mE\n" . - ". fi\n" . - ". hy \\\\n(HY\n" . - "..\n" . - ".\n" . - ".EX\n" ; - while () - { - s/\\/\\e/g; - print OUT; - } - print OUT ".EE\n"; - close(IN); - close(OUT); -END -if [ $? != 0 ] ; then exit 1; fi - - -# Make HTML form of the documentation. - -echo "Making HTML documentation" -/bin/rm html/* -cp index.html.src html/index.html -cp ../README html/README.txt -cp ../NON-AUTOTOOLS-BUILD html/NON-AUTOTOOLS-BUILD.txt - -for file in *.1 ; do - base=`basename $file .1` - echo " Making $base.html" - perl ../132html -toc $base <$file >html/$base.html -done - -# Exclude table of contents for function summaries. It seems that expr -# forces an anchored regex. Also exclude them for small pages that have -# only one section. - -for file in *.3 ; do - base=`basename $file .3` - toc=-toc - if [ `expr $base : '.*_'` -ne 0 ] ; then toc="" ; fi - if [ "$base" = "pcre2sample" ] || \ - [ "$base" = "pcre2compat" ] || \ - [ "$base" = "pcre2limits" ] || \ - [ "$base" = "pcre2unicode" ] ; then - toc="" - fi - echo " Making $base.html" - perl ../132html $toc $base <$file >html/$base.html - if [ $? != 0 ] ; then exit 1; fi -done - -# End of documentation processing; stop if only documentation required. - -cd .. -echo Documentation done -if [ "$1" = "doc" ] ; then exit; fi - -# These files are detrailed; do not detrail the test data because there may be -# significant trailing spaces. Do not detrail RunTest.bat, because it has CRLF -# line endings and the detrail script removes all trailing white space. The -# configure files are also omitted from the detrailing. - -files="\ - Makefile.am \ - configure.ac \ - README \ - LICENCE \ - COPYING \ - AUTHORS \ - NEWS \ - NON-AUTOTOOLS-BUILD \ - INSTALL \ - 132html \ - CleanTxt \ - Detrail \ - ChangeLog \ - CMakeLists.txt \ - RunGrepTest \ - RunTest \ - pcre2-config.in \ - perltest.sh \ - libpcre2-8.pc.in \ - libpcre2-16.pc.in \ - libpcre2-32.pc.in \ - libpcre2-posix.pc.in \ - src/pcre2_dftables.c \ - src/pcre2.h.in \ - src/pcre2_auto_possess.c \ - src/pcre2_compile.c \ - src/pcre2_config.c \ - src/pcre2_context.c \ - src/pcre2_convert.c \ - src/pcre2_dfa_match.c \ - src/pcre2_error.c \ - src/pcre2_extuni.c \ - src/pcre2_find_bracket.c \ - src/pcre2_internal.h \ - src/pcre2_intmodedep.h \ - src/pcre2_jit_compile.c \ - src/pcre2_jit_match.c \ - src/pcre2_jit_misc.c \ - src/pcre2_jit_test.c \ - src/pcre2_maketables.c \ - src/pcre2_match.c \ - src/pcre2_match_data.c \ - src/pcre2_newline.c \ - src/pcre2_ord2utf.c \ - src/pcre2_pattern_info.c \ - src/pcre2_printint.c \ - src/pcre2_string_utils.c \ - src/pcre2_study.c \ - src/pcre2_substring.c \ - src/pcre2_tables.c \ - src/pcre2_ucd.c \ - src/pcre2_ucp.h \ - src/pcre2_valid_utf.c \ - src/pcre2_xclass.c \ - src/pcre2demo.c \ - src/pcre2grep.c \ - src/pcre2posix.c \ - src/pcre2posix.h \ - src/pcre2test.c" - -echo Detrailing -perl ./Detrail $files doc/p* doc/html/* - -echo Done - -#End diff --git a/pcre2/README b/pcre2/README deleted file mode 100644 index 1d6df8f2c..000000000 --- a/pcre2/README +++ /dev/null @@ -1,906 +0,0 @@ -README file for PCRE2 (Perl-compatible regular expression library) ------------------------------------------------------------------- - -PCRE2 is a re-working of the original PCRE1 library to provide an entirely new -API. Since its initial release in 2015, there has been further development of -the code and it now differs from PCRE1 in more than just the API. There are new -features and the internals have been improved. The latest release of PCRE2 is -available in three alternative formats from: - -https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.gz -https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.bz2 -https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.zip - -There is a mailing list for discussion about the development of PCRE (both the -original and new APIs) at pcre-dev@exim.org. You can access the archives and -subscribe or manage your subscription here: - - https://lists.exim.org/mailman/listinfo/pcre-dev - -Please read the NEWS file if you are upgrading from a previous release. The -contents of this README file are: - - The PCRE2 APIs - Documentation for PCRE2 - Contributions by users of PCRE2 - Building PCRE2 on non-Unix-like systems - Building PCRE2 without using autotools - Building PCRE2 using autotools - Retrieving configuration information - Shared libraries - Cross-compiling using autotools - Making new tarballs - Testing PCRE2 - Character tables - File manifest - - -The PCRE2 APIs --------------- - -PCRE2 is written in C, and it has its own API. There are three sets of -functions, one for the 8-bit library, which processes strings of bytes, one for -the 16-bit library, which processes strings of 16-bit values, and one for the -32-bit library, which processes strings of 32-bit values. Unlike PCRE1, there -are no C++ wrappers. - -The distribution does contain a set of C wrapper functions for the 8-bit -library that are based on the POSIX regular expression API (see the pcre2posix -man page). These are built into a library called libpcre2-posix. Note that this -just provides a POSIX calling interface to PCRE2; the regular expressions -themselves still follow Perl syntax and semantics. The POSIX API is restricted, -and does not give full access to all of PCRE2's facilities. - -The header file for the POSIX-style functions is called pcre2posix.h. The -official POSIX name is regex.h, but I did not want to risk possible problems -with existing files of that name by distributing it that way. To use PCRE2 with -an existing program that uses the POSIX API, pcre2posix.h will have to be -renamed or pointed at by a link (or the program modified, of course). See the -pcre2posix documentation for more details. - - -Documentation for PCRE2 ------------------------ - -If you install PCRE2 in the normal way on a Unix-like system, you will end up -with a set of man pages whose names all start with "pcre2". The one that is -just called "pcre2" lists all the others. In addition to these man pages, the -PCRE2 documentation is supplied in two other forms: - - 1. There are files called doc/pcre2.txt, doc/pcre2grep.txt, and - doc/pcre2test.txt in the source distribution. The first of these is a - concatenation of the text forms of all the section 3 man pages except the - listing of pcre2demo.c and those that summarize individual functions. The - other two are the text forms of the section 1 man pages for the pcre2grep - and pcre2test commands. These text forms are provided for ease of scanning - with text editors or similar tools. They are installed in - /share/doc/pcre2, where is the installation prefix - (defaulting to /usr/local). - - 2. A set of files containing all the documentation in HTML form, hyperlinked - in various ways, and rooted in a file called index.html, is distributed in - doc/html and installed in /share/doc/pcre2/html. - - -Building PCRE2 on non-Unix-like systems ---------------------------------------- - -For a non-Unix-like system, please read the file NON-AUTOTOOLS-BUILD, though if -your system supports the use of "configure" and "make" you may be able to build -PCRE2 using autotools in the same way as for many Unix-like systems. - -PCRE2 can also be configured using CMake, which can be run in various ways -(command line, GUI, etc). This creates Makefiles, solution files, etc. The file -NON-AUTOTOOLS-BUILD has information about CMake. - -PCRE2 has been compiled on many different operating systems. It should be -straightforward to build PCRE2 on any system that has a Standard C compiler and -library, because it uses only Standard C functions. - - -Building PCRE2 without using autotools --------------------------------------- - -The use of autotools (in particular, libtool) is problematic in some -environments, even some that are Unix or Unix-like. See the NON-AUTOTOOLS-BUILD -file for ways of building PCRE2 without using autotools. - - -Building PCRE2 using autotools ------------------------------- - -The following instructions assume the use of the widely used "configure; make; -make install" (autotools) process. - -To build PCRE2 on system that supports autotools, first run the "configure" -command from the PCRE2 distribution directory, with your current directory set -to the directory where you want the files to be created. This command is a -standard GNU "autoconf" configuration script, for which generic instructions -are supplied in the file INSTALL. - -Most commonly, people build PCRE2 within its own distribution directory, and in -this case, on many systems, just running "./configure" is sufficient. However, -the usual methods of changing standard defaults are available. For example: - -CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local - -This command specifies that the C compiler should be run with the flags '-O2 --Wall' instead of the default, and that "make install" should install PCRE2 -under /opt/local instead of the default /usr/local. - -If you want to build in a different directory, just run "configure" with that -directory as current. For example, suppose you have unpacked the PCRE2 source -into /source/pcre2/pcre2-xxx, but you want to build it in -/build/pcre2/pcre2-xxx: - -cd /build/pcre2/pcre2-xxx -/source/pcre2/pcre2-xxx/configure - -PCRE2 is written in C and is normally compiled as a C library. However, it is -possible to build it as a C++ library, though the provided building apparatus -does not have any features to support this. - -There are some optional features that can be included or omitted from the PCRE2 -library. They are also documented in the pcre2build man page. - -. By default, both shared and static libraries are built. You can change this - by adding one of these options to the "configure" command: - - --disable-shared - --disable-static - - (See also "Shared libraries on Unix-like systems" below.) - -. By default, only the 8-bit library is built. If you add --enable-pcre2-16 to - the "configure" command, the 16-bit library is also built. If you add - --enable-pcre2-32 to the "configure" command, the 32-bit library is also - built. If you want only the 16-bit or 32-bit library, use --disable-pcre2-8 - to disable building the 8-bit library. - -. If you want to include support for just-in-time (JIT) compiling, which can - give large performance improvements on certain platforms, add --enable-jit to - the "configure" command. This support is available only for certain hardware - architectures. If you try to enable it on an unsupported architecture, there - will be a compile time error. If in doubt, use --enable-jit=auto, which - enables JIT only if the current hardware is supported. - -. If you are enabling JIT under SELinux environment you may also want to add - --enable-jit-sealloc, which enables the use of an executable memory allocator - that is compatible with SELinux. Warning: this allocator is experimental! - It does not support fork() operation and may crash when no disk space is - available. This option has no effect if JIT is disabled. - -. If you do not want to make use of the default support for UTF-8 Unicode - character strings in the 8-bit library, UTF-16 Unicode character strings in - the 16-bit library, or UTF-32 Unicode character strings in the 32-bit - library, you can add --disable-unicode to the "configure" command. This - reduces the size of the libraries. It is not possible to configure one - library with Unicode support, and another without, in the same configuration. - It is also not possible to use --enable-ebcdic (see below) with Unicode - support, so if this option is set, you must also use --disable-unicode. - - When Unicode support is available, the use of a UTF encoding still has to be - enabled by setting the PCRE2_UTF option at run time or starting a pattern - with (*UTF). When PCRE2 is compiled with Unicode support, its input can only - either be ASCII or UTF-8/16/32, even when running on EBCDIC platforms. - - As well as supporting UTF strings, Unicode support includes support for the - \P, \p, and \X sequences that recognize Unicode character properties. - However, only the basic two-letter properties such as Lu are supported. - Escape sequences such as \d and \w in patterns do not by default make use of - Unicode properties, but can be made to do so by setting the PCRE2_UCP option - or starting a pattern with (*UCP). - -. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any - of the preceding, or any of the Unicode newline sequences, or the NUL (zero) - character as indicating the end of a line. Whatever you specify at build time - is the default; the caller of PCRE2 can change the selection at run time. The - default newline indicator is a single LF character (the Unix standard). You - can specify the default newline indicator by adding --enable-newline-is-cr, - --enable-newline-is-lf, --enable-newline-is-crlf, - --enable-newline-is-anycrlf, --enable-newline-is-any, or - --enable-newline-is-nul to the "configure" command, respectively. - -. By default, the sequence \R in a pattern matches any Unicode line ending - sequence. This is independent of the option specifying what PCRE2 considers - to be the end of a line (see above). However, the caller of PCRE2 can - restrict \R to match only CR, LF, or CRLF. You can make this the default by - adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R"). - -. In a pattern, the escape sequence \C matches a single code unit, even in a - UTF mode. This can be dangerous because it breaks up multi-code-unit - characters. You can build PCRE2 with the use of \C permanently locked out by - adding --enable-never-backslash-C (note the upper case C) to the "configure" - command. When \C is allowed by the library, individual applications can lock - it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option. - -. PCRE2 has a counter that limits the depth of nesting of parentheses in a - pattern. This limits the amount of system stack that a pattern uses when it - is compiled. The default is 250, but you can change it by setting, for - example, - - --with-parens-nest-limit=500 - -. PCRE2 has a counter that can be set to limit the amount of computing resource - it uses when matching a pattern. If the limit is exceeded during a match, the - match fails. The default is ten million. You can change the default by - setting, for example, - - --with-match-limit=500000 - - on the "configure" command. This is just the default; individual calls to - pcre2_match() or pcre2_dfa_match() can supply their own value. There is more - discussion in the pcre2api man page (search for pcre2_set_match_limit). - -. There is a separate counter that limits the depth of nested backtracking - (pcre2_match()) or nested function calls (pcre2_dfa_match()) during a - matching process, which indirectly limits the amount of heap memory that is - used, and in the case of pcre2_dfa_match() the amount of stack as well. This - counter also has a default of ten million, which is essentially "unlimited". - You can change the default by setting, for example, - - --with-match-limit-depth=5000 - - There is more discussion in the pcre2api man page (search for - pcre2_set_depth_limit). - -. You can also set an explicit limit on the amount of heap memory used by - the pcre2_match() and pcre2_dfa_match() interpreters: - - --with-heap-limit=500 - - The units are kibibytes (units of 1024 bytes). This limit does not apply when - the JIT optimization (which has its own memory control features) is used. - There is more discussion on the pcre2api man page (search for - pcre2_set_heap_limit). - -. In the 8-bit library, the default maximum compiled pattern size is around - 64 kibibytes. You can increase this by adding --with-link-size=3 to the - "configure" command. PCRE2 then uses three bytes instead of two for offsets - to different parts of the compiled pattern. In the 16-bit library, - --with-link-size=3 is the same as --with-link-size=4, which (in both - libraries) uses four-byte offsets. Increasing the internal link size reduces - performance in the 8-bit and 16-bit libraries. In the 32-bit library, the - link size setting is ignored, as 4-byte offsets are always used. - -. For speed, PCRE2 uses four tables for manipulating and identifying characters - whose code point values are less than 256. By default, it uses a set of - tables for ASCII encoding that is part of the distribution. If you specify - - --enable-rebuild-chartables - - a program called pcre2_dftables is compiled and run in the default C locale - when you obey "make". It builds a source file called pcre2_chartables.c. If - you do not specify this option, pcre2_chartables.c is created as a copy of - pcre2_chartables.c.dist. See "Character tables" below for further - information. - -. It is possible to compile PCRE2 for use on systems that use EBCDIC as their - character code (as opposed to ASCII/Unicode) by specifying - - --enable-ebcdic --disable-unicode - - This automatically implies --enable-rebuild-chartables (see above). However, - when PCRE2 is built this way, it always operates in EBCDIC. It cannot support - both EBCDIC and UTF-8/16/32. There is a second option, --enable-ebcdic-nl25, - which specifies that the code value for the EBCDIC NL character is 0x25 - instead of the default 0x15. - -. If you specify --enable-debug, additional debugging code is included in the - build. This option is intended for use by the PCRE2 maintainers. - -. In environments where valgrind is installed, if you specify - - --enable-valgrind - - PCRE2 will use valgrind annotations to mark certain memory regions as - unaddressable. This allows it to detect invalid memory accesses, and is - mostly useful for debugging PCRE2 itself. - -. In environments where the gcc compiler is used and lcov is installed, if you - specify - - --enable-coverage - - the build process implements a code coverage report for the test suite. The - report is generated by running "make coverage". If ccache is installed on - your system, it must be disabled when building PCRE2 for coverage reporting. - You can do this by setting the environment variable CCACHE_DISABLE=1 before - running "make" to build PCRE2. There is more information about coverage - reporting in the "pcre2build" documentation. - -. When JIT support is enabled, pcre2grep automatically makes use of it, unless - you add --disable-pcre2grep-jit to the "configure" command. - -. There is support for calling external programs during matching in the - pcre2grep command, using PCRE2's callout facility with string arguments. This - support can be disabled by adding --disable-pcre2grep-callout to the - "configure" command. There are two kinds of callout: one that generates - output from inbuilt code, and another that calls an external program. The - latter has special support for Windows and VMS; otherwise it assumes the - existence of the fork() function. This facility can be disabled by adding - --disable-pcre2grep-callout-fork to the "configure" command. - -. The pcre2grep program currently supports only 8-bit data files, and so - requires the 8-bit PCRE2 library. It is possible to compile pcre2grep to use - libz and/or libbz2, in order to read .gz and .bz2 files (respectively), by - specifying one or both of - - --enable-pcre2grep-libz - --enable-pcre2grep-libbz2 - - Of course, the relevant libraries must be installed on your system. - -. The default starting size (in bytes) of the internal buffer used by pcre2grep - can be set by, for example: - - --with-pcre2grep-bufsize=51200 - - The value must be a plain integer. The default is 20480. The amount of memory - used by pcre2grep is actually three times this number, to allow for "before" - and "after" lines. If very long lines are encountered, the buffer is - automatically enlarged, up to a fixed maximum size. - -. The default maximum size of pcre2grep's internal buffer can be set by, for - example: - - --with-pcre2grep-max-bufsize=2097152 - - The default is either 1048576 or the value of --with-pcre2grep-bufsize, - whichever is the larger. - -. It is possible to compile pcre2test so that it links with the libreadline - or libedit libraries, by specifying, respectively, - - --enable-pcre2test-libreadline or --enable-pcre2test-libedit - - If this is done, when pcre2test's input is from a terminal, it reads it using - the readline() function. This provides line-editing and history facilities. - Note that libreadline is GPL-licenced, so if you distribute a binary of - pcre2test linked in this way, there may be licensing issues. These can be - avoided by linking with libedit (which has a BSD licence) instead. - - Enabling libreadline causes the -lreadline option to be added to the - pcre2test build. In many operating environments with a sytem-installed - readline library this is sufficient. However, in some environments (e.g. if - an unmodified distribution version of readline is in use), it may be - necessary to specify something like LIBS="-lncurses" as well. This is - because, to quote the readline INSTALL, "Readline uses the termcap functions, - but does not link with the termcap or curses library itself, allowing - applications which link with readline the to choose an appropriate library." - If you get error messages about missing functions tgetstr, tgetent, tputs, - tgetflag, or tgoto, this is the problem, and linking with the ncurses library - should fix it. - -. The C99 standard defines formatting modifiers z and t for size_t and - ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in - environments other than Microsoft Visual Studio when __STDC_VERSION__ is - defined and has a value greater than or equal to 199901L (indicating C99). - However, there is at least one environment that claims to be C99 but does not - support these modifiers. If --disable-percent-zt is specified, no use is made - of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for - size_t values. - -. There is a special option called --enable-fuzz-support for use by people who - want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit - library. If set, it causes an extra library called libpcre2-fuzzsupport.a to - be built, but not installed. This contains a single function called - LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the - length of the string. When called, this function tries to compile the string - as a pattern, and if that succeeds, to match it. This is done both with no - options and with some random options bits that are generated from the string. - Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to - be created. This is normally run under valgrind or used when PCRE2 is - compiled with address sanitizing enabled. It calls the fuzzing function and - outputs information about it is doing. The input strings are specified by - arguments: if an argument starts with "=" the rest of it is a literal input - string. Otherwise, it is assumed to be a file name, and the contents of the - file are the test string. - -. Releases before 10.30 could be compiled with --disable-stack-for-recursion, - which caused pcre2_match() to use individual blocks on the heap for - backtracking instead of recursive function calls (which use the stack). This - is now obsolete since pcre2_match() was refactored always to use the heap (in - a much more efficient way than before). This option is retained for backwards - compatibility, but has no effect other than to output a warning. - -The "configure" script builds the following files for the basic C library: - -. Makefile the makefile that builds the library -. src/config.h build-time configuration options for the library -. src/pcre2.h the public PCRE2 header file -. pcre2-config script that shows the building settings such as CFLAGS - that were set for "configure" -. libpcre2-8.pc ) -. libpcre2-16.pc ) data for the pkg-config command -. libpcre2-32.pc ) -. libpcre2-posix.pc ) -. libtool script that builds shared and/or static libraries - -Versions of config.h and pcre2.h are distributed in the src directory of PCRE2 -tarballs under the names config.h.generic and pcre2.h.generic. These are -provided for those who have to build PCRE2 without using "configure" or CMake. -If you use "configure" or CMake, the .generic versions are not used. - -The "configure" script also creates config.status, which is an executable -script that can be run to recreate the configuration, and config.log, which -contains compiler output from tests that "configure" runs. - -Once "configure" has run, you can run "make". This builds whichever of the -libraries libpcre2-8, libpcre2-16 and libpcre2-32 are configured, and a test -program called pcre2test. If you enabled JIT support with --enable-jit, another -test program called pcre2_jit_test is built as well. If the 8-bit library is -built, libpcre2-posix and the pcre2grep command are also built. Running -"make" with the -j option may speed up compilation on multiprocessor systems. - -The command "make check" runs all the appropriate tests. Details of the PCRE2 -tests are given below in a separate section of this document. The -j option of -"make" can also be used when running the tests. - -You can use "make install" to install PCRE2 into live directories on your -system. The following are installed (file names are all relative to the - that is set when "configure" is run): - - Commands (bin): - pcre2test - pcre2grep (if 8-bit support is enabled) - pcre2-config - - Libraries (lib): - libpcre2-8 (if 8-bit support is enabled) - libpcre2-16 (if 16-bit support is enabled) - libpcre2-32 (if 32-bit support is enabled) - libpcre2-posix (if 8-bit support is enabled) - - Configuration information (lib/pkgconfig): - libpcre2-8.pc - libpcre2-16.pc - libpcre2-32.pc - libpcre2-posix.pc - - Header files (include): - pcre2.h - pcre2posix.h - - Man pages (share/man/man{1,3}): - pcre2grep.1 - pcre2test.1 - pcre2-config.1 - pcre2.3 - pcre2*.3 (lots more pages, all starting "pcre2") - - HTML documentation (share/doc/pcre2/html): - index.html - *.html (lots more pages, hyperlinked from index.html) - - Text file documentation (share/doc/pcre2): - AUTHORS - COPYING - ChangeLog - LICENCE - NEWS - README - pcre2.txt (a concatenation of the man(3) pages) - pcre2test.txt the pcre2test man page - pcre2grep.txt the pcre2grep man page - pcre2-config.txt the pcre2-config man page - -If you want to remove PCRE2 from your system, you can run "make uninstall". -This removes all the files that "make install" installed. However, it does not -remove any directories, because these are often shared with other programs. - - -Retrieving configuration information ------------------------------------- - -Running "make install" installs the command pcre2-config, which can be used to -recall information about the PCRE2 configuration and installation. For example: - - pcre2-config --version - -prints the version number, and - - pcre2-config --libs8 - -outputs information about where the 8-bit library is installed. This command -can be included in makefiles for programs that use PCRE2, saving the programmer -from having to remember too many details. Run pcre2-config with no arguments to -obtain a list of possible arguments. - -The pkg-config command is another system for saving and retrieving information -about installed libraries. Instead of separate commands for each library, a -single command is used. For example: - - pkg-config --libs libpcre2-16 - -The data is held in *.pc files that are installed in a directory called -/lib/pkgconfig. - - -Shared libraries ----------------- - -The default distribution builds PCRE2 as shared libraries and static libraries, -as long as the operating system supports shared libraries. Shared library -support relies on the "libtool" script which is built as part of the -"configure" process. - -The libtool script is used to compile and link both shared and static -libraries. They are placed in a subdirectory called .libs when they are newly -built. The programs pcre2test and pcre2grep are built to use these uninstalled -libraries (by means of wrapper scripts in the case of shared libraries). When -you use "make install" to install shared libraries, pcre2grep and pcre2test are -automatically re-built to use the newly installed shared libraries before being -installed themselves. However, the versions left in the build directory still -use the uninstalled libraries. - -To build PCRE2 using static libraries only you must use --disable-shared when -configuring it. For example: - -./configure --prefix=/usr/gnu --disable-shared - -Then run "make" in the usual way. Similarly, you can use --disable-static to -build only shared libraries. - - -Cross-compiling using autotools -------------------------------- - -You can specify CC and CFLAGS in the normal way to the "configure" command, in -order to cross-compile PCRE2 for some other host. However, you should NOT -specify --enable-rebuild-chartables, because if you do, the pcre2_dftables.c -source file is compiled and run on the local host, in order to generate the -inbuilt character tables (the pcre2_chartables.c file). This will probably not -work, because pcre2_dftables.c needs to be compiled with the local compiler, -not the cross compiler. - -When --enable-rebuild-chartables is not specified, pcre2_chartables.c is -created by making a copy of pcre2_chartables.c.dist, which is a default set of -tables that assumes ASCII code. Cross-compiling with the default tables should -not be a problem. - -If you need to modify the character tables when cross-compiling, you should -move pcre2_chartables.c.dist out of the way, then compile pcre2_dftables.c by -hand and run it on the local host to make a new version of -pcre2_chartables.c.dist. See the pcre2build section "Creating character tables -at build time" for more details. - - -Making new tarballs -------------------- - -The command "make dist" creates three PCRE2 tarballs, in tar.gz, tar.bz2, and -zip formats. The command "make distcheck" does the same, but then does a trial -build of the new distribution to ensure that it works. - -If you have modified any of the man page sources in the doc directory, you -should first run the PrepareRelease script before making a distribution. This -script creates the .txt and HTML forms of the documentation from the man pages. - - -Testing PCRE2 -------------- - -To test the basic PCRE2 library on a Unix-like system, run the RunTest script. -There is another script called RunGrepTest that tests the pcre2grep command. -When JIT support is enabled, a third test program called pcre2_jit_test is -built. Both the scripts and all the program tests are run if you obey "make -check". For other environments, see the instructions in NON-AUTOTOOLS-BUILD. - -The RunTest script runs the pcre2test test program (which is documented in its -own man page) on each of the relevant testinput files in the testdata -directory, and compares the output with the contents of the corresponding -testoutput files. RunTest uses a file called testtry to hold the main output -from pcre2test. Other files whose names begin with "test" are used as working -files in some tests. - -Some tests are relevant only when certain build-time options were selected. For -example, the tests for UTF-8/16/32 features are run only when Unicode support -is available. RunTest outputs a comment when it skips a test. - -Many (but not all) of the tests that are not skipped are run twice if JIT -support is available. On the second run, JIT compilation is forced. This -testing can be suppressed by putting "nojit" on the RunTest command line. - -The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit -libraries that are enabled. If you want to run just one set of tests, call -RunTest with either the -8, -16 or -32 option. - -If valgrind is installed, you can run the tests under it by putting "valgrind" -on the RunTest command line. To run pcre2test on just one or more specific test -files, give their numbers as arguments to RunTest, for example: - - RunTest 2 7 11 - -You can also specify ranges of tests such as 3-6 or 3- (meaning 3 to the -end), or a number preceded by ~ to exclude a test. For example: - - Runtest 3-15 ~10 - -This runs tests 3 to 15, excluding test 10, and just ~13 runs all the tests -except test 13. Whatever order the arguments are in, the tests are always run -in numerical order. - -You can also call RunTest with the single argument "list" to cause it to output -a list of tests. - -The test sequence starts with "test 0", which is a special test that has no -input file, and whose output is not checked. This is because it will be -different on different hardware and with different configurations. The test -exists in order to exercise some of pcre2test's code that would not otherwise -be run. - -Tests 1 and 2 can always be run, as they expect only plain text strings (not -UTF) and make no use of Unicode properties. The first test file can be fed -directly into the perltest.sh script to check that Perl gives the same results. -The only difference you should see is in the first few lines, where the Perl -version is given instead of the PCRE2 version. The second set of tests check -auxiliary functions, error detection, and run-time flags that are specific to -PCRE2. It also uses the debugging flags to check some of the internals of -pcre2_compile(). - -If you build PCRE2 with a locale setting that is not the standard C locale, the -character tables may be different (see next paragraph). In some cases, this may -cause failures in the second set of tests. For example, in a locale where the -isprint() function yields TRUE for characters in the range 128-255, the use of -[:isascii:] inside a character class defines a different set of characters, and -this shows up in this test as a difference in the compiled code, which is being -listed for checking. For example, where the comparison test output contains -[\x00-\x7f] the test might contain [\x00-\xff], and similarly in some other -cases. This is not a bug in PCRE2. - -Test 3 checks pcre2_maketables(), the facility for building a set of character -tables for a specific locale and using them instead of the default tables. The -script uses the "locale" command to check for the availability of the "fr_FR", -"french", or "fr" locale, and uses the first one that it finds. If the "locale" -command fails, or if its output doesn't include "fr_FR", "french", or "fr" in -the list of available locales, the third test cannot be run, and a comment is -output to say why. If running this test produces an error like this: - - ** Failed to set locale "fr_FR" - -it means that the given locale is not available on your system, despite being -listed by "locale". This does not mean that PCRE2 is broken. There are three -alternative output files for the third test, because three different versions -of the French locale have been encountered. The test passes if its output -matches any one of them. - -Tests 4 and 5 check UTF and Unicode property support, test 4 being compatible -with the perltest.sh script, and test 5 checking PCRE2-specific things. - -Tests 6 and 7 check the pcre2_dfa_match() alternative matching function, in -non-UTF mode and UTF-mode with Unicode property support, respectively. - -Test 8 checks some internal offsets and code size features, but it is run only -when Unicode support is enabled. The output is different in 8-bit, 16-bit, and -32-bit modes and for different link sizes, so there are different output files -for each mode and link size. - -Tests 9 and 10 are run only in 8-bit mode, and tests 11 and 12 are run only in -16-bit and 32-bit modes. These are tests that generate different output in -8-bit mode. Each pair are for general cases and Unicode support, respectively. - -Test 13 checks the handling of non-UTF characters greater than 255 by -pcre2_dfa_match() in 16-bit and 32-bit modes. - -Test 14 contains some special UTF and UCP tests that give different output for -different code unit widths. - -Test 15 contains a number of tests that must not be run with JIT. They check, -among other non-JIT things, the match-limiting features of the intepretive -matcher. - -Test 16 is run only when JIT support is not available. It checks that an -attempt to use JIT has the expected behaviour. - -Test 17 is run only when JIT support is available. It checks JIT complete and -partial modes, match-limiting under JIT, and other JIT-specific features. - -Tests 18 and 19 are run only in 8-bit mode. They check the POSIX interface to -the 8-bit library, without and with Unicode support, respectively. - -Test 20 checks the serialization functions by writing a set of compiled -patterns to a file, and then reloading and checking them. - -Tests 21 and 22 test \C support when the use of \C is not locked out, without -and with UTF support, respectively. Test 23 tests \C when it is locked out. - -Tests 24 and 25 test the experimental pattern conversion functions, without and -with UTF support, respectively. - - -Character tables ----------------- - -For speed, PCRE2 uses four tables for manipulating and identifying characters -whose code point values are less than 256. By default, a set of tables that is -built into the library is used. The pcre2_maketables() function can be called -by an application to create a new set of tables in the current locale. This are -passed to PCRE2 by calling pcre2_set_character_tables() to put a pointer into a -compile context. - -The source file called pcre2_chartables.c contains the default set of tables. -By default, this is created as a copy of pcre2_chartables.c.dist, which -contains tables for ASCII coding. However, if --enable-rebuild-chartables is -specified for ./configure, a new version of pcre2_chartables.c is built by the -program pcre2_dftables (compiled from pcre2_dftables.c), which uses the ANSI C -character handling functions such as isalnum(), isalpha(), isupper(), -islower(), etc. to build the table sources. This means that the default C -locale that is set for your system will control the contents of these default -tables. You can change the default tables by editing pcre2_chartables.c and -then re-building PCRE2. If you do this, you should take care to ensure that the -file does not get automatically re-generated. The best way to do this is to -move pcre2_chartables.c.dist out of the way and replace it with your customized -tables. - -When the pcre2_dftables program is run as a result of specifying ---enable-rebuild-chartables, it uses the default C locale that is set on your -system. It does not pay attention to the LC_xxx environment variables. In other -words, it uses the system's default locale rather than whatever the compiling -user happens to have set. If you really do want to build a source set of -character tables in a locale that is specified by the LC_xxx variables, you can -run the pcre2_dftables program by hand with the -L option. For example: - - ./pcre2_dftables -L pcre2_chartables.c.special - -The second argument names the file where the source code for the tables is -written. The first two 256-byte tables provide lower casing and case flipping -functions, respectively. The next table consists of a number of 32-byte bit -maps which identify certain character classes such as digits, "word" -characters, white space, etc. These are used when building 32-byte bit maps -that represent character classes for code points less than 256. The final -256-byte table has bits indicating various character types, as follows: - - 1 white space character - 2 letter - 4 lower case letter - 8 decimal digit - 16 alphanumeric or '_' - -You can also specify -b (with or without -L) when running pcre2_dftables. This -causes the tables to be written in binary instead of as source code. A set of -binary tables can be loaded into memory by an application and passed to -pcre2_compile() in the same way as tables created dynamically by calling -pcre2_maketables(). The tables are just a string of bytes, independent of -hardware characteristics such as endianness. This means they can be bundled -with an application that runs in different environments, to ensure consistent -behaviour. - -See also the pcre2build section "Creating character tables at build time". - - -File manifest -------------- - -The distribution should contain the files listed below. - -(A) Source files for the PCRE2 library functions and their headers are found in - the src directory: - - src/pcre2_dftables.c auxiliary program for building pcre2_chartables.c - when --enable-rebuild-chartables is specified - - src/pcre2_chartables.c.dist a default set of character tables that assume - ASCII coding; unless --enable-rebuild-chartables is - specified, used by copying to pcre2_chartables.c - - src/pcre2posix.c ) - src/pcre2_auto_possess.c ) - src/pcre2_compile.c ) - src/pcre2_config.c ) - src/pcre2_context.c ) - src/pcre2_convert.c ) - src/pcre2_dfa_match.c ) - src/pcre2_error.c ) - src/pcre2_extuni.c ) - src/pcre2_find_bracket.c ) - src/pcre2_jit_compile.c ) - src/pcre2_jit_match.c ) sources for the functions in the library, - src/pcre2_jit_misc.c ) and some internal functions that they use - src/pcre2_maketables.c ) - src/pcre2_match.c ) - src/pcre2_match_data.c ) - src/pcre2_newline.c ) - src/pcre2_ord2utf.c ) - src/pcre2_pattern_info.c ) - src/pcre2_script_run.c ) - src/pcre2_serialize.c ) - src/pcre2_string_utils.c ) - src/pcre2_study.c ) - src/pcre2_substitute.c ) - src/pcre2_substring.c ) - src/pcre2_tables.c ) - src/pcre2_ucd.c ) - src/pcre2_valid_utf.c ) - src/pcre2_xclass.c ) - - src/pcre2_printint.c debugging function that is used by pcre2test, - src/pcre2_fuzzsupport.c function for (optional) fuzzing support - - src/config.h.in template for config.h, when built by "configure" - src/pcre2.h.in template for pcre2.h when built by "configure" - src/pcre2posix.h header for the external POSIX wrapper API - src/pcre2_internal.h header for internal use - src/pcre2_intmodedep.h a mode-specific internal header - src/pcre2_ucp.h header for Unicode property handling - - sljit/* source files for the JIT compiler - -(B) Source files for programs that use PCRE2: - - src/pcre2demo.c simple demonstration of coding calls to PCRE2 - src/pcre2grep.c source of a grep utility that uses PCRE2 - src/pcre2test.c comprehensive test program - src/pcre2_jit_test.c JIT test program - -(C) Auxiliary files: - - 132html script to turn "man" pages into HTML - AUTHORS information about the author of PCRE2 - ChangeLog log of changes to the code - CleanTxt script to clean nroff output for txt man pages - Detrail script to remove trailing spaces - HACKING some notes about the internals of PCRE2 - INSTALL generic installation instructions - LICENCE conditions for the use of PCRE2 - COPYING the same, using GNU's standard name - Makefile.in ) template for Unix Makefile, which is built by - ) "configure" - Makefile.am ) the automake input that was used to create - ) Makefile.in - NEWS important changes in this release - NON-AUTOTOOLS-BUILD notes on building PCRE2 without using autotools - PrepareRelease script to make preparations for "make dist" - README this file - RunTest a Unix shell script for running tests - RunGrepTest a Unix shell script for pcre2grep tests - aclocal.m4 m4 macros (generated by "aclocal") - config.guess ) files used by libtool, - config.sub ) used only when building a shared library - configure a configuring shell script (built by autoconf) - configure.ac ) the autoconf input that was used to build - ) "configure" and config.h - depcomp ) script to find program dependencies, generated by - ) automake - doc/*.3 man page sources for PCRE2 - doc/*.1 man page sources for pcre2grep and pcre2test - doc/index.html.src the base HTML page - doc/html/* HTML documentation - doc/pcre2.txt plain text version of the man pages - doc/pcre2test.txt plain text documentation of test program - install-sh a shell script for installing files - libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config - libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config - libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config - libpcre2-posix.pc.in template for libpcre2-posix.pc for pkg-config - ltmain.sh file used to build a libtool script - missing ) common stub for a few missing GNU programs while - ) installing, generated by automake - mkinstalldirs script for making install directories - perltest.sh Script for running a Perl test program - pcre2-config.in source of script which retains PCRE2 information - testdata/testinput* test data for main library tests - testdata/testoutput* expected test results - testdata/grep* input and output for pcre2grep tests - testdata/* other supporting test files - -(D) Auxiliary files for cmake support - - cmake/COPYING-CMAKE-SCRIPTS - cmake/FindPackageHandleStandardArgs.cmake - cmake/FindEditline.cmake - cmake/FindReadline.cmake - CMakeLists.txt - config-cmake.h.in - -(E) Auxiliary files for building PCRE2 "by hand" - - src/pcre2.h.generic ) a version of the public PCRE2 header file - ) for use in non-"configure" environments - src/config.h.generic ) a version of config.h for use in non-"configure" - ) environments - -Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com -Last updated: 04 December 2020 diff --git a/pcre2/aclocal.m4 b/pcre2/aclocal.m4 deleted file mode 100644 index a1b8aed63..000000000 --- a/pcre2/aclocal.m4 +++ /dev/null @@ -1,1548 +0,0 @@ -# generated automatically by aclocal 1.16.2 -*- Autoconf -*- - -# Copyright (C) 1996-2020 Free Software Foundation, Inc. - -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY, to the extent permitted by law; without -# even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. - -m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])]) -m4_ifndef([AC_AUTOCONF_VERSION], - [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl -m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],, -[m4_warning([this file was generated for autoconf 2.69. -You have another version of autoconf. It may work, but is not guaranteed to. -If you have problems, you may need to regenerate the build system entirely. -To do so, use the procedure documented by the package, typically 'autoreconf'.])]) - -# pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- -# serial 11 (pkg-config-0.29.1) - -dnl Copyright © 2004 Scott James Remnant . -dnl Copyright © 2012-2015 Dan Nicholson -dnl -dnl This program is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU General Public License as published by -dnl the Free Software Foundation; either version 2 of the License, or -dnl (at your option) any later version. -dnl -dnl This program is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of -dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -dnl General Public License for more details. -dnl -dnl You should have received a copy of the GNU General Public License -dnl along with this program; if not, write to the Free Software -dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA -dnl 02111-1307, USA. -dnl -dnl As a special exception to the GNU General Public License, if you -dnl distribute this file as part of a program that contains a -dnl configuration script generated by Autoconf, you may include it under -dnl the same distribution terms that you use for the rest of that -dnl program. - -dnl PKG_PREREQ(MIN-VERSION) -dnl ----------------------- -dnl Since: 0.29 -dnl -dnl Verify that the version of the pkg-config macros are at least -dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's -dnl installed version of pkg-config, this checks the developer's version -dnl of pkg.m4 when generating configure. -dnl -dnl To ensure that this macro is defined, also add: -dnl m4_ifndef([PKG_PREREQ], -dnl [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])]) -dnl -dnl See the "Since" comment for each macro you use to see what version -dnl of the macros you require. -m4_defun([PKG_PREREQ], -[m4_define([PKG_MACROS_VERSION], [0.29.1]) -m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1, - [m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])]) -])dnl PKG_PREREQ - -dnl PKG_PROG_PKG_CONFIG([MIN-VERSION]) -dnl ---------------------------------- -dnl Since: 0.16 -dnl -dnl Search for the pkg-config tool and set the PKG_CONFIG variable to -dnl first found in the path. Checks that the version of pkg-config found -dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is -dnl used since that's the first version where most current features of -dnl pkg-config existed. -AC_DEFUN([PKG_PROG_PKG_CONFIG], -[m4_pattern_forbid([^_?PKG_[A-Z_]+$]) -m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$]) -m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$]) -AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility]) -AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path]) -AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path]) - -if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then - AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) -fi -if test -n "$PKG_CONFIG"; then - _pkg_min_version=m4_default([$1], [0.9.0]) - AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) - if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - PKG_CONFIG="" - fi -fi[]dnl -])dnl PKG_PROG_PKG_CONFIG - -dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -dnl ------------------------------------------------------------------- -dnl Since: 0.18 -dnl -dnl Check to see whether a particular set of modules exists. Similar to -dnl PKG_CHECK_MODULES(), but does not set variables or print errors. -dnl -dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) -dnl only at the first occurence in configure.ac, so if the first place -dnl it's called might be skipped (such as if it is within an "if", you -dnl have to call PKG_CHECK_EXISTS manually -AC_DEFUN([PKG_CHECK_EXISTS], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -if test -n "$PKG_CONFIG" && \ - AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then - m4_default([$2], [:]) -m4_ifvaln([$3], [else - $3])dnl -fi]) - -dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) -dnl --------------------------------------------- -dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting -dnl pkg_failed based on the result. -m4_define([_PKG_CONFIG], -[if test -n "$$1"; then - pkg_cv_[]$1="$$1" - elif test -n "$PKG_CONFIG"; then - PKG_CHECK_EXISTS([$3], - [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes ], - [pkg_failed=yes]) - else - pkg_failed=untried -fi[]dnl -])dnl _PKG_CONFIG - -dnl _PKG_SHORT_ERRORS_SUPPORTED -dnl --------------------------- -dnl Internal check to see if pkg-config supports short errors. -AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG]) -if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then - _pkg_short_errors_supported=yes -else - _pkg_short_errors_supported=no -fi[]dnl -])dnl _PKG_SHORT_ERRORS_SUPPORTED - - -dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], -dnl [ACTION-IF-NOT-FOUND]) -dnl -------------------------------------------------------------- -dnl Since: 0.4.0 -dnl -dnl Note that if there is a possibility the first call to -dnl PKG_CHECK_MODULES might not happen, you should be sure to include an -dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac -AC_DEFUN([PKG_CHECK_MODULES], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl -AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl - -pkg_failed=no -AC_MSG_CHECKING([for $1]) - -_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) -_PKG_CONFIG([$1][_LIBS], [libs], [$2]) - -m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS -and $1[]_LIBS to avoid the need to call pkg-config. -See the pkg-config man page for more details.]) - -if test $pkg_failed = yes; then - AC_MSG_RESULT([no]) - _PKG_SHORT_ERRORS_SUPPORTED - if test $_pkg_short_errors_supported = yes; then - $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1` - else - $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1` - fi - # Put the nasty error message in config.log where it belongs - echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD - - m4_default([$4], [AC_MSG_ERROR( -[Package requirements ($2) were not met: - -$$1_PKG_ERRORS - -Consider adjusting the PKG_CONFIG_PATH environment variable if you -installed software in a non-standard prefix. - -_PKG_TEXT])[]dnl - ]) -elif test $pkg_failed = untried; then - AC_MSG_RESULT([no]) - m4_default([$4], [AC_MSG_FAILURE( -[The pkg-config script could not be found or is too old. Make sure it -is in your PATH or set the PKG_CONFIG environment variable to the full -path to pkg-config. - -_PKG_TEXT - -To get pkg-config, see .])[]dnl - ]) -else - $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS - $1[]_LIBS=$pkg_cv_[]$1[]_LIBS - AC_MSG_RESULT([yes]) - $3 -fi[]dnl -])dnl PKG_CHECK_MODULES - - -dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], -dnl [ACTION-IF-NOT-FOUND]) -dnl --------------------------------------------------------------------- -dnl Since: 0.29 -dnl -dnl Checks for existence of MODULES and gathers its build flags with -dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags -dnl and VARIABLE-PREFIX_LIBS from --libs. -dnl -dnl Note that if there is a possibility the first call to -dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to -dnl include an explicit call to PKG_PROG_PKG_CONFIG in your -dnl configure.ac. -AC_DEFUN([PKG_CHECK_MODULES_STATIC], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -_save_PKG_CONFIG=$PKG_CONFIG -PKG_CONFIG="$PKG_CONFIG --static" -PKG_CHECK_MODULES($@) -PKG_CONFIG=$_save_PKG_CONFIG[]dnl -])dnl PKG_CHECK_MODULES_STATIC - - -dnl PKG_INSTALLDIR([DIRECTORY]) -dnl ------------------------- -dnl Since: 0.27 -dnl -dnl Substitutes the variable pkgconfigdir as the location where a module -dnl should install pkg-config .pc files. By default the directory is -dnl $libdir/pkgconfig, but the default can be changed by passing -dnl DIRECTORY. The user can override through the --with-pkgconfigdir -dnl parameter. -AC_DEFUN([PKG_INSTALLDIR], -[m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])]) -m4_pushdef([pkg_description], - [pkg-config installation directory @<:@]pkg_default[@:>@]) -AC_ARG_WITH([pkgconfigdir], - [AS_HELP_STRING([--with-pkgconfigdir], pkg_description)],, - [with_pkgconfigdir=]pkg_default) -AC_SUBST([pkgconfigdir], [$with_pkgconfigdir]) -m4_popdef([pkg_default]) -m4_popdef([pkg_description]) -])dnl PKG_INSTALLDIR - - -dnl PKG_NOARCH_INSTALLDIR([DIRECTORY]) -dnl -------------------------------- -dnl Since: 0.27 -dnl -dnl Substitutes the variable noarch_pkgconfigdir as the location where a -dnl module should install arch-independent pkg-config .pc files. By -dnl default the directory is $datadir/pkgconfig, but the default can be -dnl changed by passing DIRECTORY. The user can override through the -dnl --with-noarch-pkgconfigdir parameter. -AC_DEFUN([PKG_NOARCH_INSTALLDIR], -[m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])]) -m4_pushdef([pkg_description], - [pkg-config arch-independent installation directory @<:@]pkg_default[@:>@]) -AC_ARG_WITH([noarch-pkgconfigdir], - [AS_HELP_STRING([--with-noarch-pkgconfigdir], pkg_description)],, - [with_noarch_pkgconfigdir=]pkg_default) -AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir]) -m4_popdef([pkg_default]) -m4_popdef([pkg_description]) -])dnl PKG_NOARCH_INSTALLDIR - - -dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE, -dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -dnl ------------------------------------------- -dnl Since: 0.28 -dnl -dnl Retrieves the value of the pkg-config variable for the given module. -AC_DEFUN([PKG_CHECK_VAR], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl - -_PKG_CONFIG([$1], [variable="][$3]["], [$2]) -AS_VAR_COPY([$1], [pkg_cv_][$1]) - -AS_VAR_IF([$1], [""], [$5], [$4])dnl -])dnl PKG_CHECK_VAR - -dnl PKG_WITH_MODULES(VARIABLE-PREFIX, MODULES, -dnl [ACTION-IF-FOUND],[ACTION-IF-NOT-FOUND], -dnl [DESCRIPTION], [DEFAULT]) -dnl ------------------------------------------ -dnl -dnl Prepare a "--with-" configure option using the lowercase -dnl [VARIABLE-PREFIX] name, merging the behaviour of AC_ARG_WITH and -dnl PKG_CHECK_MODULES in a single macro. -AC_DEFUN([PKG_WITH_MODULES], -[ -m4_pushdef([with_arg], m4_tolower([$1])) - -m4_pushdef([description], - [m4_default([$5], [build with ]with_arg[ support])]) - -m4_pushdef([def_arg], [m4_default([$6], [auto])]) -m4_pushdef([def_action_if_found], [AS_TR_SH([with_]with_arg)=yes]) -m4_pushdef([def_action_if_not_found], [AS_TR_SH([with_]with_arg)=no]) - -m4_case(def_arg, - [yes],[m4_pushdef([with_without], [--without-]with_arg)], - [m4_pushdef([with_without],[--with-]with_arg)]) - -AC_ARG_WITH(with_arg, - AS_HELP_STRING(with_without, description[ @<:@default=]def_arg[@:>@]),, - [AS_TR_SH([with_]with_arg)=def_arg]) - -AS_CASE([$AS_TR_SH([with_]with_arg)], - [yes],[PKG_CHECK_MODULES([$1],[$2],$3,$4)], - [auto],[PKG_CHECK_MODULES([$1],[$2], - [m4_n([def_action_if_found]) $3], - [m4_n([def_action_if_not_found]) $4])]) - -m4_popdef([with_arg]) -m4_popdef([description]) -m4_popdef([def_arg]) - -])dnl PKG_WITH_MODULES - -dnl PKG_HAVE_WITH_MODULES(VARIABLE-PREFIX, MODULES, -dnl [DESCRIPTION], [DEFAULT]) -dnl ----------------------------------------------- -dnl -dnl Convenience macro to trigger AM_CONDITIONAL after PKG_WITH_MODULES -dnl check._[VARIABLE-PREFIX] is exported as make variable. -AC_DEFUN([PKG_HAVE_WITH_MODULES], -[ -PKG_WITH_MODULES([$1],[$2],,,[$3],[$4]) - -AM_CONDITIONAL([HAVE_][$1], - [test "$AS_TR_SH([with_]m4_tolower([$1]))" = "yes"]) -])dnl PKG_HAVE_WITH_MODULES - -dnl PKG_HAVE_DEFINE_WITH_MODULES(VARIABLE-PREFIX, MODULES, -dnl [DESCRIPTION], [DEFAULT]) -dnl ------------------------------------------------------ -dnl -dnl Convenience macro to run AM_CONDITIONAL and AC_DEFINE after -dnl PKG_WITH_MODULES check. HAVE_[VARIABLE-PREFIX] is exported as make -dnl and preprocessor variable. -AC_DEFUN([PKG_HAVE_DEFINE_WITH_MODULES], -[ -PKG_HAVE_WITH_MODULES([$1],[$2],[$3],[$4]) - -AS_IF([test "$AS_TR_SH([with_]m4_tolower([$1]))" = "yes"], - [AC_DEFINE([HAVE_][$1], 1, [Enable ]m4_tolower([$1])[ support])]) -])dnl PKG_HAVE_DEFINE_WITH_MODULES - -# Copyright (C) 2002-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_AUTOMAKE_VERSION(VERSION) -# ---------------------------- -# Automake X.Y traces this macro to ensure aclocal.m4 has been -# generated from the m4 files accompanying Automake X.Y. -# (This private macro should not be called outside this file.) -AC_DEFUN([AM_AUTOMAKE_VERSION], -[am__api_version='1.16' -dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to -dnl require some minimum version. Point them to the right macro. -m4_if([$1], [1.16.2], [], - [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl -]) - -# _AM_AUTOCONF_VERSION(VERSION) -# ----------------------------- -# aclocal traces this macro to find the Autoconf version. -# This is a private macro too. Using m4_define simplifies -# the logic in aclocal, which can simply ignore this definition. -m4_define([_AM_AUTOCONF_VERSION], []) - -# AM_SET_CURRENT_AUTOMAKE_VERSION -# ------------------------------- -# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced. -# This function is AC_REQUIREd by AM_INIT_AUTOMAKE. -AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION], -[AM_AUTOMAKE_VERSION([1.16.2])dnl -m4_ifndef([AC_AUTOCONF_VERSION], - [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl -_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) - -# Copyright (C) 2011-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_PROG_AR([ACT-IF-FAIL]) -# ------------------------- -# Try to determine the archiver interface, and trigger the ar-lib wrapper -# if it is needed. If the detection of archiver interface fails, run -# ACT-IF-FAIL (default is to abort configure with a proper error message). -AC_DEFUN([AM_PROG_AR], -[AC_BEFORE([$0], [LT_INIT])dnl -AC_BEFORE([$0], [AC_PROG_LIBTOOL])dnl -AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl -AC_REQUIRE_AUX_FILE([ar-lib])dnl -AC_CHECK_TOOLS([AR], [ar lib "link -lib"], [false]) -: ${AR=ar} - -AC_CACHE_CHECK([the archiver ($AR) interface], [am_cv_ar_interface], - [AC_LANG_PUSH([C]) - am_cv_ar_interface=ar - AC_COMPILE_IFELSE([AC_LANG_SOURCE([[int some_variable = 0;]])], - [am_ar_try='$AR cru libconftest.a conftest.$ac_objext >&AS_MESSAGE_LOG_FD' - AC_TRY_EVAL([am_ar_try]) - if test "$ac_status" -eq 0; then - am_cv_ar_interface=ar - else - am_ar_try='$AR -NOLOGO -OUT:conftest.lib conftest.$ac_objext >&AS_MESSAGE_LOG_FD' - AC_TRY_EVAL([am_ar_try]) - if test "$ac_status" -eq 0; then - am_cv_ar_interface=lib - else - am_cv_ar_interface=unknown - fi - fi - rm -f conftest.lib libconftest.a - ]) - AC_LANG_POP([C])]) - -case $am_cv_ar_interface in -ar) - ;; -lib) - # Microsoft lib, so override with the ar-lib wrapper script. - # FIXME: It is wrong to rewrite AR. - # But if we don't then we get into trouble of one sort or another. - # A longer-term fix would be to have automake use am__AR in this case, - # and then we could set am__AR="$am_aux_dir/ar-lib \$(AR)" or something - # similar. - AR="$am_aux_dir/ar-lib $AR" - ;; -unknown) - m4_default([$1], - [AC_MSG_ERROR([could not determine $AR interface])]) - ;; -esac -AC_SUBST([AR])dnl -]) - -# AM_AUX_DIR_EXPAND -*- Autoconf -*- - -# Copyright (C) 2001-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets -# $ac_aux_dir to '$srcdir/foo'. In other projects, it is set to -# '$srcdir', '$srcdir/..', or '$srcdir/../..'. -# -# Of course, Automake must honor this variable whenever it calls a -# tool from the auxiliary directory. The problem is that $srcdir (and -# therefore $ac_aux_dir as well) can be either absolute or relative, -# depending on how configure is run. This is pretty annoying, since -# it makes $ac_aux_dir quite unusable in subdirectories: in the top -# source directory, any form will work fine, but in subdirectories a -# relative path needs to be adjusted first. -# -# $ac_aux_dir/missing -# fails when called from a subdirectory if $ac_aux_dir is relative -# $top_srcdir/$ac_aux_dir/missing -# fails if $ac_aux_dir is absolute, -# fails when called from a subdirectory in a VPATH build with -# a relative $ac_aux_dir -# -# The reason of the latter failure is that $top_srcdir and $ac_aux_dir -# are both prefixed by $srcdir. In an in-source build this is usually -# harmless because $srcdir is '.', but things will broke when you -# start a VPATH build or use an absolute $srcdir. -# -# So we could use something similar to $top_srcdir/$ac_aux_dir/missing, -# iff we strip the leading $srcdir from $ac_aux_dir. That would be: -# am_aux_dir='\$(top_srcdir)/'`expr "$ac_aux_dir" : "$srcdir//*\(.*\)"` -# and then we would define $MISSING as -# MISSING="\${SHELL} $am_aux_dir/missing" -# This will work as long as MISSING is not called from configure, because -# unfortunately $(top_srcdir) has no meaning in configure. -# However there are other variables, like CC, which are often used in -# configure, and could therefore not use this "fixed" $ac_aux_dir. -# -# Another solution, used here, is to always expand $ac_aux_dir to an -# absolute PATH. The drawback is that using absolute paths prevent a -# configured tree to be moved without reconfiguration. - -AC_DEFUN([AM_AUX_DIR_EXPAND], -[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl -# Expand $ac_aux_dir to an absolute path. -am_aux_dir=`cd "$ac_aux_dir" && pwd` -]) - -# AM_CONDITIONAL -*- Autoconf -*- - -# Copyright (C) 1997-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_CONDITIONAL(NAME, SHELL-CONDITION) -# ------------------------------------- -# Define a conditional. -AC_DEFUN([AM_CONDITIONAL], -[AC_PREREQ([2.52])dnl - m4_if([$1], [TRUE], [AC_FATAL([$0: invalid condition: $1])], - [$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl -AC_SUBST([$1_TRUE])dnl -AC_SUBST([$1_FALSE])dnl -_AM_SUBST_NOTMAKE([$1_TRUE])dnl -_AM_SUBST_NOTMAKE([$1_FALSE])dnl -m4_define([_AM_COND_VALUE_$1], [$2])dnl -if $2; then - $1_TRUE= - $1_FALSE='#' -else - $1_TRUE='#' - $1_FALSE= -fi -AC_CONFIG_COMMANDS_PRE( -[if test -z "${$1_TRUE}" && test -z "${$1_FALSE}"; then - AC_MSG_ERROR([[conditional "$1" was never defined. -Usually this means the macro was only invoked conditionally.]]) -fi])]) - -# Copyright (C) 1999-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - - -# There are a few dirty hacks below to avoid letting 'AC_PROG_CC' be -# written in clear, in which case automake, when reading aclocal.m4, -# will think it sees a *use*, and therefore will trigger all it's -# C support machinery. Also note that it means that autoscan, seeing -# CC etc. in the Makefile, will ask for an AC_PROG_CC use... - - -# _AM_DEPENDENCIES(NAME) -# ---------------------- -# See how the compiler implements dependency checking. -# NAME is "CC", "CXX", "OBJC", "OBJCXX", "UPC", or "GJC". -# We try a few techniques and use that to set a single cache variable. -# -# We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was -# modified to invoke _AM_DEPENDENCIES(CC); we would have a circular -# dependency, and given that the user is not expected to run this macro, -# just rely on AC_PROG_CC. -AC_DEFUN([_AM_DEPENDENCIES], -[AC_REQUIRE([AM_SET_DEPDIR])dnl -AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl -AC_REQUIRE([AM_MAKE_INCLUDE])dnl -AC_REQUIRE([AM_DEP_TRACK])dnl - -m4_if([$1], [CC], [depcc="$CC" am_compiler_list=], - [$1], [CXX], [depcc="$CXX" am_compiler_list=], - [$1], [OBJC], [depcc="$OBJC" am_compiler_list='gcc3 gcc'], - [$1], [OBJCXX], [depcc="$OBJCXX" am_compiler_list='gcc3 gcc'], - [$1], [UPC], [depcc="$UPC" am_compiler_list=], - [$1], [GCJ], [depcc="$GCJ" am_compiler_list='gcc3 gcc'], - [depcc="$$1" am_compiler_list=]) - -AC_CACHE_CHECK([dependency style of $depcc], - [am_cv_$1_dependencies_compiler_type], -[if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then - # We make a subdir and do the tests there. Otherwise we can end up - # making bogus files that we don't know about and never remove. For - # instance it was reported that on HP-UX the gcc test will end up - # making a dummy file named 'D' -- because '-MD' means "put the output - # in D". - rm -rf conftest.dir - mkdir conftest.dir - # Copy depcomp to subdir because otherwise we won't find it if we're - # using a relative directory. - cp "$am_depcomp" conftest.dir - cd conftest.dir - # We will build objects and dependencies in a subdirectory because - # it helps to detect inapplicable dependency modes. For instance - # both Tru64's cc and ICC support -MD to output dependencies as a - # side effect of compilation, but ICC will put the dependencies in - # the current directory while Tru64 will put them in the object - # directory. - mkdir sub - - am_cv_$1_dependencies_compiler_type=none - if test "$am_compiler_list" = ""; then - am_compiler_list=`sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < ./depcomp` - fi - am__universal=false - m4_case([$1], [CC], - [case " $depcc " in #( - *\ -arch\ *\ -arch\ *) am__universal=true ;; - esac], - [CXX], - [case " $depcc " in #( - *\ -arch\ *\ -arch\ *) am__universal=true ;; - esac]) - - for depmode in $am_compiler_list; do - # Setup a source with many dependencies, because some compilers - # like to wrap large dependency lists on column 80 (with \), and - # we should not choose a depcomp mode which is confused by this. - # - # We need to recreate these files for each test, as the compiler may - # overwrite some of them when testing with obscure command lines. - # This happens at least with the AIX C compiler. - : > sub/conftest.c - for i in 1 2 3 4 5 6; do - echo '#include "conftst'$i'.h"' >> sub/conftest.c - # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with - # Solaris 10 /bin/sh. - echo '/* dummy */' > sub/conftst$i.h - done - echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf - - # We check with '-c' and '-o' for the sake of the "dashmstdout" - # mode. It turns out that the SunPro C++ compiler does not properly - # handle '-M -o', and we need to detect this. Also, some Intel - # versions had trouble with output in subdirs. - am__obj=sub/conftest.${OBJEXT-o} - am__minus_obj="-o $am__obj" - case $depmode in - gcc) - # This depmode causes a compiler race in universal mode. - test "$am__universal" = false || continue - ;; - nosideeffect) - # After this tag, mechanisms are not by side-effect, so they'll - # only be used when explicitly requested. - if test "x$enable_dependency_tracking" = xyes; then - continue - else - break - fi - ;; - msvc7 | msvc7msys | msvisualcpp | msvcmsys) - # This compiler won't grok '-c -o', but also, the minuso test has - # not run yet. These depmodes are late enough in the game, and - # so weak that their functioning should not be impacted. - am__obj=conftest.${OBJEXT-o} - am__minus_obj= - ;; - none) break ;; - esac - if depmode=$depmode \ - source=sub/conftest.c object=$am__obj \ - depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \ - $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \ - >/dev/null 2>conftest.err && - grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 && - grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 && - grep $am__obj sub/conftest.Po > /dev/null 2>&1 && - ${MAKE-make} -s -f confmf > /dev/null 2>&1; then - # icc doesn't choke on unknown options, it will just issue warnings - # or remarks (even with -Werror). So we grep stderr for any message - # that says an option was ignored or not supported. - # When given -MP, icc 7.0 and 7.1 complain thusly: - # icc: Command line warning: ignoring option '-M'; no argument required - # The diagnosis changed in icc 8.0: - # icc: Command line remark: option '-MP' not supported - if (grep 'ignoring option' conftest.err || - grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else - am_cv_$1_dependencies_compiler_type=$depmode - break - fi - fi - done - - cd .. - rm -rf conftest.dir -else - am_cv_$1_dependencies_compiler_type=none -fi -]) -AC_SUBST([$1DEPMODE], [depmode=$am_cv_$1_dependencies_compiler_type]) -AM_CONDITIONAL([am__fastdep$1], [ - test "x$enable_dependency_tracking" != xno \ - && test "$am_cv_$1_dependencies_compiler_type" = gcc3]) -]) - - -# AM_SET_DEPDIR -# ------------- -# Choose a directory name for dependency files. -# This macro is AC_REQUIREd in _AM_DEPENDENCIES. -AC_DEFUN([AM_SET_DEPDIR], -[AC_REQUIRE([AM_SET_LEADING_DOT])dnl -AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl -]) - - -# AM_DEP_TRACK -# ------------ -AC_DEFUN([AM_DEP_TRACK], -[AC_ARG_ENABLE([dependency-tracking], [dnl -AS_HELP_STRING( - [--enable-dependency-tracking], - [do not reject slow dependency extractors]) -AS_HELP_STRING( - [--disable-dependency-tracking], - [speeds up one-time build])]) -if test "x$enable_dependency_tracking" != xno; then - am_depcomp="$ac_aux_dir/depcomp" - AMDEPBACKSLASH='\' - am__nodep='_no' -fi -AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno]) -AC_SUBST([AMDEPBACKSLASH])dnl -_AM_SUBST_NOTMAKE([AMDEPBACKSLASH])dnl -AC_SUBST([am__nodep])dnl -_AM_SUBST_NOTMAKE([am__nodep])dnl -]) - -# Generate code to set up dependency tracking. -*- Autoconf -*- - -# Copyright (C) 1999-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# _AM_OUTPUT_DEPENDENCY_COMMANDS -# ------------------------------ -AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS], -[{ - # Older Autoconf quotes --file arguments for eval, but not when files - # are listed without --file. Let's play safe and only enable the eval - # if we detect the quoting. - # TODO: see whether this extra hack can be removed once we start - # requiring Autoconf 2.70 or later. - AS_CASE([$CONFIG_FILES], - [*\'*], [eval set x "$CONFIG_FILES"], - [*], [set x $CONFIG_FILES]) - shift - # Used to flag and report bootstrapping failures. - am_rc=0 - for am_mf - do - # Strip MF so we end up with the name of the file. - am_mf=`AS_ECHO(["$am_mf"]) | sed -e 's/:.*$//'` - # Check whether this is an Automake generated Makefile which includes - # dependency-tracking related rules and includes. - # Grep'ing the whole file directly is not great: AIX grep has a line - # limit of 2048, but all sed's we know have understand at least 4000. - sed -n 's,^am--depfiles:.*,X,p' "$am_mf" | grep X >/dev/null 2>&1 \ - || continue - am_dirpart=`AS_DIRNAME(["$am_mf"])` - am_filepart=`AS_BASENAME(["$am_mf"])` - AM_RUN_LOG([cd "$am_dirpart" \ - && sed -e '/# am--include-marker/d' "$am_filepart" \ - | $MAKE -f - am--depfiles]) || am_rc=$? - done - if test $am_rc -ne 0; then - AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments - for automatic dependency tracking. If GNU make was not used, consider - re-running the configure script with MAKE="gmake" (or whatever is - necessary). You can also try re-running configure with the - '--disable-dependency-tracking' option to at least be able to build - the package (albeit without support for automatic dependency tracking).]) - fi - AS_UNSET([am_dirpart]) - AS_UNSET([am_filepart]) - AS_UNSET([am_mf]) - AS_UNSET([am_rc]) - rm -f conftest-deps.mk -} -])# _AM_OUTPUT_DEPENDENCY_COMMANDS - - -# AM_OUTPUT_DEPENDENCY_COMMANDS -# ----------------------------- -# This macro should only be invoked once -- use via AC_REQUIRE. -# -# This code is only required when automatic dependency tracking is enabled. -# This creates each '.Po' and '.Plo' makefile fragment that we'll need in -# order to bootstrap the dependency handling code. -AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS], -[AC_CONFIG_COMMANDS([depfiles], - [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS], - [AMDEP_TRUE="$AMDEP_TRUE" MAKE="${MAKE-make}"])]) - -# Do all the work for Automake. -*- Autoconf -*- - -# Copyright (C) 1996-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# This macro actually does too much. Some checks are only needed if -# your package does certain things. But this isn't really a big deal. - -dnl Redefine AC_PROG_CC to automatically invoke _AM_PROG_CC_C_O. -m4_define([AC_PROG_CC], -m4_defn([AC_PROG_CC]) -[_AM_PROG_CC_C_O -]) - -# AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE]) -# AM_INIT_AUTOMAKE([OPTIONS]) -# ----------------------------------------------- -# The call with PACKAGE and VERSION arguments is the old style -# call (pre autoconf-2.50), which is being phased out. PACKAGE -# and VERSION should now be passed to AC_INIT and removed from -# the call to AM_INIT_AUTOMAKE. -# We support both call styles for the transition. After -# the next Automake release, Autoconf can make the AC_INIT -# arguments mandatory, and then we can depend on a new Autoconf -# release and drop the old call support. -AC_DEFUN([AM_INIT_AUTOMAKE], -[AC_PREREQ([2.65])dnl -dnl Autoconf wants to disallow AM_ names. We explicitly allow -dnl the ones we care about. -m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl -AC_REQUIRE([AM_SET_CURRENT_AUTOMAKE_VERSION])dnl -AC_REQUIRE([AC_PROG_INSTALL])dnl -if test "`cd $srcdir && pwd`" != "`pwd`"; then - # Use -I$(srcdir) only when $(srcdir) != ., so that make's output - # is not polluted with repeated "-I." - AC_SUBST([am__isrc], [' -I$(srcdir)'])_AM_SUBST_NOTMAKE([am__isrc])dnl - # test to see if srcdir already configured - if test -f $srcdir/config.status; then - AC_MSG_ERROR([source directory already configured; run "make distclean" there first]) - fi -fi - -# test whether we have cygpath -if test -z "$CYGPATH_W"; then - if (cygpath --version) >/dev/null 2>/dev/null; then - CYGPATH_W='cygpath -w' - else - CYGPATH_W=echo - fi -fi -AC_SUBST([CYGPATH_W]) - -# Define the identity of the package. -dnl Distinguish between old-style and new-style calls. -m4_ifval([$2], -[AC_DIAGNOSE([obsolete], - [$0: two- and three-arguments forms are deprecated.]) -m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl - AC_SUBST([PACKAGE], [$1])dnl - AC_SUBST([VERSION], [$2])], -[_AM_SET_OPTIONS([$1])dnl -dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT. -m4_if( - m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]), - [ok:ok],, - [m4_fatal([AC_INIT should be called with package and version arguments])])dnl - AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl - AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl - -_AM_IF_OPTION([no-define],, -[AC_DEFINE_UNQUOTED([PACKAGE], ["$PACKAGE"], [Name of package]) - AC_DEFINE_UNQUOTED([VERSION], ["$VERSION"], [Version number of package])])dnl - -# Some tools Automake needs. -AC_REQUIRE([AM_SANITY_CHECK])dnl -AC_REQUIRE([AC_ARG_PROGRAM])dnl -AM_MISSING_PROG([ACLOCAL], [aclocal-${am__api_version}]) -AM_MISSING_PROG([AUTOCONF], [autoconf]) -AM_MISSING_PROG([AUTOMAKE], [automake-${am__api_version}]) -AM_MISSING_PROG([AUTOHEADER], [autoheader]) -AM_MISSING_PROG([MAKEINFO], [makeinfo]) -AC_REQUIRE([AM_PROG_INSTALL_SH])dnl -AC_REQUIRE([AM_PROG_INSTALL_STRIP])dnl -AC_REQUIRE([AC_PROG_MKDIR_P])dnl -# For better backward compatibility. To be removed once Automake 1.9.x -# dies out for good. For more background, see: -# -# -AC_SUBST([mkdir_p], ['$(MKDIR_P)']) -# We need awk for the "check" target (and possibly the TAP driver). The -# system "awk" is bad on some platforms. -AC_REQUIRE([AC_PROG_AWK])dnl -AC_REQUIRE([AC_PROG_MAKE_SET])dnl -AC_REQUIRE([AM_SET_LEADING_DOT])dnl -_AM_IF_OPTION([tar-ustar], [_AM_PROG_TAR([ustar])], - [_AM_IF_OPTION([tar-pax], [_AM_PROG_TAR([pax])], - [_AM_PROG_TAR([v7])])]) -_AM_IF_OPTION([no-dependencies],, -[AC_PROVIDE_IFELSE([AC_PROG_CC], - [_AM_DEPENDENCIES([CC])], - [m4_define([AC_PROG_CC], - m4_defn([AC_PROG_CC])[_AM_DEPENDENCIES([CC])])])dnl -AC_PROVIDE_IFELSE([AC_PROG_CXX], - [_AM_DEPENDENCIES([CXX])], - [m4_define([AC_PROG_CXX], - m4_defn([AC_PROG_CXX])[_AM_DEPENDENCIES([CXX])])])dnl -AC_PROVIDE_IFELSE([AC_PROG_OBJC], - [_AM_DEPENDENCIES([OBJC])], - [m4_define([AC_PROG_OBJC], - m4_defn([AC_PROG_OBJC])[_AM_DEPENDENCIES([OBJC])])])dnl -AC_PROVIDE_IFELSE([AC_PROG_OBJCXX], - [_AM_DEPENDENCIES([OBJCXX])], - [m4_define([AC_PROG_OBJCXX], - m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl -]) -AC_REQUIRE([AM_SILENT_RULES])dnl -dnl The testsuite driver may need to know about EXEEXT, so add the -dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen. This -dnl macro is hooked onto _AC_COMPILER_EXEEXT early, see below. -AC_CONFIG_COMMANDS_PRE(dnl -[m4_provide_if([_AM_COMPILER_EXEEXT], - [AM_CONDITIONAL([am__EXEEXT], [test -n "$EXEEXT"])])])dnl - -# POSIX will say in a future version that running "rm -f" with no argument -# is OK; and we want to be able to make that assumption in our Makefile -# recipes. So use an aggressive probe to check that the usage we want is -# actually supported "in the wild" to an acceptable degree. -# See automake bug#10828. -# To make any issue more visible, cause the running configure to be aborted -# by default if the 'rm' program in use doesn't match our expectations; the -# user can still override this though. -if rm -f && rm -fr && rm -rf; then : OK; else - cat >&2 <<'END' -Oops! - -Your 'rm' program seems unable to run without file operands specified -on the command line, even when the '-f' option is present. This is contrary -to the behaviour of most rm programs out there, and not conforming with -the upcoming POSIX standard: - -Please tell bug-automake@gnu.org about your system, including the value -of your $PATH and any error possibly output before this message. This -can help us improve future automake versions. - -END - if test x"$ACCEPT_INFERIOR_RM_PROGRAM" = x"yes"; then - echo 'Configuration will proceed anyway, since you have set the' >&2 - echo 'ACCEPT_INFERIOR_RM_PROGRAM variable to "yes"' >&2 - echo >&2 - else - cat >&2 <<'END' -Aborting the configuration process, to ensure you take notice of the issue. - -You can download and install GNU coreutils to get an 'rm' implementation -that behaves properly: . - -If you want to complete the configuration process using your problematic -'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM -to "yes", and re-run configure. - -END - AC_MSG_ERROR([Your 'rm' program is bad, sorry.]) - fi -fi -dnl The trailing newline in this macro's definition is deliberate, for -dnl backward compatibility and to allow trailing 'dnl'-style comments -dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841. -]) - -dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion. Do not -dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further -dnl mangled by Autoconf and run in a shell conditional statement. -m4_define([_AC_COMPILER_EXEEXT], -m4_defn([_AC_COMPILER_EXEEXT])[m4_provide([_AM_COMPILER_EXEEXT])]) - -# When config.status generates a header, we must update the stamp-h file. -# This file resides in the same directory as the config header -# that is generated. The stamp files are numbered to have different names. - -# Autoconf calls _AC_AM_CONFIG_HEADER_HOOK (when defined) in the -# loop where config.status creates the headers, so we can generate -# our stamp files there. -AC_DEFUN([_AC_AM_CONFIG_HEADER_HOOK], -[# Compute $1's index in $config_headers. -_am_arg=$1 -_am_stamp_count=1 -for _am_header in $config_headers :; do - case $_am_header in - $_am_arg | $_am_arg:* ) - break ;; - * ) - _am_stamp_count=`expr $_am_stamp_count + 1` ;; - esac -done -echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count]) - -# Copyright (C) 2001-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_PROG_INSTALL_SH -# ------------------ -# Define $install_sh. -AC_DEFUN([AM_PROG_INSTALL_SH], -[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl -if test x"${install_sh+set}" != xset; then - case $am_aux_dir in - *\ * | *\ *) - install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;; - *) - install_sh="\${SHELL} $am_aux_dir/install-sh" - esac -fi -AC_SUBST([install_sh])]) - -# Copyright (C) 2003-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# Check whether the underlying file-system supports filenames -# with a leading dot. For instance MS-DOS doesn't. -AC_DEFUN([AM_SET_LEADING_DOT], -[rm -rf .tst 2>/dev/null -mkdir .tst 2>/dev/null -if test -d .tst; then - am__leading_dot=. -else - am__leading_dot=_ -fi -rmdir .tst 2>/dev/null -AC_SUBST([am__leading_dot])]) - -# Check to see how 'make' treats includes. -*- Autoconf -*- - -# Copyright (C) 2001-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_MAKE_INCLUDE() -# ----------------- -# Check whether make has an 'include' directive that can support all -# the idioms we need for our automatic dependency tracking code. -AC_DEFUN([AM_MAKE_INCLUDE], -[AC_MSG_CHECKING([whether ${MAKE-make} supports the include directive]) -cat > confinc.mk << 'END' -am__doit: - @echo this is the am__doit target >confinc.out -.PHONY: am__doit -END -am__include="#" -am__quote= -# BSD make does it like this. -echo '.include "confinc.mk" # ignored' > confmf.BSD -# Other make implementations (GNU, Solaris 10, AIX) do it like this. -echo 'include confinc.mk # ignored' > confmf.GNU -_am_result=no -for s in GNU BSD; do - AM_RUN_LOG([${MAKE-make} -f confmf.$s && cat confinc.out]) - AS_CASE([$?:`cat confinc.out 2>/dev/null`], - ['0:this is the am__doit target'], - [AS_CASE([$s], - [BSD], [am__include='.include' am__quote='"'], - [am__include='include' am__quote=''])]) - if test "$am__include" != "#"; then - _am_result="yes ($s style)" - break - fi -done -rm -f confinc.* confmf.* -AC_MSG_RESULT([${_am_result}]) -AC_SUBST([am__include])]) -AC_SUBST([am__quote])]) - -# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*- - -# Copyright (C) 1997-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_MISSING_PROG(NAME, PROGRAM) -# ------------------------------ -AC_DEFUN([AM_MISSING_PROG], -[AC_REQUIRE([AM_MISSING_HAS_RUN]) -$1=${$1-"${am_missing_run}$2"} -AC_SUBST($1)]) - -# AM_MISSING_HAS_RUN -# ------------------ -# Define MISSING if not defined so far and test if it is modern enough. -# If it is, set am_missing_run to use it, otherwise, to nothing. -AC_DEFUN([AM_MISSING_HAS_RUN], -[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl -AC_REQUIRE_AUX_FILE([missing])dnl -if test x"${MISSING+set}" != xset; then - case $am_aux_dir in - *\ * | *\ *) - MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;; - *) - MISSING="\${SHELL} $am_aux_dir/missing" ;; - esac -fi -# Use eval to expand $SHELL -if eval "$MISSING --is-lightweight"; then - am_missing_run="$MISSING " -else - am_missing_run= - AC_MSG_WARN(['missing' script is too old or missing]) -fi -]) - -# Helper functions for option handling. -*- Autoconf -*- - -# Copyright (C) 2001-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# _AM_MANGLE_OPTION(NAME) -# ----------------------- -AC_DEFUN([_AM_MANGLE_OPTION], -[[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])]) - -# _AM_SET_OPTION(NAME) -# -------------------- -# Set option NAME. Presently that only means defining a flag for this option. -AC_DEFUN([_AM_SET_OPTION], -[m4_define(_AM_MANGLE_OPTION([$1]), [1])]) - -# _AM_SET_OPTIONS(OPTIONS) -# ------------------------ -# OPTIONS is a space-separated list of Automake options. -AC_DEFUN([_AM_SET_OPTIONS], -[m4_foreach_w([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])]) - -# _AM_IF_OPTION(OPTION, IF-SET, [IF-NOT-SET]) -# ------------------------------------------- -# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise. -AC_DEFUN([_AM_IF_OPTION], -[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])]) - -# Copyright (C) 1999-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# _AM_PROG_CC_C_O -# --------------- -# Like AC_PROG_CC_C_O, but changed for automake. We rewrite AC_PROG_CC -# to automatically call this. -AC_DEFUN([_AM_PROG_CC_C_O], -[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl -AC_REQUIRE_AUX_FILE([compile])dnl -AC_LANG_PUSH([C])dnl -AC_CACHE_CHECK( - [whether $CC understands -c and -o together], - [am_cv_prog_cc_c_o], - [AC_LANG_CONFTEST([AC_LANG_PROGRAM([])]) - # Make sure it works both with $CC and with simple cc. - # Following AC_PROG_CC_C_O, we do the test twice because some - # compilers refuse to overwrite an existing .o file with -o, - # though they will create one. - am_cv_prog_cc_c_o=yes - for am_i in 1 2; do - if AM_RUN_LOG([$CC -c conftest.$ac_ext -o conftest2.$ac_objext]) \ - && test -f conftest2.$ac_objext; then - : OK - else - am_cv_prog_cc_c_o=no - break - fi - done - rm -f core conftest* - unset am_i]) -if test "$am_cv_prog_cc_c_o" != yes; then - # Losing compiler, so override with the script. - # FIXME: It is wrong to rewrite CC. - # But if we don't then we get into trouble of one sort or another. - # A longer-term fix would be to have automake use am__CC in this case, - # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)" - CC="$am_aux_dir/compile $CC" -fi -AC_LANG_POP([C])]) - -# For backward compatibility. -AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])]) - -# Copyright (C) 2001-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_RUN_LOG(COMMAND) -# ------------------- -# Run COMMAND, save the exit status in ac_status, and log it. -# (This has been adapted from Autoconf's _AC_RUN_LOG macro.) -AC_DEFUN([AM_RUN_LOG], -[{ echo "$as_me:$LINENO: $1" >&AS_MESSAGE_LOG_FD - ($1) >&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD - ac_status=$? - echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD - (exit $ac_status); }]) - -# Check to make sure that the build environment is sane. -*- Autoconf -*- - -# Copyright (C) 1996-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_SANITY_CHECK -# --------------- -AC_DEFUN([AM_SANITY_CHECK], -[AC_MSG_CHECKING([whether build environment is sane]) -# Reject unsafe characters in $srcdir or the absolute working directory -# name. Accept space and tab only in the latter. -am_lf=' -' -case `pwd` in - *[[\\\"\#\$\&\'\`$am_lf]]*) - AC_MSG_ERROR([unsafe absolute working directory name]);; -esac -case $srcdir in - *[[\\\"\#\$\&\'\`$am_lf\ \ ]]*) - AC_MSG_ERROR([unsafe srcdir value: '$srcdir']);; -esac - -# Do 'set' in a subshell so we don't clobber the current shell's -# arguments. Must try -L first in case configure is actually a -# symlink; some systems play weird games with the mod time of symlinks -# (eg FreeBSD returns the mod time of the symlink's containing -# directory). -if ( - am_has_slept=no - for am_try in 1 2; do - echo "timestamp, slept: $am_has_slept" > conftest.file - set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null` - if test "$[*]" = "X"; then - # -L didn't work. - set X `ls -t "$srcdir/configure" conftest.file` - fi - if test "$[*]" != "X $srcdir/configure conftest.file" \ - && test "$[*]" != "X conftest.file $srcdir/configure"; then - - # If neither matched, then we have a broken ls. This can happen - # if, for instance, CONFIG_SHELL is bash and it inherits a - # broken ls alias from the environment. This has actually - # happened. Such a system could not be considered "sane". - AC_MSG_ERROR([ls -t appears to fail. Make sure there is not a broken - alias in your environment]) - fi - if test "$[2]" = conftest.file || test $am_try -eq 2; then - break - fi - # Just in case. - sleep 1 - am_has_slept=yes - done - test "$[2]" = conftest.file - ) -then - # Ok. - : -else - AC_MSG_ERROR([newly created file is older than distributed files! -Check your system clock]) -fi -AC_MSG_RESULT([yes]) -# If we didn't sleep, we still need to ensure time stamps of config.status and -# generated files are strictly newer. -am_sleep_pid= -if grep 'slept: no' conftest.file >/dev/null 2>&1; then - ( sleep 1 ) & - am_sleep_pid=$! -fi -AC_CONFIG_COMMANDS_PRE( - [AC_MSG_CHECKING([that generated files are newer than configure]) - if test -n "$am_sleep_pid"; then - # Hide warnings about reused PIDs. - wait $am_sleep_pid 2>/dev/null - fi - AC_MSG_RESULT([done])]) -rm -f conftest.file -]) - -# Copyright (C) 2009-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_SILENT_RULES([DEFAULT]) -# -------------------------- -# Enable less verbose build rules; with the default set to DEFAULT -# ("yes" being less verbose, "no" or empty being verbose). -AC_DEFUN([AM_SILENT_RULES], -[AC_ARG_ENABLE([silent-rules], [dnl -AS_HELP_STRING( - [--enable-silent-rules], - [less verbose build output (undo: "make V=1")]) -AS_HELP_STRING( - [--disable-silent-rules], - [verbose build output (undo: "make V=0")])dnl -]) -case $enable_silent_rules in @%:@ ((( - yes) AM_DEFAULT_VERBOSITY=0;; - no) AM_DEFAULT_VERBOSITY=1;; - *) AM_DEFAULT_VERBOSITY=m4_if([$1], [yes], [0], [1]);; -esac -dnl -dnl A few 'make' implementations (e.g., NonStop OS and NextStep) -dnl do not support nested variable expansions. -dnl See automake bug#9928 and bug#10237. -am_make=${MAKE-make} -AC_CACHE_CHECK([whether $am_make supports nested variables], - [am_cv_make_support_nested_variables], - [if AS_ECHO([['TRUE=$(BAR$(V)) -BAR0=false -BAR1=true -V=1 -am__doit: - @$(TRUE) -.PHONY: am__doit']]) | $am_make -f - >/dev/null 2>&1; then - am_cv_make_support_nested_variables=yes -else - am_cv_make_support_nested_variables=no -fi]) -if test $am_cv_make_support_nested_variables = yes; then - dnl Using '$V' instead of '$(V)' breaks IRIX make. - AM_V='$(V)' - AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)' -else - AM_V=$AM_DEFAULT_VERBOSITY - AM_DEFAULT_V=$AM_DEFAULT_VERBOSITY -fi -AC_SUBST([AM_V])dnl -AM_SUBST_NOTMAKE([AM_V])dnl -AC_SUBST([AM_DEFAULT_V])dnl -AM_SUBST_NOTMAKE([AM_DEFAULT_V])dnl -AC_SUBST([AM_DEFAULT_VERBOSITY])dnl -AM_BACKSLASH='\' -AC_SUBST([AM_BACKSLASH])dnl -_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl -]) - -# Copyright (C) 2001-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_PROG_INSTALL_STRIP -# --------------------- -# One issue with vendor 'install' (even GNU) is that you can't -# specify the program used to strip binaries. This is especially -# annoying in cross-compiling environments, where the build's strip -# is unlikely to handle the host's binaries. -# Fortunately install-sh will honor a STRIPPROG variable, so we -# always use install-sh in "make install-strip", and initialize -# STRIPPROG with the value of the STRIP variable (set by the user). -AC_DEFUN([AM_PROG_INSTALL_STRIP], -[AC_REQUIRE([AM_PROG_INSTALL_SH])dnl -# Installed binaries are usually stripped using 'strip' when the user -# run "make install-strip". However 'strip' might not be the right -# tool to use in cross-compilation environments, therefore Automake -# will honor the 'STRIP' environment variable to overrule this program. -dnl Don't test for $cross_compiling = yes, because it might be 'maybe'. -if test "$cross_compiling" != no; then - AC_CHECK_TOOL([STRIP], [strip], :) -fi -INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s" -AC_SUBST([INSTALL_STRIP_PROGRAM])]) - -# Copyright (C) 2006-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# _AM_SUBST_NOTMAKE(VARIABLE) -# --------------------------- -# Prevent Automake from outputting VARIABLE = @VARIABLE@ in Makefile.in. -# This macro is traced by Automake. -AC_DEFUN([_AM_SUBST_NOTMAKE]) - -# AM_SUBST_NOTMAKE(VARIABLE) -# -------------------------- -# Public sister of _AM_SUBST_NOTMAKE. -AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)]) - -# Check how to create a tarball. -*- Autoconf -*- - -# Copyright (C) 2004-2020 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# _AM_PROG_TAR(FORMAT) -# -------------------- -# Check how to create a tarball in format FORMAT. -# FORMAT should be one of 'v7', 'ustar', or 'pax'. -# -# Substitute a variable $(am__tar) that is a command -# writing to stdout a FORMAT-tarball containing the directory -# $tardir. -# tardir=directory && $(am__tar) > result.tar -# -# Substitute a variable $(am__untar) that extract such -# a tarball read from stdin. -# $(am__untar) < result.tar -# -AC_DEFUN([_AM_PROG_TAR], -[# Always define AMTAR for backward compatibility. Yes, it's still used -# in the wild :-( We should find a proper way to deprecate it ... -AC_SUBST([AMTAR], ['$${TAR-tar}']) - -# We'll loop over all known methods to create a tar archive until one works. -_am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none' - -m4_if([$1], [v7], - [am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'], - - [m4_case([$1], - [ustar], - [# The POSIX 1988 'ustar' format is defined with fixed-size fields. - # There is notably a 21 bits limit for the UID and the GID. In fact, - # the 'pax' utility can hang on bigger UID/GID (see automake bug#8343 - # and bug#13588). - am_max_uid=2097151 # 2^21 - 1 - am_max_gid=$am_max_uid - # The $UID and $GID variables are not portable, so we need to resort - # to the POSIX-mandated id(1) utility. Errors in the 'id' calls - # below are definitely unexpected, so allow the users to see them - # (that is, avoid stderr redirection). - am_uid=`id -u || echo unknown` - am_gid=`id -g || echo unknown` - AC_MSG_CHECKING([whether UID '$am_uid' is supported by ustar format]) - if test $am_uid -le $am_max_uid; then - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - _am_tools=none - fi - AC_MSG_CHECKING([whether GID '$am_gid' is supported by ustar format]) - if test $am_gid -le $am_max_gid; then - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - _am_tools=none - fi], - - [pax], - [], - - [m4_fatal([Unknown tar format])]) - - AC_MSG_CHECKING([how to create a $1 tar archive]) - - # Go ahead even if we have the value already cached. We do so because we - # need to set the values for the 'am__tar' and 'am__untar' variables. - _am_tools=${am_cv_prog_tar_$1-$_am_tools} - - for _am_tool in $_am_tools; do - case $_am_tool in - gnutar) - for _am_tar in tar gnutar gtar; do - AM_RUN_LOG([$_am_tar --version]) && break - done - am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"' - am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"' - am__untar="$_am_tar -xf -" - ;; - plaintar) - # Must skip GNU tar: if it does not support --format= it doesn't create - # ustar tarball either. - (tar --version) >/dev/null 2>&1 && continue - am__tar='tar chf - "$$tardir"' - am__tar_='tar chf - "$tardir"' - am__untar='tar xf -' - ;; - pax) - am__tar='pax -L -x $1 -w "$$tardir"' - am__tar_='pax -L -x $1 -w "$tardir"' - am__untar='pax -r' - ;; - cpio) - am__tar='find "$$tardir" -print | cpio -o -H $1 -L' - am__tar_='find "$tardir" -print | cpio -o -H $1 -L' - am__untar='cpio -i -H $1 -d' - ;; - none) - am__tar=false - am__tar_=false - am__untar=false - ;; - esac - - # If the value was cached, stop now. We just wanted to have am__tar - # and am__untar set. - test -n "${am_cv_prog_tar_$1}" && break - - # tar/untar a dummy directory, and stop if the command works. - rm -rf conftest.dir - mkdir conftest.dir - echo GrepMe > conftest.dir/file - AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar]) - rm -rf conftest.dir - if test -s conftest.tar; then - AM_RUN_LOG([$am__untar /dev/null 2>&1 && break - fi - done - rm -rf conftest.dir - - AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool]) - AC_MSG_RESULT([$am_cv_prog_tar_$1])]) - -AC_SUBST([am__tar]) -AC_SUBST([am__untar]) -]) # _AM_PROG_TAR - -m4_include([m4/ax_pthread.m4]) -m4_include([m4/libtool.m4]) -m4_include([m4/ltoptions.m4]) -m4_include([m4/ltsugar.m4]) -m4_include([m4/ltversion.m4]) -m4_include([m4/lt~obsolete.m4]) -m4_include([m4/pcre2_visibility.m4]) diff --git a/pcre2/ar-lib b/pcre2/ar-lib deleted file mode 100755 index 1e9388e2a..000000000 --- a/pcre2/ar-lib +++ /dev/null @@ -1,271 +0,0 @@ -#! /bin/sh -# Wrapper for Microsoft lib.exe - -me=ar-lib -scriptversion=2019-07-04.01; # UTC - -# Copyright (C) 2010-2020 Free Software Foundation, Inc. -# Written by Peter Rosin . -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - -# This file is maintained in Automake, please report -# bugs to or send patches to -# . - - -# func_error message -func_error () -{ - echo "$me: $1" 1>&2 - exit 1 -} - -file_conv= - -# func_file_conv build_file -# Convert a $build file to $host form and store it in $file -# Currently only supports Windows hosts. -func_file_conv () -{ - file=$1 - case $file in - / | /[!/]*) # absolute file, and not a UNC file - if test -z "$file_conv"; then - # lazily determine how to convert abs files - case `uname -s` in - MINGW*) - file_conv=mingw - ;; - CYGWIN* | MSYS*) - file_conv=cygwin - ;; - *) - file_conv=wine - ;; - esac - fi - case $file_conv in - mingw) - file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` - ;; - cygwin | msys) - file=`cygpath -m "$file" || echo "$file"` - ;; - wine) - file=`winepath -w "$file" || echo "$file"` - ;; - esac - ;; - esac -} - -# func_at_file at_file operation archive -# Iterate over all members in AT_FILE performing OPERATION on ARCHIVE -# for each of them. -# When interpreting the content of the @FILE, do NOT use func_file_conv, -# since the user would need to supply preconverted file names to -# binutils ar, at least for MinGW. -func_at_file () -{ - operation=$2 - archive=$3 - at_file_contents=`cat "$1"` - eval set x "$at_file_contents" - shift - - for member - do - $AR -NOLOGO $operation:"$member" "$archive" || exit $? - done -} - -case $1 in - '') - func_error "no command. Try '$0 --help' for more information." - ;; - -h | --h*) - cat <_FOUND variable. -# The package is found if all variables listed are TRUE. -# Example: -# -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibXml2 DEFAULT_MSG LIBXML2_LIBRARIES LIBXML2_INCLUDE_DIR) -# -# LibXml2 is considered to be found, if both LIBXML2_LIBRARIES and -# LIBXML2_INCLUDE_DIR are valid. Then also LIBXML2_FOUND is set to TRUE. -# If it is not found and REQUIRED was used, it fails with FATAL_ERROR, -# independent whether QUIET was used or not. -# If it is found, the location is reported using the VAR1 argument, so -# here a message "Found LibXml2: /usr/lib/libxml2.so" will be printed out. -# If the second argument is DEFAULT_MSG, the message in the failure case will -# be "Could NOT find LibXml2", if you don't like this message you can specify -# your own custom failure message there. - -MACRO(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FAIL_MSG _VAR1 ) - - IF("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG") - IF (${_NAME}_FIND_REQUIRED) - SET(_FAIL_MESSAGE "Could not find REQUIRED package ${_NAME}") - ELSE (${_NAME}_FIND_REQUIRED) - SET(_FAIL_MESSAGE "Could not find OPTIONAL package ${_NAME}") - ENDIF (${_NAME}_FIND_REQUIRED) - ELSE("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG") - SET(_FAIL_MESSAGE "${_FAIL_MSG}") - ENDIF("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG") - - STRING(TOUPPER ${_NAME} _NAME_UPPER) - - SET(${_NAME_UPPER}_FOUND TRUE) - IF(NOT ${_VAR1}) - SET(${_NAME_UPPER}_FOUND FALSE) - ENDIF(NOT ${_VAR1}) - - FOREACH(_CURRENT_VAR ${ARGN}) - IF(NOT ${_CURRENT_VAR}) - SET(${_NAME_UPPER}_FOUND FALSE) - ENDIF(NOT ${_CURRENT_VAR}) - ENDFOREACH(_CURRENT_VAR) - - IF (${_NAME_UPPER}_FOUND) - IF (NOT ${_NAME}_FIND_QUIETLY) - MESSAGE(STATUS "Found ${_NAME}: ${${_VAR1}}") - ENDIF (NOT ${_NAME}_FIND_QUIETLY) - ELSE (${_NAME_UPPER}_FOUND) - IF (${_NAME}_FIND_REQUIRED) - MESSAGE(FATAL_ERROR "${_FAIL_MESSAGE}") - ELSE (${_NAME}_FIND_REQUIRED) - IF (NOT ${_NAME}_FIND_QUIETLY) - MESSAGE(STATUS "${_FAIL_MESSAGE}") - ENDIF (NOT ${_NAME}_FIND_QUIETLY) - ENDIF (${_NAME}_FIND_REQUIRED) - ENDIF (${_NAME_UPPER}_FOUND) -ENDMACRO(FIND_PACKAGE_HANDLE_STANDARD_ARGS) diff --git a/pcre2/cmake/FindReadline.cmake b/pcre2/cmake/FindReadline.cmake deleted file mode 100644 index 1d4cc5584..000000000 --- a/pcre2/cmake/FindReadline.cmake +++ /dev/null @@ -1,29 +0,0 @@ -# from http://websvn.kde.org/trunk/KDE/kdeedu/cmake/modules/FindReadline.cmake -# http://websvn.kde.org/trunk/KDE/kdeedu/cmake/modules/COPYING-CMAKE-SCRIPTS -# --> BSD licensed -# -# GNU Readline library finder -if(READLINE_INCLUDE_DIR AND READLINE_LIBRARY AND NCURSES_LIBRARY) - set(READLINE_FOUND TRUE) -else(READLINE_INCLUDE_DIR AND READLINE_LIBRARY AND NCURSES_LIBRARY) - FIND_PATH(READLINE_INCLUDE_DIR readline/readline.h - /usr/include/readline - ) - -# 2008-04-22 The next clause used to read like this: -# -# FIND_LIBRARY(READLINE_LIBRARY NAMES readline) -# FIND_LIBRARY(NCURSES_LIBRARY NAMES ncurses ) -# include(FindPackageHandleStandardArgs) -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(Readline DEFAULT_MSG NCURSES_LIBRARY READLINE_INCLUDE_DIR READLINE_LIBRARY ) -# -# I was advised to modify it such that it will find an ncurses library if -# required, but not if one was explicitly given, that is, it allows the -# default to be overridden. PH - - FIND_LIBRARY(READLINE_LIBRARY NAMES readline) - include(FindPackageHandleStandardArgs) - FIND_PACKAGE_HANDLE_STANDARD_ARGS(Readline DEFAULT_MSG READLINE_INCLUDE_DIR READLINE_LIBRARY ) - - MARK_AS_ADVANCED(READLINE_INCLUDE_DIR READLINE_LIBRARY) -endif(READLINE_INCLUDE_DIR AND READLINE_LIBRARY AND NCURSES_LIBRARY) diff --git a/pcre2/compile b/pcre2/compile deleted file mode 100755 index 23fcba011..000000000 --- a/pcre2/compile +++ /dev/null @@ -1,348 +0,0 @@ -#! /bin/sh -# Wrapper for compilers which do not understand '-c -o'. - -scriptversion=2018-03-07.03; # UTC - -# Copyright (C) 1999-2020 Free Software Foundation, Inc. -# Written by Tom Tromey . -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - -# This file is maintained in Automake, please report -# bugs to or send patches to -# . - -nl=' -' - -# We need space, tab and new line, in precisely that order. Quoting is -# there to prevent tools from complaining about whitespace usage. -IFS=" "" $nl" - -file_conv= - -# func_file_conv build_file lazy -# Convert a $build file to $host form and store it in $file -# Currently only supports Windows hosts. If the determined conversion -# type is listed in (the comma separated) LAZY, no conversion will -# take place. -func_file_conv () -{ - file=$1 - case $file in - / | /[!/]*) # absolute file, and not a UNC file - if test -z "$file_conv"; then - # lazily determine how to convert abs files - case `uname -s` in - MINGW*) - file_conv=mingw - ;; - CYGWIN* | MSYS*) - file_conv=cygwin - ;; - *) - file_conv=wine - ;; - esac - fi - case $file_conv/,$2, in - *,$file_conv,*) - ;; - mingw/*) - file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` - ;; - cygwin/* | msys/*) - file=`cygpath -m "$file" || echo "$file"` - ;; - wine/*) - file=`winepath -w "$file" || echo "$file"` - ;; - esac - ;; - esac -} - -# func_cl_dashL linkdir -# Make cl look for libraries in LINKDIR -func_cl_dashL () -{ - func_file_conv "$1" - if test -z "$lib_path"; then - lib_path=$file - else - lib_path="$lib_path;$file" - fi - linker_opts="$linker_opts -LIBPATH:$file" -} - -# func_cl_dashl library -# Do a library search-path lookup for cl -func_cl_dashl () -{ - lib=$1 - found=no - save_IFS=$IFS - IFS=';' - for dir in $lib_path $LIB - do - IFS=$save_IFS - if $shared && test -f "$dir/$lib.dll.lib"; then - found=yes - lib=$dir/$lib.dll.lib - break - fi - if test -f "$dir/$lib.lib"; then - found=yes - lib=$dir/$lib.lib - break - fi - if test -f "$dir/lib$lib.a"; then - found=yes - lib=$dir/lib$lib.a - break - fi - done - IFS=$save_IFS - - if test "$found" != yes; then - lib=$lib.lib - fi -} - -# func_cl_wrapper cl arg... -# Adjust compile command to suit cl -func_cl_wrapper () -{ - # Assume a capable shell - lib_path= - shared=: - linker_opts= - for arg - do - if test -n "$eat"; then - eat= - else - case $1 in - -o) - # configure might choose to run compile as 'compile cc -o foo foo.c'. - eat=1 - case $2 in - *.o | *.[oO][bB][jJ]) - func_file_conv "$2" - set x "$@" -Fo"$file" - shift - ;; - *) - func_file_conv "$2" - set x "$@" -Fe"$file" - shift - ;; - esac - ;; - -I) - eat=1 - func_file_conv "$2" mingw - set x "$@" -I"$file" - shift - ;; - -I*) - func_file_conv "${1#-I}" mingw - set x "$@" -I"$file" - shift - ;; - -l) - eat=1 - func_cl_dashl "$2" - set x "$@" "$lib" - shift - ;; - -l*) - func_cl_dashl "${1#-l}" - set x "$@" "$lib" - shift - ;; - -L) - eat=1 - func_cl_dashL "$2" - ;; - -L*) - func_cl_dashL "${1#-L}" - ;; - -static) - shared=false - ;; - -Wl,*) - arg=${1#-Wl,} - save_ifs="$IFS"; IFS=',' - for flag in $arg; do - IFS="$save_ifs" - linker_opts="$linker_opts $flag" - done - IFS="$save_ifs" - ;; - -Xlinker) - eat=1 - linker_opts="$linker_opts $2" - ;; - -*) - set x "$@" "$1" - shift - ;; - *.cc | *.CC | *.cxx | *.CXX | *.[cC]++) - func_file_conv "$1" - set x "$@" -Tp"$file" - shift - ;; - *.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO]) - func_file_conv "$1" mingw - set x "$@" "$file" - shift - ;; - *) - set x "$@" "$1" - shift - ;; - esac - fi - shift - done - if test -n "$linker_opts"; then - linker_opts="-link$linker_opts" - fi - exec "$@" $linker_opts - exit 1 -} - -eat= - -case $1 in - '') - echo "$0: No command. Try '$0 --help' for more information." 1>&2 - exit 1; - ;; - -h | --h*) - cat <<\EOF -Usage: compile [--help] [--version] PROGRAM [ARGS] - -Wrapper for compilers which do not understand '-c -o'. -Remove '-o dest.o' from ARGS, run PROGRAM with the remaining -arguments, and rename the output as expected. - -If you are trying to build a whole package this is not the -right script to run: please start by reading the file 'INSTALL'. - -Report bugs to . -EOF - exit $? - ;; - -v | --v*) - echo "compile $scriptversion" - exit $? - ;; - cl | *[/\\]cl | cl.exe | *[/\\]cl.exe | \ - icl | *[/\\]icl | icl.exe | *[/\\]icl.exe ) - func_cl_wrapper "$@" # Doesn't return... - ;; -esac - -ofile= -cfile= - -for arg -do - if test -n "$eat"; then - eat= - else - case $1 in - -o) - # configure might choose to run compile as 'compile cc -o foo foo.c'. - # So we strip '-o arg' only if arg is an object. - eat=1 - case $2 in - *.o | *.obj) - ofile=$2 - ;; - *) - set x "$@" -o "$2" - shift - ;; - esac - ;; - *.c) - cfile=$1 - set x "$@" "$1" - shift - ;; - *) - set x "$@" "$1" - shift - ;; - esac - fi - shift -done - -if test -z "$ofile" || test -z "$cfile"; then - # If no '-o' option was seen then we might have been invoked from a - # pattern rule where we don't need one. That is ok -- this is a - # normal compilation that the losing compiler can handle. If no - # '.c' file was seen then we are probably linking. That is also - # ok. - exec "$@" -fi - -# Name of file we expect compiler to create. -cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'` - -# Create the lock directory. -# Note: use '[/\\:.-]' here to ensure that we don't use the same name -# that we are using for the .o file. Also, base the name on the expected -# object file name, since that is what matters with a parallel build. -lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d -while true; do - if mkdir "$lockdir" >/dev/null 2>&1; then - break - fi - sleep 1 -done -# FIXME: race condition here if user kills between mkdir and trap. -trap "rmdir '$lockdir'; exit 1" 1 2 15 - -# Run the compile. -"$@" -ret=$? - -if test -f "$cofile"; then - test "$cofile" = "$ofile" || mv "$cofile" "$ofile" -elif test -f "${cofile}bj"; then - test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile" -fi - -rmdir "$lockdir" -exit $ret - -# Local Variables: -# mode: shell-script -# sh-indentation: 2 -# eval: (add-hook 'before-save-hook 'time-stamp) -# time-stamp-start: "scriptversion=" -# time-stamp-format: "%:y-%02m-%02d.%02H" -# time-stamp-time-zone: "UTC0" -# time-stamp-end: "; # UTC" -# End: diff --git a/pcre2/config-cmake.h.in b/pcre2/config-cmake.h.in deleted file mode 100644 index 7766dd74c..000000000 --- a/pcre2/config-cmake.h.in +++ /dev/null @@ -1,58 +0,0 @@ -/* config.h for CMake builds */ - -#cmakedefine HAVE_ATTRIBUTE_UNINITIALIZED 1 -#cmakedefine HAVE_DIRENT_H 1 -#cmakedefine HAVE_INTTYPES_H 1 -#cmakedefine HAVE_STDINT_H 1 -#cmakedefine HAVE_STRERROR 1 -#cmakedefine HAVE_SYS_STAT_H 1 -#cmakedefine HAVE_SYS_TYPES_H 1 -#cmakedefine HAVE_UNISTD_H 1 -#cmakedefine HAVE_WINDOWS_H 1 - -#cmakedefine HAVE_BCOPY 1 -#cmakedefine HAVE_MEMFD_CREATE 1 -#cmakedefine HAVE_MEMMOVE 1 -#cmakedefine HAVE_SECURE_GETENV 1 -#cmakedefine HAVE_STRERROR 1 - -#cmakedefine PCRE2_STATIC 1 - -#cmakedefine SUPPORT_PCRE2_8 1 -#cmakedefine SUPPORT_PCRE2_16 1 -#cmakedefine SUPPORT_PCRE2_32 1 -#cmakedefine PCRE2_DEBUG 1 -#cmakedefine DISABLE_PERCENT_ZT 1 - -#cmakedefine SUPPORT_LIBBZ2 1 -#cmakedefine SUPPORT_LIBEDIT 1 -#cmakedefine SUPPORT_LIBREADLINE 1 -#cmakedefine SUPPORT_LIBZ 1 - -#cmakedefine SUPPORT_JIT 1 -#cmakedefine SLJIT_PROT_EXECUTABLE_ALLOCATOR 1 -#cmakedefine SUPPORT_PCRE2GREP_JIT 1 -#cmakedefine SUPPORT_PCRE2GREP_CALLOUT 1 -#cmakedefine SUPPORT_PCRE2GREP_CALLOUT_FORK 1 -#cmakedefine SUPPORT_UNICODE 1 -#cmakedefine SUPPORT_VALGRIND 1 - -#cmakedefine BSR_ANYCRLF 1 -#cmakedefine EBCDIC 1 -#cmakedefine EBCDIC_NL25 1 -#cmakedefine HEAP_MATCH_RECURSE 1 -#cmakedefine NEVER_BACKSLASH_C 1 - -#define LINK_SIZE @PCRE2_LINK_SIZE@ -#define HEAP_LIMIT @PCRE2_HEAP_LIMIT@ -#define MATCH_LIMIT @PCRE2_MATCH_LIMIT@ -#define MATCH_LIMIT_DEPTH @PCRE2_MATCH_LIMIT_DEPTH@ -#define NEWLINE_DEFAULT @NEWLINE_DEFAULT@ -#define PARENS_NEST_LIMIT @PCRE2_PARENS_NEST_LIMIT@ -#define PCRE2GREP_BUFSIZE @PCRE2GREP_BUFSIZE@ -#define PCRE2GREP_MAX_BUFSIZE @PCRE2GREP_MAX_BUFSIZE@ - -#define MAX_NAME_SIZE 32 -#define MAX_NAME_COUNT 10000 - -/* end config.h for CMake builds */ diff --git a/pcre2/config.guess b/pcre2/config.guess deleted file mode 100755 index 45001cfec..000000000 --- a/pcre2/config.guess +++ /dev/null @@ -1,1667 +0,0 @@ -#! /bin/sh -# Attempt to guess a canonical system name. -# Copyright 1992-2020 Free Software Foundation, Inc. - -timestamp='2020-01-01' - -# This file is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . -# -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that -# program. This Exception is an additional permission under section 7 -# of the GNU General Public License, version 3 ("GPLv3"). -# -# Originally written by Per Bothner; maintained since 2000 by Ben Elliston. -# -# You can get the latest version of this script from: -# https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess -# -# Please send patches to . - - -me=`echo "$0" | sed -e 's,.*/,,'` - -usage="\ -Usage: $0 [OPTION] - -Output the configuration name of the system \`$me' is run on. - -Options: - -h, --help print this help, then exit - -t, --time-stamp print date of last modification, then exit - -v, --version print version number, then exit - -Report bugs and patches to ." - -version="\ -GNU config.guess ($timestamp) - -Originally written by Per Bothner. -Copyright 1992-2020 Free Software Foundation, Inc. - -This is free software; see the source for copying conditions. There is NO -warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." - -help=" -Try \`$me --help' for more information." - -# Parse command line -while test $# -gt 0 ; do - case $1 in - --time-stamp | --time* | -t ) - echo "$timestamp" ; exit ;; - --version | -v ) - echo "$version" ; exit ;; - --help | --h* | -h ) - echo "$usage"; exit ;; - -- ) # Stop option processing - shift; break ;; - - ) # Use stdin as input. - break ;; - -* ) - echo "$me: invalid option $1$help" >&2 - exit 1 ;; - * ) - break ;; - esac -done - -if test $# != 0; then - echo "$me: too many arguments$help" >&2 - exit 1 -fi - -# CC_FOR_BUILD -- compiler used by this script. Note that the use of a -# compiler to aid in system detection is discouraged as it requires -# temporary files to be created and, as you can see below, it is a -# headache to deal with in a portable fashion. - -# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still -# use `HOST_CC' if defined, but it is deprecated. - -# Portable tmp directory creation inspired by the Autoconf team. - -tmp= -# shellcheck disable=SC2172 -trap 'test -z "$tmp" || rm -fr "$tmp"' 0 1 2 13 15 - -set_cc_for_build() { - # prevent multiple calls if $tmp is already set - test "$tmp" && return 0 - : "${TMPDIR=/tmp}" - # shellcheck disable=SC2039 - { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || - { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir "$tmp" 2>/dev/null) ; } || - { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir "$tmp" 2>/dev/null) && echo "Warning: creating insecure temp directory" >&2 ; } || - { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } - dummy=$tmp/dummy - case ${CC_FOR_BUILD-},${HOST_CC-},${CC-} in - ,,) echo "int x;" > "$dummy.c" - for driver in cc gcc c89 c99 ; do - if ($driver -c -o "$dummy.o" "$dummy.c") >/dev/null 2>&1 ; then - CC_FOR_BUILD="$driver" - break - fi - done - if test x"$CC_FOR_BUILD" = x ; then - CC_FOR_BUILD=no_compiler_found - fi - ;; - ,,*) CC_FOR_BUILD=$CC ;; - ,*,*) CC_FOR_BUILD=$HOST_CC ;; - esac -} - -# This is needed to find uname on a Pyramid OSx when run in the BSD universe. -# (ghazi@noc.rutgers.edu 1994-08-24) -if test -f /.attbin/uname ; then - PATH=$PATH:/.attbin ; export PATH -fi - -UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown -UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown -UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown -UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown - -case "$UNAME_SYSTEM" in -Linux|GNU|GNU/*) - # If the system lacks a compiler, then just pick glibc. - # We could probably try harder. - LIBC=gnu - - set_cc_for_build - cat <<-EOF > "$dummy.c" - #include - #if defined(__UCLIBC__) - LIBC=uclibc - #elif defined(__dietlibc__) - LIBC=dietlibc - #else - LIBC=gnu - #endif - EOF - eval "`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`" - - # If ldd exists, use it to detect musl libc. - if command -v ldd >/dev/null && \ - ldd --version 2>&1 | grep -q ^musl - then - LIBC=musl - fi - ;; -esac - -# Note: order is significant - the case branches are not exclusive. - -case "$UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION" in - *:NetBSD:*:*) - # NetBSD (nbsd) targets should (where applicable) match one or - # more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*, - # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently - # switched to ELF, *-*-netbsd* would select the old - # object file format. This provides both forward - # compatibility and a consistent mechanism for selecting the - # object file format. - # - # Note: NetBSD doesn't particularly care about the vendor - # portion of the name. We always set it to "unknown". - sysctl="sysctl -n hw.machine_arch" - UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \ - "/sbin/$sysctl" 2>/dev/null || \ - "/usr/sbin/$sysctl" 2>/dev/null || \ - echo unknown)` - case "$UNAME_MACHINE_ARCH" in - armeb) machine=armeb-unknown ;; - arm*) machine=arm-unknown ;; - sh3el) machine=shl-unknown ;; - sh3eb) machine=sh-unknown ;; - sh5el) machine=sh5le-unknown ;; - earmv*) - arch=`echo "$UNAME_MACHINE_ARCH" | sed -e 's,^e\(armv[0-9]\).*$,\1,'` - endian=`echo "$UNAME_MACHINE_ARCH" | sed -ne 's,^.*\(eb\)$,\1,p'` - machine="${arch}${endian}"-unknown - ;; - *) machine="$UNAME_MACHINE_ARCH"-unknown ;; - esac - # The Operating System including object format, if it has switched - # to ELF recently (or will in the future) and ABI. - case "$UNAME_MACHINE_ARCH" in - earm*) - os=netbsdelf - ;; - arm*|i386|m68k|ns32k|sh3*|sparc|vax) - set_cc_for_build - if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ - | grep -q __ELF__ - then - # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). - # Return netbsd for either. FIX? - os=netbsd - else - os=netbsdelf - fi - ;; - *) - os=netbsd - ;; - esac - # Determine ABI tags. - case "$UNAME_MACHINE_ARCH" in - earm*) - expr='s/^earmv[0-9]/-eabi/;s/eb$//' - abi=`echo "$UNAME_MACHINE_ARCH" | sed -e "$expr"` - ;; - esac - # The OS release - # Debian GNU/NetBSD machines have a different userland, and - # thus, need a distinct triplet. However, they do not need - # kernel version information, so it can be replaced with a - # suitable tag, in the style of linux-gnu. - case "$UNAME_VERSION" in - Debian*) - release='-gnu' - ;; - *) - release=`echo "$UNAME_RELEASE" | sed -e 's/[-_].*//' | cut -d. -f1,2` - ;; - esac - # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: - # contains redundant information, the shorter form: - # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. - echo "$machine-${os}${release}${abi-}" - exit ;; - *:Bitrig:*:*) - UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'` - echo "$UNAME_MACHINE_ARCH"-unknown-bitrig"$UNAME_RELEASE" - exit ;; - *:OpenBSD:*:*) - UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` - echo "$UNAME_MACHINE_ARCH"-unknown-openbsd"$UNAME_RELEASE" - exit ;; - *:LibertyBSD:*:*) - UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'` - echo "$UNAME_MACHINE_ARCH"-unknown-libertybsd"$UNAME_RELEASE" - exit ;; - *:MidnightBSD:*:*) - echo "$UNAME_MACHINE"-unknown-midnightbsd"$UNAME_RELEASE" - exit ;; - *:ekkoBSD:*:*) - echo "$UNAME_MACHINE"-unknown-ekkobsd"$UNAME_RELEASE" - exit ;; - *:SolidBSD:*:*) - echo "$UNAME_MACHINE"-unknown-solidbsd"$UNAME_RELEASE" - exit ;; - *:OS108:*:*) - echo "$UNAME_MACHINE"-unknown-os108_"$UNAME_RELEASE" - exit ;; - macppc:MirBSD:*:*) - echo powerpc-unknown-mirbsd"$UNAME_RELEASE" - exit ;; - *:MirBSD:*:*) - echo "$UNAME_MACHINE"-unknown-mirbsd"$UNAME_RELEASE" - exit ;; - *:Sortix:*:*) - echo "$UNAME_MACHINE"-unknown-sortix - exit ;; - *:Twizzler:*:*) - echo "$UNAME_MACHINE"-unknown-twizzler - exit ;; - *:Redox:*:*) - echo "$UNAME_MACHINE"-unknown-redox - exit ;; - mips:OSF1:*.*) - echo mips-dec-osf1 - exit ;; - alpha:OSF1:*:*) - case $UNAME_RELEASE in - *4.0) - UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` - ;; - *5.*) - UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` - ;; - esac - # According to Compaq, /usr/sbin/psrinfo has been available on - # OSF/1 and Tru64 systems produced since 1995. I hope that - # covers most systems running today. This code pipes the CPU - # types through head -n 1, so we only detect the type of CPU 0. - ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` - case "$ALPHA_CPU_TYPE" in - "EV4 (21064)") - UNAME_MACHINE=alpha ;; - "EV4.5 (21064)") - UNAME_MACHINE=alpha ;; - "LCA4 (21066/21068)") - UNAME_MACHINE=alpha ;; - "EV5 (21164)") - UNAME_MACHINE=alphaev5 ;; - "EV5.6 (21164A)") - UNAME_MACHINE=alphaev56 ;; - "EV5.6 (21164PC)") - UNAME_MACHINE=alphapca56 ;; - "EV5.7 (21164PC)") - UNAME_MACHINE=alphapca57 ;; - "EV6 (21264)") - UNAME_MACHINE=alphaev6 ;; - "EV6.7 (21264A)") - UNAME_MACHINE=alphaev67 ;; - "EV6.8CB (21264C)") - UNAME_MACHINE=alphaev68 ;; - "EV6.8AL (21264B)") - UNAME_MACHINE=alphaev68 ;; - "EV6.8CX (21264D)") - UNAME_MACHINE=alphaev68 ;; - "EV6.9A (21264/EV69A)") - UNAME_MACHINE=alphaev69 ;; - "EV7 (21364)") - UNAME_MACHINE=alphaev7 ;; - "EV7.9 (21364A)") - UNAME_MACHINE=alphaev79 ;; - esac - # A Pn.n version is a patched version. - # A Vn.n version is a released version. - # A Tn.n version is a released field test version. - # A Xn.n version is an unreleased experimental baselevel. - # 1.2 uses "1.2" for uname -r. - echo "$UNAME_MACHINE"-dec-osf"`echo "$UNAME_RELEASE" | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`" - # Reset EXIT trap before exiting to avoid spurious non-zero exit code. - exitcode=$? - trap '' 0 - exit $exitcode ;; - Amiga*:UNIX_System_V:4.0:*) - echo m68k-unknown-sysv4 - exit ;; - *:[Aa]miga[Oo][Ss]:*:*) - echo "$UNAME_MACHINE"-unknown-amigaos - exit ;; - *:[Mm]orph[Oo][Ss]:*:*) - echo "$UNAME_MACHINE"-unknown-morphos - exit ;; - *:OS/390:*:*) - echo i370-ibm-openedition - exit ;; - *:z/VM:*:*) - echo s390-ibm-zvmoe - exit ;; - *:OS400:*:*) - echo powerpc-ibm-os400 - exit ;; - arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) - echo arm-acorn-riscix"$UNAME_RELEASE" - exit ;; - arm*:riscos:*:*|arm*:RISCOS:*:*) - echo arm-unknown-riscos - exit ;; - SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) - echo hppa1.1-hitachi-hiuxmpp - exit ;; - Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) - # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. - if test "`(/bin/universe) 2>/dev/null`" = att ; then - echo pyramid-pyramid-sysv3 - else - echo pyramid-pyramid-bsd - fi - exit ;; - NILE*:*:*:dcosx) - echo pyramid-pyramid-svr4 - exit ;; - DRS?6000:unix:4.0:6*) - echo sparc-icl-nx6 - exit ;; - DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) - case `/usr/bin/uname -p` in - sparc) echo sparc-icl-nx7; exit ;; - esac ;; - s390x:SunOS:*:*) - echo "$UNAME_MACHINE"-ibm-solaris2"`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`" - exit ;; - sun4H:SunOS:5.*:*) - echo sparc-hal-solaris2"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`" - exit ;; - sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) - echo sparc-sun-solaris2"`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`" - exit ;; - i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*) - echo i386-pc-auroraux"$UNAME_RELEASE" - exit ;; - i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) - set_cc_for_build - SUN_ARCH=i386 - # If there is a compiler, see if it is configured for 64-bit objects. - # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. - # This test works for both compilers. - if [ "$CC_FOR_BUILD" != no_compiler_found ]; then - if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ - (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ - grep IS_64BIT_ARCH >/dev/null - then - SUN_ARCH=x86_64 - fi - fi - echo "$SUN_ARCH"-pc-solaris2"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`" - exit ;; - sun4*:SunOS:6*:*) - # According to config.sub, this is the proper way to canonicalize - # SunOS6. Hard to guess exactly what SunOS6 will be like, but - # it's likely to be more like Solaris than SunOS4. - echo sparc-sun-solaris3"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`" - exit ;; - sun4*:SunOS:*:*) - case "`/usr/bin/arch -k`" in - Series*|S4*) - UNAME_RELEASE=`uname -v` - ;; - esac - # Japanese Language versions have a version number like `4.1.3-JL'. - echo sparc-sun-sunos"`echo "$UNAME_RELEASE"|sed -e 's/-/_/'`" - exit ;; - sun3*:SunOS:*:*) - echo m68k-sun-sunos"$UNAME_RELEASE" - exit ;; - sun*:*:4.2BSD:*) - UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` - test "x$UNAME_RELEASE" = x && UNAME_RELEASE=3 - case "`/bin/arch`" in - sun3) - echo m68k-sun-sunos"$UNAME_RELEASE" - ;; - sun4) - echo sparc-sun-sunos"$UNAME_RELEASE" - ;; - esac - exit ;; - aushp:SunOS:*:*) - echo sparc-auspex-sunos"$UNAME_RELEASE" - exit ;; - # The situation for MiNT is a little confusing. The machine name - # can be virtually everything (everything which is not - # "atarist" or "atariste" at least should have a processor - # > m68000). The system name ranges from "MiNT" over "FreeMiNT" - # to the lowercase version "mint" (or "freemint"). Finally - # the system name "TOS" denotes a system which is actually not - # MiNT. But MiNT is downward compatible to TOS, so this should - # be no problem. - atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) - echo m68k-atari-mint"$UNAME_RELEASE" - exit ;; - atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) - echo m68k-atari-mint"$UNAME_RELEASE" - exit ;; - *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) - echo m68k-atari-mint"$UNAME_RELEASE" - exit ;; - milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) - echo m68k-milan-mint"$UNAME_RELEASE" - exit ;; - hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) - echo m68k-hades-mint"$UNAME_RELEASE" - exit ;; - *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) - echo m68k-unknown-mint"$UNAME_RELEASE" - exit ;; - m68k:machten:*:*) - echo m68k-apple-machten"$UNAME_RELEASE" - exit ;; - powerpc:machten:*:*) - echo powerpc-apple-machten"$UNAME_RELEASE" - exit ;; - RISC*:Mach:*:*) - echo mips-dec-mach_bsd4.3 - exit ;; - RISC*:ULTRIX:*:*) - echo mips-dec-ultrix"$UNAME_RELEASE" - exit ;; - VAX*:ULTRIX*:*:*) - echo vax-dec-ultrix"$UNAME_RELEASE" - exit ;; - 2020:CLIX:*:* | 2430:CLIX:*:*) - echo clipper-intergraph-clix"$UNAME_RELEASE" - exit ;; - mips:*:*:UMIPS | mips:*:*:RISCos) - set_cc_for_build - sed 's/^ //' << EOF > "$dummy.c" -#ifdef __cplusplus -#include /* for printf() prototype */ - int main (int argc, char *argv[]) { -#else - int main (argc, argv) int argc; char *argv[]; { -#endif - #if defined (host_mips) && defined (MIPSEB) - #if defined (SYSTYPE_SYSV) - printf ("mips-mips-riscos%ssysv\\n", argv[1]); exit (0); - #endif - #if defined (SYSTYPE_SVR4) - printf ("mips-mips-riscos%ssvr4\\n", argv[1]); exit (0); - #endif - #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) - printf ("mips-mips-riscos%sbsd\\n", argv[1]); exit (0); - #endif - #endif - exit (-1); - } -EOF - $CC_FOR_BUILD -o "$dummy" "$dummy.c" && - dummyarg=`echo "$UNAME_RELEASE" | sed -n 's/\([0-9]*\).*/\1/p'` && - SYSTEM_NAME=`"$dummy" "$dummyarg"` && - { echo "$SYSTEM_NAME"; exit; } - echo mips-mips-riscos"$UNAME_RELEASE" - exit ;; - Motorola:PowerMAX_OS:*:*) - echo powerpc-motorola-powermax - exit ;; - Motorola:*:4.3:PL8-*) - echo powerpc-harris-powermax - exit ;; - Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) - echo powerpc-harris-powermax - exit ;; - Night_Hawk:Power_UNIX:*:*) - echo powerpc-harris-powerunix - exit ;; - m88k:CX/UX:7*:*) - echo m88k-harris-cxux7 - exit ;; - m88k:*:4*:R4*) - echo m88k-motorola-sysv4 - exit ;; - m88k:*:3*:R3*) - echo m88k-motorola-sysv3 - exit ;; - AViiON:dgux:*:*) - # DG/UX returns AViiON for all architectures - UNAME_PROCESSOR=`/usr/bin/uname -p` - if [ "$UNAME_PROCESSOR" = mc88100 ] || [ "$UNAME_PROCESSOR" = mc88110 ] - then - if [ "$TARGET_BINARY_INTERFACE"x = m88kdguxelfx ] || \ - [ "$TARGET_BINARY_INTERFACE"x = x ] - then - echo m88k-dg-dgux"$UNAME_RELEASE" - else - echo m88k-dg-dguxbcs"$UNAME_RELEASE" - fi - else - echo i586-dg-dgux"$UNAME_RELEASE" - fi - exit ;; - M88*:DolphinOS:*:*) # DolphinOS (SVR3) - echo m88k-dolphin-sysv3 - exit ;; - M88*:*:R3*:*) - # Delta 88k system running SVR3 - echo m88k-motorola-sysv3 - exit ;; - XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) - echo m88k-tektronix-sysv3 - exit ;; - Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) - echo m68k-tektronix-bsd - exit ;; - *:IRIX*:*:*) - echo mips-sgi-irix"`echo "$UNAME_RELEASE"|sed -e 's/-/_/g'`" - exit ;; - ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. - echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id - exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' - i*86:AIX:*:*) - echo i386-ibm-aix - exit ;; - ia64:AIX:*:*) - if [ -x /usr/bin/oslevel ] ; then - IBM_REV=`/usr/bin/oslevel` - else - IBM_REV="$UNAME_VERSION.$UNAME_RELEASE" - fi - echo "$UNAME_MACHINE"-ibm-aix"$IBM_REV" - exit ;; - *:AIX:2:3) - if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then - set_cc_for_build - sed 's/^ //' << EOF > "$dummy.c" - #include - - main() - { - if (!__power_pc()) - exit(1); - puts("powerpc-ibm-aix3.2.5"); - exit(0); - } -EOF - if $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"` - then - echo "$SYSTEM_NAME" - else - echo rs6000-ibm-aix3.2.5 - fi - elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then - echo rs6000-ibm-aix3.2.4 - else - echo rs6000-ibm-aix3.2 - fi - exit ;; - *:AIX:*:[4567]) - IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` - if /usr/sbin/lsattr -El "$IBM_CPU_ID" | grep ' POWER' >/dev/null 2>&1; then - IBM_ARCH=rs6000 - else - IBM_ARCH=powerpc - fi - if [ -x /usr/bin/lslpp ] ; then - IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc | - awk -F: '{ print $3 }' | sed s/[0-9]*$/0/` - else - IBM_REV="$UNAME_VERSION.$UNAME_RELEASE" - fi - echo "$IBM_ARCH"-ibm-aix"$IBM_REV" - exit ;; - *:AIX:*:*) - echo rs6000-ibm-aix - exit ;; - ibmrt:4.4BSD:*|romp-ibm:4.4BSD:*) - echo romp-ibm-bsd4.4 - exit ;; - ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and - echo romp-ibm-bsd"$UNAME_RELEASE" # 4.3 with uname added to - exit ;; # report: romp-ibm BSD 4.3 - *:BOSX:*:*) - echo rs6000-bull-bosx - exit ;; - DPX/2?00:B.O.S.:*:*) - echo m68k-bull-sysv3 - exit ;; - 9000/[34]??:4.3bsd:1.*:*) - echo m68k-hp-bsd - exit ;; - hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) - echo m68k-hp-bsd4.4 - exit ;; - 9000/[34678]??:HP-UX:*:*) - HPUX_REV=`echo "$UNAME_RELEASE"|sed -e 's/[^.]*.[0B]*//'` - case "$UNAME_MACHINE" in - 9000/31?) HP_ARCH=m68000 ;; - 9000/[34]??) HP_ARCH=m68k ;; - 9000/[678][0-9][0-9]) - if [ -x /usr/bin/getconf ]; then - sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` - sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` - case "$sc_cpu_version" in - 523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0 - 528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1 - 532) # CPU_PA_RISC2_0 - case "$sc_kernel_bits" in - 32) HP_ARCH=hppa2.0n ;; - 64) HP_ARCH=hppa2.0w ;; - '') HP_ARCH=hppa2.0 ;; # HP-UX 10.20 - esac ;; - esac - fi - if [ "$HP_ARCH" = "" ]; then - set_cc_for_build - sed 's/^ //' << EOF > "$dummy.c" - - #define _HPUX_SOURCE - #include - #include - - int main () - { - #if defined(_SC_KERNEL_BITS) - long bits = sysconf(_SC_KERNEL_BITS); - #endif - long cpu = sysconf (_SC_CPU_VERSION); - - switch (cpu) - { - case CPU_PA_RISC1_0: puts ("hppa1.0"); break; - case CPU_PA_RISC1_1: puts ("hppa1.1"); break; - case CPU_PA_RISC2_0: - #if defined(_SC_KERNEL_BITS) - switch (bits) - { - case 64: puts ("hppa2.0w"); break; - case 32: puts ("hppa2.0n"); break; - default: puts ("hppa2.0"); break; - } break; - #else /* !defined(_SC_KERNEL_BITS) */ - puts ("hppa2.0"); break; - #endif - default: puts ("hppa1.0"); break; - } - exit (0); - } -EOF - (CCOPTS="" $CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null) && HP_ARCH=`"$dummy"` - test -z "$HP_ARCH" && HP_ARCH=hppa - fi ;; - esac - if [ "$HP_ARCH" = hppa2.0w ] - then - set_cc_for_build - - # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating - # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler - # generating 64-bit code. GNU and HP use different nomenclature: - # - # $ CC_FOR_BUILD=cc ./config.guess - # => hppa2.0w-hp-hpux11.23 - # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess - # => hppa64-hp-hpux11.23 - - if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | - grep -q __LP64__ - then - HP_ARCH=hppa2.0w - else - HP_ARCH=hppa64 - fi - fi - echo "$HP_ARCH"-hp-hpux"$HPUX_REV" - exit ;; - ia64:HP-UX:*:*) - HPUX_REV=`echo "$UNAME_RELEASE"|sed -e 's/[^.]*.[0B]*//'` - echo ia64-hp-hpux"$HPUX_REV" - exit ;; - 3050*:HI-UX:*:*) - set_cc_for_build - sed 's/^ //' << EOF > "$dummy.c" - #include - int - main () - { - long cpu = sysconf (_SC_CPU_VERSION); - /* The order matters, because CPU_IS_HP_MC68K erroneously returns - true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct - results, however. */ - if (CPU_IS_PA_RISC (cpu)) - { - switch (cpu) - { - case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; - case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; - case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; - default: puts ("hppa-hitachi-hiuxwe2"); break; - } - } - else if (CPU_IS_HP_MC68K (cpu)) - puts ("m68k-hitachi-hiuxwe2"); - else puts ("unknown-hitachi-hiuxwe2"); - exit (0); - } -EOF - $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"` && - { echo "$SYSTEM_NAME"; exit; } - echo unknown-hitachi-hiuxwe2 - exit ;; - 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:*) - echo hppa1.1-hp-bsd - exit ;; - 9000/8??:4.3bsd:*:*) - echo hppa1.0-hp-bsd - exit ;; - *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) - echo hppa1.0-hp-mpeix - exit ;; - hp7??:OSF1:*:* | hp8?[79]:OSF1:*:*) - echo hppa1.1-hp-osf - exit ;; - hp8??:OSF1:*:*) - echo hppa1.0-hp-osf - exit ;; - i*86:OSF1:*:*) - if [ -x /usr/sbin/sysversion ] ; then - echo "$UNAME_MACHINE"-unknown-osf1mk - else - echo "$UNAME_MACHINE"-unknown-osf1 - fi - exit ;; - parisc*:Lites*:*:*) - echo hppa1.1-hp-lites - exit ;; - C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) - echo c1-convex-bsd - exit ;; - C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) - if getsysinfo -f scalar_acc - then echo c32-convex-bsd - else echo c2-convex-bsd - fi - exit ;; - C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) - echo c34-convex-bsd - exit ;; - C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) - echo c38-convex-bsd - exit ;; - C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) - echo c4-convex-bsd - exit ;; - CRAY*Y-MP:*:*:*) - echo ymp-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/' - exit ;; - CRAY*[A-Z]90:*:*:*) - echo "$UNAME_MACHINE"-cray-unicos"$UNAME_RELEASE" \ - | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ - -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ - -e 's/\.[^.]*$/.X/' - exit ;; - CRAY*TS:*:*:*) - echo t90-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/' - exit ;; - CRAY*T3E:*:*:*) - echo alphaev5-cray-unicosmk"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/' - exit ;; - CRAY*SV1:*:*:*) - echo sv1-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/' - exit ;; - *:UNICOS/mp:*:*) - echo craynv-cray-unicosmp"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/' - exit ;; - F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) - FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz` - FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'` - FUJITSU_REL=`echo "$UNAME_RELEASE" | sed -e 's/ /_/'` - echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" - exit ;; - 5000:UNIX_System_V:4.*:*) - FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'` - FUJITSU_REL=`echo "$UNAME_RELEASE" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'` - echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" - exit ;; - i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) - echo "$UNAME_MACHINE"-pc-bsdi"$UNAME_RELEASE" - exit ;; - sparc*:BSD/OS:*:*) - echo sparc-unknown-bsdi"$UNAME_RELEASE" - exit ;; - *:BSD/OS:*:*) - echo "$UNAME_MACHINE"-unknown-bsdi"$UNAME_RELEASE" - exit ;; - arm:FreeBSD:*:*) - UNAME_PROCESSOR=`uname -p` - set_cc_for_build - if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ - | grep -q __ARM_PCS_VFP - then - echo "${UNAME_PROCESSOR}"-unknown-freebsd"`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`"-gnueabi - else - echo "${UNAME_PROCESSOR}"-unknown-freebsd"`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`"-gnueabihf - fi - exit ;; - *:FreeBSD:*:*) - UNAME_PROCESSOR=`/usr/bin/uname -p` - case "$UNAME_PROCESSOR" in - amd64) - UNAME_PROCESSOR=x86_64 ;; - i386) - UNAME_PROCESSOR=i586 ;; - esac - echo "$UNAME_PROCESSOR"-unknown-freebsd"`echo "$UNAME_RELEASE"|sed -e 's/[-(].*//'`" - exit ;; - i*:CYGWIN*:*) - echo "$UNAME_MACHINE"-pc-cygwin - exit ;; - *:MINGW64*:*) - echo "$UNAME_MACHINE"-pc-mingw64 - exit ;; - *:MINGW*:*) - echo "$UNAME_MACHINE"-pc-mingw32 - exit ;; - *:MSYS*:*) - echo "$UNAME_MACHINE"-pc-msys - exit ;; - i*:PW*:*) - echo "$UNAME_MACHINE"-pc-pw32 - exit ;; - *:Interix*:*) - case "$UNAME_MACHINE" in - x86) - echo i586-pc-interix"$UNAME_RELEASE" - exit ;; - authenticamd | genuineintel | EM64T) - echo x86_64-unknown-interix"$UNAME_RELEASE" - exit ;; - IA64) - echo ia64-unknown-interix"$UNAME_RELEASE" - exit ;; - esac ;; - i*:UWIN*:*) - echo "$UNAME_MACHINE"-pc-uwin - exit ;; - amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) - echo x86_64-pc-cygwin - exit ;; - prep*:SunOS:5.*:*) - echo powerpcle-unknown-solaris2"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`" - exit ;; - *:GNU:*:*) - # the GNU system - echo "`echo "$UNAME_MACHINE"|sed -e 's,[-/].*$,,'`-unknown-$LIBC`echo "$UNAME_RELEASE"|sed -e 's,/.*$,,'`" - exit ;; - *:GNU/*:*:*) - # other systems with GNU libc and userland - echo "$UNAME_MACHINE-unknown-`echo "$UNAME_SYSTEM" | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo "$UNAME_RELEASE"|sed -e 's/[-(].*//'`-$LIBC" - exit ;; - *:Minix:*:*) - echo "$UNAME_MACHINE"-unknown-minix - exit ;; - aarch64:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - aarch64_be:Linux:*:*) - UNAME_MACHINE=aarch64_be - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - alpha:Linux:*:*) - case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' /proc/cpuinfo 2>/dev/null` in - EV5) UNAME_MACHINE=alphaev5 ;; - EV56) UNAME_MACHINE=alphaev56 ;; - PCA56) UNAME_MACHINE=alphapca56 ;; - PCA57) UNAME_MACHINE=alphapca56 ;; - EV6) UNAME_MACHINE=alphaev6 ;; - EV67) UNAME_MACHINE=alphaev67 ;; - EV68*) UNAME_MACHINE=alphaev68 ;; - esac - objdump --private-headers /bin/sh | grep -q ld.so.1 - if test "$?" = 0 ; then LIBC=gnulibc1 ; fi - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - arc:Linux:*:* | arceb:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - arm*:Linux:*:*) - set_cc_for_build - if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ - | grep -q __ARM_EABI__ - then - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - else - if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ - | grep -q __ARM_PCS_VFP - then - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"eabi - else - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"eabihf - fi - fi - exit ;; - avr32*:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - cris:Linux:*:*) - echo "$UNAME_MACHINE"-axis-linux-"$LIBC" - exit ;; - crisv32:Linux:*:*) - echo "$UNAME_MACHINE"-axis-linux-"$LIBC" - exit ;; - e2k:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - frv:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - hexagon:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - i*86:Linux:*:*) - echo "$UNAME_MACHINE"-pc-linux-"$LIBC" - exit ;; - ia64:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - k1om:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - m32r*:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - m68*:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - mips:Linux:*:* | mips64:Linux:*:*) - set_cc_for_build - IS_GLIBC=0 - test x"${LIBC}" = xgnu && IS_GLIBC=1 - sed 's/^ //' << EOF > "$dummy.c" - #undef CPU - #undef mips - #undef mipsel - #undef mips64 - #undef mips64el - #if ${IS_GLIBC} && defined(_ABI64) - LIBCABI=gnuabi64 - #else - #if ${IS_GLIBC} && defined(_ABIN32) - LIBCABI=gnuabin32 - #else - LIBCABI=${LIBC} - #endif - #endif - - #if ${IS_GLIBC} && defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6 - CPU=mipsisa64r6 - #else - #if ${IS_GLIBC} && !defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6 - CPU=mipsisa32r6 - #else - #if defined(__mips64) - CPU=mips64 - #else - CPU=mips - #endif - #endif - #endif - - #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) - MIPS_ENDIAN=el - #else - #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) - MIPS_ENDIAN= - #else - MIPS_ENDIAN= - #endif - #endif -EOF - eval "`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^CPU\|^MIPS_ENDIAN\|^LIBCABI'`" - test "x$CPU" != x && { echo "$CPU${MIPS_ENDIAN}-unknown-linux-$LIBCABI"; exit; } - ;; - mips64el:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - openrisc*:Linux:*:*) - echo or1k-unknown-linux-"$LIBC" - exit ;; - or32:Linux:*:* | or1k*:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - padre:Linux:*:*) - echo sparc-unknown-linux-"$LIBC" - exit ;; - parisc64:Linux:*:* | hppa64:Linux:*:*) - echo hppa64-unknown-linux-"$LIBC" - exit ;; - parisc:Linux:*:* | hppa:Linux:*:*) - # Look for CPU level - case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in - PA7*) echo hppa1.1-unknown-linux-"$LIBC" ;; - PA8*) echo hppa2.0-unknown-linux-"$LIBC" ;; - *) echo hppa-unknown-linux-"$LIBC" ;; - esac - exit ;; - ppc64:Linux:*:*) - echo powerpc64-unknown-linux-"$LIBC" - exit ;; - ppc:Linux:*:*) - echo powerpc-unknown-linux-"$LIBC" - exit ;; - ppc64le:Linux:*:*) - echo powerpc64le-unknown-linux-"$LIBC" - exit ;; - ppcle:Linux:*:*) - echo powerpcle-unknown-linux-"$LIBC" - exit ;; - riscv32:Linux:*:* | riscv64:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - s390:Linux:*:* | s390x:Linux:*:*) - echo "$UNAME_MACHINE"-ibm-linux-"$LIBC" - exit ;; - sh64*:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - sh*:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - sparc:Linux:*:* | sparc64:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - tile*:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - vax:Linux:*:*) - echo "$UNAME_MACHINE"-dec-linux-"$LIBC" - exit ;; - x86_64:Linux:*:*) - echo "$UNAME_MACHINE"-pc-linux-"$LIBC" - exit ;; - xtensa*:Linux:*:*) - echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" - exit ;; - i*86:DYNIX/ptx:4*:*) - # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. - # earlier versions are messed up and put the nodename in both - # sysname and nodename. - echo i386-sequent-sysv4 - exit ;; - i*86:UNIX_SV:4.2MP:2.*) - # Unixware is an offshoot of SVR4, but it has its own version - # number series starting with 2... - # I am not positive that other SVR4 systems won't match this, - # I just have to hope. -- rms. - # Use sysv4.2uw... so that sysv4* matches it. - echo "$UNAME_MACHINE"-pc-sysv4.2uw"$UNAME_VERSION" - exit ;; - i*86:OS/2:*:*) - # If we were able to find `uname', then EMX Unix compatibility - # is probably installed. - echo "$UNAME_MACHINE"-pc-os2-emx - exit ;; - i*86:XTS-300:*:STOP) - echo "$UNAME_MACHINE"-unknown-stop - exit ;; - i*86:atheos:*:*) - echo "$UNAME_MACHINE"-unknown-atheos - exit ;; - i*86:syllable:*:*) - echo "$UNAME_MACHINE"-pc-syllable - exit ;; - i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*) - echo i386-unknown-lynxos"$UNAME_RELEASE" - exit ;; - i*86:*DOS:*:*) - echo "$UNAME_MACHINE"-pc-msdosdjgpp - exit ;; - i*86:*:4.*:*) - UNAME_REL=`echo "$UNAME_RELEASE" | sed 's/\/MP$//'` - if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then - echo "$UNAME_MACHINE"-univel-sysv"$UNAME_REL" - else - echo "$UNAME_MACHINE"-pc-sysv"$UNAME_REL" - fi - exit ;; - i*86:*:5:[678]*) - # UnixWare 7.x, OpenUNIX and OpenServer 6. - case `/bin/uname -X | grep "^Machine"` in - *486*) UNAME_MACHINE=i486 ;; - *Pentium) UNAME_MACHINE=i586 ;; - *Pent*|*Celeron) UNAME_MACHINE=i686 ;; - esac - echo "$UNAME_MACHINE-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}" - exit ;; - i*86:*:3.2:*) - if test -f /usr/options/cb.name; then - UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then - UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` - (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 - (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ - && UNAME_MACHINE=i586 - (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ - && UNAME_MACHINE=i686 - (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ - && UNAME_MACHINE=i686 - echo "$UNAME_MACHINE"-pc-sco"$UNAME_REL" - else - echo "$UNAME_MACHINE"-pc-sysv32 - fi - exit ;; - pc:*:*:*) - # Left here for compatibility: - # uname -m prints for DJGPP always 'pc', but it prints nothing about - # the processor, so we play safe by assuming i586. - # Note: whatever this is, it MUST be the same as what config.sub - # prints for the "djgpp" host, or else GDB configure will decide that - # this is a cross-build. - echo i586-pc-msdosdjgpp - exit ;; - Intel:Mach:3*:*) - echo i386-pc-mach3 - exit ;; - paragon:*:*:*) - echo i860-intel-osf1 - exit ;; - i860:*:4.*:*) # i860-SVR4 - if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then - echo i860-stardent-sysv"$UNAME_RELEASE" # Stardent Vistra i860-SVR4 - else # Add other i860-SVR4 vendors below as they are discovered. - echo i860-unknown-sysv"$UNAME_RELEASE" # Unknown i860-SVR4 - fi - exit ;; - mini*:CTIX:SYS*5:*) - # "miniframe" - echo m68010-convergent-sysv - exit ;; - mc68k:UNIX:SYSTEM5:3.51m) - echo m68k-convergent-sysv - exit ;; - M680?0:D-NIX:5.3:*) - echo m68k-diab-dnix - exit ;; - M68*:*:R3V[5678]*:*) - test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; - 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) - OS_REL='' - test -r /etc/.relid \ - && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` - /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ - && { echo i486-ncr-sysv4.3"$OS_REL"; exit; } - /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ - && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;; - 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) - /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ - && { echo i486-ncr-sysv4; exit; } ;; - NCR*:*:4.2:* | MPRAS*:*:4.2:*) - OS_REL='.3' - test -r /etc/.relid \ - && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` - /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ - && { echo i486-ncr-sysv4.3"$OS_REL"; exit; } - /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ - && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } - /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \ - && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;; - m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) - echo m68k-unknown-lynxos"$UNAME_RELEASE" - exit ;; - mc68030:UNIX_System_V:4.*:*) - echo m68k-atari-sysv4 - exit ;; - TSUNAMI:LynxOS:2.*:*) - echo sparc-unknown-lynxos"$UNAME_RELEASE" - exit ;; - rs6000:LynxOS:2.*:*) - echo rs6000-unknown-lynxos"$UNAME_RELEASE" - exit ;; - PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*) - echo powerpc-unknown-lynxos"$UNAME_RELEASE" - exit ;; - SM[BE]S:UNIX_SV:*:*) - echo mips-dde-sysv"$UNAME_RELEASE" - exit ;; - RM*:ReliantUNIX-*:*:*) - echo mips-sni-sysv4 - exit ;; - RM*:SINIX-*:*:*) - echo mips-sni-sysv4 - exit ;; - *:SINIX-*:*:*) - if uname -p 2>/dev/null >/dev/null ; then - UNAME_MACHINE=`(uname -p) 2>/dev/null` - echo "$UNAME_MACHINE"-sni-sysv4 - else - echo ns32k-sni-sysv - fi - exit ;; - PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort - # says - echo i586-unisys-sysv4 - exit ;; - *:UNIX_System_V:4*:FTX*) - # From Gerald Hewes . - # How about differentiating between stratus architectures? -djm - echo hppa1.1-stratus-sysv4 - exit ;; - *:*:*:FTX*) - # From seanf@swdc.stratus.com. - echo i860-stratus-sysv4 - exit ;; - i*86:VOS:*:*) - # From Paul.Green@stratus.com. - echo "$UNAME_MACHINE"-stratus-vos - exit ;; - *:VOS:*:*) - # From Paul.Green@stratus.com. - echo hppa1.1-stratus-vos - exit ;; - mc68*:A/UX:*:*) - echo m68k-apple-aux"$UNAME_RELEASE" - exit ;; - news*:NEWS-OS:6*:*) - echo mips-sony-newsos6 - exit ;; - R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) - if [ -d /usr/nec ]; then - echo mips-nec-sysv"$UNAME_RELEASE" - else - echo mips-unknown-sysv"$UNAME_RELEASE" - fi - exit ;; - BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. - echo powerpc-be-beos - exit ;; - BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. - echo powerpc-apple-beos - exit ;; - BePC:BeOS:*:*) # BeOS running on Intel PC compatible. - echo i586-pc-beos - exit ;; - BePC:Haiku:*:*) # Haiku running on Intel PC compatible. - echo i586-pc-haiku - exit ;; - x86_64:Haiku:*:*) - echo x86_64-unknown-haiku - exit ;; - SX-4:SUPER-UX:*:*) - echo sx4-nec-superux"$UNAME_RELEASE" - exit ;; - SX-5:SUPER-UX:*:*) - echo sx5-nec-superux"$UNAME_RELEASE" - exit ;; - SX-6:SUPER-UX:*:*) - echo sx6-nec-superux"$UNAME_RELEASE" - exit ;; - SX-7:SUPER-UX:*:*) - echo sx7-nec-superux"$UNAME_RELEASE" - exit ;; - SX-8:SUPER-UX:*:*) - echo sx8-nec-superux"$UNAME_RELEASE" - exit ;; - SX-8R:SUPER-UX:*:*) - echo sx8r-nec-superux"$UNAME_RELEASE" - exit ;; - SX-ACE:SUPER-UX:*:*) - echo sxace-nec-superux"$UNAME_RELEASE" - exit ;; - Power*:Rhapsody:*:*) - echo powerpc-apple-rhapsody"$UNAME_RELEASE" - exit ;; - *:Rhapsody:*:*) - echo "$UNAME_MACHINE"-apple-rhapsody"$UNAME_RELEASE" - exit ;; - *:Darwin:*:*) - UNAME_PROCESSOR=`uname -p` - case $UNAME_PROCESSOR in - unknown) UNAME_PROCESSOR=powerpc ;; - esac - if command -v xcode-select > /dev/null 2> /dev/null && \ - ! xcode-select --print-path > /dev/null 2> /dev/null ; then - # Avoid executing cc if there is no toolchain installed as - # cc will be a stub that puts up a graphical alert - # prompting the user to install developer tools. - CC_FOR_BUILD=no_compiler_found - else - set_cc_for_build - fi - if [ "$CC_FOR_BUILD" != no_compiler_found ]; then - if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ - (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ - grep IS_64BIT_ARCH >/dev/null - then - case $UNAME_PROCESSOR in - i386) UNAME_PROCESSOR=x86_64 ;; - powerpc) UNAME_PROCESSOR=powerpc64 ;; - esac - fi - # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc - if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \ - (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ - grep IS_PPC >/dev/null - then - UNAME_PROCESSOR=powerpc - fi - elif test "$UNAME_PROCESSOR" = i386 ; then - # uname -m returns i386 or x86_64 - UNAME_PROCESSOR=$UNAME_MACHINE - fi - echo "$UNAME_PROCESSOR"-apple-darwin"$UNAME_RELEASE" - exit ;; - *:procnto*:*:* | *:QNX:[0123456789]*:*) - UNAME_PROCESSOR=`uname -p` - if test "$UNAME_PROCESSOR" = x86; then - UNAME_PROCESSOR=i386 - UNAME_MACHINE=pc - fi - echo "$UNAME_PROCESSOR"-"$UNAME_MACHINE"-nto-qnx"$UNAME_RELEASE" - exit ;; - *:QNX:*:4*) - echo i386-pc-qnx - exit ;; - NEO-*:NONSTOP_KERNEL:*:*) - echo neo-tandem-nsk"$UNAME_RELEASE" - exit ;; - NSE-*:NONSTOP_KERNEL:*:*) - echo nse-tandem-nsk"$UNAME_RELEASE" - exit ;; - NSR-*:NONSTOP_KERNEL:*:*) - echo nsr-tandem-nsk"$UNAME_RELEASE" - exit ;; - NSV-*:NONSTOP_KERNEL:*:*) - echo nsv-tandem-nsk"$UNAME_RELEASE" - exit ;; - NSX-*:NONSTOP_KERNEL:*:*) - echo nsx-tandem-nsk"$UNAME_RELEASE" - exit ;; - *:NonStop-UX:*:*) - echo mips-compaq-nonstopux - exit ;; - BS2000:POSIX*:*:*) - echo bs2000-siemens-sysv - exit ;; - DS/*:UNIX_System_V:*:*) - echo "$UNAME_MACHINE"-"$UNAME_SYSTEM"-"$UNAME_RELEASE" - exit ;; - *:Plan9:*:*) - # "uname -m" is not consistent, so use $cputype instead. 386 - # is converted to i386 for consistency with other x86 - # operating systems. - # shellcheck disable=SC2154 - if test "$cputype" = 386; then - UNAME_MACHINE=i386 - else - UNAME_MACHINE="$cputype" - fi - echo "$UNAME_MACHINE"-unknown-plan9 - exit ;; - *:TOPS-10:*:*) - echo pdp10-unknown-tops10 - exit ;; - *:TENEX:*:*) - echo pdp10-unknown-tenex - exit ;; - KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) - echo pdp10-dec-tops20 - exit ;; - XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) - echo pdp10-xkl-tops20 - exit ;; - *:TOPS-20:*:*) - echo pdp10-unknown-tops20 - exit ;; - *:ITS:*:*) - echo pdp10-unknown-its - exit ;; - SEI:*:*:SEIUX) - echo mips-sei-seiux"$UNAME_RELEASE" - exit ;; - *:DragonFly:*:*) - echo "$UNAME_MACHINE"-unknown-dragonfly"`echo "$UNAME_RELEASE"|sed -e 's/[-(].*//'`" - exit ;; - *:*VMS:*:*) - UNAME_MACHINE=`(uname -p) 2>/dev/null` - case "$UNAME_MACHINE" in - A*) echo alpha-dec-vms ; exit ;; - I*) echo ia64-dec-vms ; exit ;; - V*) echo vax-dec-vms ; exit ;; - esac ;; - *:XENIX:*:SysV) - echo i386-pc-xenix - exit ;; - i*86:skyos:*:*) - echo "$UNAME_MACHINE"-pc-skyos"`echo "$UNAME_RELEASE" | sed -e 's/ .*$//'`" - exit ;; - i*86:rdos:*:*) - echo "$UNAME_MACHINE"-pc-rdos - exit ;; - i*86:AROS:*:*) - echo "$UNAME_MACHINE"-pc-aros - exit ;; - x86_64:VMkernel:*:*) - echo "$UNAME_MACHINE"-unknown-esx - exit ;; - amd64:Isilon\ OneFS:*:*) - echo x86_64-unknown-onefs - exit ;; - *:Unleashed:*:*) - echo "$UNAME_MACHINE"-unknown-unleashed"$UNAME_RELEASE" - exit ;; -esac - -# No uname command or uname output not recognized. -set_cc_for_build -cat > "$dummy.c" < -#include -#endif -#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__) -#if defined (vax) || defined (__vax) || defined (__vax__) || defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__) -#include -#if defined(_SIZE_T_) || defined(SIGLOST) -#include -#endif -#endif -#endif -main () -{ -#if defined (sony) -#if defined (MIPSEB) - /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, - I don't know.... */ - printf ("mips-sony-bsd\n"); exit (0); -#else -#include - printf ("m68k-sony-newsos%s\n", -#ifdef NEWSOS4 - "4" -#else - "" -#endif - ); exit (0); -#endif -#endif - -#if defined (NeXT) -#if !defined (__ARCHITECTURE__) -#define __ARCHITECTURE__ "m68k" -#endif - int version; - version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; - if (version < 4) - printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); - else - printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); - exit (0); -#endif - -#if defined (MULTIMAX) || defined (n16) -#if defined (UMAXV) - printf ("ns32k-encore-sysv\n"); exit (0); -#else -#if defined (CMU) - printf ("ns32k-encore-mach\n"); exit (0); -#else - printf ("ns32k-encore-bsd\n"); exit (0); -#endif -#endif -#endif - -#if defined (__386BSD__) - printf ("i386-pc-bsd\n"); exit (0); -#endif - -#if defined (sequent) -#if defined (i386) - printf ("i386-sequent-dynix\n"); exit (0); -#endif -#if defined (ns32000) - printf ("ns32k-sequent-dynix\n"); exit (0); -#endif -#endif - -#if defined (_SEQUENT_) - struct utsname un; - - uname(&un); - if (strncmp(un.version, "V2", 2) == 0) { - printf ("i386-sequent-ptx2\n"); exit (0); - } - if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ - printf ("i386-sequent-ptx1\n"); exit (0); - } - printf ("i386-sequent-ptx\n"); exit (0); -#endif - -#if defined (vax) -#if !defined (ultrix) -#include -#if defined (BSD) -#if BSD == 43 - printf ("vax-dec-bsd4.3\n"); exit (0); -#else -#if BSD == 199006 - printf ("vax-dec-bsd4.3reno\n"); exit (0); -#else - printf ("vax-dec-bsd\n"); exit (0); -#endif -#endif -#else - printf ("vax-dec-bsd\n"); exit (0); -#endif -#else -#if defined(_SIZE_T_) || defined(SIGLOST) - struct utsname un; - uname (&un); - printf ("vax-dec-ultrix%s\n", un.release); exit (0); -#else - printf ("vax-dec-ultrix\n"); exit (0); -#endif -#endif -#endif -#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__) -#if defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__) -#if defined(_SIZE_T_) || defined(SIGLOST) - struct utsname *un; - uname (&un); - printf ("mips-dec-ultrix%s\n", un.release); exit (0); -#else - printf ("mips-dec-ultrix\n"); exit (0); -#endif -#endif -#endif - -#if defined (alliant) && defined (i860) - printf ("i860-alliant-bsd\n"); exit (0); -#endif - - exit (1); -} -EOF - -$CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null && SYSTEM_NAME=`$dummy` && - { echo "$SYSTEM_NAME"; exit; } - -# Apollos put the system type in the environment. -test -d /usr/apollo && { echo "$ISP-apollo-$SYSTYPE"; exit; } - -echo "$0: unable to guess system type" >&2 - -case "$UNAME_MACHINE:$UNAME_SYSTEM" in - mips:Linux | mips64:Linux) - # If we got here on MIPS GNU/Linux, output extra information. - cat >&2 <&2 </dev/null || echo unknown` -uname -r = `(uname -r) 2>/dev/null || echo unknown` -uname -s = `(uname -s) 2>/dev/null || echo unknown` -uname -v = `(uname -v) 2>/dev/null || echo unknown` - -/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` -/bin/uname -X = `(/bin/uname -X) 2>/dev/null` - -hostinfo = `(hostinfo) 2>/dev/null` -/bin/universe = `(/bin/universe) 2>/dev/null` -/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` -/bin/arch = `(/bin/arch) 2>/dev/null` -/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` -/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` - -UNAME_MACHINE = "$UNAME_MACHINE" -UNAME_RELEASE = "$UNAME_RELEASE" -UNAME_SYSTEM = "$UNAME_SYSTEM" -UNAME_VERSION = "$UNAME_VERSION" -EOF - -exit 1 - -# Local variables: -# eval: (add-hook 'before-save-hook 'time-stamp) -# time-stamp-start: "timestamp='" -# time-stamp-format: "%:y-%02m-%02d" -# time-stamp-end: "'" -# End: diff --git a/pcre2/config.sub b/pcre2/config.sub deleted file mode 100755 index f02d43ad5..000000000 --- a/pcre2/config.sub +++ /dev/null @@ -1,1793 +0,0 @@ -#! /bin/sh -# Configuration validation subroutine script. -# Copyright 1992-2020 Free Software Foundation, Inc. - -timestamp='2020-01-01' - -# This file is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . -# -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that -# program. This Exception is an additional permission under section 7 -# of the GNU General Public License, version 3 ("GPLv3"). - - -# Please send patches to . -# -# Configuration subroutine to validate and canonicalize a configuration type. -# Supply the specified configuration type as an argument. -# If it is invalid, we print an error message on stderr and exit with code 1. -# Otherwise, we print the canonical config type on stdout and succeed. - -# You can get the latest version of this script from: -# https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub - -# This file is supposed to be the same for all GNU packages -# and recognize all the CPU types, system types and aliases -# that are meaningful with *any* GNU software. -# Each package is responsible for reporting which valid configurations -# it does not support. The user should be able to distinguish -# a failure to support a valid configuration from a meaningless -# configuration. - -# The goal of this file is to map all the various variations of a given -# machine specification into a single specification in the form: -# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM -# or in some cases, the newer four-part form: -# CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM -# It is wrong to echo any other type of specification. - -me=`echo "$0" | sed -e 's,.*/,,'` - -usage="\ -Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS - -Canonicalize a configuration name. - -Options: - -h, --help print this help, then exit - -t, --time-stamp print date of last modification, then exit - -v, --version print version number, then exit - -Report bugs and patches to ." - -version="\ -GNU config.sub ($timestamp) - -Copyright 1992-2020 Free Software Foundation, Inc. - -This is free software; see the source for copying conditions. There is NO -warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." - -help=" -Try \`$me --help' for more information." - -# Parse command line -while test $# -gt 0 ; do - case $1 in - --time-stamp | --time* | -t ) - echo "$timestamp" ; exit ;; - --version | -v ) - echo "$version" ; exit ;; - --help | --h* | -h ) - echo "$usage"; exit ;; - -- ) # Stop option processing - shift; break ;; - - ) # Use stdin as input. - break ;; - -* ) - echo "$me: invalid option $1$help" >&2 - exit 1 ;; - - *local*) - # First pass through any local machine types. - echo "$1" - exit ;; - - * ) - break ;; - esac -done - -case $# in - 0) echo "$me: missing argument$help" >&2 - exit 1;; - 1) ;; - *) echo "$me: too many arguments$help" >&2 - exit 1;; -esac - -# Split fields of configuration type -# shellcheck disable=SC2162 -IFS="-" read field1 field2 field3 field4 <&2 - exit 1 - ;; - *-*-*-*) - basic_machine=$field1-$field2 - os=$field3-$field4 - ;; - *-*-*) - # Ambiguous whether COMPANY is present, or skipped and KERNEL-OS is two - # parts - maybe_os=$field2-$field3 - case $maybe_os in - nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc \ - | linux-newlib* | linux-musl* | linux-uclibc* | uclinux-uclibc* \ - | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* \ - | netbsd*-eabi* | kopensolaris*-gnu* | cloudabi*-eabi* \ - | storm-chaos* | os2-emx* | rtmk-nova*) - basic_machine=$field1 - os=$maybe_os - ;; - android-linux) - basic_machine=$field1-unknown - os=linux-android - ;; - *) - basic_machine=$field1-$field2 - os=$field3 - ;; - esac - ;; - *-*) - # A lone config we happen to match not fitting any pattern - case $field1-$field2 in - decstation-3100) - basic_machine=mips-dec - os= - ;; - *-*) - # Second component is usually, but not always the OS - case $field2 in - # Prevent following clause from handling this valid os - sun*os*) - basic_machine=$field1 - os=$field2 - ;; - # Manufacturers - dec* | mips* | sequent* | encore* | pc533* | sgi* | sony* \ - | att* | 7300* | 3300* | delta* | motorola* | sun[234]* \ - | unicom* | ibm* | next | hp | isi* | apollo | altos* \ - | convergent* | ncr* | news | 32* | 3600* | 3100* \ - | hitachi* | c[123]* | convex* | sun | crds | omron* | dg \ - | ultra | tti* | harris | dolphin | highlevel | gould \ - | cbm | ns | masscomp | apple | axis | knuth | cray \ - | microblaze* | sim | cisco \ - | oki | wec | wrs | winbond) - basic_machine=$field1-$field2 - os= - ;; - *) - basic_machine=$field1 - os=$field2 - ;; - esac - ;; - esac - ;; - *) - # Convert single-component short-hands not valid as part of - # multi-component configurations. - case $field1 in - 386bsd) - basic_machine=i386-pc - os=bsd - ;; - a29khif) - basic_machine=a29k-amd - os=udi - ;; - adobe68k) - basic_machine=m68010-adobe - os=scout - ;; - alliant) - basic_machine=fx80-alliant - os= - ;; - altos | altos3068) - basic_machine=m68k-altos - os= - ;; - am29k) - basic_machine=a29k-none - os=bsd - ;; - amdahl) - basic_machine=580-amdahl - os=sysv - ;; - amiga) - basic_machine=m68k-unknown - os= - ;; - amigaos | amigados) - basic_machine=m68k-unknown - os=amigaos - ;; - amigaunix | amix) - basic_machine=m68k-unknown - os=sysv4 - ;; - apollo68) - basic_machine=m68k-apollo - os=sysv - ;; - apollo68bsd) - basic_machine=m68k-apollo - os=bsd - ;; - aros) - basic_machine=i386-pc - os=aros - ;; - aux) - basic_machine=m68k-apple - os=aux - ;; - balance) - basic_machine=ns32k-sequent - os=dynix - ;; - blackfin) - basic_machine=bfin-unknown - os=linux - ;; - cegcc) - basic_machine=arm-unknown - os=cegcc - ;; - convex-c1) - basic_machine=c1-convex - os=bsd - ;; - convex-c2) - basic_machine=c2-convex - os=bsd - ;; - convex-c32) - basic_machine=c32-convex - os=bsd - ;; - convex-c34) - basic_machine=c34-convex - os=bsd - ;; - convex-c38) - basic_machine=c38-convex - os=bsd - ;; - cray) - basic_machine=j90-cray - os=unicos - ;; - crds | unos) - basic_machine=m68k-crds - os= - ;; - da30) - basic_machine=m68k-da30 - os= - ;; - decstation | pmax | pmin | dec3100 | decstatn) - basic_machine=mips-dec - os= - ;; - delta88) - basic_machine=m88k-motorola - os=sysv3 - ;; - dicos) - basic_machine=i686-pc - os=dicos - ;; - djgpp) - basic_machine=i586-pc - os=msdosdjgpp - ;; - ebmon29k) - basic_machine=a29k-amd - os=ebmon - ;; - es1800 | OSE68k | ose68k | ose | OSE) - basic_machine=m68k-ericsson - os=ose - ;; - gmicro) - basic_machine=tron-gmicro - os=sysv - ;; - go32) - basic_machine=i386-pc - os=go32 - ;; - h8300hms) - basic_machine=h8300-hitachi - os=hms - ;; - h8300xray) - basic_machine=h8300-hitachi - os=xray - ;; - h8500hms) - basic_machine=h8500-hitachi - os=hms - ;; - harris) - basic_machine=m88k-harris - os=sysv3 - ;; - hp300 | hp300hpux) - basic_machine=m68k-hp - os=hpux - ;; - hp300bsd) - basic_machine=m68k-hp - os=bsd - ;; - hppaosf) - basic_machine=hppa1.1-hp - os=osf - ;; - hppro) - basic_machine=hppa1.1-hp - os=proelf - ;; - i386mach) - basic_machine=i386-mach - os=mach - ;; - isi68 | isi) - basic_machine=m68k-isi - os=sysv - ;; - m68knommu) - basic_machine=m68k-unknown - os=linux - ;; - magnum | m3230) - basic_machine=mips-mips - os=sysv - ;; - merlin) - basic_machine=ns32k-utek - os=sysv - ;; - mingw64) - basic_machine=x86_64-pc - os=mingw64 - ;; - mingw32) - basic_machine=i686-pc - os=mingw32 - ;; - mingw32ce) - basic_machine=arm-unknown - os=mingw32ce - ;; - monitor) - basic_machine=m68k-rom68k - os=coff - ;; - morphos) - basic_machine=powerpc-unknown - os=morphos - ;; - moxiebox) - basic_machine=moxie-unknown - os=moxiebox - ;; - msdos) - basic_machine=i386-pc - os=msdos - ;; - msys) - basic_machine=i686-pc - os=msys - ;; - mvs) - basic_machine=i370-ibm - os=mvs - ;; - nacl) - basic_machine=le32-unknown - os=nacl - ;; - ncr3000) - basic_machine=i486-ncr - os=sysv4 - ;; - netbsd386) - basic_machine=i386-pc - os=netbsd - ;; - netwinder) - basic_machine=armv4l-rebel - os=linux - ;; - news | news700 | news800 | news900) - basic_machine=m68k-sony - os=newsos - ;; - news1000) - basic_machine=m68030-sony - os=newsos - ;; - necv70) - basic_machine=v70-nec - os=sysv - ;; - nh3000) - basic_machine=m68k-harris - os=cxux - ;; - nh[45]000) - basic_machine=m88k-harris - os=cxux - ;; - nindy960) - basic_machine=i960-intel - os=nindy - ;; - mon960) - basic_machine=i960-intel - os=mon960 - ;; - nonstopux) - basic_machine=mips-compaq - os=nonstopux - ;; - os400) - basic_machine=powerpc-ibm - os=os400 - ;; - OSE68000 | ose68000) - basic_machine=m68000-ericsson - os=ose - ;; - os68k) - basic_machine=m68k-none - os=os68k - ;; - paragon) - basic_machine=i860-intel - os=osf - ;; - parisc) - basic_machine=hppa-unknown - os=linux - ;; - pw32) - basic_machine=i586-unknown - os=pw32 - ;; - rdos | rdos64) - basic_machine=x86_64-pc - os=rdos - ;; - rdos32) - basic_machine=i386-pc - os=rdos - ;; - rom68k) - basic_machine=m68k-rom68k - os=coff - ;; - sa29200) - basic_machine=a29k-amd - os=udi - ;; - sei) - basic_machine=mips-sei - os=seiux - ;; - sequent) - basic_machine=i386-sequent - os= - ;; - sps7) - basic_machine=m68k-bull - os=sysv2 - ;; - st2000) - basic_machine=m68k-tandem - os= - ;; - stratus) - basic_machine=i860-stratus - os=sysv4 - ;; - sun2) - basic_machine=m68000-sun - os= - ;; - sun2os3) - basic_machine=m68000-sun - os=sunos3 - ;; - sun2os4) - basic_machine=m68000-sun - os=sunos4 - ;; - sun3) - basic_machine=m68k-sun - os= - ;; - sun3os3) - basic_machine=m68k-sun - os=sunos3 - ;; - sun3os4) - basic_machine=m68k-sun - os=sunos4 - ;; - sun4) - basic_machine=sparc-sun - os= - ;; - sun4os3) - basic_machine=sparc-sun - os=sunos3 - ;; - sun4os4) - basic_machine=sparc-sun - os=sunos4 - ;; - sun4sol2) - basic_machine=sparc-sun - os=solaris2 - ;; - sun386 | sun386i | roadrunner) - basic_machine=i386-sun - os= - ;; - sv1) - basic_machine=sv1-cray - os=unicos - ;; - symmetry) - basic_machine=i386-sequent - os=dynix - ;; - t3e) - basic_machine=alphaev5-cray - os=unicos - ;; - t90) - basic_machine=t90-cray - os=unicos - ;; - toad1) - basic_machine=pdp10-xkl - os=tops20 - ;; - tpf) - basic_machine=s390x-ibm - os=tpf - ;; - udi29k) - basic_machine=a29k-amd - os=udi - ;; - ultra3) - basic_machine=a29k-nyu - os=sym1 - ;; - v810 | necv810) - basic_machine=v810-nec - os=none - ;; - vaxv) - basic_machine=vax-dec - os=sysv - ;; - vms) - basic_machine=vax-dec - os=vms - ;; - vsta) - basic_machine=i386-pc - os=vsta - ;; - vxworks960) - basic_machine=i960-wrs - os=vxworks - ;; - vxworks68) - basic_machine=m68k-wrs - os=vxworks - ;; - vxworks29k) - basic_machine=a29k-wrs - os=vxworks - ;; - xbox) - basic_machine=i686-pc - os=mingw32 - ;; - ymp) - basic_machine=ymp-cray - os=unicos - ;; - *) - basic_machine=$1 - os= - ;; - esac - ;; -esac - -# Decode 1-component or ad-hoc basic machines -case $basic_machine in - # Here we handle the default manufacturer of certain CPU types. It is in - # some cases the only manufacturer, in others, it is the most popular. - w89k) - cpu=hppa1.1 - vendor=winbond - ;; - op50n) - cpu=hppa1.1 - vendor=oki - ;; - op60c) - cpu=hppa1.1 - vendor=oki - ;; - ibm*) - cpu=i370 - vendor=ibm - ;; - orion105) - cpu=clipper - vendor=highlevel - ;; - mac | mpw | mac-mpw) - cpu=m68k - vendor=apple - ;; - pmac | pmac-mpw) - cpu=powerpc - vendor=apple - ;; - - # Recognize the various machine names and aliases which stand - # for a CPU type and a company and sometimes even an OS. - 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) - cpu=m68000 - vendor=att - ;; - 3b*) - cpu=we32k - vendor=att - ;; - bluegene*) - cpu=powerpc - vendor=ibm - os=cnk - ;; - decsystem10* | dec10*) - cpu=pdp10 - vendor=dec - os=tops10 - ;; - decsystem20* | dec20*) - cpu=pdp10 - vendor=dec - os=tops20 - ;; - delta | 3300 | motorola-3300 | motorola-delta \ - | 3300-motorola | delta-motorola) - cpu=m68k - vendor=motorola - ;; - dpx2*) - cpu=m68k - vendor=bull - os=sysv3 - ;; - encore | umax | mmax) - cpu=ns32k - vendor=encore - ;; - elxsi) - cpu=elxsi - vendor=elxsi - os=${os:-bsd} - ;; - fx2800) - cpu=i860 - vendor=alliant - ;; - genix) - cpu=ns32k - vendor=ns - ;; - h3050r* | hiux*) - cpu=hppa1.1 - vendor=hitachi - os=hiuxwe2 - ;; - hp3k9[0-9][0-9] | hp9[0-9][0-9]) - cpu=hppa1.0 - vendor=hp - ;; - hp9k2[0-9][0-9] | hp9k31[0-9]) - cpu=m68000 - vendor=hp - ;; - hp9k3[2-9][0-9]) - cpu=m68k - vendor=hp - ;; - hp9k6[0-9][0-9] | hp6[0-9][0-9]) - cpu=hppa1.0 - vendor=hp - ;; - hp9k7[0-79][0-9] | hp7[0-79][0-9]) - cpu=hppa1.1 - vendor=hp - ;; - hp9k78[0-9] | hp78[0-9]) - # FIXME: really hppa2.0-hp - cpu=hppa1.1 - vendor=hp - ;; - hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) - # FIXME: really hppa2.0-hp - cpu=hppa1.1 - vendor=hp - ;; - hp9k8[0-9][13679] | hp8[0-9][13679]) - cpu=hppa1.1 - vendor=hp - ;; - hp9k8[0-9][0-9] | hp8[0-9][0-9]) - cpu=hppa1.0 - vendor=hp - ;; - i*86v32) - cpu=`echo "$1" | sed -e 's/86.*/86/'` - vendor=pc - os=sysv32 - ;; - i*86v4*) - cpu=`echo "$1" | sed -e 's/86.*/86/'` - vendor=pc - os=sysv4 - ;; - i*86v) - cpu=`echo "$1" | sed -e 's/86.*/86/'` - vendor=pc - os=sysv - ;; - i*86sol2) - cpu=`echo "$1" | sed -e 's/86.*/86/'` - vendor=pc - os=solaris2 - ;; - j90 | j90-cray) - cpu=j90 - vendor=cray - os=${os:-unicos} - ;; - iris | iris4d) - cpu=mips - vendor=sgi - case $os in - irix*) - ;; - *) - os=irix4 - ;; - esac - ;; - miniframe) - cpu=m68000 - vendor=convergent - ;; - *mint | mint[0-9]* | *MiNT | *MiNT[0-9]*) - cpu=m68k - vendor=atari - os=mint - ;; - news-3600 | risc-news) - cpu=mips - vendor=sony - os=newsos - ;; - next | m*-next) - cpu=m68k - vendor=next - case $os in - openstep*) - ;; - nextstep*) - ;; - ns2*) - os=nextstep2 - ;; - *) - os=nextstep3 - ;; - esac - ;; - np1) - cpu=np1 - vendor=gould - ;; - op50n-* | op60c-*) - cpu=hppa1.1 - vendor=oki - os=proelf - ;; - pa-hitachi) - cpu=hppa1.1 - vendor=hitachi - os=hiuxwe2 - ;; - pbd) - cpu=sparc - vendor=tti - ;; - pbb) - cpu=m68k - vendor=tti - ;; - pc532) - cpu=ns32k - vendor=pc532 - ;; - pn) - cpu=pn - vendor=gould - ;; - power) - cpu=power - vendor=ibm - ;; - ps2) - cpu=i386 - vendor=ibm - ;; - rm[46]00) - cpu=mips - vendor=siemens - ;; - rtpc | rtpc-*) - cpu=romp - vendor=ibm - ;; - sde) - cpu=mipsisa32 - vendor=sde - os=${os:-elf} - ;; - simso-wrs) - cpu=sparclite - vendor=wrs - os=vxworks - ;; - tower | tower-32) - cpu=m68k - vendor=ncr - ;; - vpp*|vx|vx-*) - cpu=f301 - vendor=fujitsu - ;; - w65) - cpu=w65 - vendor=wdc - ;; - w89k-*) - cpu=hppa1.1 - vendor=winbond - os=proelf - ;; - none) - cpu=none - vendor=none - ;; - leon|leon[3-9]) - cpu=sparc - vendor=$basic_machine - ;; - leon-*|leon[3-9]-*) - cpu=sparc - vendor=`echo "$basic_machine" | sed 's/-.*//'` - ;; - - *-*) - # shellcheck disable=SC2162 - IFS="-" read cpu vendor <&2 - exit 1 - ;; - esac - ;; -esac - -# Here we canonicalize certain aliases for manufacturers. -case $vendor in - digital*) - vendor=dec - ;; - commodore*) - vendor=cbm - ;; - *) - ;; -esac - -# Decode manufacturer-specific aliases for certain operating systems. - -if [ x$os != x ] -then -case $os in - # First match some system type aliases that might get confused - # with valid system types. - # solaris* is a basic system type, with this one exception. - auroraux) - os=auroraux - ;; - bluegene*) - os=cnk - ;; - solaris1 | solaris1.*) - os=`echo $os | sed -e 's|solaris1|sunos4|'` - ;; - solaris) - os=solaris2 - ;; - unixware*) - os=sysv4.2uw - ;; - gnu/linux*) - os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` - ;; - # es1800 is here to avoid being matched by es* (a different OS) - es1800*) - os=ose - ;; - # Some version numbers need modification - chorusos*) - os=chorusos - ;; - isc) - os=isc2.2 - ;; - sco6) - os=sco5v6 - ;; - sco5) - os=sco3.2v5 - ;; - sco4) - os=sco3.2v4 - ;; - sco3.2.[4-9]*) - os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` - ;; - sco3.2v[4-9]* | sco5v6*) - # Don't forget version if it is 3.2v4 or newer. - ;; - scout) - # Don't match below - ;; - sco*) - os=sco3.2v2 - ;; - psos*) - os=psos - ;; - # Now accept the basic system types. - # The portable systems comes first. - # Each alternative MUST end in a * to match a version number. - # sysv* is not here because it comes later, after sysvr4. - gnu* | bsd* | mach* | minix* | genix* | ultrix* | irix* \ - | *vms* | esix* | aix* | cnk* | sunos | sunos[34]*\ - | hpux* | unos* | osf* | luna* | dgux* | auroraux* | solaris* \ - | sym* | kopensolaris* | plan9* \ - | amigaos* | amigados* | msdos* | newsos* | unicos* | aof* \ - | aos* | aros* | cloudabi* | sortix* | twizzler* \ - | nindy* | vxsim* | vxworks* | ebmon* | hms* | mvs* \ - | clix* | riscos* | uniplus* | iris* | isc* | rtu* | xenix* \ - | knetbsd* | mirbsd* | netbsd* \ - | bitrig* | openbsd* | solidbsd* | libertybsd* | os108* \ - | ekkobsd* | kfreebsd* | freebsd* | riscix* | lynxos* \ - | bosx* | nextstep* | cxux* | aout* | elf* | oabi* \ - | ptx* | coff* | ecoff* | winnt* | domain* | vsta* \ - | udi* | eabi* | lites* | ieee* | go32* | aux* | hcos* \ - | chorusrdb* | cegcc* | glidix* \ - | cygwin* | msys* | pe* | moss* | proelf* | rtems* \ - | midipix* | mingw32* | mingw64* | linux-gnu* | linux-android* \ - | linux-newlib* | linux-musl* | linux-uclibc* \ - | uxpv* | beos* | mpeix* | udk* | moxiebox* \ - | interix* | uwin* | mks* | rhapsody* | darwin* \ - | openstep* | oskit* | conix* | pw32* | nonstopux* \ - | storm-chaos* | tops10* | tenex* | tops20* | its* \ - | os2* | vos* | palmos* | uclinux* | nucleus* \ - | morphos* | superux* | rtmk* | windiss* \ - | powermax* | dnix* | nx6 | nx7 | sei* | dragonfly* \ - | skyos* | haiku* | rdos* | toppers* | drops* | es* \ - | onefs* | tirtos* | phoenix* | fuchsia* | redox* | bme* \ - | midnightbsd* | amdhsa* | unleashed* | emscripten* | wasi* \ - | nsk* | powerunix) - # Remember, each alternative MUST END IN *, to match a version number. - ;; - qnx*) - case $cpu in - x86 | i*86) - ;; - *) - os=nto-$os - ;; - esac - ;; - hiux*) - os=hiuxwe2 - ;; - nto-qnx*) - ;; - nto*) - os=`echo $os | sed -e 's|nto|nto-qnx|'` - ;; - sim | xray | os68k* | v88r* \ - | windows* | osx | abug | netware* | os9* \ - | macos* | mpw* | magic* | mmixware* | mon960* | lnews*) - ;; - linux-dietlibc) - os=linux-dietlibc - ;; - linux*) - os=`echo $os | sed -e 's|linux|linux-gnu|'` - ;; - lynx*178) - os=lynxos178 - ;; - lynx*5) - os=lynxos5 - ;; - lynx*) - os=lynxos - ;; - mac*) - os=`echo "$os" | sed -e 's|mac|macos|'` - ;; - opened*) - os=openedition - ;; - os400*) - os=os400 - ;; - sunos5*) - os=`echo "$os" | sed -e 's|sunos5|solaris2|'` - ;; - sunos6*) - os=`echo "$os" | sed -e 's|sunos6|solaris3|'` - ;; - wince*) - os=wince - ;; - utek*) - os=bsd - ;; - dynix*) - os=bsd - ;; - acis*) - os=aos - ;; - atheos*) - os=atheos - ;; - syllable*) - os=syllable - ;; - 386bsd) - os=bsd - ;; - ctix* | uts*) - os=sysv - ;; - nova*) - os=rtmk-nova - ;; - ns2) - os=nextstep2 - ;; - # Preserve the version number of sinix5. - sinix5.*) - os=`echo $os | sed -e 's|sinix|sysv|'` - ;; - sinix*) - os=sysv4 - ;; - tpf*) - os=tpf - ;; - triton*) - os=sysv3 - ;; - oss*) - os=sysv3 - ;; - svr4*) - os=sysv4 - ;; - svr3) - os=sysv3 - ;; - sysvr4) - os=sysv4 - ;; - # This must come after sysvr4. - sysv*) - ;; - ose*) - os=ose - ;; - *mint | mint[0-9]* | *MiNT | MiNT[0-9]*) - os=mint - ;; - zvmoe) - os=zvmoe - ;; - dicos*) - os=dicos - ;; - pikeos*) - # Until real need of OS specific support for - # particular features comes up, bare metal - # configurations are quite functional. - case $cpu in - arm*) - os=eabi - ;; - *) - os=elf - ;; - esac - ;; - nacl*) - ;; - ios) - ;; - none) - ;; - *-eabi) - ;; - *) - echo Invalid configuration \`"$1"\': system \`"$os"\' not recognized 1>&2 - exit 1 - ;; -esac -else - -# Here we handle the default operating systems that come with various machines. -# The value should be what the vendor currently ships out the door with their -# machine or put another way, the most popular os provided with the machine. - -# Note that if you're going to try to match "-MANUFACTURER" here (say, -# "-sun"), then you have to tell the case statement up towards the top -# that MANUFACTURER isn't an operating system. Otherwise, code above -# will signal an error saying that MANUFACTURER isn't an operating -# system, and we'll never get to this point. - -case $cpu-$vendor in - score-*) - os=elf - ;; - spu-*) - os=elf - ;; - *-acorn) - os=riscix1.2 - ;; - arm*-rebel) - os=linux - ;; - arm*-semi) - os=aout - ;; - c4x-* | tic4x-*) - os=coff - ;; - c8051-*) - os=elf - ;; - clipper-intergraph) - os=clix - ;; - hexagon-*) - os=elf - ;; - tic54x-*) - os=coff - ;; - tic55x-*) - os=coff - ;; - tic6x-*) - os=coff - ;; - # This must come before the *-dec entry. - pdp10-*) - os=tops20 - ;; - pdp11-*) - os=none - ;; - *-dec | vax-*) - os=ultrix4.2 - ;; - m68*-apollo) - os=domain - ;; - i386-sun) - os=sunos4.0.2 - ;; - m68000-sun) - os=sunos3 - ;; - m68*-cisco) - os=aout - ;; - mep-*) - os=elf - ;; - mips*-cisco) - os=elf - ;; - mips*-*) - os=elf - ;; - or32-*) - os=coff - ;; - *-tti) # must be before sparc entry or we get the wrong os. - os=sysv3 - ;; - sparc-* | *-sun) - os=sunos4.1.1 - ;; - pru-*) - os=elf - ;; - *-be) - os=beos - ;; - *-ibm) - os=aix - ;; - *-knuth) - os=mmixware - ;; - *-wec) - os=proelf - ;; - *-winbond) - os=proelf - ;; - *-oki) - os=proelf - ;; - *-hp) - os=hpux - ;; - *-hitachi) - os=hiux - ;; - i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) - os=sysv - ;; - *-cbm) - os=amigaos - ;; - *-dg) - os=dgux - ;; - *-dolphin) - os=sysv3 - ;; - m68k-ccur) - os=rtu - ;; - m88k-omron*) - os=luna - ;; - *-next) - os=nextstep - ;; - *-sequent) - os=ptx - ;; - *-crds) - os=unos - ;; - *-ns) - os=genix - ;; - i370-*) - os=mvs - ;; - *-gould) - os=sysv - ;; - *-highlevel) - os=bsd - ;; - *-encore) - os=bsd - ;; - *-sgi) - os=irix - ;; - *-siemens) - os=sysv4 - ;; - *-masscomp) - os=rtu - ;; - f30[01]-fujitsu | f700-fujitsu) - os=uxpv - ;; - *-rom68k) - os=coff - ;; - *-*bug) - os=coff - ;; - *-apple) - os=macos - ;; - *-atari*) - os=mint - ;; - *-wrs) - os=vxworks - ;; - *) - os=none - ;; -esac -fi - -# Here we handle the case where we know the os, and the CPU type, but not the -# manufacturer. We pick the logical manufacturer. -case $vendor in - unknown) - case $os in - riscix*) - vendor=acorn - ;; - sunos*) - vendor=sun - ;; - cnk*|-aix*) - vendor=ibm - ;; - beos*) - vendor=be - ;; - hpux*) - vendor=hp - ;; - mpeix*) - vendor=hp - ;; - hiux*) - vendor=hitachi - ;; - unos*) - vendor=crds - ;; - dgux*) - vendor=dg - ;; - luna*) - vendor=omron - ;; - genix*) - vendor=ns - ;; - clix*) - vendor=intergraph - ;; - mvs* | opened*) - vendor=ibm - ;; - os400*) - vendor=ibm - ;; - ptx*) - vendor=sequent - ;; - tpf*) - vendor=ibm - ;; - vxsim* | vxworks* | windiss*) - vendor=wrs - ;; - aux*) - vendor=apple - ;; - hms*) - vendor=hitachi - ;; - mpw* | macos*) - vendor=apple - ;; - *mint | mint[0-9]* | *MiNT | MiNT[0-9]*) - vendor=atari - ;; - vos*) - vendor=stratus - ;; - esac - ;; -esac - -echo "$cpu-$vendor-$os" -exit - -# Local variables: -# eval: (add-hook 'before-save-hook 'time-stamp) -# time-stamp-start: "timestamp='" -# time-stamp-format: "%:y-%02m-%02d" -# time-stamp-end: "'" -# End: diff --git a/pcre2/configure.ac b/pcre2/configure.ac deleted file mode 100644 index af26f0b8f..000000000 --- a/pcre2/configure.ac +++ /dev/null @@ -1,1117 +0,0 @@ -dnl Process this file with autoconf to produce a configure script. - -dnl NOTE FOR MAINTAINERS: Do not use minor version numbers 08 or 09 because -dnl the leading zeros may cause them to be treated as invalid octal constants -dnl if a PCRE2 user writes code that uses PCRE2_MINOR as a number. There is now -dnl a check further down that throws an error if 08 or 09 are used. - -dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might -dnl be defined as -RC2, for example. For real releases, it should be empty. - -m4_define(pcre2_major, [10]) -m4_define(pcre2_minor, [36]) -m4_define(pcre2_prerelease, []) -m4_define(pcre2_date, [2020-12-04]) - -# Libtool shared library interface versions (current:revision:age) -m4_define(libpcre2_8_version, [10:1:10]) -m4_define(libpcre2_16_version, [10:1:10]) -m4_define(libpcre2_32_version, [10:1:10]) -m4_define(libpcre2_posix_version, [2:3:0]) - -# NOTE: The CMakeLists.txt file searches for the above variables in the first -# 50 lines of this file. Please update that if the variables above are moved. - -AC_PREREQ(2.57) -AC_INIT(PCRE2, pcre2_major.pcre2_minor[]pcre2_prerelease, , pcre2) -AC_CONFIG_SRCDIR([src/pcre2.h.in]) -AM_INIT_AUTOMAKE([dist-bzip2 dist-zip]) -m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) -AC_CONFIG_HEADERS(src/config.h) - -# This was added at the suggestion of libtoolize (03-Jan-10) -AC_CONFIG_MACRO_DIR([m4]) - -# The default CFLAGS in Autoconf are "-g -O2" for gcc and just "-g" for any -# other compiler. There doesn't seem to be a standard way of getting rid of the -# -g (which I don't think is needed for a production library). This fudge seems -# to achieve the necessary. First, we remember the externally set values of -# CFLAGS. Then call the AC_PROG_CC macro to find the compiler - if CFLAGS is -# not set, it will be set to Autoconf's defaults. Afterwards, if the original -# values were not set, remove the -g from the Autoconf defaults. - -remember_set_CFLAGS="$CFLAGS" - -AC_PROG_CC -AM_PROG_CC_C_O -AC_USE_SYSTEM_EXTENSIONS - -if test "x$remember_set_CFLAGS" = "x" -then - if test "$CFLAGS" = "-g -O2" - then - CFLAGS="-O2" - elif test "$CFLAGS" = "-g" - then - CFLAGS="" - fi -fi - -# This is a new thing required to stop a warning from automake 1.12 -m4_ifdef([AM_PROG_AR], [AM_PROG_AR]) - -# Check for a 64-bit integer type -AC_TYPE_INT64_T - -AC_PROG_INSTALL -AC_LIBTOOL_WIN32_DLL -LT_INIT -AC_PROG_LN_S - -# Check for GCC visibility feature - -PCRE2_VISIBILITY - -# Check for Clang __attribute__((uninitialized)) feature - -AC_MSG_CHECKING([for __attribute__((uninitialized))]) -AC_LANG_PUSH([C]) -tmp_CFLAGS=$CFLAGS -CFLAGS="$CFLAGS -Werror" -AC_COMPILE_IFELSE([AC_LANG_PROGRAM(, - [[char buf[128] __attribute__((uninitialized));(void)buf]])], - [pcre2_cc_cv_attribute_uninitialized=yes], - [pcre2_cc_cv_attribute_uninitialized=no]) -AC_MSG_RESULT([$pcre2_cc_cv_attribute_uninitialized]) -if test "$pcre2_cc_cv_attribute_uninitialized" = yes; then - AC_DEFINE([HAVE_ATTRIBUTE_UNINITIALIZED], 1, [Define this if your compiler - supports __attribute__((uninitialized))]) -fi -CFLAGS=$tmp_CFLAGS -AC_LANG_POP([C]) - -# Versioning - -PCRE2_MAJOR="pcre2_major" -PCRE2_MINOR="pcre2_minor" -PCRE2_PRERELEASE="pcre2_prerelease" -PCRE2_DATE="pcre2_date" - -if test "$PCRE2_MINOR" = "08" -o "$PCRE2_MINOR" = "09" -then - echo "***" - echo "*** Minor version number $PCRE2_MINOR must not be used. ***" - echo "*** Use only 00 to 07 or 10 onwards, to avoid octal issues. ***" - echo "***" - exit 1 -fi - -AC_SUBST(PCRE2_MAJOR) -AC_SUBST(PCRE2_MINOR) -AC_SUBST(PCRE2_PRERELEASE) -AC_SUBST(PCRE2_DATE) - -# Set a more sensible default value for $(htmldir). -if test "x$htmldir" = 'x${docdir}' -then - htmldir='${docdir}/html' -fi - -# Force an error for PCRE1 size options -AC_ARG_ENABLE(pcre8,,,enable_pcre8=no) -AC_ARG_ENABLE(pcre16,,,enable_pcre16=no) -AC_ARG_ENABLE(pcre32,,,enable_pcre32=no) - -if test "$enable_pcre8$enable_pcre16$enable_pcre32" != "nonono" -then - echo "** ERROR: Use --[[en|dis]]able-pcre2-[[8|16|32]], not --[[en|dis]]able-pcre[[8|16|32]]" - exit 1 -fi - -# Handle --disable-pcre2-8 (enabled by default) -AC_ARG_ENABLE(pcre2-8, - AS_HELP_STRING([--disable-pcre2-8], - [disable 8 bit character support]), - , enable_pcre2_8=unset) -AC_SUBST(enable_pcre2_8) - -# Handle --enable-pcre2-16 (disabled by default) -AC_ARG_ENABLE(pcre2-16, - AS_HELP_STRING([--enable-pcre2-16], - [enable 16 bit character support]), - , enable_pcre2_16=unset) -AC_SUBST(enable_pcre2_16) - -# Handle --enable-pcre2-32 (disabled by default) -AC_ARG_ENABLE(pcre2-32, - AS_HELP_STRING([--enable-pcre2-32], - [enable 32 bit character support]), - , enable_pcre2_32=unset) -AC_SUBST(enable_pcre2_32) - -# Handle --enable-debug (disabled by default) -AC_ARG_ENABLE(debug, - AS_HELP_STRING([--enable-debug], - [enable debugging code]), - , enable_debug=no) - -# Handle --enable-jit (disabled by default) -AC_ARG_ENABLE(jit, - AS_HELP_STRING([--enable-jit], - [enable Just-In-Time compiling support]), - , enable_jit=no) - -# This code enables JIT if the hardware supports it. -if test "$enable_jit" = "auto"; then - AC_LANG(C) - SAVE_CPPFLAGS=$CPPFLAGS - CPPFLAGS=-I$srcdir - AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ - #define SLJIT_CONFIG_AUTO 1 - #include "src/sljit/sljitConfigInternal.h" - #if (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED) - #error unsupported - #endif]])], enable_jit=yes, enable_jit=no) - CPPFLAGS=$SAVE_CPPFLAGS - echo checking for JIT support on this hardware... $enable_jit -fi - -# Handle --enable-jit-sealloc (disabled by default and only experimental) -case $host_os in - linux* | netbsd*) - AC_ARG_ENABLE(jit-sealloc, - AS_HELP_STRING([--enable-jit-sealloc], - [enable SELinux compatible execmem allocator in JIT (experimental)]), - ,enable_jit_sealloc=no) - ;; - *) - enable_jit_sealloc=unsupported - ;; -esac - -# Handle --disable-pcre2grep-jit (enabled by default) -AC_ARG_ENABLE(pcre2grep-jit, - AS_HELP_STRING([--disable-pcre2grep-jit], - [disable JIT support in pcre2grep]), - , enable_pcre2grep_jit=yes) - -# Handle --disable-pcre2grep-callout (enabled by default) -AC_ARG_ENABLE(pcre2grep-callout, - AS_HELP_STRING([--disable-pcre2grep-callout], - [disable callout script support in pcre2grep]), - , enable_pcre2grep_callout=yes) - -# Handle --disable-pcre2grep-callout-fork (enabled by default) -AC_ARG_ENABLE(pcre2grep-callout-fork, - AS_HELP_STRING([--disable-pcre2grep-callout-fork], - [disable callout script fork support in pcre2grep]), - , enable_pcre2grep_callout_fork=yes) - -# Handle --enable-rebuild-chartables -AC_ARG_ENABLE(rebuild-chartables, - AS_HELP_STRING([--enable-rebuild-chartables], - [rebuild character tables in current locale]), - , enable_rebuild_chartables=no) - -# Handle --disable-unicode (enabled by default) -AC_ARG_ENABLE(unicode, - AS_HELP_STRING([--disable-unicode], - [disable Unicode support]), - , enable_unicode=unset) - -# Handle newline options -ac_pcre2_newline=lf -AC_ARG_ENABLE(newline-is-cr, - AS_HELP_STRING([--enable-newline-is-cr], - [use CR as newline character]), - ac_pcre2_newline=cr) -AC_ARG_ENABLE(newline-is-lf, - AS_HELP_STRING([--enable-newline-is-lf], - [use LF as newline character (default)]), - ac_pcre2_newline=lf) -AC_ARG_ENABLE(newline-is-crlf, - AS_HELP_STRING([--enable-newline-is-crlf], - [use CRLF as newline sequence]), - ac_pcre2_newline=crlf) -AC_ARG_ENABLE(newline-is-anycrlf, - AS_HELP_STRING([--enable-newline-is-anycrlf], - [use CR, LF, or CRLF as newline sequence]), - ac_pcre2_newline=anycrlf) -AC_ARG_ENABLE(newline-is-any, - AS_HELP_STRING([--enable-newline-is-any], - [use any valid Unicode newline sequence]), - ac_pcre2_newline=any) -AC_ARG_ENABLE(newline-is-nul, - AS_HELP_STRING([--enable-newline-is-nul], - [use NUL (binary zero) as newline character]), - ac_pcre2_newline=nul) -enable_newline="$ac_pcre2_newline" - -# Handle --enable-bsr-anycrlf -AC_ARG_ENABLE(bsr-anycrlf, - AS_HELP_STRING([--enable-bsr-anycrlf], - [\R matches only CR, LF, CRLF by default]), - , enable_bsr_anycrlf=no) - -# Handle --enable-never-backslash-C -AC_ARG_ENABLE(never-backslash-C, - AS_HELP_STRING([--enable-never-backslash-C], - [use of \C causes an error]), - , enable_never_backslash_C=no) - -# Handle --enable-ebcdic -AC_ARG_ENABLE(ebcdic, - AS_HELP_STRING([--enable-ebcdic], - [assume EBCDIC coding rather than ASCII; incompatible with --enable-utf; use only in (uncommon) EBCDIC environments; it implies --enable-rebuild-chartables]), - , enable_ebcdic=no) - -# Handle --enable-ebcdic-nl25 -AC_ARG_ENABLE(ebcdic-nl25, - AS_HELP_STRING([--enable-ebcdic-nl25], - [set EBCDIC code for NL to 0x25 instead of 0x15; it implies --enable-ebcdic]), - , enable_ebcdic_nl25=no) - -# Handle --enable-pcre2grep-libz -AC_ARG_ENABLE(pcre2grep-libz, - AS_HELP_STRING([--enable-pcre2grep-libz], - [link pcre2grep with libz to handle .gz files]), - , enable_pcre2grep_libz=no) - -# Handle --enable-pcre2grep-libbz2 -AC_ARG_ENABLE(pcre2grep-libbz2, - AS_HELP_STRING([--enable-pcre2grep-libbz2], - [link pcre2grep with libbz2 to handle .bz2 files]), - , enable_pcre2grep_libbz2=no) - -# Handle --with-pcre2grep-bufsize=N -AC_ARG_WITH(pcre2grep-bufsize, - AS_HELP_STRING([--with-pcre2grep-bufsize=N], - [pcre2grep initial buffer size (default=20480, minimum=8192)]), - , with_pcre2grep_bufsize=20480) - -# Handle --with-pcre2grep-max-bufsize=N -AC_ARG_WITH(pcre2grep-max-bufsize, - AS_HELP_STRING([--with-pcre2grep-max-bufsize=N], - [pcre2grep maximum buffer size (default=1048576, minimum=8192)]), - , with_pcre2grep_max_bufsize=1048576) - -# Handle --enable-pcre2test-libedit -AC_ARG_ENABLE(pcre2test-libedit, - AS_HELP_STRING([--enable-pcre2test-libedit], - [link pcre2test with libedit]), - , enable_pcre2test_libedit=no) - -# Handle --enable-pcre2test-libreadline -AC_ARG_ENABLE(pcre2test-libreadline, - AS_HELP_STRING([--enable-pcre2test-libreadline], - [link pcre2test with libreadline]), - , enable_pcre2test_libreadline=no) - -# Handle --with-link-size=N -AC_ARG_WITH(link-size, - AS_HELP_STRING([--with-link-size=N], - [internal link size (2, 3, or 4 allowed; default=2)]), - , with_link_size=2) - -# Handle --with-parens-nest-limit=N -AC_ARG_WITH(parens-nest-limit, - AS_HELP_STRING([--with-parens-nest-limit=N], - [nested parentheses limit (default=250)]), - , with_parens_nest_limit=250) - -# Handle --with-heap-limit -AC_ARG_WITH(heap-limit, - AS_HELP_STRING([--with-heap-limit=N], - [default limit on heap memory (kibibytes, default=20000000)]), - , with_heap_limit=20000000) - -# Handle --with-match-limit=N -AC_ARG_WITH(match-limit, - AS_HELP_STRING([--with-match-limit=N], - [default limit on internal looping (default=10000000)]), - , with_match_limit=10000000) - -# Handle --with-match-limit-depth=N -# Recognize old synonym --with-match-limit-recursion -# -# Note: In config.h, the default is to define MATCH_LIMIT_DEPTH symbolically as -# MATCH_LIMIT, which in turn is defined to be some numeric value (e.g. -# 10000000). MATCH_LIMIT_DEPTH can otherwise be set to some different numeric -# value (or even the same numeric value as MATCH_LIMIT, though no longer -# defined in terms of the latter). -# -AC_ARG_WITH(match-limit-depth, - AS_HELP_STRING([--with-match-limit-depth=N], - [default limit on match tree depth (default=MATCH_LIMIT)]), - , with_match_limit_depth=MATCH_LIMIT) - -AC_ARG_WITH(match-limit-recursion,, - , with_match_limit_recursion=UNSET) - -# Handle --enable-valgrind -AC_ARG_ENABLE(valgrind, - AS_HELP_STRING([--enable-valgrind], - [enable valgrind support]), - , enable_valgrind=no) - -# Enable code coverage reports using gcov -AC_ARG_ENABLE(coverage, - AS_HELP_STRING([--enable-coverage], - [enable code coverage reports using gcov]), - , enable_coverage=no) - -# Handle --enable-fuzz-support -AC_ARG_ENABLE(fuzz_support, - AS_HELP_STRING([--enable-fuzz-support], - [enable fuzzer support]), - , enable_fuzz_support=no) - -# Handle --disable-stack-for-recursion -# This option became obsolete at release 10.30. -AC_ARG_ENABLE(stack-for-recursion,, - , enable_stack_for_recursion=yes) - -# Original code -# AC_ARG_ENABLE(stack-for-recursion, -# AS_HELP_STRING([--disable-stack-for-recursion], -# [don't use stack recursion when matching]), -# , enable_stack_for_recursion=yes) - -# Handle --disable-percent_zt (set as "auto" by default) -AC_ARG_ENABLE(percent-zt, - AS_HELP_STRING([--disable-percent-zt], - [disable the use of z and t formatting modifiers]), - , enable_percent_zt=auto) - -# Set the default value for pcre2-8 -if test "x$enable_pcre2_8" = "xunset" -then - enable_pcre2_8=yes -fi - -# Set the default value for pcre2-16 -if test "x$enable_pcre2_16" = "xunset" -then - enable_pcre2_16=no -fi - -# Set the default value for pcre2-32 -if test "x$enable_pcre2_32" = "xunset" -then - enable_pcre2_32=no -fi - -# Make sure at least one library is selected -if test "x$enable_pcre2_8$enable_pcre2_16$enable_pcre2_32" = "xnonono" -then - AC_MSG_ERROR([At least one of the 8, 16 or 32 bit libraries must be enabled]) -fi - -# Unicode is enabled by default. -if test "x$enable_unicode" = "xunset" -then - enable_unicode=yes -fi - -# Convert the newline identifier into the appropriate integer value. These must -# agree with the PCRE2_NEWLINE_xxx values in pcre2.h. - -case "$enable_newline" in - cr) ac_pcre2_newline_value=1 ;; - lf) ac_pcre2_newline_value=2 ;; - crlf) ac_pcre2_newline_value=3 ;; - any) ac_pcre2_newline_value=4 ;; - anycrlf) ac_pcre2_newline_value=5 ;; - nul) ac_pcre2_newline_value=6 ;; - *) - AC_MSG_ERROR([invalid argument \"$enable_newline\" to --enable-newline option]) - ;; -esac - -# --enable-ebcdic-nl25 implies --enable-ebcdic -if test "x$enable_ebcdic_nl25" = "xyes"; then - enable_ebcdic=yes -fi - -# Make sure that if enable_ebcdic is set, rebuild_chartables is also enabled. -# Also check that UTF support is not requested, because PCRE2 cannot handle -# EBCDIC and UTF in the same build. To do so it would need to use different -# character constants depending on the mode. Also, EBCDIC cannot be used with -# 16-bit and 32-bit libraries. -# -if test "x$enable_ebcdic" = "xyes"; then - enable_rebuild_chartables=yes - if test "x$enable_unicode" = "xyes"; then - AC_MSG_ERROR([support for EBCDIC and Unicode cannot be enabled at the same time]) - fi - if test "x$enable_pcre2_16" = "xyes" -o "x$enable_pcre2_32" = "xyes"; then - AC_MSG_ERROR([EBCDIC support is available only for the 8-bit library]) - fi -fi - -# Check argument to --with-link-size -case "$with_link_size" in - 2|3|4) ;; - *) - AC_MSG_ERROR([invalid argument \"$with_link_size\" to --with-link-size option]) - ;; -esac - -AH_TOP([ -/* PCRE2 is written in Standard C, but there are a few non-standard things it -can cope with, allowing it to run on SunOS4 and other "close to standard" -systems. - -In environments that support the GNU autotools, config.h.in is converted into -config.h by the "configure" script. In environments that use CMake, -config-cmake.in is converted into config.h. If you are going to build PCRE2 "by -hand" without using "configure" or CMake, you should copy the distributed -config.h.generic to config.h, and edit the macro definitions to be the way you -need them. You must then add -DHAVE_CONFIG_H to all of your compile commands, -so that config.h is included at the start of every source. - -Alternatively, you can avoid editing by using -D on the compiler command line -to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H, -but if you do, default values will be taken from config.h for non-boolean -macros that are not defined on the command line. - -Boolean macros such as HAVE_STDLIB_H and SUPPORT_PCRE2_8 should either be -defined (conventionally to 1) for TRUE, and not defined at all for FALSE. All -such macros are listed as a commented #undef in config.h.generic. Macros such -as MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are -surrounded by #ifndef/#endif lines so that the value can be overridden by -D. - -PCRE2 uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if -HAVE_BCOPY is defined. If your system has neither bcopy() nor memmove(), make -sure both macros are undefined; an emulation function will then be used. */]) - -# Checks for header files. -AC_HEADER_STDC -AC_CHECK_HEADERS(limits.h sys/types.h sys/stat.h dirent.h) -AC_CHECK_HEADERS([windows.h], [HAVE_WINDOWS_H=1]) -AC_CHECK_HEADERS([sys/wait.h], [HAVE_SYS_WAIT_H=1]) - -# Conditional compilation -AM_CONDITIONAL(WITH_PCRE2_8, test "x$enable_pcre2_8" = "xyes") -AM_CONDITIONAL(WITH_PCRE2_16, test "x$enable_pcre2_16" = "xyes") -AM_CONDITIONAL(WITH_PCRE2_32, test "x$enable_pcre2_32" = "xyes") -AM_CONDITIONAL(WITH_DEBUG, test "x$enable_debug" = "xyes") -AM_CONDITIONAL(WITH_REBUILD_CHARTABLES, test "x$enable_rebuild_chartables" = "xyes") -AM_CONDITIONAL(WITH_JIT, test "x$enable_jit" = "xyes") -AM_CONDITIONAL(WITH_UNICODE, test "x$enable_unicode" = "xyes") -AM_CONDITIONAL(WITH_VALGRIND, test "x$enable_valgrind" = "xyes") -AM_CONDITIONAL(WITH_FUZZ_SUPPORT, test "x$enable_fuzz_support" = "xyes") - -if test "$enable_fuzz_support" = "yes" -a "$enable_pcre2_8" = "no"; then - echo "** ERROR: Fuzzer support requires the 8-bit library" - exit 1 -fi - -# Checks for typedefs, structures, and compiler characteristics. - -AC_C_CONST -AC_TYPE_SIZE_T - -# Checks for library functions. - -AC_CHECK_FUNCS(bcopy memfd_create memmove mkostemp secure_getenv strerror) - -# Check for the availability of libz (aka zlib) - -AC_CHECK_HEADERS([zlib.h], [HAVE_ZLIB_H=1]) -AC_CHECK_LIB([z], [gzopen], [HAVE_LIBZ=1]) - -# Check for the availability of libbz2. Originally we just used AC_CHECK_LIB, -# as for libz. However, this had the following problem, diagnosed and fixed by -# a user: -# -# - libbz2 uses the Pascal calling convention (WINAPI) for the functions -# under Win32. -# - The standard autoconf AC_CHECK_LIB fails to include "bzlib.h", -# therefore missing the function definition. -# - The compiler thus generates a "C" signature for the test function. -# - The linker fails to find the "C" function. -# - PCRE2 fails to configure if asked to do so against libbz2. -# -# Solution: -# -# - Replace the AC_CHECK_LIB test with a custom test. - -AC_CHECK_HEADERS([bzlib.h], [HAVE_BZLIB_H=1]) -# Original test -# AC_CHECK_LIB([bz2], [BZ2_bzopen], [HAVE_LIBBZ2=1]) -# -# Custom test follows - -AC_MSG_CHECKING([for libbz2]) -OLD_LIBS="$LIBS" -LIBS="$LIBS -lbz2" -AC_LINK_IFELSE([AC_LANG_PROGRAM([[ -#ifdef HAVE_BZLIB_H -#include -#endif]], -[[return (int)BZ2_bzopen("conftest", "rb");]])], -[AC_MSG_RESULT([yes]);HAVE_LIBBZ2=1; break;], -AC_MSG_RESULT([no])) -LIBS="$OLD_LIBS" - -# Check for the availabiity of libreadline - -if test "$enable_pcre2test_libreadline" = "yes"; then - AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_H=1]) - AC_CHECK_HEADERS([readline/history.h], [HAVE_HISTORY_H=1]) - AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-lreadline"], - [unset ac_cv_lib_readline_readline; - AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-ltinfo"], - [unset ac_cv_lib_readline_readline; - AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-lcurses"], - [unset ac_cv_lib_readline_readline; - AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-lncurses"], - [unset ac_cv_lib_readline_readline; - AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-lncursesw"], - [unset ac_cv_lib_readline_readline; - AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-ltermcap"], - [LIBREADLINE=""], - [-ltermcap])], - [-lncursesw])], - [-lncurses])], - [-lcurses])], - [-ltinfo])]) - AC_SUBST(LIBREADLINE) - if test -n "$LIBREADLINE"; then - if test "$LIBREADLINE" != "-lreadline"; then - echo "-lreadline needs $LIBREADLINE" - LIBREADLINE="-lreadline $LIBREADLINE" - fi - fi -fi - - -# Check for the availability of libedit. Different distributions put its -# headers in different places. Try to cover the most common ones. - -if test "$enable_pcre2test_libedit" = "yes"; then - AC_CHECK_HEADERS([editline/readline.h], [HAVE_EDITLINE_READLINE_H=1], - [AC_CHECK_HEADERS([edit/readline/readline.h], [HAVE_READLINE_READLINE_H=1], - [AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_READLINE_H=1])])]) - AC_CHECK_LIB([edit], [readline], [LIBEDIT="-ledit"]) -fi - -PCRE2_STATIC_CFLAG="" -if test "x$enable_shared" = "xno" ; then - AC_DEFINE([PCRE2_STATIC], [1], [ - Define to any value if linking statically (TODO: make nice with Libtool)]) - PCRE2_STATIC_CFLAG="-DPCRE2_STATIC" -fi -AC_SUBST(PCRE2_STATIC_CFLAG) - -# Here is where PCRE2-specific defines are handled - -if test "$enable_pcre2_8" = "yes"; then - AC_DEFINE([SUPPORT_PCRE2_8], [], [ - Define to any value to enable the 8 bit PCRE2 library.]) -fi - -if test "$enable_pcre2_16" = "yes"; then - AC_DEFINE([SUPPORT_PCRE2_16], [], [ - Define to any value to enable the 16 bit PCRE2 library.]) -fi - -if test "$enable_pcre2_32" = "yes"; then - AC_DEFINE([SUPPORT_PCRE2_32], [], [ - Define to any value to enable the 32 bit PCRE2 library.]) -fi - -if test "$enable_debug" = "yes"; then - AC_DEFINE([PCRE2_DEBUG], [], [ - Define to any value to include debugging code.]) -fi - -if test "$enable_percent_zt" = "no"; then - AC_DEFINE([DISABLE_PERCENT_ZT], [], [ - Define to any value to disable the use of the z and t modifiers in - formatting settings such as %zu or %td (this is rarely needed).]) -else - enable_percent_zt=auto -fi - -# Unless running under Windows, JIT support requires pthreads. - -if test "$enable_jit" = "yes"; then - if test "$HAVE_WINDOWS_H" != "1"; then - AX_PTHREAD([], [AC_MSG_ERROR([JIT support requires pthreads])]) - CC="$PTHREAD_CC" - CFLAGS="$PTHREAD_CFLAGS $CFLAGS" - LIBS="$PTHREAD_LIBS $LIBS" - fi - AC_DEFINE([SUPPORT_JIT], [], [ - Define to any value to enable support for Just-In-Time compiling.]) -else - enable_pcre2grep_jit="no" -fi - -if test "$enable_jit_sealloc" = "yes"; then - AC_DEFINE([SLJIT_PROT_EXECUTABLE_ALLOCATOR], [1], [ - Define to any non-zero number to enable support for SELinux - compatible executable memory allocator in JIT. Note that this - will have no effect unless SUPPORT_JIT is also defined.]) -fi - -if test "$enable_pcre2grep_jit" = "yes"; then - AC_DEFINE([SUPPORT_PCRE2GREP_JIT], [], [ - Define to any value to enable JIT support in pcre2grep. Note that this will - have no effect unless SUPPORT_JIT is also defined.]) -fi - -if test "$enable_pcre2grep_callout" = "yes"; then - if test "$enable_pcre2grep_callout_fork" = "yes"; then - if test "$HAVE_WINDOWS_H" != "1"; then - if test "$HAVE_SYS_WAIT_H" != "1"; then - AC_MSG_ERROR([Callout script support needs sys/wait.h.]) - fi - fi - AC_DEFINE([SUPPORT_PCRE2GREP_CALLOUT_FORK], [], [ - Define to any value to enable fork support in pcre2grep callout scripts. - This will have no effect unless SUPPORT_PCRE2GREP_CALLOUT is also - defined.]) - fi - AC_DEFINE([SUPPORT_PCRE2GREP_CALLOUT], [], [ - Define to any value to enable callout script support in pcre2grep.]) -else - enable_pcre2grep_callout_fork="no" -fi - -if test "$enable_unicode" = "yes"; then - AC_DEFINE([SUPPORT_UNICODE], [], [ - Define to any value to enable support for Unicode and UTF encoding. - This will work even in an EBCDIC environment, but it is incompatible - with the EBCDIC macro. That is, PCRE2 can support *either* EBCDIC - code *or* ASCII/Unicode, but not both at once.]) -fi - -if test "$enable_pcre2grep_libz" = "yes"; then - AC_DEFINE([SUPPORT_LIBZ], [], [ - Define to any value to allow pcre2grep to be linked with libz, so that it is - able to handle .gz files.]) -fi - -if test "$enable_pcre2grep_libbz2" = "yes"; then - AC_DEFINE([SUPPORT_LIBBZ2], [], [ - Define to any value to allow pcre2grep to be linked with libbz2, so that it - is able to handle .bz2 files.]) -fi - -if test $with_pcre2grep_bufsize -lt 8192 ; then - AC_MSG_WARN([$with_pcre2grep_bufsize is too small for --with-pcre2grep-bufsize; using 8192]) - with_pcre2grep_bufsize="8192" -else - if test $? -gt 1 ; then - AC_MSG_ERROR([Bad value for --with-pcre2grep-bufsize]) - fi -fi - -if test $with_pcre2grep_max_bufsize -lt $with_pcre2grep_bufsize ; then - with_pcre2grep_max_bufsize="$with_pcre2grep_bufsize" -else - if test $? -gt 1 ; then - AC_MSG_ERROR([Bad value for --with-pcre2grep-max-bufsize]) - fi -fi - -AC_DEFINE_UNQUOTED([PCRE2GREP_BUFSIZE], [$with_pcre2grep_bufsize], [ - The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by - pcre2grep to hold parts of the file it is searching. The buffer will be - expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing very - long lines. The actual amount of memory used by pcre2grep is three times this - number, because it allows for the buffering of "before" and "after" lines.]) - -AC_DEFINE_UNQUOTED([PCRE2GREP_MAX_BUFSIZE], [$with_pcre2grep_max_bufsize], [ - The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer - used by pcre2grep to hold parts of the file it is searching. The actual - amount of memory used by pcre2grep is three times this number, because it - allows for the buffering of "before" and "after" lines.]) - -if test "$enable_pcre2test_libedit" = "yes"; then - AC_DEFINE([SUPPORT_LIBEDIT], [], [ - Define to any value to allow pcre2test to be linked with libedit.]) - LIBREADLINE="$LIBEDIT" -elif test "$enable_pcre2test_libreadline" = "yes"; then - AC_DEFINE([SUPPORT_LIBREADLINE], [], [ - Define to any value to allow pcre2test to be linked with libreadline.]) -fi - -AC_DEFINE_UNQUOTED([NEWLINE_DEFAULT], [$ac_pcre2_newline_value], [ - The value of NEWLINE_DEFAULT determines the default newline character - sequence. PCRE2 client programs can override this by selecting other values - at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), - 5 (ANYCRLF), and 6 (NUL).]) - -if test "$enable_bsr_anycrlf" = "yes"; then - AC_DEFINE([BSR_ANYCRLF], [], [ - By default, the \R escape sequence matches any Unicode line ending - character or sequence of characters. If BSR_ANYCRLF is defined (to any - value), this is changed so that backslash-R matches only CR, LF, or CRLF. - The build-time default can be overridden by the user of PCRE2 at runtime.]) -fi - -if test "$enable_never_backslash_C" = "yes"; then - AC_DEFINE([NEVER_BACKSLASH_C], [], [ - Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns.]) -fi - -AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [ - The value of LINK_SIZE determines the number of bytes used to store - links as offsets within the compiled regex. The default is 2, which - allows for compiled patterns up to 65535 code units long. This covers the - vast majority of cases. However, PCRE2 can also be compiled to use 3 or 4 - bytes instead. This allows for longer patterns in extreme cases.]) - -AC_DEFINE_UNQUOTED([PARENS_NEST_LIMIT], [$with_parens_nest_limit], [ - The value of PARENS_NEST_LIMIT specifies the maximum depth of nested - parentheses (of any kind) in a pattern. This limits the amount of system - stack that is used while compiling a pattern.]) - -AC_DEFINE_UNQUOTED([MATCH_LIMIT], [$with_match_limit], [ - The value of MATCH_LIMIT determines the default number of times the - pcre2_match() function can record a backtrack position during a single - matching attempt. The value is also used to limit a loop counter in - pcre2_dfa_match(). There is a runtime interface for setting a different - limit. The limit exists in order to catch runaway regular expressions that - take for ever to determine that they do not match. The default is set very - large so that it does not accidentally catch legitimate cases.]) - -# --with-match-limit-recursion is an obsolete synonym for --with-match-limit-depth - -if test "$with_match_limit_recursion" != "UNSET"; then -cat <. - -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - -# Originally written by Alexandre Oliva . - -case $1 in - '') - echo "$0: No command. Try '$0 --help' for more information." 1>&2 - exit 1; - ;; - -h | --h*) - cat <<\EOF -Usage: depcomp [--help] [--version] PROGRAM [ARGS] - -Run PROGRAMS ARGS to compile a file, generating dependencies -as side-effects. - -Environment variables: - depmode Dependency tracking mode. - source Source file read by 'PROGRAMS ARGS'. - object Object file output by 'PROGRAMS ARGS'. - DEPDIR directory where to store dependencies. - depfile Dependency file to output. - tmpdepfile Temporary file to use when outputting dependencies. - libtool Whether libtool is used (yes/no). - -Report bugs to . -EOF - exit $? - ;; - -v | --v*) - echo "depcomp $scriptversion" - exit $? - ;; -esac - -# Get the directory component of the given path, and save it in the -# global variables '$dir'. Note that this directory component will -# be either empty or ending with a '/' character. This is deliberate. -set_dir_from () -{ - case $1 in - */*) dir=`echo "$1" | sed -e 's|/[^/]*$|/|'`;; - *) dir=;; - esac -} - -# Get the suffix-stripped basename of the given path, and save it the -# global variable '$base'. -set_base_from () -{ - base=`echo "$1" | sed -e 's|^.*/||' -e 's/\.[^.]*$//'` -} - -# If no dependency file was actually created by the compiler invocation, -# we still have to create a dummy depfile, to avoid errors with the -# Makefile "include basename.Plo" scheme. -make_dummy_depfile () -{ - echo "#dummy" > "$depfile" -} - -# Factor out some common post-processing of the generated depfile. -# Requires the auxiliary global variable '$tmpdepfile' to be set. -aix_post_process_depfile () -{ - # If the compiler actually managed to produce a dependency file, - # post-process it. - if test -f "$tmpdepfile"; then - # Each line is of the form 'foo.o: dependency.h'. - # Do two passes, one to just change these to - # $object: dependency.h - # and one to simply output - # dependency.h: - # which is needed to avoid the deleted-header problem. - { sed -e "s,^.*\.[$lower]*:,$object:," < "$tmpdepfile" - sed -e "s,^.*\.[$lower]*:[$tab ]*,," -e 's,$,:,' < "$tmpdepfile" - } > "$depfile" - rm -f "$tmpdepfile" - else - make_dummy_depfile - fi -} - -# A tabulation character. -tab=' ' -# A newline character. -nl=' -' -# Character ranges might be problematic outside the C locale. -# These definitions help. -upper=ABCDEFGHIJKLMNOPQRSTUVWXYZ -lower=abcdefghijklmnopqrstuvwxyz -digits=0123456789 -alpha=${upper}${lower} - -if test -z "$depmode" || test -z "$source" || test -z "$object"; then - echo "depcomp: Variables source, object and depmode must be set" 1>&2 - exit 1 -fi - -# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po. -depfile=${depfile-`echo "$object" | - sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`} -tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`} - -rm -f "$tmpdepfile" - -# Avoid interferences from the environment. -gccflag= dashmflag= - -# Some modes work just like other modes, but use different flags. We -# parameterize here, but still list the modes in the big case below, -# to make depend.m4 easier to write. Note that we *cannot* use a case -# here, because this file can only contain one case statement. -if test "$depmode" = hp; then - # HP compiler uses -M and no extra arg. - gccflag=-M - depmode=gcc -fi - -if test "$depmode" = dashXmstdout; then - # This is just like dashmstdout with a different argument. - dashmflag=-xM - depmode=dashmstdout -fi - -cygpath_u="cygpath -u -f -" -if test "$depmode" = msvcmsys; then - # This is just like msvisualcpp but w/o cygpath translation. - # Just convert the backslash-escaped backslashes to single forward - # slashes to satisfy depend.m4 - cygpath_u='sed s,\\\\,/,g' - depmode=msvisualcpp -fi - -if test "$depmode" = msvc7msys; then - # This is just like msvc7 but w/o cygpath translation. - # Just convert the backslash-escaped backslashes to single forward - # slashes to satisfy depend.m4 - cygpath_u='sed s,\\\\,/,g' - depmode=msvc7 -fi - -if test "$depmode" = xlc; then - # IBM C/C++ Compilers xlc/xlC can output gcc-like dependency information. - gccflag=-qmakedep=gcc,-MF - depmode=gcc -fi - -case "$depmode" in -gcc3) -## gcc 3 implements dependency tracking that does exactly what -## we want. Yay! Note: for some reason libtool 1.4 doesn't like -## it if -MD -MP comes after the -MF stuff. Hmm. -## Unfortunately, FreeBSD c89 acceptance of flags depends upon -## the command line argument order; so add the flags where they -## appear in depend2.am. Note that the slowdown incurred here -## affects only configure: in makefiles, %FASTDEP% shortcuts this. - for arg - do - case $arg in - -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;; - *) set fnord "$@" "$arg" ;; - esac - shift # fnord - shift # $arg - done - "$@" - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - mv "$tmpdepfile" "$depfile" - ;; - -gcc) -## Note that this doesn't just cater to obsosete pre-3.x GCC compilers. -## but also to in-use compilers like IMB xlc/xlC and the HP C compiler. -## (see the conditional assignment to $gccflag above). -## There are various ways to get dependency output from gcc. Here's -## why we pick this rather obscure method: -## - Don't want to use -MD because we'd like the dependencies to end -## up in a subdir. Having to rename by hand is ugly. -## (We might end up doing this anyway to support other compilers.) -## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like -## -MM, not -M (despite what the docs say). Also, it might not be -## supported by the other compilers which use the 'gcc' depmode. -## - Using -M directly means running the compiler twice (even worse -## than renaming). - if test -z "$gccflag"; then - gccflag=-MD, - fi - "$@" -Wp,"$gccflag$tmpdepfile" - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - echo "$object : \\" > "$depfile" - # The second -e expression handles DOS-style file names with drive - # letters. - sed -e 's/^[^:]*: / /' \ - -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile" -## This next piece of magic avoids the "deleted header file" problem. -## The problem is that when a header file which appears in a .P file -## is deleted, the dependency causes make to die (because there is -## typically no way to rebuild the header). We avoid this by adding -## dummy dependencies for each header file. Too bad gcc doesn't do -## this for us directly. -## Some versions of gcc put a space before the ':'. On the theory -## that the space means something, we add a space to the output as -## well. hp depmode also adds that space, but also prefixes the VPATH -## to the object. Take care to not repeat it in the output. -## Some versions of the HPUX 10.20 sed can't process this invocation -## correctly. Breaking it into two sed invocations is a workaround. - tr ' ' "$nl" < "$tmpdepfile" \ - | sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \ - | sed -e 's/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -hp) - # This case exists only to let depend.m4 do its work. It works by - # looking at the text of this script. This case will never be run, - # since it is checked for above. - exit 1 - ;; - -sgi) - if test "$libtool" = yes; then - "$@" "-Wp,-MDupdate,$tmpdepfile" - else - "$@" -MDupdate "$tmpdepfile" - fi - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - - if test -f "$tmpdepfile"; then # yes, the sourcefile depend on other files - echo "$object : \\" > "$depfile" - # Clip off the initial element (the dependent). Don't try to be - # clever and replace this with sed code, as IRIX sed won't handle - # lines with more than a fixed number of characters (4096 in - # IRIX 6.2 sed, 8192 in IRIX 6.5). We also remove comment lines; - # the IRIX cc adds comments like '#:fec' to the end of the - # dependency line. - tr ' ' "$nl" < "$tmpdepfile" \ - | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' \ - | tr "$nl" ' ' >> "$depfile" - echo >> "$depfile" - # The second pass generates a dummy entry for each header file. - tr ' ' "$nl" < "$tmpdepfile" \ - | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \ - >> "$depfile" - else - make_dummy_depfile - fi - rm -f "$tmpdepfile" - ;; - -xlc) - # This case exists only to let depend.m4 do its work. It works by - # looking at the text of this script. This case will never be run, - # since it is checked for above. - exit 1 - ;; - -aix) - # The C for AIX Compiler uses -M and outputs the dependencies - # in a .u file. In older versions, this file always lives in the - # current directory. Also, the AIX compiler puts '$object:' at the - # start of each line; $object doesn't have directory information. - # Version 6 uses the directory in both cases. - set_dir_from "$object" - set_base_from "$object" - if test "$libtool" = yes; then - tmpdepfile1=$dir$base.u - tmpdepfile2=$base.u - tmpdepfile3=$dir.libs/$base.u - "$@" -Wc,-M - else - tmpdepfile1=$dir$base.u - tmpdepfile2=$dir$base.u - tmpdepfile3=$dir$base.u - "$@" -M - fi - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" - exit $stat - fi - - for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" - do - test -f "$tmpdepfile" && break - done - aix_post_process_depfile - ;; - -tcc) - # tcc (Tiny C Compiler) understand '-MD -MF file' since version 0.9.26 - # FIXME: That version still under development at the moment of writing. - # Make that this statement remains true also for stable, released - # versions. - # It will wrap lines (doesn't matter whether long or short) with a - # trailing '\', as in: - # - # foo.o : \ - # foo.c \ - # foo.h \ - # - # It will put a trailing '\' even on the last line, and will use leading - # spaces rather than leading tabs (at least since its commit 0394caf7 - # "Emit spaces for -MD"). - "$@" -MD -MF "$tmpdepfile" - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - # Each non-empty line is of the form 'foo.o : \' or ' dep.h \'. - # We have to change lines of the first kind to '$object: \'. - sed -e "s|.*:|$object :|" < "$tmpdepfile" > "$depfile" - # And for each line of the second kind, we have to emit a 'dep.h:' - # dummy dependency, to avoid the deleted-header problem. - sed -n -e 's|^ *\(.*\) *\\$|\1:|p' < "$tmpdepfile" >> "$depfile" - rm -f "$tmpdepfile" - ;; - -## The order of this option in the case statement is important, since the -## shell code in configure will try each of these formats in the order -## listed in this file. A plain '-MD' option would be understood by many -## compilers, so we must ensure this comes after the gcc and icc options. -pgcc) - # Portland's C compiler understands '-MD'. - # Will always output deps to 'file.d' where file is the root name of the - # source file under compilation, even if file resides in a subdirectory. - # The object file name does not affect the name of the '.d' file. - # pgcc 10.2 will output - # foo.o: sub/foo.c sub/foo.h - # and will wrap long lines using '\' : - # foo.o: sub/foo.c ... \ - # sub/foo.h ... \ - # ... - set_dir_from "$object" - # Use the source, not the object, to determine the base name, since - # that's sadly what pgcc will do too. - set_base_from "$source" - tmpdepfile=$base.d - - # For projects that build the same source file twice into different object - # files, the pgcc approach of using the *source* file root name can cause - # problems in parallel builds. Use a locking strategy to avoid stomping on - # the same $tmpdepfile. - lockdir=$base.d-lock - trap " - echo '$0: caught signal, cleaning up...' >&2 - rmdir '$lockdir' - exit 1 - " 1 2 13 15 - numtries=100 - i=$numtries - while test $i -gt 0; do - # mkdir is a portable test-and-set. - if mkdir "$lockdir" 2>/dev/null; then - # This process acquired the lock. - "$@" -MD - stat=$? - # Release the lock. - rmdir "$lockdir" - break - else - # If the lock is being held by a different process, wait - # until the winning process is done or we timeout. - while test -d "$lockdir" && test $i -gt 0; do - sleep 1 - i=`expr $i - 1` - done - fi - i=`expr $i - 1` - done - trap - 1 2 13 15 - if test $i -le 0; then - echo "$0: failed to acquire lock after $numtries attempts" >&2 - echo "$0: check lockdir '$lockdir'" >&2 - exit 1 - fi - - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - # Each line is of the form `foo.o: dependent.h', - # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'. - # Do two passes, one to just change these to - # `$object: dependent.h' and one to simply `dependent.h:'. - sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile" - # Some versions of the HPUX 10.20 sed can't process this invocation - # correctly. Breaking it into two sed invocations is a workaround. - sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" \ - | sed -e 's/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -hp2) - # The "hp" stanza above does not work with aCC (C++) and HP's ia64 - # compilers, which have integrated preprocessors. The correct option - # to use with these is +Maked; it writes dependencies to a file named - # 'foo.d', which lands next to the object file, wherever that - # happens to be. - # Much of this is similar to the tru64 case; see comments there. - set_dir_from "$object" - set_base_from "$object" - if test "$libtool" = yes; then - tmpdepfile1=$dir$base.d - tmpdepfile2=$dir.libs/$base.d - "$@" -Wc,+Maked - else - tmpdepfile1=$dir$base.d - tmpdepfile2=$dir$base.d - "$@" +Maked - fi - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile1" "$tmpdepfile2" - exit $stat - fi - - for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" - do - test -f "$tmpdepfile" && break - done - if test -f "$tmpdepfile"; then - sed -e "s,^.*\.[$lower]*:,$object:," "$tmpdepfile" > "$depfile" - # Add 'dependent.h:' lines. - sed -ne '2,${ - s/^ *// - s/ \\*$// - s/$/:/ - p - }' "$tmpdepfile" >> "$depfile" - else - make_dummy_depfile - fi - rm -f "$tmpdepfile" "$tmpdepfile2" - ;; - -tru64) - # The Tru64 compiler uses -MD to generate dependencies as a side - # effect. 'cc -MD -o foo.o ...' puts the dependencies into 'foo.o.d'. - # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put - # dependencies in 'foo.d' instead, so we check for that too. - # Subdirectories are respected. - set_dir_from "$object" - set_base_from "$object" - - if test "$libtool" = yes; then - # Libtool generates 2 separate objects for the 2 libraries. These - # two compilations output dependencies in $dir.libs/$base.o.d and - # in $dir$base.o.d. We have to check for both files, because - # one of the two compilations can be disabled. We should prefer - # $dir$base.o.d over $dir.libs/$base.o.d because the latter is - # automatically cleaned when .libs/ is deleted, while ignoring - # the former would cause a distcleancheck panic. - tmpdepfile1=$dir$base.o.d # libtool 1.5 - tmpdepfile2=$dir.libs/$base.o.d # Likewise. - tmpdepfile3=$dir.libs/$base.d # Compaq CCC V6.2-504 - "$@" -Wc,-MD - else - tmpdepfile1=$dir$base.d - tmpdepfile2=$dir$base.d - tmpdepfile3=$dir$base.d - "$@" -MD - fi - - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" - exit $stat - fi - - for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" - do - test -f "$tmpdepfile" && break - done - # Same post-processing that is required for AIX mode. - aix_post_process_depfile - ;; - -msvc7) - if test "$libtool" = yes; then - showIncludes=-Wc,-showIncludes - else - showIncludes=-showIncludes - fi - "$@" $showIncludes > "$tmpdepfile" - stat=$? - grep -v '^Note: including file: ' "$tmpdepfile" - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - echo "$object : \\" > "$depfile" - # The first sed program below extracts the file names and escapes - # backslashes for cygpath. The second sed program outputs the file - # name when reading, but also accumulates all include files in the - # hold buffer in order to output them again at the end. This only - # works with sed implementations that can handle large buffers. - sed < "$tmpdepfile" -n ' -/^Note: including file: *\(.*\)/ { - s//\1/ - s/\\/\\\\/g - p -}' | $cygpath_u | sort -u | sed -n ' -s/ /\\ /g -s/\(.*\)/'"$tab"'\1 \\/p -s/.\(.*\) \\/\1:/ -H -$ { - s/.*/'"$tab"'/ - G - p -}' >> "$depfile" - echo >> "$depfile" # make sure the fragment doesn't end with a backslash - rm -f "$tmpdepfile" - ;; - -msvc7msys) - # This case exists only to let depend.m4 do its work. It works by - # looking at the text of this script. This case will never be run, - # since it is checked for above. - exit 1 - ;; - -#nosideeffect) - # This comment above is used by automake to tell side-effect - # dependency tracking mechanisms from slower ones. - -dashmstdout) - # Important note: in order to support this mode, a compiler *must* - # always write the preprocessed file to stdout, regardless of -o. - "$@" || exit $? - - # Remove the call to Libtool. - if test "$libtool" = yes; then - while test "X$1" != 'X--mode=compile'; do - shift - done - shift - fi - - # Remove '-o $object'. - IFS=" " - for arg - do - case $arg in - -o) - shift - ;; - $object) - shift - ;; - *) - set fnord "$@" "$arg" - shift # fnord - shift # $arg - ;; - esac - done - - test -z "$dashmflag" && dashmflag=-M - # Require at least two characters before searching for ':' - # in the target name. This is to cope with DOS-style filenames: - # a dependency such as 'c:/foo/bar' could be seen as target 'c' otherwise. - "$@" $dashmflag | - sed "s|^[$tab ]*[^:$tab ][^:][^:]*:[$tab ]*|$object: |" > "$tmpdepfile" - rm -f "$depfile" - cat < "$tmpdepfile" > "$depfile" - # Some versions of the HPUX 10.20 sed can't process this sed invocation - # correctly. Breaking it into two sed invocations is a workaround. - tr ' ' "$nl" < "$tmpdepfile" \ - | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \ - | sed -e 's/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -dashXmstdout) - # This case only exists to satisfy depend.m4. It is never actually - # run, as this mode is specially recognized in the preamble. - exit 1 - ;; - -makedepend) - "$@" || exit $? - # Remove any Libtool call - if test "$libtool" = yes; then - while test "X$1" != 'X--mode=compile'; do - shift - done - shift - fi - # X makedepend - shift - cleared=no eat=no - for arg - do - case $cleared in - no) - set ""; shift - cleared=yes ;; - esac - if test $eat = yes; then - eat=no - continue - fi - case "$arg" in - -D*|-I*) - set fnord "$@" "$arg"; shift ;; - # Strip any option that makedepend may not understand. Remove - # the object too, otherwise makedepend will parse it as a source file. - -arch) - eat=yes ;; - -*|$object) - ;; - *) - set fnord "$@" "$arg"; shift ;; - esac - done - obj_suffix=`echo "$object" | sed 's/^.*\././'` - touch "$tmpdepfile" - ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@" - rm -f "$depfile" - # makedepend may prepend the VPATH from the source file name to the object. - # No need to regex-escape $object, excess matching of '.' is harmless. - sed "s|^.*\($object *:\)|\1|" "$tmpdepfile" > "$depfile" - # Some versions of the HPUX 10.20 sed can't process the last invocation - # correctly. Breaking it into two sed invocations is a workaround. - sed '1,2d' "$tmpdepfile" \ - | tr ' ' "$nl" \ - | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \ - | sed -e 's/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" "$tmpdepfile".bak - ;; - -cpp) - # Important note: in order to support this mode, a compiler *must* - # always write the preprocessed file to stdout. - "$@" || exit $? - - # Remove the call to Libtool. - if test "$libtool" = yes; then - while test "X$1" != 'X--mode=compile'; do - shift - done - shift - fi - - # Remove '-o $object'. - IFS=" " - for arg - do - case $arg in - -o) - shift - ;; - $object) - shift - ;; - *) - set fnord "$@" "$arg" - shift # fnord - shift # $arg - ;; - esac - done - - "$@" -E \ - | sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \ - -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \ - | sed '$ s: \\$::' > "$tmpdepfile" - rm -f "$depfile" - echo "$object : \\" > "$depfile" - cat < "$tmpdepfile" >> "$depfile" - sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -msvisualcpp) - # Important note: in order to support this mode, a compiler *must* - # always write the preprocessed file to stdout. - "$@" || exit $? - - # Remove the call to Libtool. - if test "$libtool" = yes; then - while test "X$1" != 'X--mode=compile'; do - shift - done - shift - fi - - IFS=" " - for arg - do - case "$arg" in - -o) - shift - ;; - $object) - shift - ;; - "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI") - set fnord "$@" - shift - shift - ;; - *) - set fnord "$@" "$arg" - shift - shift - ;; - esac - done - "$@" -E 2>/dev/null | - sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::\1:p' | $cygpath_u | sort -u > "$tmpdepfile" - rm -f "$depfile" - echo "$object : \\" > "$depfile" - sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::'"$tab"'\1 \\:p' >> "$depfile" - echo "$tab" >> "$depfile" - sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::\1\::p' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -msvcmsys) - # This case exists only to let depend.m4 do its work. It works by - # looking at the text of this script. This case will never be run, - # since it is checked for above. - exit 1 - ;; - -none) - exec "$@" - ;; - -*) - echo "Unknown depmode $depmode" 1>&2 - exit 1 - ;; -esac - -exit 0 - -# Local Variables: -# mode: shell-script -# sh-indentation: 2 -# eval: (add-hook 'before-save-hook 'time-stamp) -# time-stamp-start: "scriptversion=" -# time-stamp-format: "%:y-%02m-%02d.%02H" -# time-stamp-time-zone: "UTC0" -# time-stamp-end: "; # UTC" -# End: diff --git a/pcre2/install-sh b/pcre2/install-sh deleted file mode 100755 index 20d8b2eae..000000000 --- a/pcre2/install-sh +++ /dev/null @@ -1,529 +0,0 @@ -#!/bin/sh -# install - install a program, script, or datafile - -scriptversion=2018-03-11.20; # UTC - -# This originates from X11R5 (mit/util/scripts/install.sh), which was -# later released in X11R6 (xc/config/util/install.sh) with the -# following copyright and license. -# -# Copyright (C) 1994 X Consortium -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN -# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC- -# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# Except as contained in this notice, the name of the X Consortium shall not -# be used in advertising or otherwise to promote the sale, use or other deal- -# ings in this Software without prior written authorization from the X Consor- -# tium. -# -# -# FSF changes to this file are in the public domain. -# -# Calling this script install-sh is preferred over install.sh, to prevent -# 'make' implicit rules from creating a file called install from it -# when there is no Makefile. -# -# This script is compatible with the BSD install script, but was written -# from scratch. - -tab=' ' -nl=' -' -IFS=" $tab$nl" - -# Set DOITPROG to "echo" to test this script. - -doit=${DOITPROG-} -doit_exec=${doit:-exec} - -# Put in absolute file names if you don't have them in your path; -# or use environment vars. - -chgrpprog=${CHGRPPROG-chgrp} -chmodprog=${CHMODPROG-chmod} -chownprog=${CHOWNPROG-chown} -cmpprog=${CMPPROG-cmp} -cpprog=${CPPROG-cp} -mkdirprog=${MKDIRPROG-mkdir} -mvprog=${MVPROG-mv} -rmprog=${RMPROG-rm} -stripprog=${STRIPPROG-strip} - -posix_mkdir= - -# Desired mode of installed file. -mode=0755 - -chgrpcmd= -chmodcmd=$chmodprog -chowncmd= -mvcmd=$mvprog -rmcmd="$rmprog -f" -stripcmd= - -src= -dst= -dir_arg= -dst_arg= - -copy_on_change=false -is_target_a_directory=possibly - -usage="\ -Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE - or: $0 [OPTION]... SRCFILES... DIRECTORY - or: $0 [OPTION]... -t DIRECTORY SRCFILES... - or: $0 [OPTION]... -d DIRECTORIES... - -In the 1st form, copy SRCFILE to DSTFILE. -In the 2nd and 3rd, copy all SRCFILES to DIRECTORY. -In the 4th, create DIRECTORIES. - -Options: - --help display this help and exit. - --version display version info and exit. - - -c (ignored) - -C install only if different (preserve the last data modification time) - -d create directories instead of installing files. - -g GROUP $chgrpprog installed files to GROUP. - -m MODE $chmodprog installed files to MODE. - -o USER $chownprog installed files to USER. - -s $stripprog installed files. - -t DIRECTORY install into DIRECTORY. - -T report an error if DSTFILE is a directory. - -Environment variables override the default commands: - CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG - RMPROG STRIPPROG -" - -while test $# -ne 0; do - case $1 in - -c) ;; - - -C) copy_on_change=true;; - - -d) dir_arg=true;; - - -g) chgrpcmd="$chgrpprog $2" - shift;; - - --help) echo "$usage"; exit $?;; - - -m) mode=$2 - case $mode in - *' '* | *"$tab"* | *"$nl"* | *'*'* | *'?'* | *'['*) - echo "$0: invalid mode: $mode" >&2 - exit 1;; - esac - shift;; - - -o) chowncmd="$chownprog $2" - shift;; - - -s) stripcmd=$stripprog;; - - -t) - is_target_a_directory=always - dst_arg=$2 - # Protect names problematic for 'test' and other utilities. - case $dst_arg in - -* | [=\(\)!]) dst_arg=./$dst_arg;; - esac - shift;; - - -T) is_target_a_directory=never;; - - --version) echo "$0 $scriptversion"; exit $?;; - - --) shift - break;; - - -*) echo "$0: invalid option: $1" >&2 - exit 1;; - - *) break;; - esac - shift -done - -# We allow the use of options -d and -T together, by making -d -# take the precedence; this is for compatibility with GNU install. - -if test -n "$dir_arg"; then - if test -n "$dst_arg"; then - echo "$0: target directory not allowed when installing a directory." >&2 - exit 1 - fi -fi - -if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then - # When -d is used, all remaining arguments are directories to create. - # When -t is used, the destination is already specified. - # Otherwise, the last argument is the destination. Remove it from $@. - for arg - do - if test -n "$dst_arg"; then - # $@ is not empty: it contains at least $arg. - set fnord "$@" "$dst_arg" - shift # fnord - fi - shift # arg - dst_arg=$arg - # Protect names problematic for 'test' and other utilities. - case $dst_arg in - -* | [=\(\)!]) dst_arg=./$dst_arg;; - esac - done -fi - -if test $# -eq 0; then - if test -z "$dir_arg"; then - echo "$0: no input file specified." >&2 - exit 1 - fi - # It's OK to call 'install-sh -d' without argument. - # This can happen when creating conditional directories. - exit 0 -fi - -if test -z "$dir_arg"; then - if test $# -gt 1 || test "$is_target_a_directory" = always; then - if test ! -d "$dst_arg"; then - echo "$0: $dst_arg: Is not a directory." >&2 - exit 1 - fi - fi -fi - -if test -z "$dir_arg"; then - do_exit='(exit $ret); exit $ret' - trap "ret=129; $do_exit" 1 - trap "ret=130; $do_exit" 2 - trap "ret=141; $do_exit" 13 - trap "ret=143; $do_exit" 15 - - # Set umask so as not to create temps with too-generous modes. - # However, 'strip' requires both read and write access to temps. - case $mode in - # Optimize common cases. - *644) cp_umask=133;; - *755) cp_umask=22;; - - *[0-7]) - if test -z "$stripcmd"; then - u_plus_rw= - else - u_plus_rw='% 200' - fi - cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;; - *) - if test -z "$stripcmd"; then - u_plus_rw= - else - u_plus_rw=,u+rw - fi - cp_umask=$mode$u_plus_rw;; - esac -fi - -for src -do - # Protect names problematic for 'test' and other utilities. - case $src in - -* | [=\(\)!]) src=./$src;; - esac - - if test -n "$dir_arg"; then - dst=$src - dstdir=$dst - test -d "$dstdir" - dstdir_status=$? - else - - # Waiting for this to be detected by the "$cpprog $src $dsttmp" command - # might cause directories to be created, which would be especially bad - # if $src (and thus $dsttmp) contains '*'. - if test ! -f "$src" && test ! -d "$src"; then - echo "$0: $src does not exist." >&2 - exit 1 - fi - - if test -z "$dst_arg"; then - echo "$0: no destination specified." >&2 - exit 1 - fi - dst=$dst_arg - - # If destination is a directory, append the input filename. - if test -d "$dst"; then - if test "$is_target_a_directory" = never; then - echo "$0: $dst_arg: Is a directory" >&2 - exit 1 - fi - dstdir=$dst - dstbase=`basename "$src"` - case $dst in - */) dst=$dst$dstbase;; - *) dst=$dst/$dstbase;; - esac - dstdir_status=0 - else - dstdir=`dirname "$dst"` - test -d "$dstdir" - dstdir_status=$? - fi - fi - - case $dstdir in - */) dstdirslash=$dstdir;; - *) dstdirslash=$dstdir/;; - esac - - obsolete_mkdir_used=false - - if test $dstdir_status != 0; then - case $posix_mkdir in - '') - # Create intermediate dirs using mode 755 as modified by the umask. - # This is like FreeBSD 'install' as of 1997-10-28. - umask=`umask` - case $stripcmd.$umask in - # Optimize common cases. - *[2367][2367]) mkdir_umask=$umask;; - .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;; - - *[0-7]) - mkdir_umask=`expr $umask + 22 \ - - $umask % 100 % 40 + $umask % 20 \ - - $umask % 10 % 4 + $umask % 2 - `;; - *) mkdir_umask=$umask,go-w;; - esac - - # With -d, create the new directory with the user-specified mode. - # Otherwise, rely on $mkdir_umask. - if test -n "$dir_arg"; then - mkdir_mode=-m$mode - else - mkdir_mode= - fi - - posix_mkdir=false - case $umask in - *[123567][0-7][0-7]) - # POSIX mkdir -p sets u+wx bits regardless of umask, which - # is incompatible with FreeBSD 'install' when (umask & 300) != 0. - ;; - *) - # Note that $RANDOM variable is not portable (e.g. dash); Use it - # here however when possible just to lower collision chance. - tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$ - - trap 'ret=$?; rmdir "$tmpdir/a/b" "$tmpdir/a" "$tmpdir" 2>/dev/null; exit $ret' 0 - - # Because "mkdir -p" follows existing symlinks and we likely work - # directly in world-writeable /tmp, make sure that the '$tmpdir' - # directory is successfully created first before we actually test - # 'mkdir -p' feature. - if (umask $mkdir_umask && - $mkdirprog $mkdir_mode "$tmpdir" && - exec $mkdirprog $mkdir_mode -p -- "$tmpdir/a/b") >/dev/null 2>&1 - then - if test -z "$dir_arg" || { - # Check for POSIX incompatibilities with -m. - # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or - # other-writable bit of parent directory when it shouldn't. - # FreeBSD 6.1 mkdir -m -p sets mode of existing directory. - test_tmpdir="$tmpdir/a" - ls_ld_tmpdir=`ls -ld "$test_tmpdir"` - case $ls_ld_tmpdir in - d????-?r-*) different_mode=700;; - d????-?--*) different_mode=755;; - *) false;; - esac && - $mkdirprog -m$different_mode -p -- "$test_tmpdir" && { - ls_ld_tmpdir_1=`ls -ld "$test_tmpdir"` - test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1" - } - } - then posix_mkdir=: - fi - rmdir "$tmpdir/a/b" "$tmpdir/a" "$tmpdir" - else - # Remove any dirs left behind by ancient mkdir implementations. - rmdir ./$mkdir_mode ./-p ./-- "$tmpdir" 2>/dev/null - fi - trap '' 0;; - esac;; - esac - - if - $posix_mkdir && ( - umask $mkdir_umask && - $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir" - ) - then : - else - - # The umask is ridiculous, or mkdir does not conform to POSIX, - # or it failed possibly due to a race condition. Create the - # directory the slow way, step by step, checking for races as we go. - - case $dstdir in - /*) prefix='/';; - [-=\(\)!]*) prefix='./';; - *) prefix='';; - esac - - oIFS=$IFS - IFS=/ - set -f - set fnord $dstdir - shift - set +f - IFS=$oIFS - - prefixes= - - for d - do - test X"$d" = X && continue - - prefix=$prefix$d - if test -d "$prefix"; then - prefixes= - else - if $posix_mkdir; then - (umask=$mkdir_umask && - $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break - # Don't fail if two instances are running concurrently. - test -d "$prefix" || exit 1 - else - case $prefix in - *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;; - *) qprefix=$prefix;; - esac - prefixes="$prefixes '$qprefix'" - fi - fi - prefix=$prefix/ - done - - if test -n "$prefixes"; then - # Don't fail if two instances are running concurrently. - (umask $mkdir_umask && - eval "\$doit_exec \$mkdirprog $prefixes") || - test -d "$dstdir" || exit 1 - obsolete_mkdir_used=true - fi - fi - fi - - if test -n "$dir_arg"; then - { test -z "$chowncmd" || $doit $chowncmd "$dst"; } && - { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } && - { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false || - test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1 - else - - # Make a couple of temp file names in the proper directory. - dsttmp=${dstdirslash}_inst.$$_ - rmtmp=${dstdirslash}_rm.$$_ - - # Trap to clean up those temp files at exit. - trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0 - - # Copy the file name to the temp name. - (umask $cp_umask && - { test -z "$stripcmd" || { - # Create $dsttmp read-write so that cp doesn't create it read-only, - # which would cause strip to fail. - if test -z "$doit"; then - : >"$dsttmp" # No need to fork-exec 'touch'. - else - $doit touch "$dsttmp" - fi - } - } && - $doit_exec $cpprog "$src" "$dsttmp") && - - # and set any options; do chmod last to preserve setuid bits. - # - # If any of these fail, we abort the whole thing. If we want to - # ignore errors from any of these, just make sure not to ignore - # errors from the above "$doit $cpprog $src $dsttmp" command. - # - { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } && - { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } && - { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } && - { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } && - - # If -C, don't bother to copy if it wouldn't change the file. - if $copy_on_change && - old=`LC_ALL=C ls -dlL "$dst" 2>/dev/null` && - new=`LC_ALL=C ls -dlL "$dsttmp" 2>/dev/null` && - set -f && - set X $old && old=:$2:$4:$5:$6 && - set X $new && new=:$2:$4:$5:$6 && - set +f && - test "$old" = "$new" && - $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1 - then - rm -f "$dsttmp" - else - # Rename the file to the real destination. - $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null || - - # The rename failed, perhaps because mv can't rename something else - # to itself, or perhaps because mv is so ancient that it does not - # support -f. - { - # Now remove or move aside any old file at destination location. - # We try this two ways since rm can't unlink itself on some - # systems and the destination file might be busy for other - # reasons. In this case, the final cleanup might fail but the new - # file should still install successfully. - { - test ! -f "$dst" || - $doit $rmcmd -f "$dst" 2>/dev/null || - { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null && - { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; } - } || - { echo "$0: cannot unlink or rename $dst" >&2 - (exit 1); exit 1 - } - } && - - # Now rename the file to the real destination. - $doit $mvcmd "$dsttmp" "$dst" - } - fi || exit 1 - - trap '' 0 - fi -done - -# Local variables: -# eval: (add-hook 'before-save-hook 'time-stamp) -# time-stamp-start: "scriptversion=" -# time-stamp-format: "%:y-%02m-%02d.%02H" -# time-stamp-time-zone: "UTC0" -# time-stamp-end: "; # UTC" -# End: diff --git a/pcre2/libpcre2-16.pc.in b/pcre2/libpcre2-16.pc.in deleted file mode 100644 index bacb46651..000000000 --- a/pcre2/libpcre2-16.pc.in +++ /dev/null @@ -1,13 +0,0 @@ -# Package Information for pkg-config - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: libpcre2-16 -Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 16 bit character support -Version: @PACKAGE_VERSION@ -Libs: -L${libdir} -lpcre2-16@LIB_POSTFIX@ -Libs.private: @PTHREAD_CFLAGS@ @PTHREAD_LIBS@ -Cflags: -I${includedir} @PCRE2_STATIC_CFLAG@ diff --git a/pcre2/libpcre2-32.pc.in b/pcre2/libpcre2-32.pc.in deleted file mode 100644 index 06241f066..000000000 --- a/pcre2/libpcre2-32.pc.in +++ /dev/null @@ -1,13 +0,0 @@ -# Package Information for pkg-config - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: libpcre2-32 -Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 32 bit character support -Version: @PACKAGE_VERSION@ -Libs: -L${libdir} -lpcre2-32@LIB_POSTFIX@ -Libs.private: @PTHREAD_CFLAGS@ @PTHREAD_LIBS@ -Cflags: -I${includedir} @PCRE2_STATIC_CFLAG@ diff --git a/pcre2/libpcre2-8.pc.in b/pcre2/libpcre2-8.pc.in deleted file mode 100644 index 246bb9ea3..000000000 --- a/pcre2/libpcre2-8.pc.in +++ /dev/null @@ -1,13 +0,0 @@ -# Package Information for pkg-config - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: libpcre2-8 -Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 8 bit character support -Version: @PACKAGE_VERSION@ -Libs: -L${libdir} -lpcre2-8@LIB_POSTFIX@ -Libs.private: @PTHREAD_CFLAGS@ @PTHREAD_LIBS@ -Cflags: -I${includedir} @PCRE2_STATIC_CFLAG@ diff --git a/pcre2/libpcre2-posix.pc.in b/pcre2/libpcre2-posix.pc.in deleted file mode 100644 index 758c30688..000000000 --- a/pcre2/libpcre2-posix.pc.in +++ /dev/null @@ -1,13 +0,0 @@ -# Package Information for pkg-config - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: libpcre2-posix -Description: Posix compatible interface to libpcre2-8 -Version: @PACKAGE_VERSION@ -Libs: -L${libdir} -lpcre2-posix@LIB_POSTFIX@ -Cflags: -I${includedir} @PCRE2_STATIC_CFLAG@ -Requires.private: libpcre2-8 diff --git a/pcre2/missing b/pcre2/missing deleted file mode 100755 index 8d0eaad25..000000000 --- a/pcre2/missing +++ /dev/null @@ -1,215 +0,0 @@ -#! /bin/sh -# Common wrapper for a few potentially missing GNU programs. - -scriptversion=2018-03-07.03; # UTC - -# Copyright (C) 1996-2020 Free Software Foundation, Inc. -# Originally written by Fran,cois Pinard , 1996. - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - -if test $# -eq 0; then - echo 1>&2 "Try '$0 --help' for more information" - exit 1 -fi - -case $1 in - - --is-lightweight) - # Used by our autoconf macros to check whether the available missing - # script is modern enough. - exit 0 - ;; - - --run) - # Back-compat with the calling convention used by older automake. - shift - ;; - - -h|--h|--he|--hel|--help) - echo "\ -$0 [OPTION]... PROGRAM [ARGUMENT]... - -Run 'PROGRAM [ARGUMENT]...', returning a proper advice when this fails due -to PROGRAM being missing or too old. - -Options: - -h, --help display this help and exit - -v, --version output version information and exit - -Supported PROGRAM values: - aclocal autoconf autoheader autom4te automake makeinfo - bison yacc flex lex help2man - -Version suffixes to PROGRAM as well as the prefixes 'gnu-', 'gnu', and -'g' are ignored when checking the name. - -Send bug reports to ." - exit $? - ;; - - -v|--v|--ve|--ver|--vers|--versi|--versio|--version) - echo "missing $scriptversion (GNU Automake)" - exit $? - ;; - - -*) - echo 1>&2 "$0: unknown '$1' option" - echo 1>&2 "Try '$0 --help' for more information" - exit 1 - ;; - -esac - -# Run the given program, remember its exit status. -"$@"; st=$? - -# If it succeeded, we are done. -test $st -eq 0 && exit 0 - -# Also exit now if we it failed (or wasn't found), and '--version' was -# passed; such an option is passed most likely to detect whether the -# program is present and works. -case $2 in --version|--help) exit $st;; esac - -# Exit code 63 means version mismatch. This often happens when the user -# tries to use an ancient version of a tool on a file that requires a -# minimum version. -if test $st -eq 63; then - msg="probably too old" -elif test $st -eq 127; then - # Program was missing. - msg="missing on your system" -else - # Program was found and executed, but failed. Give up. - exit $st -fi - -perl_URL=https://www.perl.org/ -flex_URL=https://github.com/westes/flex -gnu_software_URL=https://www.gnu.org/software - -program_details () -{ - case $1 in - aclocal|automake) - echo "The '$1' program is part of the GNU Automake package:" - echo "<$gnu_software_URL/automake>" - echo "It also requires GNU Autoconf, GNU m4 and Perl in order to run:" - echo "<$gnu_software_URL/autoconf>" - echo "<$gnu_software_URL/m4/>" - echo "<$perl_URL>" - ;; - autoconf|autom4te|autoheader) - echo "The '$1' program is part of the GNU Autoconf package:" - echo "<$gnu_software_URL/autoconf/>" - echo "It also requires GNU m4 and Perl in order to run:" - echo "<$gnu_software_URL/m4/>" - echo "<$perl_URL>" - ;; - esac -} - -give_advice () -{ - # Normalize program name to check for. - normalized_program=`echo "$1" | sed ' - s/^gnu-//; t - s/^gnu//; t - s/^g//; t'` - - printf '%s\n' "'$1' is $msg." - - configure_deps="'configure.ac' or m4 files included by 'configure.ac'" - case $normalized_program in - autoconf*) - echo "You should only need it if you modified 'configure.ac'," - echo "or m4 files included by it." - program_details 'autoconf' - ;; - autoheader*) - echo "You should only need it if you modified 'acconfig.h' or" - echo "$configure_deps." - program_details 'autoheader' - ;; - automake*) - echo "You should only need it if you modified 'Makefile.am' or" - echo "$configure_deps." - program_details 'automake' - ;; - aclocal*) - echo "You should only need it if you modified 'acinclude.m4' or" - echo "$configure_deps." - program_details 'aclocal' - ;; - autom4te*) - echo "You might have modified some maintainer files that require" - echo "the 'autom4te' program to be rebuilt." - program_details 'autom4te' - ;; - bison*|yacc*) - echo "You should only need it if you modified a '.y' file." - echo "You may want to install the GNU Bison package:" - echo "<$gnu_software_URL/bison/>" - ;; - lex*|flex*) - echo "You should only need it if you modified a '.l' file." - echo "You may want to install the Fast Lexical Analyzer package:" - echo "<$flex_URL>" - ;; - help2man*) - echo "You should only need it if you modified a dependency" \ - "of a man page." - echo "You may want to install the GNU Help2man package:" - echo "<$gnu_software_URL/help2man/>" - ;; - makeinfo*) - echo "You should only need it if you modified a '.texi' file, or" - echo "any other file indirectly affecting the aspect of the manual." - echo "You might want to install the Texinfo package:" - echo "<$gnu_software_URL/texinfo/>" - echo "The spurious makeinfo call might also be the consequence of" - echo "using a buggy 'make' (AIX, DU, IRIX), in which case you might" - echo "want to install GNU make:" - echo "<$gnu_software_URL/make/>" - ;; - *) - echo "You might have modified some files without having the proper" - echo "tools for further handling them. Check the 'README' file, it" - echo "often tells you about the needed prerequisites for installing" - echo "this package. You may also peek at any GNU archive site, in" - echo "case some other package contains this missing '$1' program." - ;; - esac -} - -give_advice "$1" | sed -e '1s/^/WARNING: /' \ - -e '2,$s/^/ /' >&2 - -# Propagate the correct exit status (expected to be 127 for a program -# not found, 63 for a program that failed due to version mismatch). -exit $st - -# Local variables: -# eval: (add-hook 'before-save-hook 'time-stamp) -# time-stamp-start: "scriptversion=" -# time-stamp-format: "%:y-%02m-%02d.%02H" -# time-stamp-time-zone: "UTC0" -# time-stamp-end: "; # UTC" -# End: diff --git a/pcre2/pcre2-config.in b/pcre2/pcre2-config.in deleted file mode 100644 index bacea876e..000000000 --- a/pcre2/pcre2-config.in +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/sh - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -exec_prefix_set=no - -cflags="[--cflags]" -libs= - -if test @enable_pcre2_16@ = yes ; then - libs="[--libs16] $libs" -fi - -if test @enable_pcre2_32@ = yes ; then - libs="[--libs32] $libs" -fi - -if test @enable_pcre2_8@ = yes ; then - libs="[--libs8] [--libs-posix] $libs" - cflags="$cflags [--cflags-posix]" -fi - -usage="Usage: pcre2-config [--prefix] [--exec-prefix] [--version] $libs $cflags" - -if test $# -eq 0; then - echo "${usage}" 1>&2 - exit 1 -fi - -libR= -case `uname -s` in - *SunOS*) - libR=" -R@libdir@" - ;; - *BSD*) - libR=" -Wl,-R@libdir@" - ;; -esac - -libS= -if test @libdir@ != /usr/lib ; then - libS=-L@libdir@ -fi - -while test $# -gt 0; do - case "$1" in - -*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;; - *) optarg= ;; - esac - - case $1 in - --prefix=*) - prefix=$optarg - if test $exec_prefix_set = no ; then - exec_prefix=$optarg - fi - ;; - --prefix) - echo $prefix - ;; - --exec-prefix=*) - exec_prefix=$optarg - exec_prefix_set=yes - ;; - --exec-prefix) - echo $exec_prefix - ;; - --version) - echo @PACKAGE_VERSION@ - ;; - --cflags) - if test @includedir@ != /usr/include ; then - includes=-I@includedir@ - fi - echo $includes @PCRE2_STATIC_CFLAG@ - ;; - --cflags-posix) - if test @enable_pcre2_8@ = yes ; then - if test @includedir@ != /usr/include ; then - includes=-I@includedir@ - fi - echo $includes @PCRE2_STATIC_CFLAG@ - else - echo "${usage}" 1>&2 - fi - ;; - --libs-posix) - if test @enable_pcre2_8@ = yes ; then - echo $libS$libR -lpcre2-posix@LIB_POSTFIX@ -lpcre2-8@LIB_POSTFIX@ - else - echo "${usage}" 1>&2 - fi - ;; - --libs8) - if test @enable_pcre2_8@ = yes ; then - echo $libS$libR -lpcre2-8@LIB_POSTFIX@ - else - echo "${usage}" 1>&2 - fi - ;; - --libs16) - if test @enable_pcre2_16@ = yes ; then - echo $libS$libR -lpcre2-16@LIB_POSTFIX@ - else - echo "${usage}" 1>&2 - fi - ;; - --libs32) - if test @enable_pcre2_32@ = yes ; then - echo $libS$libR -lpcre2-32@LIB_POSTFIX@ - else - echo "${usage}" 1>&2 - fi - ;; - *) - echo "${usage}" 1>&2 - exit 1 - ;; - esac - shift -done diff --git a/pcre2/src/config.h.generic b/pcre2/src/config.h.generic deleted file mode 100644 index 10f410479..000000000 --- a/pcre2/src/config.h.generic +++ /dev/null @@ -1,381 +0,0 @@ -/* src/config.h. Generated from config.h.in by configure. */ -/* src/config.h.in. Generated from configure.ac by autoheader. */ - -/* PCRE2 is written in Standard C, but there are a few non-standard things it -can cope with, allowing it to run on SunOS4 and other "close to standard" -systems. - -In environments that support the GNU autotools, config.h.in is converted into -config.h by the "configure" script. In environments that use CMake, -config-cmake.in is converted into config.h. If you are going to build PCRE2 "by -hand" without using "configure" or CMake, you should copy the distributed -config.h.generic to config.h, and edit the macro definitions to be the way you -need them. You must then add -DHAVE_CONFIG_H to all of your compile commands, -so that config.h is included at the start of every source. - -Alternatively, you can avoid editing by using -D on the compiler command line -to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H, -but if you do, default values will be taken from config.h for non-boolean -macros that are not defined on the command line. - -Boolean macros such as HAVE_STDLIB_H and SUPPORT_PCRE2_8 should either be -defined (conventionally to 1) for TRUE, and not defined at all for FALSE. All -such macros are listed as a commented #undef in config.h.generic. Macros such -as MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are -surrounded by #ifndef/#endif lines so that the value can be overridden by -D. - -PCRE2 uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if -HAVE_BCOPY is defined. If your system has neither bcopy() nor memmove(), make -sure both macros are undefined; an emulation function will then be used. */ - -/* By default, the \R escape sequence matches any Unicode line ending - character or sequence of characters. If BSR_ANYCRLF is defined (to any - value), this is changed so that backslash-R matches only CR, LF, or CRLF. - The build-time default can be overridden by the user of PCRE2 at runtime. - */ -/* #undef BSR_ANYCRLF */ - -/* Define to any value to disable the use of the z and t modifiers in - formatting settings such as %zu or %td (this is rarely needed). */ -/* #undef DISABLE_PERCENT_ZT */ - -/* If you are compiling for a system that uses EBCDIC instead of ASCII - character codes, define this macro to any value. When EBCDIC is set, PCRE2 - assumes that all input strings are in EBCDIC. If you do not define this - macro, PCRE2 will assume input strings are ASCII or UTF-8/16/32 Unicode. It - is not possible to build a version of PCRE2 that supports both EBCDIC and - UTF-8/16/32. */ -/* #undef EBCDIC */ - -/* In an EBCDIC environment, define this macro to any value to arrange for the - NL character to be 0x25 instead of the default 0x15. NL plays the role that - LF does in an ASCII/Unicode environment. */ -/* #undef EBCDIC_NL25 */ - -/* Define this if your compiler supports __attribute__((uninitialized)) */ -/* #undef HAVE_ATTRIBUTE_UNINITIALIZED */ - -/* Define to 1 if you have the `bcopy' function. */ -/* #undef HAVE_BCOPY */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_BZLIB_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_DIRENT_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_DLFCN_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_EDITLINE_READLINE_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_EDIT_READLINE_READLINE_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_INTTYPES_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_LIMITS_H */ - -/* Define to 1 if you have the `memfd_create' function. */ -/* #undef HAVE_MEMFD_CREATE */ - -/* Define to 1 if you have the `memmove' function. */ -/* #undef HAVE_MEMMOVE */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_MEMORY_H */ - -/* Define to 1 if you have the `mkostemp' function. */ -/* #undef HAVE_MKOSTEMP */ - -/* Define if you have POSIX threads libraries and header files. */ -/* #undef HAVE_PTHREAD */ - -/* Have PTHREAD_PRIO_INHERIT. */ -/* #undef HAVE_PTHREAD_PRIO_INHERIT */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_READLINE_HISTORY_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_READLINE_READLINE_H */ - -/* Define to 1 if you have the `secure_getenv' function. */ -/* #undef HAVE_SECURE_GETENV */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_STDINT_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_STDLIB_H */ - -/* Define to 1 if you have the `strerror' function. */ -/* #undef HAVE_STRERROR */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_STRINGS_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_STRING_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_SYS_STAT_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_SYS_TYPES_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_SYS_WAIT_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_UNISTD_H */ - -/* Define to 1 if the compiler supports simple visibility declarations. */ -/* #undef HAVE_VISIBILITY */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_WINDOWS_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_ZLIB_H */ - -/* This limits the amount of memory that may be used while matching a pattern. - It applies to both pcre2_match() and pcre2_dfa_match(). It does not apply - to JIT matching. The value is in kibibytes (units of 1024 bytes). */ -#ifndef HEAP_LIMIT -#define HEAP_LIMIT 20000000 -#endif - -/* The value of LINK_SIZE determines the number of bytes used to store links - as offsets within the compiled regex. The default is 2, which allows for - compiled patterns up to 65535 code units long. This covers the vast - majority of cases. However, PCRE2 can also be compiled to use 3 or 4 bytes - instead. This allows for longer patterns in extreme cases. */ -#ifndef LINK_SIZE -#define LINK_SIZE 2 -#endif - -/* Define to the sub-directory where libtool stores uninstalled libraries. */ -/* This is ignored unless you are using libtool. */ -#ifndef LT_OBJDIR -#define LT_OBJDIR ".libs/" -#endif - -/* The value of MATCH_LIMIT determines the default number of times the - pcre2_match() function can record a backtrack position during a single - matching attempt. The value is also used to limit a loop counter in - pcre2_dfa_match(). There is a runtime interface for setting a different - limit. The limit exists in order to catch runaway regular expressions that - take for ever to determine that they do not match. The default is set very - large so that it does not accidentally catch legitimate cases. */ -#ifndef MATCH_LIMIT -#define MATCH_LIMIT 10000000 -#endif - -/* The above limit applies to all backtracks, whether or not they are nested. - In some environments it is desirable to limit the nesting of backtracking - (that is, the depth of tree that is searched) more strictly, in order to - restrict the maximum amount of heap memory that is used. The value of - MATCH_LIMIT_DEPTH provides this facility. To have any useful effect, it - must be less than the value of MATCH_LIMIT. The default is to use the same - value as MATCH_LIMIT. There is a runtime method for setting a different - limit. In the case of pcre2_dfa_match(), this limit controls the depth of - the internal nested function calls that are used for pattern recursions, - lookarounds, and atomic groups. */ -#ifndef MATCH_LIMIT_DEPTH -#define MATCH_LIMIT_DEPTH MATCH_LIMIT -#endif - -/* This limit is parameterized just in case anybody ever wants to change it. - Care must be taken if it is increased, because it guards against integer - overflow caused by enormously large patterns. */ -#ifndef MAX_NAME_COUNT -#define MAX_NAME_COUNT 10000 -#endif - -/* This limit is parameterized just in case anybody ever wants to change it. - Care must be taken if it is increased, because it guards against integer - overflow caused by enormously large patterns. */ -#ifndef MAX_NAME_SIZE -#define MAX_NAME_SIZE 32 -#endif - -/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */ -/* #undef NEVER_BACKSLASH_C */ - -/* The value of NEWLINE_DEFAULT determines the default newline character - sequence. PCRE2 client programs can override this by selecting other values - at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), 5 - (ANYCRLF), and 6 (NUL). */ -#ifndef NEWLINE_DEFAULT -#define NEWLINE_DEFAULT 2 -#endif - -/* Name of package */ -#define PACKAGE "pcre2" - -/* Define to the address where bug reports for this package should be sent. */ -#define PACKAGE_BUGREPORT "" - -/* Define to the full name of this package. */ -#define PACKAGE_NAME "PCRE2" - -/* Define to the full name and version of this package. */ -#define PACKAGE_STRING "PCRE2 10.36" - -/* Define to the one symbol short name of this package. */ -#define PACKAGE_TARNAME "pcre2" - -/* Define to the home page for this package. */ -#define PACKAGE_URL "" - -/* Define to the version of this package. */ -#define PACKAGE_VERSION "10.36" - -/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested - parentheses (of any kind) in a pattern. This limits the amount of system - stack that is used while compiling a pattern. */ -#ifndef PARENS_NEST_LIMIT -#define PARENS_NEST_LIMIT 250 -#endif - -/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by - pcre2grep to hold parts of the file it is searching. The buffer will be - expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing - very long lines. The actual amount of memory used by pcre2grep is three - times this number, because it allows for the buffering of "before" and - "after" lines. */ -#ifndef PCRE2GREP_BUFSIZE -#define PCRE2GREP_BUFSIZE 20480 -#endif - -/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer - used by pcre2grep to hold parts of the file it is searching. The actual - amount of memory used by pcre2grep is three times this number, because it - allows for the buffering of "before" and "after" lines. */ -#ifndef PCRE2GREP_MAX_BUFSIZE -#define PCRE2GREP_MAX_BUFSIZE 1048576 -#endif - -/* Define to any value to include debugging code. */ -/* #undef PCRE2_DEBUG */ - -/* If you are compiling for a system other than a Unix-like system or - Win32, and it needs some magic to be inserted before the definition - of a function that is exported by the library, define this macro to - contain the relevant magic. If you do not define this macro, a suitable - __declspec value is used for Windows systems; in other environments - "extern" is used for a C compiler and "extern C" for a C++ compiler. - This macro apears at the start of every exported function that is part - of the external API. It does not appear on functions that are "external" - in the C sense, but which are internal to the library. */ -/* #undef PCRE2_EXP_DEFN */ - -/* Define to any value if linking statically (TODO: make nice with Libtool) */ -/* #undef PCRE2_STATIC */ - -/* Define to necessary symbol if this constant uses a non-standard name on - your system. */ -/* #undef PTHREAD_CREATE_JOINABLE */ - -/* Define to any non-zero number to enable support for SELinux compatible - executable memory allocator in JIT. Note that this will have no effect - unless SUPPORT_JIT is also defined. */ -/* #undef SLJIT_PROT_EXECUTABLE_ALLOCATOR */ - -/* Define to 1 if you have the ANSI C header files. */ -/* #undef STDC_HEADERS */ - -/* Define to any value to enable support for Just-In-Time compiling. */ -/* #undef SUPPORT_JIT */ - -/* Define to any value to allow pcre2grep to be linked with libbz2, so that it - is able to handle .bz2 files. */ -/* #undef SUPPORT_LIBBZ2 */ - -/* Define to any value to allow pcre2test to be linked with libedit. */ -/* #undef SUPPORT_LIBEDIT */ - -/* Define to any value to allow pcre2test to be linked with libreadline. */ -/* #undef SUPPORT_LIBREADLINE */ - -/* Define to any value to allow pcre2grep to be linked with libz, so that it - is able to handle .gz files. */ -/* #undef SUPPORT_LIBZ */ - -/* Define to any value to enable callout script support in pcre2grep. */ -/* #undef SUPPORT_PCRE2GREP_CALLOUT */ - -/* Define to any value to enable fork support in pcre2grep callout scripts. - This will have no effect unless SUPPORT_PCRE2GREP_CALLOUT is also defined. - */ -/* #undef SUPPORT_PCRE2GREP_CALLOUT_FORK */ - -/* Define to any value to enable JIT support in pcre2grep. Note that this will - have no effect unless SUPPORT_JIT is also defined. */ -/* #undef SUPPORT_PCRE2GREP_JIT */ - -/* Define to any value to enable the 16 bit PCRE2 library. */ -/* #undef SUPPORT_PCRE2_16 */ - -/* Define to any value to enable the 32 bit PCRE2 library. */ -/* #undef SUPPORT_PCRE2_32 */ - -/* Define to any value to enable the 8 bit PCRE2 library. */ -/* #undef SUPPORT_PCRE2_8 */ - -/* Define to any value to enable support for Unicode and UTF encoding. This - will work even in an EBCDIC environment, but it is incompatible with the - EBCDIC macro. That is, PCRE2 can support *either* EBCDIC code *or* - ASCII/Unicode, but not both at once. */ -/* #undef SUPPORT_UNICODE */ - -/* Define to any value for valgrind support to find invalid memory reads. */ -/* #undef SUPPORT_VALGRIND */ - -/* Enable extensions on AIX 3, Interix. */ -#ifndef _ALL_SOURCE -# define _ALL_SOURCE 1 -#endif -/* Enable GNU extensions on systems that have them. */ -#ifndef _GNU_SOURCE -# define _GNU_SOURCE 1 -#endif -/* Enable threading extensions on Solaris. */ -#ifndef _POSIX_PTHREAD_SEMANTICS -# define _POSIX_PTHREAD_SEMANTICS 1 -#endif -/* Enable extensions on HP NonStop. */ -#ifndef _TANDEM_SOURCE -# define _TANDEM_SOURCE 1 -#endif -/* Enable general extensions on Solaris. */ -#ifndef __EXTENSIONS__ -# define __EXTENSIONS__ 1 -#endif - -/* Version number of package */ -#define VERSION "10.36" - -/* Define to 1 if on MINIX. */ -/* #undef _MINIX */ - -/* Define to 2 if the system does not provide POSIX.1 features except with - this defined. */ -/* #undef _POSIX_1_SOURCE */ - -/* Define to 1 if you need to in order for `stat' and other things to work. */ -/* #undef _POSIX_SOURCE */ - -/* Define to empty if `const' does not conform to ANSI C. */ -/* #undef const */ - -/* Define to the type of a signed integer type of width exactly 64 bits if - such a type exists and the standard includes do not define it. */ -/* #undef int64_t */ - -/* Define to `unsigned int' if does not define. */ -/* #undef size_t */ diff --git a/pcre2/src/config.h.in b/pcre2/src/config.h.in deleted file mode 100644 index d42cc0053..000000000 --- a/pcre2/src/config.h.in +++ /dev/null @@ -1,369 +0,0 @@ -/* src/config.h.in. Generated from configure.ac by autoheader. */ - - -/* PCRE2 is written in Standard C, but there are a few non-standard things it -can cope with, allowing it to run on SunOS4 and other "close to standard" -systems. - -In environments that support the GNU autotools, config.h.in is converted into -config.h by the "configure" script. In environments that use CMake, -config-cmake.in is converted into config.h. If you are going to build PCRE2 "by -hand" without using "configure" or CMake, you should copy the distributed -config.h.generic to config.h, and edit the macro definitions to be the way you -need them. You must then add -DHAVE_CONFIG_H to all of your compile commands, -so that config.h is included at the start of every source. - -Alternatively, you can avoid editing by using -D on the compiler command line -to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H, -but if you do, default values will be taken from config.h for non-boolean -macros that are not defined on the command line. - -Boolean macros such as HAVE_STDLIB_H and SUPPORT_PCRE2_8 should either be -defined (conventionally to 1) for TRUE, and not defined at all for FALSE. All -such macros are listed as a commented #undef in config.h.generic. Macros such -as MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are -surrounded by #ifndef/#endif lines so that the value can be overridden by -D. - -PCRE2 uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if -HAVE_BCOPY is defined. If your system has neither bcopy() nor memmove(), make -sure both macros are undefined; an emulation function will then be used. */ - -/* By default, the \R escape sequence matches any Unicode line ending - character or sequence of characters. If BSR_ANYCRLF is defined (to any - value), this is changed so that backslash-R matches only CR, LF, or CRLF. - The build-time default can be overridden by the user of PCRE2 at runtime. - */ -#undef BSR_ANYCRLF - -/* Define to any value to disable the use of the z and t modifiers in - formatting settings such as %zu or %td (this is rarely needed). */ -#undef DISABLE_PERCENT_ZT - -/* If you are compiling for a system that uses EBCDIC instead of ASCII - character codes, define this macro to any value. When EBCDIC is set, PCRE2 - assumes that all input strings are in EBCDIC. If you do not define this - macro, PCRE2 will assume input strings are ASCII or UTF-8/16/32 Unicode. It - is not possible to build a version of PCRE2 that supports both EBCDIC and - UTF-8/16/32. */ -#undef EBCDIC - -/* In an EBCDIC environment, define this macro to any value to arrange for the - NL character to be 0x25 instead of the default 0x15. NL plays the role that - LF does in an ASCII/Unicode environment. */ -#undef EBCDIC_NL25 - -/* Define this if your compiler supports __attribute__((uninitialized)) */ -#undef HAVE_ATTRIBUTE_UNINITIALIZED - -/* Define to 1 if you have the `bcopy' function. */ -#undef HAVE_BCOPY - -/* Define to 1 if you have the header file. */ -#undef HAVE_BZLIB_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_DIRENT_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_DLFCN_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_EDITLINE_READLINE_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_EDIT_READLINE_READLINE_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_INTTYPES_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_LIMITS_H - -/* Define to 1 if you have the `memfd_create' function. */ -#undef HAVE_MEMFD_CREATE - -/* Define to 1 if you have the `memmove' function. */ -#undef HAVE_MEMMOVE - -/* Define to 1 if you have the header file. */ -#undef HAVE_MEMORY_H - -/* Define to 1 if you have the `mkostemp' function. */ -#undef HAVE_MKOSTEMP - -/* Define if you have POSIX threads libraries and header files. */ -#undef HAVE_PTHREAD - -/* Have PTHREAD_PRIO_INHERIT. */ -#undef HAVE_PTHREAD_PRIO_INHERIT - -/* Define to 1 if you have the header file. */ -#undef HAVE_READLINE_HISTORY_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_READLINE_READLINE_H - -/* Define to 1 if you have the `secure_getenv' function. */ -#undef HAVE_SECURE_GETENV - -/* Define to 1 if you have the header file. */ -#undef HAVE_STDINT_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STDLIB_H - -/* Define to 1 if you have the `strerror' function. */ -#undef HAVE_STRERROR - -/* Define to 1 if you have the header file. */ -#undef HAVE_STRINGS_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STRING_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_STAT_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_TYPES_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_WAIT_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_UNISTD_H - -/* Define to 1 if the compiler supports simple visibility declarations. */ -#undef HAVE_VISIBILITY - -/* Define to 1 if you have the header file. */ -#undef HAVE_WINDOWS_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_ZLIB_H - -/* This limits the amount of memory that may be used while matching a pattern. - It applies to both pcre2_match() and pcre2_dfa_match(). It does not apply - to JIT matching. The value is in kibibytes (units of 1024 bytes). */ -#undef HEAP_LIMIT - -/* The value of LINK_SIZE determines the number of bytes used to store links - as offsets within the compiled regex. The default is 2, which allows for - compiled patterns up to 65535 code units long. This covers the vast - majority of cases. However, PCRE2 can also be compiled to use 3 or 4 bytes - instead. This allows for longer patterns in extreme cases. */ -#undef LINK_SIZE - -/* Define to the sub-directory where libtool stores uninstalled libraries. */ -#undef LT_OBJDIR - -/* The value of MATCH_LIMIT determines the default number of times the - pcre2_match() function can record a backtrack position during a single - matching attempt. The value is also used to limit a loop counter in - pcre2_dfa_match(). There is a runtime interface for setting a different - limit. The limit exists in order to catch runaway regular expressions that - take for ever to determine that they do not match. The default is set very - large so that it does not accidentally catch legitimate cases. */ -#undef MATCH_LIMIT - -/* The above limit applies to all backtracks, whether or not they are nested. - In some environments it is desirable to limit the nesting of backtracking - (that is, the depth of tree that is searched) more strictly, in order to - restrict the maximum amount of heap memory that is used. The value of - MATCH_LIMIT_DEPTH provides this facility. To have any useful effect, it - must be less than the value of MATCH_LIMIT. The default is to use the same - value as MATCH_LIMIT. There is a runtime method for setting a different - limit. In the case of pcre2_dfa_match(), this limit controls the depth of - the internal nested function calls that are used for pattern recursions, - lookarounds, and atomic groups. */ -#undef MATCH_LIMIT_DEPTH - -/* This limit is parameterized just in case anybody ever wants to change it. - Care must be taken if it is increased, because it guards against integer - overflow caused by enormously large patterns. */ -#undef MAX_NAME_COUNT - -/* This limit is parameterized just in case anybody ever wants to change it. - Care must be taken if it is increased, because it guards against integer - overflow caused by enormously large patterns. */ -#undef MAX_NAME_SIZE - -/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */ -#undef NEVER_BACKSLASH_C - -/* The value of NEWLINE_DEFAULT determines the default newline character - sequence. PCRE2 client programs can override this by selecting other values - at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), 5 - (ANYCRLF), and 6 (NUL). */ -#undef NEWLINE_DEFAULT - -/* Name of package */ -#undef PACKAGE - -/* Define to the address where bug reports for this package should be sent. */ -#undef PACKAGE_BUGREPORT - -/* Define to the full name of this package. */ -#undef PACKAGE_NAME - -/* Define to the full name and version of this package. */ -#undef PACKAGE_STRING - -/* Define to the one symbol short name of this package. */ -#undef PACKAGE_TARNAME - -/* Define to the home page for this package. */ -#undef PACKAGE_URL - -/* Define to the version of this package. */ -#undef PACKAGE_VERSION - -/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested - parentheses (of any kind) in a pattern. This limits the amount of system - stack that is used while compiling a pattern. */ -#undef PARENS_NEST_LIMIT - -/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by - pcre2grep to hold parts of the file it is searching. The buffer will be - expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing - very long lines. The actual amount of memory used by pcre2grep is three - times this number, because it allows for the buffering of "before" and - "after" lines. */ -#undef PCRE2GREP_BUFSIZE - -/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer - used by pcre2grep to hold parts of the file it is searching. The actual - amount of memory used by pcre2grep is three times this number, because it - allows for the buffering of "before" and "after" lines. */ -#undef PCRE2GREP_MAX_BUFSIZE - -/* to make a symbol visible */ -#undef PCRE2POSIX_EXP_DECL - -/* to make a symbol visible */ -#undef PCRE2POSIX_EXP_DEFN - -/* Define to any value to include debugging code. */ -#undef PCRE2_DEBUG - -/* to make a symbol visible */ -#undef PCRE2_EXP_DECL - - -/* If you are compiling for a system other than a Unix-like system or - Win32, and it needs some magic to be inserted before the definition - of a function that is exported by the library, define this macro to - contain the relevant magic. If you do not define this macro, a suitable - __declspec value is used for Windows systems; in other environments - "extern" is used for a C compiler and "extern C" for a C++ compiler. - This macro apears at the start of every exported function that is part - of the external API. It does not appear on functions that are "external" - in the C sense, but which are internal to the library. */ -#undef PCRE2_EXP_DEFN - -/* Define to any value if linking statically (TODO: make nice with Libtool) */ -#undef PCRE2_STATIC - -/* Define to necessary symbol if this constant uses a non-standard name on - your system. */ -#undef PTHREAD_CREATE_JOINABLE - -/* Define to any non-zero number to enable support for SELinux compatible - executable memory allocator in JIT. Note that this will have no effect - unless SUPPORT_JIT is also defined. */ -#undef SLJIT_PROT_EXECUTABLE_ALLOCATOR - -/* Define to 1 if you have the ANSI C header files. */ -#undef STDC_HEADERS - -/* Define to any value to enable support for Just-In-Time compiling. */ -#undef SUPPORT_JIT - -/* Define to any value to allow pcre2grep to be linked with libbz2, so that it - is able to handle .bz2 files. */ -#undef SUPPORT_LIBBZ2 - -/* Define to any value to allow pcre2test to be linked with libedit. */ -#undef SUPPORT_LIBEDIT - -/* Define to any value to allow pcre2test to be linked with libreadline. */ -#undef SUPPORT_LIBREADLINE - -/* Define to any value to allow pcre2grep to be linked with libz, so that it - is able to handle .gz files. */ -#undef SUPPORT_LIBZ - -/* Define to any value to enable callout script support in pcre2grep. */ -#undef SUPPORT_PCRE2GREP_CALLOUT - -/* Define to any value to enable fork support in pcre2grep callout scripts. - This will have no effect unless SUPPORT_PCRE2GREP_CALLOUT is also defined. - */ -#undef SUPPORT_PCRE2GREP_CALLOUT_FORK - -/* Define to any value to enable JIT support in pcre2grep. Note that this will - have no effect unless SUPPORT_JIT is also defined. */ -#undef SUPPORT_PCRE2GREP_JIT - -/* Define to any value to enable the 16 bit PCRE2 library. */ -#undef SUPPORT_PCRE2_16 - -/* Define to any value to enable the 32 bit PCRE2 library. */ -#undef SUPPORT_PCRE2_32 - -/* Define to any value to enable the 8 bit PCRE2 library. */ -#undef SUPPORT_PCRE2_8 - -/* Define to any value to enable support for Unicode and UTF encoding. This - will work even in an EBCDIC environment, but it is incompatible with the - EBCDIC macro. That is, PCRE2 can support *either* EBCDIC code *or* - ASCII/Unicode, but not both at once. */ -#undef SUPPORT_UNICODE - -/* Define to any value for valgrind support to find invalid memory reads. */ -#undef SUPPORT_VALGRIND - -/* Enable extensions on AIX 3, Interix. */ -#ifndef _ALL_SOURCE -# undef _ALL_SOURCE -#endif -/* Enable GNU extensions on systems that have them. */ -#ifndef _GNU_SOURCE -# undef _GNU_SOURCE -#endif -/* Enable threading extensions on Solaris. */ -#ifndef _POSIX_PTHREAD_SEMANTICS -# undef _POSIX_PTHREAD_SEMANTICS -#endif -/* Enable extensions on HP NonStop. */ -#ifndef _TANDEM_SOURCE -# undef _TANDEM_SOURCE -#endif -/* Enable general extensions on Solaris. */ -#ifndef __EXTENSIONS__ -# undef __EXTENSIONS__ -#endif - - -/* Version number of package */ -#undef VERSION - -/* Define to 1 if on MINIX. */ -#undef _MINIX - -/* Define to 2 if the system does not provide POSIX.1 features except with - this defined. */ -#undef _POSIX_1_SOURCE - -/* Define to 1 if you need to in order for `stat' and other things to work. */ -#undef _POSIX_SOURCE - -/* Define to empty if `const' does not conform to ANSI C. */ -#undef const - -/* Define to the type of a signed integer type of width exactly 64 bits if - such a type exists and the standard includes do not define it. */ -#undef int64_t - -/* Define to `unsigned int' if does not define. */ -#undef size_t diff --git a/pcre2/src/pcre2.h.generic b/pcre2/src/pcre2.h.generic deleted file mode 100644 index f204ec818..000000000 --- a/pcre2/src/pcre2.h.generic +++ /dev/null @@ -1,991 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* This is the public header file for the PCRE library, second API, to be -#included by applications that call PCRE2 functions. - - Copyright (c) 2016-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -#ifndef PCRE2_H_IDEMPOTENT_GUARD -#define PCRE2_H_IDEMPOTENT_GUARD - -/* The current PCRE version information. */ - -#define PCRE2_MAJOR 10 -#define PCRE2_MINOR 36 -#define PCRE2_PRERELEASE -#define PCRE2_DATE 2020-12-04 - -/* When an application links to a PCRE DLL in Windows, the symbols that are -imported have to be identified as such. When building PCRE2, the appropriate -export setting is defined in pcre2_internal.h, which includes this file. So we -don't change existing definitions of PCRE2_EXP_DECL. */ - -#if defined(_WIN32) && !defined(PCRE2_STATIC) -# ifndef PCRE2_EXP_DECL -# define PCRE2_EXP_DECL extern __declspec(dllimport) -# endif -#endif - -/* By default, we use the standard "extern" declarations. */ - -#ifndef PCRE2_EXP_DECL -# ifdef __cplusplus -# define PCRE2_EXP_DECL extern "C" -# else -# define PCRE2_EXP_DECL extern -# endif -#endif - -/* When compiling with the MSVC compiler, it is sometimes necessary to include -a "calling convention" before exported function names. (This is secondhand -information; I know nothing about MSVC myself). For example, something like - - void __cdecl function(....) - -might be needed. In order so make this easy, all the exported functions have -PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not -set, we ensure here that it has no effect. */ - -#ifndef PCRE2_CALL_CONVENTION -#define PCRE2_CALL_CONVENTION -#endif - -/* Have to include limits.h, stdlib.h, and inttypes.h to ensure that size_t and -uint8_t, UCHAR_MAX, etc are defined. Some systems that do have inttypes.h do -not have stdint.h, which is why we use inttypes.h, which according to the C -standard is a superset of stdint.h. If none of these headers are available, -the relevant values must be provided by some other means. */ - -#include -#include -#include - -/* Allow for C++ users compiling this directly. */ - -#ifdef __cplusplus -extern "C" { -#endif - -/* The following option bits can be passed to pcre2_compile(), pcre2_match(), -or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the function to which it -is passed. Put these bits at the most significant end of the options word so -others can be added next to them */ - -#define PCRE2_ANCHORED 0x80000000u -#define PCRE2_NO_UTF_CHECK 0x40000000u -#define PCRE2_ENDANCHORED 0x20000000u - -/* The following option bits can be passed only to pcre2_compile(). However, -they may affect compilation, JIT compilation, and/or interpretive execution. -The following tags indicate which: - -C alters what is compiled by pcre2_compile() -J alters what is compiled by pcre2_jit_compile() -M is inspected during pcre2_match() execution -D is inspected during pcre2_dfa_match() execution -*/ - -#define PCRE2_ALLOW_EMPTY_CLASS 0x00000001u /* C */ -#define PCRE2_ALT_BSUX 0x00000002u /* C */ -#define PCRE2_AUTO_CALLOUT 0x00000004u /* C */ -#define PCRE2_CASELESS 0x00000008u /* C */ -#define PCRE2_DOLLAR_ENDONLY 0x00000010u /* J M D */ -#define PCRE2_DOTALL 0x00000020u /* C */ -#define PCRE2_DUPNAMES 0x00000040u /* C */ -#define PCRE2_EXTENDED 0x00000080u /* C */ -#define PCRE2_FIRSTLINE 0x00000100u /* J M D */ -#define PCRE2_MATCH_UNSET_BACKREF 0x00000200u /* C J M */ -#define PCRE2_MULTILINE 0x00000400u /* C */ -#define PCRE2_NEVER_UCP 0x00000800u /* C */ -#define PCRE2_NEVER_UTF 0x00001000u /* C */ -#define PCRE2_NO_AUTO_CAPTURE 0x00002000u /* C */ -#define PCRE2_NO_AUTO_POSSESS 0x00004000u /* C */ -#define PCRE2_NO_DOTSTAR_ANCHOR 0x00008000u /* C */ -#define PCRE2_NO_START_OPTIMIZE 0x00010000u /* J M D */ -#define PCRE2_UCP 0x00020000u /* C J M D */ -#define PCRE2_UNGREEDY 0x00040000u /* C */ -#define PCRE2_UTF 0x00080000u /* C J M D */ -#define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */ -#define PCRE2_ALT_CIRCUMFLEX 0x00200000u /* J M D */ -#define PCRE2_ALT_VERBNAMES 0x00400000u /* C */ -#define PCRE2_USE_OFFSET_LIMIT 0x00800000u /* J M D */ -#define PCRE2_EXTENDED_MORE 0x01000000u /* C */ -#define PCRE2_LITERAL 0x02000000u /* C */ -#define PCRE2_MATCH_INVALID_UTF 0x04000000u /* J M D */ - -/* An additional compile options word is available in the compile context. */ - -#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */ -#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */ -#define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */ -#define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */ -#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */ -#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */ - -/* These are for pcre2_jit_compile(). */ - -#define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */ -#define PCRE2_JIT_PARTIAL_SOFT 0x00000002u -#define PCRE2_JIT_PARTIAL_HARD 0x00000004u -#define PCRE2_JIT_INVALID_UTF 0x00000100u - -/* These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and -pcre2_substitute(). Some are allowed only for one of the functions, and in -these cases it is noted below. Note that PCRE2_ANCHORED, PCRE2_ENDANCHORED and -PCRE2_NO_UTF_CHECK can also be passed to these functions (though -pcre2_jit_match() ignores the latter since it bypasses all sanity checks). */ - -#define PCRE2_NOTBOL 0x00000001u -#define PCRE2_NOTEOL 0x00000002u -#define PCRE2_NOTEMPTY 0x00000004u /* ) These two must be kept */ -#define PCRE2_NOTEMPTY_ATSTART 0x00000008u /* ) adjacent to each other. */ -#define PCRE2_PARTIAL_SOFT 0x00000010u -#define PCRE2_PARTIAL_HARD 0x00000020u -#define PCRE2_DFA_RESTART 0x00000040u /* pcre2_dfa_match() only */ -#define PCRE2_DFA_SHORTEST 0x00000080u /* pcre2_dfa_match() only */ -#define PCRE2_SUBSTITUTE_GLOBAL 0x00000100u /* pcre2_substitute() only */ -#define PCRE2_SUBSTITUTE_EXTENDED 0x00000200u /* pcre2_substitute() only */ -#define PCRE2_SUBSTITUTE_UNSET_EMPTY 0x00000400u /* pcre2_substitute() only */ -#define PCRE2_SUBSTITUTE_UNKNOWN_UNSET 0x00000800u /* pcre2_substitute() only */ -#define PCRE2_SUBSTITUTE_OVERFLOW_LENGTH 0x00001000u /* pcre2_substitute() only */ -#define PCRE2_NO_JIT 0x00002000u /* Not for pcre2_dfa_match() */ -#define PCRE2_COPY_MATCHED_SUBJECT 0x00004000u -#define PCRE2_SUBSTITUTE_LITERAL 0x00008000u /* pcre2_substitute() only */ -#define PCRE2_SUBSTITUTE_MATCHED 0x00010000u /* pcre2_substitute() only */ -#define PCRE2_SUBSTITUTE_REPLACEMENT_ONLY 0x00020000u /* pcre2_substitute() only */ - -/* Options for pcre2_pattern_convert(). */ - -#define PCRE2_CONVERT_UTF 0x00000001u -#define PCRE2_CONVERT_NO_UTF_CHECK 0x00000002u -#define PCRE2_CONVERT_POSIX_BASIC 0x00000004u -#define PCRE2_CONVERT_POSIX_EXTENDED 0x00000008u -#define PCRE2_CONVERT_GLOB 0x00000010u -#define PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR 0x00000030u -#define PCRE2_CONVERT_GLOB_NO_STARSTAR 0x00000050u - -/* Newline and \R settings, for use in compile contexts. The newline values -must be kept in step with values set in config.h and both sets must all be -greater than zero. */ - -#define PCRE2_NEWLINE_CR 1 -#define PCRE2_NEWLINE_LF 2 -#define PCRE2_NEWLINE_CRLF 3 -#define PCRE2_NEWLINE_ANY 4 -#define PCRE2_NEWLINE_ANYCRLF 5 -#define PCRE2_NEWLINE_NUL 6 - -#define PCRE2_BSR_UNICODE 1 -#define PCRE2_BSR_ANYCRLF 2 - -/* Error codes for pcre2_compile(). Some of these are also used by -pcre2_pattern_convert(). */ - -#define PCRE2_ERROR_END_BACKSLASH 101 -#define PCRE2_ERROR_END_BACKSLASH_C 102 -#define PCRE2_ERROR_UNKNOWN_ESCAPE 103 -#define PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER 104 -#define PCRE2_ERROR_QUANTIFIER_TOO_BIG 105 -#define PCRE2_ERROR_MISSING_SQUARE_BRACKET 106 -#define PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS 107 -#define PCRE2_ERROR_CLASS_RANGE_ORDER 108 -#define PCRE2_ERROR_QUANTIFIER_INVALID 109 -#define PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT 110 -#define PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY 111 -#define PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS 112 -#define PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING 113 -#define PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS 114 -#define PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE 115 -#define PCRE2_ERROR_NULL_PATTERN 116 -#define PCRE2_ERROR_BAD_OPTIONS 117 -#define PCRE2_ERROR_MISSING_COMMENT_CLOSING 118 -#define PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP 119 -#define PCRE2_ERROR_PATTERN_TOO_LARGE 120 -#define PCRE2_ERROR_HEAP_FAILED 121 -#define PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS 122 -#define PCRE2_ERROR_INTERNAL_CODE_OVERFLOW 123 -#define PCRE2_ERROR_MISSING_CONDITION_CLOSING 124 -#define PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH 125 -#define PCRE2_ERROR_ZERO_RELATIVE_REFERENCE 126 -#define PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES 127 -#define PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED 128 -#define PCRE2_ERROR_BAD_RELATIVE_REFERENCE 129 -#define PCRE2_ERROR_UNKNOWN_POSIX_CLASS 130 -#define PCRE2_ERROR_INTERNAL_STUDY_ERROR 131 -#define PCRE2_ERROR_UNICODE_NOT_SUPPORTED 132 -#define PCRE2_ERROR_PARENTHESES_STACK_CHECK 133 -#define PCRE2_ERROR_CODE_POINT_TOO_BIG 134 -#define PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED 135 -#define PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C 136 -#define PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE 137 -#define PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG 138 -#define PCRE2_ERROR_MISSING_CALLOUT_CLOSING 139 -#define PCRE2_ERROR_ESCAPE_INVALID_IN_VERB 140 -#define PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P 141 -#define PCRE2_ERROR_MISSING_NAME_TERMINATOR 142 -#define PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME 143 -#define PCRE2_ERROR_INVALID_SUBPATTERN_NAME 144 -#define PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE 145 -#define PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY 146 -#define PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY 147 -#define PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG 148 -#define PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS 149 -#define PCRE2_ERROR_CLASS_INVALID_RANGE 150 -#define PCRE2_ERROR_OCTAL_BYTE_TOO_BIG 151 -#define PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE 152 -#define PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN 153 -#define PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES 154 -#define PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE 155 -#define PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE 156 -#define PCRE2_ERROR_BACKSLASH_G_SYNTAX 157 -#define PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING 158 -/* Error 159 is obsolete and should now never occur */ -#define PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED 159 -#define PCRE2_ERROR_VERB_UNKNOWN 160 -#define PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG 161 -#define PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED 162 -#define PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW 163 -#define PCRE2_ERROR_INVALID_OCTAL 164 -#define PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH 165 -#define PCRE2_ERROR_MARK_MISSING_ARGUMENT 166 -#define PCRE2_ERROR_INVALID_HEXADECIMAL 167 -#define PCRE2_ERROR_BACKSLASH_C_SYNTAX 168 -#define PCRE2_ERROR_BACKSLASH_K_SYNTAX 169 -#define PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS 170 -#define PCRE2_ERROR_BACKSLASH_N_IN_CLASS 171 -#define PCRE2_ERROR_CALLOUT_STRING_TOO_LONG 172 -#define PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT 173 -#define PCRE2_ERROR_UTF_IS_DISABLED 174 -#define PCRE2_ERROR_UCP_IS_DISABLED 175 -#define PCRE2_ERROR_VERB_NAME_TOO_LONG 176 -#define PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG 177 -#define PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS 178 -#define PCRE2_ERROR_VERSION_CONDITION_SYNTAX 179 -#define PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS 180 -#define PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER 181 -#define PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER 182 -#define PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED 183 -#define PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP 184 -#define PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED 185 -#define PCRE2_ERROR_PATTERN_TOO_COMPLICATED 186 -#define PCRE2_ERROR_LOOKBEHIND_TOO_LONG 187 -#define PCRE2_ERROR_PATTERN_STRING_TOO_LONG 188 -#define PCRE2_ERROR_INTERNAL_BAD_CODE 189 -#define PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP 190 -#define PCRE2_ERROR_NO_SURROGATES_IN_UTF16 191 -#define PCRE2_ERROR_BAD_LITERAL_OPTIONS 192 -#define PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE 193 -#define PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS 194 -#define PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN 195 -#define PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE 196 -#define PCRE2_ERROR_TOO_MANY_CAPTURES 197 -#define PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED 198 - - -/* "Expected" matching error codes: no match and partial match. */ - -#define PCRE2_ERROR_NOMATCH (-1) -#define PCRE2_ERROR_PARTIAL (-2) - -/* Error codes for UTF-8 validity checks */ - -#define PCRE2_ERROR_UTF8_ERR1 (-3) -#define PCRE2_ERROR_UTF8_ERR2 (-4) -#define PCRE2_ERROR_UTF8_ERR3 (-5) -#define PCRE2_ERROR_UTF8_ERR4 (-6) -#define PCRE2_ERROR_UTF8_ERR5 (-7) -#define PCRE2_ERROR_UTF8_ERR6 (-8) -#define PCRE2_ERROR_UTF8_ERR7 (-9) -#define PCRE2_ERROR_UTF8_ERR8 (-10) -#define PCRE2_ERROR_UTF8_ERR9 (-11) -#define PCRE2_ERROR_UTF8_ERR10 (-12) -#define PCRE2_ERROR_UTF8_ERR11 (-13) -#define PCRE2_ERROR_UTF8_ERR12 (-14) -#define PCRE2_ERROR_UTF8_ERR13 (-15) -#define PCRE2_ERROR_UTF8_ERR14 (-16) -#define PCRE2_ERROR_UTF8_ERR15 (-17) -#define PCRE2_ERROR_UTF8_ERR16 (-18) -#define PCRE2_ERROR_UTF8_ERR17 (-19) -#define PCRE2_ERROR_UTF8_ERR18 (-20) -#define PCRE2_ERROR_UTF8_ERR19 (-21) -#define PCRE2_ERROR_UTF8_ERR20 (-22) -#define PCRE2_ERROR_UTF8_ERR21 (-23) - -/* Error codes for UTF-16 validity checks */ - -#define PCRE2_ERROR_UTF16_ERR1 (-24) -#define PCRE2_ERROR_UTF16_ERR2 (-25) -#define PCRE2_ERROR_UTF16_ERR3 (-26) - -/* Error codes for UTF-32 validity checks */ - -#define PCRE2_ERROR_UTF32_ERR1 (-27) -#define PCRE2_ERROR_UTF32_ERR2 (-28) - -/* Miscellaneous error codes for pcre2[_dfa]_match(), substring extraction -functions, context functions, and serializing functions. They are in numerical -order. Originally they were in alphabetical order too, but now that PCRE2 is -released, the numbers must not be changed. */ - -#define PCRE2_ERROR_BADDATA (-29) -#define PCRE2_ERROR_MIXEDTABLES (-30) /* Name was changed */ -#define PCRE2_ERROR_BADMAGIC (-31) -#define PCRE2_ERROR_BADMODE (-32) -#define PCRE2_ERROR_BADOFFSET (-33) -#define PCRE2_ERROR_BADOPTION (-34) -#define PCRE2_ERROR_BADREPLACEMENT (-35) -#define PCRE2_ERROR_BADUTFOFFSET (-36) -#define PCRE2_ERROR_CALLOUT (-37) /* Never used by PCRE2 itself */ -#define PCRE2_ERROR_DFA_BADRESTART (-38) -#define PCRE2_ERROR_DFA_RECURSE (-39) -#define PCRE2_ERROR_DFA_UCOND (-40) -#define PCRE2_ERROR_DFA_UFUNC (-41) -#define PCRE2_ERROR_DFA_UITEM (-42) -#define PCRE2_ERROR_DFA_WSSIZE (-43) -#define PCRE2_ERROR_INTERNAL (-44) -#define PCRE2_ERROR_JIT_BADOPTION (-45) -#define PCRE2_ERROR_JIT_STACKLIMIT (-46) -#define PCRE2_ERROR_MATCHLIMIT (-47) -#define PCRE2_ERROR_NOMEMORY (-48) -#define PCRE2_ERROR_NOSUBSTRING (-49) -#define PCRE2_ERROR_NOUNIQUESUBSTRING (-50) -#define PCRE2_ERROR_NULL (-51) -#define PCRE2_ERROR_RECURSELOOP (-52) -#define PCRE2_ERROR_DEPTHLIMIT (-53) -#define PCRE2_ERROR_RECURSIONLIMIT (-53) /* Obsolete synonym */ -#define PCRE2_ERROR_UNAVAILABLE (-54) -#define PCRE2_ERROR_UNSET (-55) -#define PCRE2_ERROR_BADOFFSETLIMIT (-56) -#define PCRE2_ERROR_BADREPESCAPE (-57) -#define PCRE2_ERROR_REPMISSINGBRACE (-58) -#define PCRE2_ERROR_BADSUBSTITUTION (-59) -#define PCRE2_ERROR_BADSUBSPATTERN (-60) -#define PCRE2_ERROR_TOOMANYREPLACE (-61) -#define PCRE2_ERROR_BADSERIALIZEDDATA (-62) -#define PCRE2_ERROR_HEAPLIMIT (-63) -#define PCRE2_ERROR_CONVERT_SYNTAX (-64) -#define PCRE2_ERROR_INTERNAL_DUPMATCH (-65) -#define PCRE2_ERROR_DFA_UINVALID_UTF (-66) - - -/* Request types for pcre2_pattern_info() */ - -#define PCRE2_INFO_ALLOPTIONS 0 -#define PCRE2_INFO_ARGOPTIONS 1 -#define PCRE2_INFO_BACKREFMAX 2 -#define PCRE2_INFO_BSR 3 -#define PCRE2_INFO_CAPTURECOUNT 4 -#define PCRE2_INFO_FIRSTCODEUNIT 5 -#define PCRE2_INFO_FIRSTCODETYPE 6 -#define PCRE2_INFO_FIRSTBITMAP 7 -#define PCRE2_INFO_HASCRORLF 8 -#define PCRE2_INFO_JCHANGED 9 -#define PCRE2_INFO_JITSIZE 10 -#define PCRE2_INFO_LASTCODEUNIT 11 -#define PCRE2_INFO_LASTCODETYPE 12 -#define PCRE2_INFO_MATCHEMPTY 13 -#define PCRE2_INFO_MATCHLIMIT 14 -#define PCRE2_INFO_MAXLOOKBEHIND 15 -#define PCRE2_INFO_MINLENGTH 16 -#define PCRE2_INFO_NAMECOUNT 17 -#define PCRE2_INFO_NAMEENTRYSIZE 18 -#define PCRE2_INFO_NAMETABLE 19 -#define PCRE2_INFO_NEWLINE 20 -#define PCRE2_INFO_DEPTHLIMIT 21 -#define PCRE2_INFO_RECURSIONLIMIT 21 /* Obsolete synonym */ -#define PCRE2_INFO_SIZE 22 -#define PCRE2_INFO_HASBACKSLASHC 23 -#define PCRE2_INFO_FRAMESIZE 24 -#define PCRE2_INFO_HEAPLIMIT 25 -#define PCRE2_INFO_EXTRAOPTIONS 26 - -/* Request types for pcre2_config(). */ - -#define PCRE2_CONFIG_BSR 0 -#define PCRE2_CONFIG_JIT 1 -#define PCRE2_CONFIG_JITTARGET 2 -#define PCRE2_CONFIG_LINKSIZE 3 -#define PCRE2_CONFIG_MATCHLIMIT 4 -#define PCRE2_CONFIG_NEWLINE 5 -#define PCRE2_CONFIG_PARENSLIMIT 6 -#define PCRE2_CONFIG_DEPTHLIMIT 7 -#define PCRE2_CONFIG_RECURSIONLIMIT 7 /* Obsolete synonym */ -#define PCRE2_CONFIG_STACKRECURSE 8 /* Obsolete */ -#define PCRE2_CONFIG_UNICODE 9 -#define PCRE2_CONFIG_UNICODE_VERSION 10 -#define PCRE2_CONFIG_VERSION 11 -#define PCRE2_CONFIG_HEAPLIMIT 12 -#define PCRE2_CONFIG_NEVER_BACKSLASH_C 13 -#define PCRE2_CONFIG_COMPILED_WIDTHS 14 -#define PCRE2_CONFIG_TABLES_LENGTH 15 - - -/* Types for code units in patterns and subject strings. */ - -typedef uint8_t PCRE2_UCHAR8; -typedef uint16_t PCRE2_UCHAR16; -typedef uint32_t PCRE2_UCHAR32; - -typedef const PCRE2_UCHAR8 *PCRE2_SPTR8; -typedef const PCRE2_UCHAR16 *PCRE2_SPTR16; -typedef const PCRE2_UCHAR32 *PCRE2_SPTR32; - -/* The PCRE2_SIZE type is used for all string lengths and offsets in PCRE2, -including pattern offsets for errors and subject offsets after a match. We -define special values to indicate zero-terminated strings and unset offsets in -the offset vector (ovector). */ - -#define PCRE2_SIZE size_t -#define PCRE2_SIZE_MAX SIZE_MAX -#define PCRE2_ZERO_TERMINATED (~(PCRE2_SIZE)0) -#define PCRE2_UNSET (~(PCRE2_SIZE)0) - -/* Generic types for opaque structures and JIT callback functions. These -declarations are defined in a macro that is expanded for each width later. */ - -#define PCRE2_TYPES_LIST \ -struct pcre2_real_general_context; \ -typedef struct pcre2_real_general_context pcre2_general_context; \ -\ -struct pcre2_real_compile_context; \ -typedef struct pcre2_real_compile_context pcre2_compile_context; \ -\ -struct pcre2_real_match_context; \ -typedef struct pcre2_real_match_context pcre2_match_context; \ -\ -struct pcre2_real_convert_context; \ -typedef struct pcre2_real_convert_context pcre2_convert_context; \ -\ -struct pcre2_real_code; \ -typedef struct pcre2_real_code pcre2_code; \ -\ -struct pcre2_real_match_data; \ -typedef struct pcre2_real_match_data pcre2_match_data; \ -\ -struct pcre2_real_jit_stack; \ -typedef struct pcre2_real_jit_stack pcre2_jit_stack; \ -\ -typedef pcre2_jit_stack *(*pcre2_jit_callback)(void *); - - -/* The structures for passing out data via callout functions. We use structures -so that new fields can be added on the end in future versions, without changing -the API of the function, thereby allowing old clients to work without -modification. Define the generic versions in a macro; the width-specific -versions are generated from this macro below. */ - -/* Flags for the callout_flags field. These are cleared after a callout. */ - -#define PCRE2_CALLOUT_STARTMATCH 0x00000001u /* Set for each bumpalong */ -#define PCRE2_CALLOUT_BACKTRACK 0x00000002u /* Set after a backtrack */ - -#define PCRE2_STRUCTURE_LIST \ -typedef struct pcre2_callout_block { \ - uint32_t version; /* Identifies version of block */ \ - /* ------------------------ Version 0 ------------------------------- */ \ - uint32_t callout_number; /* Number compiled into pattern */ \ - uint32_t capture_top; /* Max current capture */ \ - uint32_t capture_last; /* Most recently closed capture */ \ - PCRE2_SIZE *offset_vector; /* The offset vector */ \ - PCRE2_SPTR mark; /* Pointer to current mark or NULL */ \ - PCRE2_SPTR subject; /* The subject being matched */ \ - PCRE2_SIZE subject_length; /* The length of the subject */ \ - PCRE2_SIZE start_match; /* Offset to start of this match attempt */ \ - PCRE2_SIZE current_position; /* Where we currently are in the subject */ \ - PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \ - PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \ - /* ------------------- Added for Version 1 -------------------------- */ \ - PCRE2_SIZE callout_string_offset; /* Offset to string within pattern */ \ - PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \ - PCRE2_SPTR callout_string; /* String compiled into pattern */ \ - /* ------------------- Added for Version 2 -------------------------- */ \ - uint32_t callout_flags; /* See above for list */ \ - /* ------------------------------------------------------------------ */ \ -} pcre2_callout_block; \ -\ -typedef struct pcre2_callout_enumerate_block { \ - uint32_t version; /* Identifies version of block */ \ - /* ------------------------ Version 0 ------------------------------- */ \ - PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \ - PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \ - uint32_t callout_number; /* Number compiled into pattern */ \ - PCRE2_SIZE callout_string_offset; /* Offset to string within pattern */ \ - PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \ - PCRE2_SPTR callout_string; /* String compiled into pattern */ \ - /* ------------------------------------------------------------------ */ \ -} pcre2_callout_enumerate_block; \ -\ -typedef struct pcre2_substitute_callout_block { \ - uint32_t version; /* Identifies version of block */ \ - /* ------------------------ Version 0 ------------------------------- */ \ - PCRE2_SPTR input; /* Pointer to input subject string */ \ - PCRE2_SPTR output; /* Pointer to output buffer */ \ - PCRE2_SIZE output_offsets[2]; /* Changed portion of the output */ \ - PCRE2_SIZE *ovector; /* Pointer to current ovector */ \ - uint32_t oveccount; /* Count of pairs set in ovector */ \ - uint32_t subscount; /* Substitution number */ \ - /* ------------------------------------------------------------------ */ \ -} pcre2_substitute_callout_block; - - -/* List the generic forms of all other functions in macros, which will be -expanded for each width below. Start with functions that give general -information. */ - -#define PCRE2_GENERAL_INFO_FUNCTIONS \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION pcre2_config(uint32_t, void *); - - -/* Functions for manipulating contexts. */ - -#define PCRE2_GENERAL_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL pcre2_general_context PCRE2_CALL_CONVENTION \ - *pcre2_general_context_copy(pcre2_general_context *); \ -PCRE2_EXP_DECL pcre2_general_context PCRE2_CALL_CONVENTION \ - *pcre2_general_context_create(void *(*)(PCRE2_SIZE, void *), \ - void (*)(void *, void *), void *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_general_context_free(pcre2_general_context *); - -#define PCRE2_COMPILE_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL pcre2_compile_context PCRE2_CALL_CONVENTION \ - *pcre2_compile_context_copy(pcre2_compile_context *); \ -PCRE2_EXP_DECL pcre2_compile_context PCRE2_CALL_CONVENTION \ - *pcre2_compile_context_create(pcre2_general_context *);\ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_compile_context_free(pcre2_compile_context *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_bsr(pcre2_compile_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_character_tables(pcre2_compile_context *, const uint8_t *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_compile_extra_options(pcre2_compile_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_max_pattern_length(pcre2_compile_context *, PCRE2_SIZE); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_newline(pcre2_compile_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_parens_nest_limit(pcre2_compile_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_compile_recursion_guard(pcre2_compile_context *, \ - int (*)(uint32_t, void *), void *); - -#define PCRE2_MATCH_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL pcre2_match_context PCRE2_CALL_CONVENTION \ - *pcre2_match_context_copy(pcre2_match_context *); \ -PCRE2_EXP_DECL pcre2_match_context PCRE2_CALL_CONVENTION \ - *pcre2_match_context_create(pcre2_general_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_match_context_free(pcre2_match_context *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_callout(pcre2_match_context *, \ - int (*)(pcre2_callout_block *, void *), void *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_substitute_callout(pcre2_match_context *, \ - int (*)(pcre2_substitute_callout_block *, void *), void *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_depth_limit(pcre2_match_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_heap_limit(pcre2_match_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_match_limit(pcre2_match_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_offset_limit(pcre2_match_context *, PCRE2_SIZE); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_recursion_limit(pcre2_match_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_recursion_memory_management(pcre2_match_context *, \ - void *(*)(PCRE2_SIZE, void *), void (*)(void *, void *), void *); - -#define PCRE2_CONVERT_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL pcre2_convert_context PCRE2_CALL_CONVENTION \ - *pcre2_convert_context_copy(pcre2_convert_context *); \ -PCRE2_EXP_DECL pcre2_convert_context PCRE2_CALL_CONVENTION \ - *pcre2_convert_context_create(pcre2_general_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_convert_context_free(pcre2_convert_context *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_glob_escape(pcre2_convert_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_glob_separator(pcre2_convert_context *, uint32_t); - - -/* Functions concerned with compiling a pattern to PCRE internal code. */ - -#define PCRE2_COMPILE_FUNCTIONS \ -PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \ - *pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, \ - pcre2_compile_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_code_free(pcre2_code *); \ -PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \ - *pcre2_code_copy(const pcre2_code *); \ -PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \ - *pcre2_code_copy_with_tables(const pcre2_code *); - - -/* Functions that give information about a compiled pattern. */ - -#define PCRE2_PATTERN_INFO_FUNCTIONS \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_pattern_info(const pcre2_code *, uint32_t, void *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_callout_enumerate(const pcre2_code *, \ - int (*)(pcre2_callout_enumerate_block *, void *), void *); - - -/* Functions for running a match and inspecting the result. */ - -#define PCRE2_MATCH_FUNCTIONS \ -PCRE2_EXP_DECL pcre2_match_data PCRE2_CALL_CONVENTION \ - *pcre2_match_data_create(uint32_t, pcre2_general_context *); \ -PCRE2_EXP_DECL pcre2_match_data PCRE2_CALL_CONVENTION \ - *pcre2_match_data_create_from_pattern(const pcre2_code *, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ - uint32_t, pcre2_match_data *, pcre2_match_context *, int *, PCRE2_SIZE); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ - uint32_t, pcre2_match_data *, pcre2_match_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_match_data_free(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SPTR PCRE2_CALL_CONVENTION \ - pcre2_get_mark(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ - pcre2_get_match_data_size(pcre2_match_data *); \ -PCRE2_EXP_DECL uint32_t PCRE2_CALL_CONVENTION \ - pcre2_get_ovector_count(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ - *pcre2_get_ovector_pointer(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ - pcre2_get_startchar(pcre2_match_data *); - - -/* Convenience functions for handling matched substrings. */ - -#define PCRE2_SUBSTRING_FUNCTIONS \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_copy_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_UCHAR *, \ - PCRE2_SIZE *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_copy_bynumber(pcre2_match_data *, uint32_t, PCRE2_UCHAR *, \ - PCRE2_SIZE *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_substring_free(PCRE2_UCHAR *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_get_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_UCHAR **, \ - PCRE2_SIZE *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_get_bynumber(pcre2_match_data *, uint32_t, PCRE2_UCHAR **, \ - PCRE2_SIZE *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_length_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_length_bynumber(pcre2_match_data *, uint32_t, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_nametable_scan(const pcre2_code *, PCRE2_SPTR, PCRE2_SPTR *, \ - PCRE2_SPTR *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_number_from_name(const pcre2_code *, PCRE2_SPTR); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_substring_list_free(PCRE2_SPTR *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_list_get(pcre2_match_data *, PCRE2_UCHAR ***, PCRE2_SIZE **); - -/* Functions for serializing / deserializing compiled patterns. */ - -#define PCRE2_SERIALIZE_FUNCTIONS \ -PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ - pcre2_serialize_encode(const pcre2_code **, int32_t, uint8_t **, \ - PCRE2_SIZE *, pcre2_general_context *); \ -PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ - pcre2_serialize_decode(pcre2_code **, int32_t, const uint8_t *, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ - pcre2_serialize_get_number_of_codes(const uint8_t *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_serialize_free(uint8_t *); - - -/* Convenience function for match + substitute. */ - -#define PCRE2_SUBSTITUTE_FUNCTION \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substitute(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ - uint32_t, pcre2_match_data *, pcre2_match_context *, PCRE2_SPTR, \ - PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE *); - - -/* Functions for converting pattern source strings. */ - -#define PCRE2_CONVERT_FUNCTIONS \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_pattern_convert(PCRE2_SPTR, PCRE2_SIZE, uint32_t, PCRE2_UCHAR **, \ - PCRE2_SIZE *, pcre2_convert_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_converted_pattern_free(PCRE2_UCHAR *); - - -/* Functions for JIT processing */ - -#define PCRE2_JIT_FUNCTIONS \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_jit_compile(pcre2_code *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_jit_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ - uint32_t, pcre2_match_data *, pcre2_match_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_jit_free_unused_memory(pcre2_general_context *); \ -PCRE2_EXP_DECL pcre2_jit_stack PCRE2_CALL_CONVENTION \ - *pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, pcre2_general_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_jit_stack_assign(pcre2_match_context *, pcre2_jit_callback, void *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_jit_stack_free(pcre2_jit_stack *); - - -/* Other miscellaneous functions. */ - -#define PCRE2_OTHER_FUNCTIONS \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_get_error_message(int, PCRE2_UCHAR *, PCRE2_SIZE); \ -PCRE2_EXP_DECL const uint8_t PCRE2_CALL_CONVENTION \ - *pcre2_maketables(pcre2_general_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_maketables_free(pcre2_general_context *, const uint8_t *); - -/* Define macros that generate width-specific names from generic versions. The -three-level macro scheme is necessary to get the macros expanded when we want -them to be. First we get the width from PCRE2_LOCAL_WIDTH, which is used for -generating three versions of everything below. After that, PCRE2_SUFFIX will be -re-defined to use PCRE2_CODE_UNIT_WIDTH, for use when macros such as -pcre2_compile are called by application code. */ - -#define PCRE2_JOIN(a,b) a ## b -#define PCRE2_GLUE(a,b) PCRE2_JOIN(a,b) -#define PCRE2_SUFFIX(a) PCRE2_GLUE(a,PCRE2_LOCAL_WIDTH) - - -/* Data types */ - -#define PCRE2_UCHAR PCRE2_SUFFIX(PCRE2_UCHAR) -#define PCRE2_SPTR PCRE2_SUFFIX(PCRE2_SPTR) - -#define pcre2_code PCRE2_SUFFIX(pcre2_code_) -#define pcre2_jit_callback PCRE2_SUFFIX(pcre2_jit_callback_) -#define pcre2_jit_stack PCRE2_SUFFIX(pcre2_jit_stack_) - -#define pcre2_real_code PCRE2_SUFFIX(pcre2_real_code_) -#define pcre2_real_general_context PCRE2_SUFFIX(pcre2_real_general_context_) -#define pcre2_real_compile_context PCRE2_SUFFIX(pcre2_real_compile_context_) -#define pcre2_real_convert_context PCRE2_SUFFIX(pcre2_real_convert_context_) -#define pcre2_real_match_context PCRE2_SUFFIX(pcre2_real_match_context_) -#define pcre2_real_jit_stack PCRE2_SUFFIX(pcre2_real_jit_stack_) -#define pcre2_real_match_data PCRE2_SUFFIX(pcre2_real_match_data_) - - -/* Data blocks */ - -#define pcre2_callout_block PCRE2_SUFFIX(pcre2_callout_block_) -#define pcre2_callout_enumerate_block PCRE2_SUFFIX(pcre2_callout_enumerate_block_) -#define pcre2_substitute_callout_block PCRE2_SUFFIX(pcre2_substitute_callout_block_) -#define pcre2_general_context PCRE2_SUFFIX(pcre2_general_context_) -#define pcre2_compile_context PCRE2_SUFFIX(pcre2_compile_context_) -#define pcre2_convert_context PCRE2_SUFFIX(pcre2_convert_context_) -#define pcre2_match_context PCRE2_SUFFIX(pcre2_match_context_) -#define pcre2_match_data PCRE2_SUFFIX(pcre2_match_data_) - - -/* Functions: the complete list in alphabetical order */ - -#define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_) -#define pcre2_code_copy PCRE2_SUFFIX(pcre2_code_copy_) -#define pcre2_code_copy_with_tables PCRE2_SUFFIX(pcre2_code_copy_with_tables_) -#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_) -#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_) -#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_) -#define pcre2_compile_context_create PCRE2_SUFFIX(pcre2_compile_context_create_) -#define pcre2_compile_context_free PCRE2_SUFFIX(pcre2_compile_context_free_) -#define pcre2_config PCRE2_SUFFIX(pcre2_config_) -#define pcre2_convert_context_copy PCRE2_SUFFIX(pcre2_convert_context_copy_) -#define pcre2_convert_context_create PCRE2_SUFFIX(pcre2_convert_context_create_) -#define pcre2_convert_context_free PCRE2_SUFFIX(pcre2_convert_context_free_) -#define pcre2_converted_pattern_free PCRE2_SUFFIX(pcre2_converted_pattern_free_) -#define pcre2_dfa_match PCRE2_SUFFIX(pcre2_dfa_match_) -#define pcre2_general_context_copy PCRE2_SUFFIX(pcre2_general_context_copy_) -#define pcre2_general_context_create PCRE2_SUFFIX(pcre2_general_context_create_) -#define pcre2_general_context_free PCRE2_SUFFIX(pcre2_general_context_free_) -#define pcre2_get_error_message PCRE2_SUFFIX(pcre2_get_error_message_) -#define pcre2_get_mark PCRE2_SUFFIX(pcre2_get_mark_) -#define pcre2_get_match_data_size PCRE2_SUFFIX(pcre2_get_match_data_size_) -#define pcre2_get_ovector_pointer PCRE2_SUFFIX(pcre2_get_ovector_pointer_) -#define pcre2_get_ovector_count PCRE2_SUFFIX(pcre2_get_ovector_count_) -#define pcre2_get_startchar PCRE2_SUFFIX(pcre2_get_startchar_) -#define pcre2_jit_compile PCRE2_SUFFIX(pcre2_jit_compile_) -#define pcre2_jit_match PCRE2_SUFFIX(pcre2_jit_match_) -#define pcre2_jit_free_unused_memory PCRE2_SUFFIX(pcre2_jit_free_unused_memory_) -#define pcre2_jit_stack_assign PCRE2_SUFFIX(pcre2_jit_stack_assign_) -#define pcre2_jit_stack_create PCRE2_SUFFIX(pcre2_jit_stack_create_) -#define pcre2_jit_stack_free PCRE2_SUFFIX(pcre2_jit_stack_free_) -#define pcre2_maketables PCRE2_SUFFIX(pcre2_maketables_) -#define pcre2_maketables_free PCRE2_SUFFIX(pcre2_maketables_free_) -#define pcre2_match PCRE2_SUFFIX(pcre2_match_) -#define pcre2_match_context_copy PCRE2_SUFFIX(pcre2_match_context_copy_) -#define pcre2_match_context_create PCRE2_SUFFIX(pcre2_match_context_create_) -#define pcre2_match_context_free PCRE2_SUFFIX(pcre2_match_context_free_) -#define pcre2_match_data_create PCRE2_SUFFIX(pcre2_match_data_create_) -#define pcre2_match_data_create_from_pattern PCRE2_SUFFIX(pcre2_match_data_create_from_pattern_) -#define pcre2_match_data_free PCRE2_SUFFIX(pcre2_match_data_free_) -#define pcre2_pattern_convert PCRE2_SUFFIX(pcre2_pattern_convert_) -#define pcre2_pattern_info PCRE2_SUFFIX(pcre2_pattern_info_) -#define pcre2_serialize_decode PCRE2_SUFFIX(pcre2_serialize_decode_) -#define pcre2_serialize_encode PCRE2_SUFFIX(pcre2_serialize_encode_) -#define pcre2_serialize_free PCRE2_SUFFIX(pcre2_serialize_free_) -#define pcre2_serialize_get_number_of_codes PCRE2_SUFFIX(pcre2_serialize_get_number_of_codes_) -#define pcre2_set_bsr PCRE2_SUFFIX(pcre2_set_bsr_) -#define pcre2_set_callout PCRE2_SUFFIX(pcre2_set_callout_) -#define pcre2_set_character_tables PCRE2_SUFFIX(pcre2_set_character_tables_) -#define pcre2_set_compile_extra_options PCRE2_SUFFIX(pcre2_set_compile_extra_options_) -#define pcre2_set_compile_recursion_guard PCRE2_SUFFIX(pcre2_set_compile_recursion_guard_) -#define pcre2_set_depth_limit PCRE2_SUFFIX(pcre2_set_depth_limit_) -#define pcre2_set_glob_escape PCRE2_SUFFIX(pcre2_set_glob_escape_) -#define pcre2_set_glob_separator PCRE2_SUFFIX(pcre2_set_glob_separator_) -#define pcre2_set_heap_limit PCRE2_SUFFIX(pcre2_set_heap_limit_) -#define pcre2_set_match_limit PCRE2_SUFFIX(pcre2_set_match_limit_) -#define pcre2_set_max_pattern_length PCRE2_SUFFIX(pcre2_set_max_pattern_length_) -#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_) -#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_) -#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_) -#define pcre2_set_substitute_callout PCRE2_SUFFIX(pcre2_set_substitute_callout_) -#define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_) -#define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_) -#define pcre2_substring_copy_bynumber PCRE2_SUFFIX(pcre2_substring_copy_bynumber_) -#define pcre2_substring_free PCRE2_SUFFIX(pcre2_substring_free_) -#define pcre2_substring_get_byname PCRE2_SUFFIX(pcre2_substring_get_byname_) -#define pcre2_substring_get_bynumber PCRE2_SUFFIX(pcre2_substring_get_bynumber_) -#define pcre2_substring_length_byname PCRE2_SUFFIX(pcre2_substring_length_byname_) -#define pcre2_substring_length_bynumber PCRE2_SUFFIX(pcre2_substring_length_bynumber_) -#define pcre2_substring_list_get PCRE2_SUFFIX(pcre2_substring_list_get_) -#define pcre2_substring_list_free PCRE2_SUFFIX(pcre2_substring_list_free_) -#define pcre2_substring_nametable_scan PCRE2_SUFFIX(pcre2_substring_nametable_scan_) -#define pcre2_substring_number_from_name PCRE2_SUFFIX(pcre2_substring_number_from_name_) - -/* Keep this old function name for backwards compatibility */ -#define pcre2_set_recursion_limit PCRE2_SUFFIX(pcre2_set_recursion_limit_) - -/* Keep this obsolete function for backwards compatibility: it is now a noop. */ -#define pcre2_set_recursion_memory_management PCRE2_SUFFIX(pcre2_set_recursion_memory_management_) - -/* Now generate all three sets of width-specific structures and function -prototypes. */ - -#define PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS \ -PCRE2_TYPES_LIST \ -PCRE2_STRUCTURE_LIST \ -PCRE2_GENERAL_INFO_FUNCTIONS \ -PCRE2_GENERAL_CONTEXT_FUNCTIONS \ -PCRE2_COMPILE_CONTEXT_FUNCTIONS \ -PCRE2_CONVERT_CONTEXT_FUNCTIONS \ -PCRE2_CONVERT_FUNCTIONS \ -PCRE2_MATCH_CONTEXT_FUNCTIONS \ -PCRE2_COMPILE_FUNCTIONS \ -PCRE2_PATTERN_INFO_FUNCTIONS \ -PCRE2_MATCH_FUNCTIONS \ -PCRE2_SUBSTRING_FUNCTIONS \ -PCRE2_SERIALIZE_FUNCTIONS \ -PCRE2_SUBSTITUTE_FUNCTION \ -PCRE2_JIT_FUNCTIONS \ -PCRE2_OTHER_FUNCTIONS - -#define PCRE2_LOCAL_WIDTH 8 -PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS -#undef PCRE2_LOCAL_WIDTH - -#define PCRE2_LOCAL_WIDTH 16 -PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS -#undef PCRE2_LOCAL_WIDTH - -#define PCRE2_LOCAL_WIDTH 32 -PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS -#undef PCRE2_LOCAL_WIDTH - -/* Undefine the list macros; they are no longer needed. */ - -#undef PCRE2_TYPES_LIST -#undef PCRE2_STRUCTURE_LIST -#undef PCRE2_GENERAL_INFO_FUNCTIONS -#undef PCRE2_GENERAL_CONTEXT_FUNCTIONS -#undef PCRE2_COMPILE_CONTEXT_FUNCTIONS -#undef PCRE2_CONVERT_CONTEXT_FUNCTIONS -#undef PCRE2_MATCH_CONTEXT_FUNCTIONS -#undef PCRE2_COMPILE_FUNCTIONS -#undef PCRE2_PATTERN_INFO_FUNCTIONS -#undef PCRE2_MATCH_FUNCTIONS -#undef PCRE2_SUBSTRING_FUNCTIONS -#undef PCRE2_SERIALIZE_FUNCTIONS -#undef PCRE2_SUBSTITUTE_FUNCTION -#undef PCRE2_JIT_FUNCTIONS -#undef PCRE2_OTHER_FUNCTIONS -#undef PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS - -/* PCRE2_CODE_UNIT_WIDTH must be defined. If it is 8, 16, or 32, redefine -PCRE2_SUFFIX to use it. If it is 0, undefine the other macros and make -PCRE2_SUFFIX a no-op. Otherwise, generate an error. */ - -#undef PCRE2_SUFFIX -#ifndef PCRE2_CODE_UNIT_WIDTH -#error PCRE2_CODE_UNIT_WIDTH must be defined before including pcre2.h. -#error Use 8, 16, or 32; or 0 for a multi-width application. -#else /* PCRE2_CODE_UNIT_WIDTH is defined */ -#if PCRE2_CODE_UNIT_WIDTH == 8 || \ - PCRE2_CODE_UNIT_WIDTH == 16 || \ - PCRE2_CODE_UNIT_WIDTH == 32 -#define PCRE2_SUFFIX(a) PCRE2_GLUE(a, PCRE2_CODE_UNIT_WIDTH) -#elif PCRE2_CODE_UNIT_WIDTH == 0 -#undef PCRE2_JOIN -#undef PCRE2_GLUE -#define PCRE2_SUFFIX(a) a -#else -#error PCRE2_CODE_UNIT_WIDTH must be 0, 8, 16, or 32. -#endif -#endif /* PCRE2_CODE_UNIT_WIDTH is defined */ - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* PCRE2_H_IDEMPOTENT_GUARD */ - -/* End of pcre2.h */ diff --git a/pcre2/src/pcre2.h.in b/pcre2/src/pcre2.h.in deleted file mode 100644 index 4fd6a1e30..000000000 --- a/pcre2/src/pcre2.h.in +++ /dev/null @@ -1,991 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* This is the public header file for the PCRE library, second API, to be -#included by applications that call PCRE2 functions. - - Copyright (c) 2016-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -#ifndef PCRE2_H_IDEMPOTENT_GUARD -#define PCRE2_H_IDEMPOTENT_GUARD - -/* The current PCRE version information. */ - -#define PCRE2_MAJOR @PCRE2_MAJOR@ -#define PCRE2_MINOR @PCRE2_MINOR@ -#define PCRE2_PRERELEASE @PCRE2_PRERELEASE@ -#define PCRE2_DATE @PCRE2_DATE@ - -/* When an application links to a PCRE DLL in Windows, the symbols that are -imported have to be identified as such. When building PCRE2, the appropriate -export setting is defined in pcre2_internal.h, which includes this file. So we -don't change existing definitions of PCRE2_EXP_DECL. */ - -#if defined(_WIN32) && !defined(PCRE2_STATIC) -# ifndef PCRE2_EXP_DECL -# define PCRE2_EXP_DECL extern __declspec(dllimport) -# endif -#endif - -/* By default, we use the standard "extern" declarations. */ - -#ifndef PCRE2_EXP_DECL -# ifdef __cplusplus -# define PCRE2_EXP_DECL extern "C" -# else -# define PCRE2_EXP_DECL extern -# endif -#endif - -/* When compiling with the MSVC compiler, it is sometimes necessary to include -a "calling convention" before exported function names. (This is secondhand -information; I know nothing about MSVC myself). For example, something like - - void __cdecl function(....) - -might be needed. In order so make this easy, all the exported functions have -PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not -set, we ensure here that it has no effect. */ - -#ifndef PCRE2_CALL_CONVENTION -#define PCRE2_CALL_CONVENTION -#endif - -/* Have to include limits.h, stdlib.h, and inttypes.h to ensure that size_t and -uint8_t, UCHAR_MAX, etc are defined. Some systems that do have inttypes.h do -not have stdint.h, which is why we use inttypes.h, which according to the C -standard is a superset of stdint.h. If none of these headers are available, -the relevant values must be provided by some other means. */ - -#include -#include -#include - -/* Allow for C++ users compiling this directly. */ - -#ifdef __cplusplus -extern "C" { -#endif - -/* The following option bits can be passed to pcre2_compile(), pcre2_match(), -or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the function to which it -is passed. Put these bits at the most significant end of the options word so -others can be added next to them */ - -#define PCRE2_ANCHORED 0x80000000u -#define PCRE2_NO_UTF_CHECK 0x40000000u -#define PCRE2_ENDANCHORED 0x20000000u - -/* The following option bits can be passed only to pcre2_compile(). However, -they may affect compilation, JIT compilation, and/or interpretive execution. -The following tags indicate which: - -C alters what is compiled by pcre2_compile() -J alters what is compiled by pcre2_jit_compile() -M is inspected during pcre2_match() execution -D is inspected during pcre2_dfa_match() execution -*/ - -#define PCRE2_ALLOW_EMPTY_CLASS 0x00000001u /* C */ -#define PCRE2_ALT_BSUX 0x00000002u /* C */ -#define PCRE2_AUTO_CALLOUT 0x00000004u /* C */ -#define PCRE2_CASELESS 0x00000008u /* C */ -#define PCRE2_DOLLAR_ENDONLY 0x00000010u /* J M D */ -#define PCRE2_DOTALL 0x00000020u /* C */ -#define PCRE2_DUPNAMES 0x00000040u /* C */ -#define PCRE2_EXTENDED 0x00000080u /* C */ -#define PCRE2_FIRSTLINE 0x00000100u /* J M D */ -#define PCRE2_MATCH_UNSET_BACKREF 0x00000200u /* C J M */ -#define PCRE2_MULTILINE 0x00000400u /* C */ -#define PCRE2_NEVER_UCP 0x00000800u /* C */ -#define PCRE2_NEVER_UTF 0x00001000u /* C */ -#define PCRE2_NO_AUTO_CAPTURE 0x00002000u /* C */ -#define PCRE2_NO_AUTO_POSSESS 0x00004000u /* C */ -#define PCRE2_NO_DOTSTAR_ANCHOR 0x00008000u /* C */ -#define PCRE2_NO_START_OPTIMIZE 0x00010000u /* J M D */ -#define PCRE2_UCP 0x00020000u /* C J M D */ -#define PCRE2_UNGREEDY 0x00040000u /* C */ -#define PCRE2_UTF 0x00080000u /* C J M D */ -#define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */ -#define PCRE2_ALT_CIRCUMFLEX 0x00200000u /* J M D */ -#define PCRE2_ALT_VERBNAMES 0x00400000u /* C */ -#define PCRE2_USE_OFFSET_LIMIT 0x00800000u /* J M D */ -#define PCRE2_EXTENDED_MORE 0x01000000u /* C */ -#define PCRE2_LITERAL 0x02000000u /* C */ -#define PCRE2_MATCH_INVALID_UTF 0x04000000u /* J M D */ - -/* An additional compile options word is available in the compile context. */ - -#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */ -#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */ -#define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */ -#define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */ -#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */ -#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */ - -/* These are for pcre2_jit_compile(). */ - -#define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */ -#define PCRE2_JIT_PARTIAL_SOFT 0x00000002u -#define PCRE2_JIT_PARTIAL_HARD 0x00000004u -#define PCRE2_JIT_INVALID_UTF 0x00000100u - -/* These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and -pcre2_substitute(). Some are allowed only for one of the functions, and in -these cases it is noted below. Note that PCRE2_ANCHORED, PCRE2_ENDANCHORED and -PCRE2_NO_UTF_CHECK can also be passed to these functions (though -pcre2_jit_match() ignores the latter since it bypasses all sanity checks). */ - -#define PCRE2_NOTBOL 0x00000001u -#define PCRE2_NOTEOL 0x00000002u -#define PCRE2_NOTEMPTY 0x00000004u /* ) These two must be kept */ -#define PCRE2_NOTEMPTY_ATSTART 0x00000008u /* ) adjacent to each other. */ -#define PCRE2_PARTIAL_SOFT 0x00000010u -#define PCRE2_PARTIAL_HARD 0x00000020u -#define PCRE2_DFA_RESTART 0x00000040u /* pcre2_dfa_match() only */ -#define PCRE2_DFA_SHORTEST 0x00000080u /* pcre2_dfa_match() only */ -#define PCRE2_SUBSTITUTE_GLOBAL 0x00000100u /* pcre2_substitute() only */ -#define PCRE2_SUBSTITUTE_EXTENDED 0x00000200u /* pcre2_substitute() only */ -#define PCRE2_SUBSTITUTE_UNSET_EMPTY 0x00000400u /* pcre2_substitute() only */ -#define PCRE2_SUBSTITUTE_UNKNOWN_UNSET 0x00000800u /* pcre2_substitute() only */ -#define PCRE2_SUBSTITUTE_OVERFLOW_LENGTH 0x00001000u /* pcre2_substitute() only */ -#define PCRE2_NO_JIT 0x00002000u /* Not for pcre2_dfa_match() */ -#define PCRE2_COPY_MATCHED_SUBJECT 0x00004000u -#define PCRE2_SUBSTITUTE_LITERAL 0x00008000u /* pcre2_substitute() only */ -#define PCRE2_SUBSTITUTE_MATCHED 0x00010000u /* pcre2_substitute() only */ -#define PCRE2_SUBSTITUTE_REPLACEMENT_ONLY 0x00020000u /* pcre2_substitute() only */ - -/* Options for pcre2_pattern_convert(). */ - -#define PCRE2_CONVERT_UTF 0x00000001u -#define PCRE2_CONVERT_NO_UTF_CHECK 0x00000002u -#define PCRE2_CONVERT_POSIX_BASIC 0x00000004u -#define PCRE2_CONVERT_POSIX_EXTENDED 0x00000008u -#define PCRE2_CONVERT_GLOB 0x00000010u -#define PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR 0x00000030u -#define PCRE2_CONVERT_GLOB_NO_STARSTAR 0x00000050u - -/* Newline and \R settings, for use in compile contexts. The newline values -must be kept in step with values set in config.h and both sets must all be -greater than zero. */ - -#define PCRE2_NEWLINE_CR 1 -#define PCRE2_NEWLINE_LF 2 -#define PCRE2_NEWLINE_CRLF 3 -#define PCRE2_NEWLINE_ANY 4 -#define PCRE2_NEWLINE_ANYCRLF 5 -#define PCRE2_NEWLINE_NUL 6 - -#define PCRE2_BSR_UNICODE 1 -#define PCRE2_BSR_ANYCRLF 2 - -/* Error codes for pcre2_compile(). Some of these are also used by -pcre2_pattern_convert(). */ - -#define PCRE2_ERROR_END_BACKSLASH 101 -#define PCRE2_ERROR_END_BACKSLASH_C 102 -#define PCRE2_ERROR_UNKNOWN_ESCAPE 103 -#define PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER 104 -#define PCRE2_ERROR_QUANTIFIER_TOO_BIG 105 -#define PCRE2_ERROR_MISSING_SQUARE_BRACKET 106 -#define PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS 107 -#define PCRE2_ERROR_CLASS_RANGE_ORDER 108 -#define PCRE2_ERROR_QUANTIFIER_INVALID 109 -#define PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT 110 -#define PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY 111 -#define PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS 112 -#define PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING 113 -#define PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS 114 -#define PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE 115 -#define PCRE2_ERROR_NULL_PATTERN 116 -#define PCRE2_ERROR_BAD_OPTIONS 117 -#define PCRE2_ERROR_MISSING_COMMENT_CLOSING 118 -#define PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP 119 -#define PCRE2_ERROR_PATTERN_TOO_LARGE 120 -#define PCRE2_ERROR_HEAP_FAILED 121 -#define PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS 122 -#define PCRE2_ERROR_INTERNAL_CODE_OVERFLOW 123 -#define PCRE2_ERROR_MISSING_CONDITION_CLOSING 124 -#define PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH 125 -#define PCRE2_ERROR_ZERO_RELATIVE_REFERENCE 126 -#define PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES 127 -#define PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED 128 -#define PCRE2_ERROR_BAD_RELATIVE_REFERENCE 129 -#define PCRE2_ERROR_UNKNOWN_POSIX_CLASS 130 -#define PCRE2_ERROR_INTERNAL_STUDY_ERROR 131 -#define PCRE2_ERROR_UNICODE_NOT_SUPPORTED 132 -#define PCRE2_ERROR_PARENTHESES_STACK_CHECK 133 -#define PCRE2_ERROR_CODE_POINT_TOO_BIG 134 -#define PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED 135 -#define PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C 136 -#define PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE 137 -#define PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG 138 -#define PCRE2_ERROR_MISSING_CALLOUT_CLOSING 139 -#define PCRE2_ERROR_ESCAPE_INVALID_IN_VERB 140 -#define PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P 141 -#define PCRE2_ERROR_MISSING_NAME_TERMINATOR 142 -#define PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME 143 -#define PCRE2_ERROR_INVALID_SUBPATTERN_NAME 144 -#define PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE 145 -#define PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY 146 -#define PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY 147 -#define PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG 148 -#define PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS 149 -#define PCRE2_ERROR_CLASS_INVALID_RANGE 150 -#define PCRE2_ERROR_OCTAL_BYTE_TOO_BIG 151 -#define PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE 152 -#define PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN 153 -#define PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES 154 -#define PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE 155 -#define PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE 156 -#define PCRE2_ERROR_BACKSLASH_G_SYNTAX 157 -#define PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING 158 -/* Error 159 is obsolete and should now never occur */ -#define PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED 159 -#define PCRE2_ERROR_VERB_UNKNOWN 160 -#define PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG 161 -#define PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED 162 -#define PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW 163 -#define PCRE2_ERROR_INVALID_OCTAL 164 -#define PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH 165 -#define PCRE2_ERROR_MARK_MISSING_ARGUMENT 166 -#define PCRE2_ERROR_INVALID_HEXADECIMAL 167 -#define PCRE2_ERROR_BACKSLASH_C_SYNTAX 168 -#define PCRE2_ERROR_BACKSLASH_K_SYNTAX 169 -#define PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS 170 -#define PCRE2_ERROR_BACKSLASH_N_IN_CLASS 171 -#define PCRE2_ERROR_CALLOUT_STRING_TOO_LONG 172 -#define PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT 173 -#define PCRE2_ERROR_UTF_IS_DISABLED 174 -#define PCRE2_ERROR_UCP_IS_DISABLED 175 -#define PCRE2_ERROR_VERB_NAME_TOO_LONG 176 -#define PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG 177 -#define PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS 178 -#define PCRE2_ERROR_VERSION_CONDITION_SYNTAX 179 -#define PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS 180 -#define PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER 181 -#define PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER 182 -#define PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED 183 -#define PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP 184 -#define PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED 185 -#define PCRE2_ERROR_PATTERN_TOO_COMPLICATED 186 -#define PCRE2_ERROR_LOOKBEHIND_TOO_LONG 187 -#define PCRE2_ERROR_PATTERN_STRING_TOO_LONG 188 -#define PCRE2_ERROR_INTERNAL_BAD_CODE 189 -#define PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP 190 -#define PCRE2_ERROR_NO_SURROGATES_IN_UTF16 191 -#define PCRE2_ERROR_BAD_LITERAL_OPTIONS 192 -#define PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE 193 -#define PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS 194 -#define PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN 195 -#define PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE 196 -#define PCRE2_ERROR_TOO_MANY_CAPTURES 197 -#define PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED 198 - - -/* "Expected" matching error codes: no match and partial match. */ - -#define PCRE2_ERROR_NOMATCH (-1) -#define PCRE2_ERROR_PARTIAL (-2) - -/* Error codes for UTF-8 validity checks */ - -#define PCRE2_ERROR_UTF8_ERR1 (-3) -#define PCRE2_ERROR_UTF8_ERR2 (-4) -#define PCRE2_ERROR_UTF8_ERR3 (-5) -#define PCRE2_ERROR_UTF8_ERR4 (-6) -#define PCRE2_ERROR_UTF8_ERR5 (-7) -#define PCRE2_ERROR_UTF8_ERR6 (-8) -#define PCRE2_ERROR_UTF8_ERR7 (-9) -#define PCRE2_ERROR_UTF8_ERR8 (-10) -#define PCRE2_ERROR_UTF8_ERR9 (-11) -#define PCRE2_ERROR_UTF8_ERR10 (-12) -#define PCRE2_ERROR_UTF8_ERR11 (-13) -#define PCRE2_ERROR_UTF8_ERR12 (-14) -#define PCRE2_ERROR_UTF8_ERR13 (-15) -#define PCRE2_ERROR_UTF8_ERR14 (-16) -#define PCRE2_ERROR_UTF8_ERR15 (-17) -#define PCRE2_ERROR_UTF8_ERR16 (-18) -#define PCRE2_ERROR_UTF8_ERR17 (-19) -#define PCRE2_ERROR_UTF8_ERR18 (-20) -#define PCRE2_ERROR_UTF8_ERR19 (-21) -#define PCRE2_ERROR_UTF8_ERR20 (-22) -#define PCRE2_ERROR_UTF8_ERR21 (-23) - -/* Error codes for UTF-16 validity checks */ - -#define PCRE2_ERROR_UTF16_ERR1 (-24) -#define PCRE2_ERROR_UTF16_ERR2 (-25) -#define PCRE2_ERROR_UTF16_ERR3 (-26) - -/* Error codes for UTF-32 validity checks */ - -#define PCRE2_ERROR_UTF32_ERR1 (-27) -#define PCRE2_ERROR_UTF32_ERR2 (-28) - -/* Miscellaneous error codes for pcre2[_dfa]_match(), substring extraction -functions, context functions, and serializing functions. They are in numerical -order. Originally they were in alphabetical order too, but now that PCRE2 is -released, the numbers must not be changed. */ - -#define PCRE2_ERROR_BADDATA (-29) -#define PCRE2_ERROR_MIXEDTABLES (-30) /* Name was changed */ -#define PCRE2_ERROR_BADMAGIC (-31) -#define PCRE2_ERROR_BADMODE (-32) -#define PCRE2_ERROR_BADOFFSET (-33) -#define PCRE2_ERROR_BADOPTION (-34) -#define PCRE2_ERROR_BADREPLACEMENT (-35) -#define PCRE2_ERROR_BADUTFOFFSET (-36) -#define PCRE2_ERROR_CALLOUT (-37) /* Never used by PCRE2 itself */ -#define PCRE2_ERROR_DFA_BADRESTART (-38) -#define PCRE2_ERROR_DFA_RECURSE (-39) -#define PCRE2_ERROR_DFA_UCOND (-40) -#define PCRE2_ERROR_DFA_UFUNC (-41) -#define PCRE2_ERROR_DFA_UITEM (-42) -#define PCRE2_ERROR_DFA_WSSIZE (-43) -#define PCRE2_ERROR_INTERNAL (-44) -#define PCRE2_ERROR_JIT_BADOPTION (-45) -#define PCRE2_ERROR_JIT_STACKLIMIT (-46) -#define PCRE2_ERROR_MATCHLIMIT (-47) -#define PCRE2_ERROR_NOMEMORY (-48) -#define PCRE2_ERROR_NOSUBSTRING (-49) -#define PCRE2_ERROR_NOUNIQUESUBSTRING (-50) -#define PCRE2_ERROR_NULL (-51) -#define PCRE2_ERROR_RECURSELOOP (-52) -#define PCRE2_ERROR_DEPTHLIMIT (-53) -#define PCRE2_ERROR_RECURSIONLIMIT (-53) /* Obsolete synonym */ -#define PCRE2_ERROR_UNAVAILABLE (-54) -#define PCRE2_ERROR_UNSET (-55) -#define PCRE2_ERROR_BADOFFSETLIMIT (-56) -#define PCRE2_ERROR_BADREPESCAPE (-57) -#define PCRE2_ERROR_REPMISSINGBRACE (-58) -#define PCRE2_ERROR_BADSUBSTITUTION (-59) -#define PCRE2_ERROR_BADSUBSPATTERN (-60) -#define PCRE2_ERROR_TOOMANYREPLACE (-61) -#define PCRE2_ERROR_BADSERIALIZEDDATA (-62) -#define PCRE2_ERROR_HEAPLIMIT (-63) -#define PCRE2_ERROR_CONVERT_SYNTAX (-64) -#define PCRE2_ERROR_INTERNAL_DUPMATCH (-65) -#define PCRE2_ERROR_DFA_UINVALID_UTF (-66) - - -/* Request types for pcre2_pattern_info() */ - -#define PCRE2_INFO_ALLOPTIONS 0 -#define PCRE2_INFO_ARGOPTIONS 1 -#define PCRE2_INFO_BACKREFMAX 2 -#define PCRE2_INFO_BSR 3 -#define PCRE2_INFO_CAPTURECOUNT 4 -#define PCRE2_INFO_FIRSTCODEUNIT 5 -#define PCRE2_INFO_FIRSTCODETYPE 6 -#define PCRE2_INFO_FIRSTBITMAP 7 -#define PCRE2_INFO_HASCRORLF 8 -#define PCRE2_INFO_JCHANGED 9 -#define PCRE2_INFO_JITSIZE 10 -#define PCRE2_INFO_LASTCODEUNIT 11 -#define PCRE2_INFO_LASTCODETYPE 12 -#define PCRE2_INFO_MATCHEMPTY 13 -#define PCRE2_INFO_MATCHLIMIT 14 -#define PCRE2_INFO_MAXLOOKBEHIND 15 -#define PCRE2_INFO_MINLENGTH 16 -#define PCRE2_INFO_NAMECOUNT 17 -#define PCRE2_INFO_NAMEENTRYSIZE 18 -#define PCRE2_INFO_NAMETABLE 19 -#define PCRE2_INFO_NEWLINE 20 -#define PCRE2_INFO_DEPTHLIMIT 21 -#define PCRE2_INFO_RECURSIONLIMIT 21 /* Obsolete synonym */ -#define PCRE2_INFO_SIZE 22 -#define PCRE2_INFO_HASBACKSLASHC 23 -#define PCRE2_INFO_FRAMESIZE 24 -#define PCRE2_INFO_HEAPLIMIT 25 -#define PCRE2_INFO_EXTRAOPTIONS 26 - -/* Request types for pcre2_config(). */ - -#define PCRE2_CONFIG_BSR 0 -#define PCRE2_CONFIG_JIT 1 -#define PCRE2_CONFIG_JITTARGET 2 -#define PCRE2_CONFIG_LINKSIZE 3 -#define PCRE2_CONFIG_MATCHLIMIT 4 -#define PCRE2_CONFIG_NEWLINE 5 -#define PCRE2_CONFIG_PARENSLIMIT 6 -#define PCRE2_CONFIG_DEPTHLIMIT 7 -#define PCRE2_CONFIG_RECURSIONLIMIT 7 /* Obsolete synonym */ -#define PCRE2_CONFIG_STACKRECURSE 8 /* Obsolete */ -#define PCRE2_CONFIG_UNICODE 9 -#define PCRE2_CONFIG_UNICODE_VERSION 10 -#define PCRE2_CONFIG_VERSION 11 -#define PCRE2_CONFIG_HEAPLIMIT 12 -#define PCRE2_CONFIG_NEVER_BACKSLASH_C 13 -#define PCRE2_CONFIG_COMPILED_WIDTHS 14 -#define PCRE2_CONFIG_TABLES_LENGTH 15 - - -/* Types for code units in patterns and subject strings. */ - -typedef uint8_t PCRE2_UCHAR8; -typedef uint16_t PCRE2_UCHAR16; -typedef uint32_t PCRE2_UCHAR32; - -typedef const PCRE2_UCHAR8 *PCRE2_SPTR8; -typedef const PCRE2_UCHAR16 *PCRE2_SPTR16; -typedef const PCRE2_UCHAR32 *PCRE2_SPTR32; - -/* The PCRE2_SIZE type is used for all string lengths and offsets in PCRE2, -including pattern offsets for errors and subject offsets after a match. We -define special values to indicate zero-terminated strings and unset offsets in -the offset vector (ovector). */ - -#define PCRE2_SIZE size_t -#define PCRE2_SIZE_MAX SIZE_MAX -#define PCRE2_ZERO_TERMINATED (~(PCRE2_SIZE)0) -#define PCRE2_UNSET (~(PCRE2_SIZE)0) - -/* Generic types for opaque structures and JIT callback functions. These -declarations are defined in a macro that is expanded for each width later. */ - -#define PCRE2_TYPES_LIST \ -struct pcre2_real_general_context; \ -typedef struct pcre2_real_general_context pcre2_general_context; \ -\ -struct pcre2_real_compile_context; \ -typedef struct pcre2_real_compile_context pcre2_compile_context; \ -\ -struct pcre2_real_match_context; \ -typedef struct pcre2_real_match_context pcre2_match_context; \ -\ -struct pcre2_real_convert_context; \ -typedef struct pcre2_real_convert_context pcre2_convert_context; \ -\ -struct pcre2_real_code; \ -typedef struct pcre2_real_code pcre2_code; \ -\ -struct pcre2_real_match_data; \ -typedef struct pcre2_real_match_data pcre2_match_data; \ -\ -struct pcre2_real_jit_stack; \ -typedef struct pcre2_real_jit_stack pcre2_jit_stack; \ -\ -typedef pcre2_jit_stack *(*pcre2_jit_callback)(void *); - - -/* The structures for passing out data via callout functions. We use structures -so that new fields can be added on the end in future versions, without changing -the API of the function, thereby allowing old clients to work without -modification. Define the generic versions in a macro; the width-specific -versions are generated from this macro below. */ - -/* Flags for the callout_flags field. These are cleared after a callout. */ - -#define PCRE2_CALLOUT_STARTMATCH 0x00000001u /* Set for each bumpalong */ -#define PCRE2_CALLOUT_BACKTRACK 0x00000002u /* Set after a backtrack */ - -#define PCRE2_STRUCTURE_LIST \ -typedef struct pcre2_callout_block { \ - uint32_t version; /* Identifies version of block */ \ - /* ------------------------ Version 0 ------------------------------- */ \ - uint32_t callout_number; /* Number compiled into pattern */ \ - uint32_t capture_top; /* Max current capture */ \ - uint32_t capture_last; /* Most recently closed capture */ \ - PCRE2_SIZE *offset_vector; /* The offset vector */ \ - PCRE2_SPTR mark; /* Pointer to current mark or NULL */ \ - PCRE2_SPTR subject; /* The subject being matched */ \ - PCRE2_SIZE subject_length; /* The length of the subject */ \ - PCRE2_SIZE start_match; /* Offset to start of this match attempt */ \ - PCRE2_SIZE current_position; /* Where we currently are in the subject */ \ - PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \ - PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \ - /* ------------------- Added for Version 1 -------------------------- */ \ - PCRE2_SIZE callout_string_offset; /* Offset to string within pattern */ \ - PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \ - PCRE2_SPTR callout_string; /* String compiled into pattern */ \ - /* ------------------- Added for Version 2 -------------------------- */ \ - uint32_t callout_flags; /* See above for list */ \ - /* ------------------------------------------------------------------ */ \ -} pcre2_callout_block; \ -\ -typedef struct pcre2_callout_enumerate_block { \ - uint32_t version; /* Identifies version of block */ \ - /* ------------------------ Version 0 ------------------------------- */ \ - PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \ - PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \ - uint32_t callout_number; /* Number compiled into pattern */ \ - PCRE2_SIZE callout_string_offset; /* Offset to string within pattern */ \ - PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \ - PCRE2_SPTR callout_string; /* String compiled into pattern */ \ - /* ------------------------------------------------------------------ */ \ -} pcre2_callout_enumerate_block; \ -\ -typedef struct pcre2_substitute_callout_block { \ - uint32_t version; /* Identifies version of block */ \ - /* ------------------------ Version 0 ------------------------------- */ \ - PCRE2_SPTR input; /* Pointer to input subject string */ \ - PCRE2_SPTR output; /* Pointer to output buffer */ \ - PCRE2_SIZE output_offsets[2]; /* Changed portion of the output */ \ - PCRE2_SIZE *ovector; /* Pointer to current ovector */ \ - uint32_t oveccount; /* Count of pairs set in ovector */ \ - uint32_t subscount; /* Substitution number */ \ - /* ------------------------------------------------------------------ */ \ -} pcre2_substitute_callout_block; - - -/* List the generic forms of all other functions in macros, which will be -expanded for each width below. Start with functions that give general -information. */ - -#define PCRE2_GENERAL_INFO_FUNCTIONS \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION pcre2_config(uint32_t, void *); - - -/* Functions for manipulating contexts. */ - -#define PCRE2_GENERAL_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL pcre2_general_context PCRE2_CALL_CONVENTION \ - *pcre2_general_context_copy(pcre2_general_context *); \ -PCRE2_EXP_DECL pcre2_general_context PCRE2_CALL_CONVENTION \ - *pcre2_general_context_create(void *(*)(PCRE2_SIZE, void *), \ - void (*)(void *, void *), void *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_general_context_free(pcre2_general_context *); - -#define PCRE2_COMPILE_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL pcre2_compile_context PCRE2_CALL_CONVENTION \ - *pcre2_compile_context_copy(pcre2_compile_context *); \ -PCRE2_EXP_DECL pcre2_compile_context PCRE2_CALL_CONVENTION \ - *pcre2_compile_context_create(pcre2_general_context *);\ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_compile_context_free(pcre2_compile_context *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_bsr(pcre2_compile_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_character_tables(pcre2_compile_context *, const uint8_t *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_compile_extra_options(pcre2_compile_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_max_pattern_length(pcre2_compile_context *, PCRE2_SIZE); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_newline(pcre2_compile_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_parens_nest_limit(pcre2_compile_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_compile_recursion_guard(pcre2_compile_context *, \ - int (*)(uint32_t, void *), void *); - -#define PCRE2_MATCH_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL pcre2_match_context PCRE2_CALL_CONVENTION \ - *pcre2_match_context_copy(pcre2_match_context *); \ -PCRE2_EXP_DECL pcre2_match_context PCRE2_CALL_CONVENTION \ - *pcre2_match_context_create(pcre2_general_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_match_context_free(pcre2_match_context *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_callout(pcre2_match_context *, \ - int (*)(pcre2_callout_block *, void *), void *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_substitute_callout(pcre2_match_context *, \ - int (*)(pcre2_substitute_callout_block *, void *), void *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_depth_limit(pcre2_match_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_heap_limit(pcre2_match_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_match_limit(pcre2_match_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_offset_limit(pcre2_match_context *, PCRE2_SIZE); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_recursion_limit(pcre2_match_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_recursion_memory_management(pcre2_match_context *, \ - void *(*)(PCRE2_SIZE, void *), void (*)(void *, void *), void *); - -#define PCRE2_CONVERT_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL pcre2_convert_context PCRE2_CALL_CONVENTION \ - *pcre2_convert_context_copy(pcre2_convert_context *); \ -PCRE2_EXP_DECL pcre2_convert_context PCRE2_CALL_CONVENTION \ - *pcre2_convert_context_create(pcre2_general_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_convert_context_free(pcre2_convert_context *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_glob_escape(pcre2_convert_context *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_set_glob_separator(pcre2_convert_context *, uint32_t); - - -/* Functions concerned with compiling a pattern to PCRE internal code. */ - -#define PCRE2_COMPILE_FUNCTIONS \ -PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \ - *pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, \ - pcre2_compile_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_code_free(pcre2_code *); \ -PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \ - *pcre2_code_copy(const pcre2_code *); \ -PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \ - *pcre2_code_copy_with_tables(const pcre2_code *); - - -/* Functions that give information about a compiled pattern. */ - -#define PCRE2_PATTERN_INFO_FUNCTIONS \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_pattern_info(const pcre2_code *, uint32_t, void *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_callout_enumerate(const pcre2_code *, \ - int (*)(pcre2_callout_enumerate_block *, void *), void *); - - -/* Functions for running a match and inspecting the result. */ - -#define PCRE2_MATCH_FUNCTIONS \ -PCRE2_EXP_DECL pcre2_match_data PCRE2_CALL_CONVENTION \ - *pcre2_match_data_create(uint32_t, pcre2_general_context *); \ -PCRE2_EXP_DECL pcre2_match_data PCRE2_CALL_CONVENTION \ - *pcre2_match_data_create_from_pattern(const pcre2_code *, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ - uint32_t, pcre2_match_data *, pcre2_match_context *, int *, PCRE2_SIZE); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ - uint32_t, pcre2_match_data *, pcre2_match_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_match_data_free(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SPTR PCRE2_CALL_CONVENTION \ - pcre2_get_mark(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ - pcre2_get_match_data_size(pcre2_match_data *); \ -PCRE2_EXP_DECL uint32_t PCRE2_CALL_CONVENTION \ - pcre2_get_ovector_count(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ - *pcre2_get_ovector_pointer(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ - pcre2_get_startchar(pcre2_match_data *); - - -/* Convenience functions for handling matched substrings. */ - -#define PCRE2_SUBSTRING_FUNCTIONS \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_copy_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_UCHAR *, \ - PCRE2_SIZE *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_copy_bynumber(pcre2_match_data *, uint32_t, PCRE2_UCHAR *, \ - PCRE2_SIZE *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_substring_free(PCRE2_UCHAR *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_get_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_UCHAR **, \ - PCRE2_SIZE *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_get_bynumber(pcre2_match_data *, uint32_t, PCRE2_UCHAR **, \ - PCRE2_SIZE *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_length_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_length_bynumber(pcre2_match_data *, uint32_t, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_nametable_scan(const pcre2_code *, PCRE2_SPTR, PCRE2_SPTR *, \ - PCRE2_SPTR *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_number_from_name(const pcre2_code *, PCRE2_SPTR); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_substring_list_free(PCRE2_SPTR *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substring_list_get(pcre2_match_data *, PCRE2_UCHAR ***, PCRE2_SIZE **); - -/* Functions for serializing / deserializing compiled patterns. */ - -#define PCRE2_SERIALIZE_FUNCTIONS \ -PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ - pcre2_serialize_encode(const pcre2_code **, int32_t, uint8_t **, \ - PCRE2_SIZE *, pcre2_general_context *); \ -PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ - pcre2_serialize_decode(pcre2_code **, int32_t, const uint8_t *, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ - pcre2_serialize_get_number_of_codes(const uint8_t *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_serialize_free(uint8_t *); - - -/* Convenience function for match + substitute. */ - -#define PCRE2_SUBSTITUTE_FUNCTION \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_substitute(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ - uint32_t, pcre2_match_data *, pcre2_match_context *, PCRE2_SPTR, \ - PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE *); - - -/* Functions for converting pattern source strings. */ - -#define PCRE2_CONVERT_FUNCTIONS \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_pattern_convert(PCRE2_SPTR, PCRE2_SIZE, uint32_t, PCRE2_UCHAR **, \ - PCRE2_SIZE *, pcre2_convert_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_converted_pattern_free(PCRE2_UCHAR *); - - -/* Functions for JIT processing */ - -#define PCRE2_JIT_FUNCTIONS \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_jit_compile(pcre2_code *, uint32_t); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_jit_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ - uint32_t, pcre2_match_data *, pcre2_match_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_jit_free_unused_memory(pcre2_general_context *); \ -PCRE2_EXP_DECL pcre2_jit_stack PCRE2_CALL_CONVENTION \ - *pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, pcre2_general_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_jit_stack_assign(pcre2_match_context *, pcre2_jit_callback, void *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_jit_stack_free(pcre2_jit_stack *); - - -/* Other miscellaneous functions. */ - -#define PCRE2_OTHER_FUNCTIONS \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_get_error_message(int, PCRE2_UCHAR *, PCRE2_SIZE); \ -PCRE2_EXP_DECL const uint8_t PCRE2_CALL_CONVENTION \ - *pcre2_maketables(pcre2_general_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_maketables_free(pcre2_general_context *, const uint8_t *); - -/* Define macros that generate width-specific names from generic versions. The -three-level macro scheme is necessary to get the macros expanded when we want -them to be. First we get the width from PCRE2_LOCAL_WIDTH, which is used for -generating three versions of everything below. After that, PCRE2_SUFFIX will be -re-defined to use PCRE2_CODE_UNIT_WIDTH, for use when macros such as -pcre2_compile are called by application code. */ - -#define PCRE2_JOIN(a,b) a ## b -#define PCRE2_GLUE(a,b) PCRE2_JOIN(a,b) -#define PCRE2_SUFFIX(a) PCRE2_GLUE(a,PCRE2_LOCAL_WIDTH) - - -/* Data types */ - -#define PCRE2_UCHAR PCRE2_SUFFIX(PCRE2_UCHAR) -#define PCRE2_SPTR PCRE2_SUFFIX(PCRE2_SPTR) - -#define pcre2_code PCRE2_SUFFIX(pcre2_code_) -#define pcre2_jit_callback PCRE2_SUFFIX(pcre2_jit_callback_) -#define pcre2_jit_stack PCRE2_SUFFIX(pcre2_jit_stack_) - -#define pcre2_real_code PCRE2_SUFFIX(pcre2_real_code_) -#define pcre2_real_general_context PCRE2_SUFFIX(pcre2_real_general_context_) -#define pcre2_real_compile_context PCRE2_SUFFIX(pcre2_real_compile_context_) -#define pcre2_real_convert_context PCRE2_SUFFIX(pcre2_real_convert_context_) -#define pcre2_real_match_context PCRE2_SUFFIX(pcre2_real_match_context_) -#define pcre2_real_jit_stack PCRE2_SUFFIX(pcre2_real_jit_stack_) -#define pcre2_real_match_data PCRE2_SUFFIX(pcre2_real_match_data_) - - -/* Data blocks */ - -#define pcre2_callout_block PCRE2_SUFFIX(pcre2_callout_block_) -#define pcre2_callout_enumerate_block PCRE2_SUFFIX(pcre2_callout_enumerate_block_) -#define pcre2_substitute_callout_block PCRE2_SUFFIX(pcre2_substitute_callout_block_) -#define pcre2_general_context PCRE2_SUFFIX(pcre2_general_context_) -#define pcre2_compile_context PCRE2_SUFFIX(pcre2_compile_context_) -#define pcre2_convert_context PCRE2_SUFFIX(pcre2_convert_context_) -#define pcre2_match_context PCRE2_SUFFIX(pcre2_match_context_) -#define pcre2_match_data PCRE2_SUFFIX(pcre2_match_data_) - - -/* Functions: the complete list in alphabetical order */ - -#define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_) -#define pcre2_code_copy PCRE2_SUFFIX(pcre2_code_copy_) -#define pcre2_code_copy_with_tables PCRE2_SUFFIX(pcre2_code_copy_with_tables_) -#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_) -#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_) -#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_) -#define pcre2_compile_context_create PCRE2_SUFFIX(pcre2_compile_context_create_) -#define pcre2_compile_context_free PCRE2_SUFFIX(pcre2_compile_context_free_) -#define pcre2_config PCRE2_SUFFIX(pcre2_config_) -#define pcre2_convert_context_copy PCRE2_SUFFIX(pcre2_convert_context_copy_) -#define pcre2_convert_context_create PCRE2_SUFFIX(pcre2_convert_context_create_) -#define pcre2_convert_context_free PCRE2_SUFFIX(pcre2_convert_context_free_) -#define pcre2_converted_pattern_free PCRE2_SUFFIX(pcre2_converted_pattern_free_) -#define pcre2_dfa_match PCRE2_SUFFIX(pcre2_dfa_match_) -#define pcre2_general_context_copy PCRE2_SUFFIX(pcre2_general_context_copy_) -#define pcre2_general_context_create PCRE2_SUFFIX(pcre2_general_context_create_) -#define pcre2_general_context_free PCRE2_SUFFIX(pcre2_general_context_free_) -#define pcre2_get_error_message PCRE2_SUFFIX(pcre2_get_error_message_) -#define pcre2_get_mark PCRE2_SUFFIX(pcre2_get_mark_) -#define pcre2_get_match_data_size PCRE2_SUFFIX(pcre2_get_match_data_size_) -#define pcre2_get_ovector_pointer PCRE2_SUFFIX(pcre2_get_ovector_pointer_) -#define pcre2_get_ovector_count PCRE2_SUFFIX(pcre2_get_ovector_count_) -#define pcre2_get_startchar PCRE2_SUFFIX(pcre2_get_startchar_) -#define pcre2_jit_compile PCRE2_SUFFIX(pcre2_jit_compile_) -#define pcre2_jit_match PCRE2_SUFFIX(pcre2_jit_match_) -#define pcre2_jit_free_unused_memory PCRE2_SUFFIX(pcre2_jit_free_unused_memory_) -#define pcre2_jit_stack_assign PCRE2_SUFFIX(pcre2_jit_stack_assign_) -#define pcre2_jit_stack_create PCRE2_SUFFIX(pcre2_jit_stack_create_) -#define pcre2_jit_stack_free PCRE2_SUFFIX(pcre2_jit_stack_free_) -#define pcre2_maketables PCRE2_SUFFIX(pcre2_maketables_) -#define pcre2_maketables_free PCRE2_SUFFIX(pcre2_maketables_free_) -#define pcre2_match PCRE2_SUFFIX(pcre2_match_) -#define pcre2_match_context_copy PCRE2_SUFFIX(pcre2_match_context_copy_) -#define pcre2_match_context_create PCRE2_SUFFIX(pcre2_match_context_create_) -#define pcre2_match_context_free PCRE2_SUFFIX(pcre2_match_context_free_) -#define pcre2_match_data_create PCRE2_SUFFIX(pcre2_match_data_create_) -#define pcre2_match_data_create_from_pattern PCRE2_SUFFIX(pcre2_match_data_create_from_pattern_) -#define pcre2_match_data_free PCRE2_SUFFIX(pcre2_match_data_free_) -#define pcre2_pattern_convert PCRE2_SUFFIX(pcre2_pattern_convert_) -#define pcre2_pattern_info PCRE2_SUFFIX(pcre2_pattern_info_) -#define pcre2_serialize_decode PCRE2_SUFFIX(pcre2_serialize_decode_) -#define pcre2_serialize_encode PCRE2_SUFFIX(pcre2_serialize_encode_) -#define pcre2_serialize_free PCRE2_SUFFIX(pcre2_serialize_free_) -#define pcre2_serialize_get_number_of_codes PCRE2_SUFFIX(pcre2_serialize_get_number_of_codes_) -#define pcre2_set_bsr PCRE2_SUFFIX(pcre2_set_bsr_) -#define pcre2_set_callout PCRE2_SUFFIX(pcre2_set_callout_) -#define pcre2_set_character_tables PCRE2_SUFFIX(pcre2_set_character_tables_) -#define pcre2_set_compile_extra_options PCRE2_SUFFIX(pcre2_set_compile_extra_options_) -#define pcre2_set_compile_recursion_guard PCRE2_SUFFIX(pcre2_set_compile_recursion_guard_) -#define pcre2_set_depth_limit PCRE2_SUFFIX(pcre2_set_depth_limit_) -#define pcre2_set_glob_escape PCRE2_SUFFIX(pcre2_set_glob_escape_) -#define pcre2_set_glob_separator PCRE2_SUFFIX(pcre2_set_glob_separator_) -#define pcre2_set_heap_limit PCRE2_SUFFIX(pcre2_set_heap_limit_) -#define pcre2_set_match_limit PCRE2_SUFFIX(pcre2_set_match_limit_) -#define pcre2_set_max_pattern_length PCRE2_SUFFIX(pcre2_set_max_pattern_length_) -#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_) -#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_) -#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_) -#define pcre2_set_substitute_callout PCRE2_SUFFIX(pcre2_set_substitute_callout_) -#define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_) -#define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_) -#define pcre2_substring_copy_bynumber PCRE2_SUFFIX(pcre2_substring_copy_bynumber_) -#define pcre2_substring_free PCRE2_SUFFIX(pcre2_substring_free_) -#define pcre2_substring_get_byname PCRE2_SUFFIX(pcre2_substring_get_byname_) -#define pcre2_substring_get_bynumber PCRE2_SUFFIX(pcre2_substring_get_bynumber_) -#define pcre2_substring_length_byname PCRE2_SUFFIX(pcre2_substring_length_byname_) -#define pcre2_substring_length_bynumber PCRE2_SUFFIX(pcre2_substring_length_bynumber_) -#define pcre2_substring_list_get PCRE2_SUFFIX(pcre2_substring_list_get_) -#define pcre2_substring_list_free PCRE2_SUFFIX(pcre2_substring_list_free_) -#define pcre2_substring_nametable_scan PCRE2_SUFFIX(pcre2_substring_nametable_scan_) -#define pcre2_substring_number_from_name PCRE2_SUFFIX(pcre2_substring_number_from_name_) - -/* Keep this old function name for backwards compatibility */ -#define pcre2_set_recursion_limit PCRE2_SUFFIX(pcre2_set_recursion_limit_) - -/* Keep this obsolete function for backwards compatibility: it is now a noop. */ -#define pcre2_set_recursion_memory_management PCRE2_SUFFIX(pcre2_set_recursion_memory_management_) - -/* Now generate all three sets of width-specific structures and function -prototypes. */ - -#define PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS \ -PCRE2_TYPES_LIST \ -PCRE2_STRUCTURE_LIST \ -PCRE2_GENERAL_INFO_FUNCTIONS \ -PCRE2_GENERAL_CONTEXT_FUNCTIONS \ -PCRE2_COMPILE_CONTEXT_FUNCTIONS \ -PCRE2_CONVERT_CONTEXT_FUNCTIONS \ -PCRE2_CONVERT_FUNCTIONS \ -PCRE2_MATCH_CONTEXT_FUNCTIONS \ -PCRE2_COMPILE_FUNCTIONS \ -PCRE2_PATTERN_INFO_FUNCTIONS \ -PCRE2_MATCH_FUNCTIONS \ -PCRE2_SUBSTRING_FUNCTIONS \ -PCRE2_SERIALIZE_FUNCTIONS \ -PCRE2_SUBSTITUTE_FUNCTION \ -PCRE2_JIT_FUNCTIONS \ -PCRE2_OTHER_FUNCTIONS - -#define PCRE2_LOCAL_WIDTH 8 -PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS -#undef PCRE2_LOCAL_WIDTH - -#define PCRE2_LOCAL_WIDTH 16 -PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS -#undef PCRE2_LOCAL_WIDTH - -#define PCRE2_LOCAL_WIDTH 32 -PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS -#undef PCRE2_LOCAL_WIDTH - -/* Undefine the list macros; they are no longer needed. */ - -#undef PCRE2_TYPES_LIST -#undef PCRE2_STRUCTURE_LIST -#undef PCRE2_GENERAL_INFO_FUNCTIONS -#undef PCRE2_GENERAL_CONTEXT_FUNCTIONS -#undef PCRE2_COMPILE_CONTEXT_FUNCTIONS -#undef PCRE2_CONVERT_CONTEXT_FUNCTIONS -#undef PCRE2_MATCH_CONTEXT_FUNCTIONS -#undef PCRE2_COMPILE_FUNCTIONS -#undef PCRE2_PATTERN_INFO_FUNCTIONS -#undef PCRE2_MATCH_FUNCTIONS -#undef PCRE2_SUBSTRING_FUNCTIONS -#undef PCRE2_SERIALIZE_FUNCTIONS -#undef PCRE2_SUBSTITUTE_FUNCTION -#undef PCRE2_JIT_FUNCTIONS -#undef PCRE2_OTHER_FUNCTIONS -#undef PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS - -/* PCRE2_CODE_UNIT_WIDTH must be defined. If it is 8, 16, or 32, redefine -PCRE2_SUFFIX to use it. If it is 0, undefine the other macros and make -PCRE2_SUFFIX a no-op. Otherwise, generate an error. */ - -#undef PCRE2_SUFFIX -#ifndef PCRE2_CODE_UNIT_WIDTH -#error PCRE2_CODE_UNIT_WIDTH must be defined before including pcre2.h. -#error Use 8, 16, or 32; or 0 for a multi-width application. -#else /* PCRE2_CODE_UNIT_WIDTH is defined */ -#if PCRE2_CODE_UNIT_WIDTH == 8 || \ - PCRE2_CODE_UNIT_WIDTH == 16 || \ - PCRE2_CODE_UNIT_WIDTH == 32 -#define PCRE2_SUFFIX(a) PCRE2_GLUE(a, PCRE2_CODE_UNIT_WIDTH) -#elif PCRE2_CODE_UNIT_WIDTH == 0 -#undef PCRE2_JOIN -#undef PCRE2_GLUE -#define PCRE2_SUFFIX(a) a -#else -#error PCRE2_CODE_UNIT_WIDTH must be 0, 8, 16, or 32. -#endif -#endif /* PCRE2_CODE_UNIT_WIDTH is defined */ - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* PCRE2_H_IDEMPOTENT_GUARD */ - -/* End of pcre2.h */ diff --git a/pcre2/src/pcre2_auto_possess.c b/pcre2/src/pcre2_auto_possess.c deleted file mode 100644 index c64cf856d..000000000 --- a/pcre2/src/pcre2_auto_possess.c +++ /dev/null @@ -1,1343 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -/* This module contains functions that scan a compiled pattern and change -repeats into possessive repeats where possible. */ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - - -#include "pcre2_internal.h" - - -/************************************************* -* Tables for auto-possessification * -*************************************************/ - -/* This table is used to check whether auto-possessification is possible -between adjacent character-type opcodes. The left-hand (repeated) opcode is -used to select the row, and the right-hand opcode is use to select the column. -A value of 1 means that auto-possessification is OK. For example, the second -value in the first row means that \D+\d can be turned into \D++\d. - -The Unicode property types (\P and \p) have to be present to fill out the table -because of what their opcode values are, but the table values should always be -zero because property types are handled separately in the code. The last four -columns apply to items that cannot be repeated, so there is no need to have -rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is -*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ - -#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1) -#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1) - -static const uint8_t autoposstab[APTROWS][APTCOLS] = { -/* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */ - { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */ - { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */ - { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */ - { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */ - { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */ - { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */ - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */ - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */ - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */ - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */ - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */ - { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */ - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */ - { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */ - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */ - { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */ - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */ -}; - -#ifdef SUPPORT_UNICODE -/* This table is used to check whether auto-possessification is possible -between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The -left-hand (repeated) opcode is used to select the row, and the right-hand -opcode is used to select the column. The values are as follows: - - 0 Always return FALSE (never auto-possessify) - 1 Character groups are distinct (possessify if both are OP_PROP) - 2 Check character categories in the same group (general or particular) - 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP) - - 4 Check left general category vs right particular category - 5 Check right general category vs left particular category - - 6 Left alphanum vs right general category - 7 Left space vs right general category - 8 Left word vs right general category - - 9 Right alphanum vs left general category - 10 Right space vs left general category - 11 Right word vs left general category - - 12 Left alphanum vs right particular category - 13 Left space vs right particular category - 14 Left word vs right particular category - - 15 Right alphanum vs left particular category - 16 Right space vs left particular category - 17 Right word vs left particular category -*/ - -static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = { -/* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */ - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */ - { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */ - { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */ - { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */ - { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */ - { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */ - { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */ - { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */ - { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */ - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */ - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */ -}; - -/* This table is used to check whether auto-possessification is possible -between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one -specifies a general category and the other specifies a particular category. The -row is selected by the general category and the column by the particular -category. The value is 1 if the particular category is not part of the general -category. */ - -static const uint8_t catposstab[7][30] = { -/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */ - { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */ - { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */ - { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */ - { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */ - { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */ - { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */ - { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */ -}; - -/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against -a general or particular category. The properties in each row are those -that apply to the character set in question. Duplication means that a little -unnecessary work is done when checking, but this keeps things much simpler -because they can all use the same code. For more details see the comment where -this table is used. - -Note: SPACE and PXSPACE used to be different because Perl excluded VT from -"space", but from Perl 5.18 it's included, so both categories are treated the -same here. */ - -static const uint8_t posspropstab[3][4] = { - { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */ - { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */ - { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */ -}; -#endif /* SUPPORT_UNICODE */ - - - -#ifdef SUPPORT_UNICODE -/************************************************* -* Check a character and a property * -*************************************************/ - -/* This function is called by compare_opcodes() when a property item is -adjacent to a fixed character. - -Arguments: - c the character - ptype the property type - pdata the data for the type - negated TRUE if it's a negated property (\P or \p{^) - -Returns: TRUE if auto-possessifying is OK -*/ - -static BOOL -check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata, - BOOL negated) -{ -const uint32_t *p; -const ucd_record *prop = GET_UCD(c); - -switch(ptype) - { - case PT_LAMP: - return (prop->chartype == ucp_Lu || - prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt) == negated; - - case PT_GC: - return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; - - case PT_PC: - return (pdata == prop->chartype) == negated; - - case PT_SC: - return (pdata == prop->script) == negated; - - /* These are specials */ - - case PT_ALNUM: - return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; - - /* Perl space used to exclude VT, but from Perl 5.18 it is included, which - means that Perl space and POSIX space are now identical. PCRE was changed - at release 8.34. */ - - case PT_SPACE: /* Perl space */ - case PT_PXSPACE: /* POSIX space */ - switch(c) - { - HSPACE_CASES: - VSPACE_CASES: - return negated; - - default: - return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated; - } - break; /* Control never reaches here */ - - case PT_WORD: - return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || - c == CHAR_UNDERSCORE) == negated; - - case PT_CLIST: - p = PRIV(ucd_caseless_sets) + prop->caseset; - for (;;) - { - if (c < *p) return !negated; - if (c == *p++) return negated; - } - break; /* Control never reaches here */ - } - -return FALSE; -} -#endif /* SUPPORT_UNICODE */ - - - -/************************************************* -* Base opcode of repeated opcodes * -*************************************************/ - -/* Returns the base opcode for repeated single character type opcodes. If the -opcode is not a repeated character type, it returns with the original value. - -Arguments: c opcode -Returns: base opcode for the type -*/ - -static PCRE2_UCHAR -get_repeat_base(PCRE2_UCHAR c) -{ -return (c > OP_TYPEPOSUPTO)? c : - (c >= OP_TYPESTAR)? OP_TYPESTAR : - (c >= OP_NOTSTARI)? OP_NOTSTARI : - (c >= OP_NOTSTAR)? OP_NOTSTAR : - (c >= OP_STARI)? OP_STARI : - OP_STAR; -} - - -/************************************************* -* Fill the character property list * -*************************************************/ - -/* Checks whether the code points to an opcode that can take part in auto- -possessification, and if so, fills a list with its properties. - -Arguments: - code points to start of expression - utf TRUE if in UTF mode - ucp TRUE if in UCP mode - fcc points to the case-flipping table - list points to output list - list[0] will be filled with the opcode - list[1] will be non-zero if this opcode - can match an empty character string - list[2..7] depends on the opcode - -Returns: points to the start of the next opcode if *code is accepted - NULL if *code is not accepted -*/ - -static PCRE2_SPTR -get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc, - uint32_t *list) -{ -PCRE2_UCHAR c = *code; -PCRE2_UCHAR base; -PCRE2_SPTR end; -uint32_t chr; - -#ifdef SUPPORT_UNICODE -uint32_t *clist_dest; -const uint32_t *clist_src; -#else -(void)utf; /* Suppress "unused parameter" compiler warnings */ -(void)ucp; -#endif - -list[0] = c; -list[1] = FALSE; -code++; - -if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) - { - base = get_repeat_base(c); - c -= (base - OP_STAR); - - if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO) - code += IMM2_SIZE; - - list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && - c != OP_POSPLUS); - - switch(base) - { - case OP_STAR: - list[0] = OP_CHAR; - break; - - case OP_STARI: - list[0] = OP_CHARI; - break; - - case OP_NOTSTAR: - list[0] = OP_NOT; - break; - - case OP_NOTSTARI: - list[0] = OP_NOTI; - break; - - case OP_TYPESTAR: - list[0] = *code; - code++; - break; - } - c = list[0]; - } - -switch(c) - { - case OP_NOT_DIGIT: - case OP_DIGIT: - case OP_NOT_WHITESPACE: - case OP_WHITESPACE: - case OP_NOT_WORDCHAR: - case OP_WORDCHAR: - case OP_ANY: - case OP_ALLANY: - case OP_ANYNL: - case OP_NOT_HSPACE: - case OP_HSPACE: - case OP_NOT_VSPACE: - case OP_VSPACE: - case OP_EXTUNI: - case OP_EODN: - case OP_EOD: - case OP_DOLL: - case OP_DOLLM: - return code; - - case OP_CHAR: - case OP_NOT: - GETCHARINCTEST(chr, code); - list[2] = chr; - list[3] = NOTACHAR; - return code; - - case OP_CHARI: - case OP_NOTI: - list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT; - GETCHARINCTEST(chr, code); - list[2] = chr; - -#ifdef SUPPORT_UNICODE - if (chr < 128 || (chr < 256 && !utf && !ucp)) - list[3] = fcc[chr]; - else - list[3] = UCD_OTHERCASE(chr); -#elif defined SUPPORT_WIDE_CHARS - list[3] = (chr < 256) ? fcc[chr] : chr; -#else - list[3] = fcc[chr]; -#endif - - /* The othercase might be the same value. */ - - if (chr == list[3]) - list[3] = NOTACHAR; - else - list[4] = NOTACHAR; - return code; - -#ifdef SUPPORT_UNICODE - case OP_PROP: - case OP_NOTPROP: - if (code[0] != PT_CLIST) - { - list[2] = code[0]; - list[3] = code[1]; - return code + 2; - } - - /* Convert only if we have enough space. */ - - clist_src = PRIV(ucd_caseless_sets) + code[1]; - clist_dest = list + 2; - code += 2; - - do { - if (clist_dest >= list + 8) - { - /* Early return if there is not enough space. This should never - happen, since all clists are shorter than 5 character now. */ - list[2] = code[0]; - list[3] = code[1]; - return code; - } - *clist_dest++ = *clist_src; - } - while(*clist_src++ != NOTACHAR); - - /* All characters are stored. The terminating NOTACHAR is copied from the - clist itself. */ - - list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT; - return code; -#endif - - case OP_NCLASS: - case OP_CLASS: -#ifdef SUPPORT_WIDE_CHARS - case OP_XCLASS: - if (c == OP_XCLASS) - end = code + GET(code, 0) - 1; - else -#endif - end = code + 32 / sizeof(PCRE2_UCHAR); - - switch(*end) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSSTAR: - case OP_CRPOSQUERY: - list[1] = TRUE; - end++; - break; - - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRPOSPLUS: - end++; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - list[1] = (GET2(end, 1) == 0); - end += 1 + 2 * IMM2_SIZE; - break; - } - list[2] = (uint32_t)(end - code); - return end; - } -return NULL; /* Opcode not accepted */ -} - - - -/************************************************* -* Scan further character sets for match * -*************************************************/ - -/* Checks whether the base and the current opcode have a common character, in -which case the base cannot be possessified. - -Arguments: - code points to the byte code - utf TRUE in UTF mode - ucp TRUE in UCP mode - cb compile data block - base_list the data list of the base opcode - base_end the end of the base opcode - rec_limit points to recursion depth counter - -Returns: TRUE if the auto-possessification is possible -*/ - -static BOOL -compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb, - const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit) -{ -PCRE2_UCHAR c; -uint32_t list[8]; -const uint32_t *chr_ptr; -const uint32_t *ochr_ptr; -const uint32_t *list_ptr; -PCRE2_SPTR next_code; -#ifdef SUPPORT_WIDE_CHARS -PCRE2_SPTR xclass_flags; -#endif -const uint8_t *class_bitset; -const uint8_t *set1, *set2, *set_end; -uint32_t chr; -BOOL accepted, invert_bits; -BOOL entered_a_group = FALSE; - -if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */ - -/* Note: the base_list[1] contains whether the current opcode has a greedy -(represented by a non-zero value) quantifier. This is a different from -other character type lists, which store here that the character iterator -matches to an empty string (also represented by a non-zero value). */ - -for(;;) - { - /* All operations move the code pointer forward. - Therefore infinite recursions are not possible. */ - - c = *code; - - /* Skip over callouts */ - - if (c == OP_CALLOUT) - { - code += PRIV(OP_lengths)[c]; - continue; - } - - if (c == OP_CALLOUT_STR) - { - code += GET(code, 1 + 2*LINK_SIZE); - continue; - } - - /* At the end of a branch, skip to the end of the group. */ - - if (c == OP_ALT) - { - do code += GET(code, 1); while (*code == OP_ALT); - c = *code; - } - - /* Inspect the next opcode. */ - - switch(c) - { - /* We can always possessify a greedy iterator at the end of the pattern, - which is reached after skipping over the final OP_KET. A non-greedy - iterator must never be possessified. */ - - case OP_END: - return base_list[1] != 0; - - /* When an iterator is at the end of certain kinds of group we can inspect - what follows the group by skipping over the closing ket. Note that this - does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given - iteration is variable (could be another iteration or could be the next - item). As these two opcodes are not listed in the next switch, they will - end up as the next code to inspect, and return FALSE by virtue of being - unsupported. */ - - case OP_KET: - case OP_KETRPOS: - /* The non-greedy case cannot be converted to a possessive form. */ - - if (base_list[1] == 0) return FALSE; - - /* If the bracket is capturing it might be referenced by an OP_RECURSE - so its last iterator can never be possessified if the pattern contains - recursions. (This could be improved by keeping a list of group numbers that - are called by recursion.) */ - - switch(*(code - GET(code, 1))) - { - case OP_CBRA: - case OP_SCBRA: - case OP_CBRAPOS: - case OP_SCBRAPOS: - if (cb->had_recurse) return FALSE; - break; - - /* A script run might have to backtrack if the iterated item can match - characters from more than one script. So give up unless repeating an - explicit character. */ - - case OP_SCRIPT_RUN: - if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI) - return FALSE; - break; - - /* Atomic sub-patterns and assertions can always auto-possessify their - last iterator. However, if the group was entered as a result of checking - a previous iterator, this is not possible. */ - - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ONCE: - return !entered_a_group; - - /* Non-atomic assertions - don't possessify last iterator. This needs - more thought. */ - - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - return FALSE; - } - - /* Skip over the bracket and inspect what comes next. */ - - code += PRIV(OP_lengths)[c]; - continue; - - /* Handle cases where the next item is a group. */ - - case OP_ONCE: - case OP_BRA: - case OP_CBRA: - next_code = code + GET(code, 1); - code += PRIV(OP_lengths)[c]; - - /* Check each branch. We have to recurse a level for all but the last - branch. */ - - while (*next_code == OP_ALT) - { - if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit)) - return FALSE; - code = next_code + 1 + LINK_SIZE; - next_code += GET(next_code, 1); - } - - entered_a_group = TRUE; - continue; - - case OP_BRAZERO: - case OP_BRAMINZERO: - - next_code = code + 1; - if (*next_code != OP_BRA && *next_code != OP_CBRA && - *next_code != OP_ONCE) return FALSE; - - do next_code += GET(next_code, 1); while (*next_code == OP_ALT); - - /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ - - next_code += 1 + LINK_SIZE; - if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, - rec_limit)) - return FALSE; - - code += PRIV(OP_lengths)[c]; - continue; - - /* The next opcode does not need special handling; fall through and use it - to see if the base can be possessified. */ - - default: - break; - } - - /* We now have the next appropriate opcode to compare with the base. Check - for a supported opcode, and load its properties. */ - - code = get_chr_property_list(code, utf, ucp, cb->fcc, list); - if (code == NULL) return FALSE; /* Unsupported */ - - /* If either opcode is a small character list, set pointers for comparing - characters from that list with another list, or with a property. */ - - if (base_list[0] == OP_CHAR) - { - chr_ptr = base_list + 2; - list_ptr = list; - } - else if (list[0] == OP_CHAR) - { - chr_ptr = list + 2; - list_ptr = base_list; - } - - /* Character bitsets can also be compared to certain opcodes. */ - - else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS -#if PCRE2_CODE_UNIT_WIDTH == 8 - /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */ - || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS)) -#endif - ) - { -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS)) -#else - if (base_list[0] == OP_CLASS) -#endif - { - set1 = (uint8_t *)(base_end - base_list[2]); - list_ptr = list; - } - else - { - set1 = (uint8_t *)(code - list[2]); - list_ptr = base_list; - } - - invert_bits = FALSE; - switch(list_ptr[0]) - { - case OP_CLASS: - case OP_NCLASS: - set2 = (uint8_t *) - ((list_ptr == list ? code : base_end) - list_ptr[2]); - break; - -#ifdef SUPPORT_WIDE_CHARS - case OP_XCLASS: - xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE; - if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE; - if ((*xclass_flags & XCL_MAP) == 0) - { - /* No bits are set for characters < 256. */ - if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0; - /* Might be an empty repeat. */ - continue; - } - set2 = (uint8_t *)(xclass_flags + 1); - break; -#endif - - case OP_NOT_DIGIT: - invert_bits = TRUE; - /* Fall through */ - case OP_DIGIT: - set2 = (uint8_t *)(cb->cbits + cbit_digit); - break; - - case OP_NOT_WHITESPACE: - invert_bits = TRUE; - /* Fall through */ - case OP_WHITESPACE: - set2 = (uint8_t *)(cb->cbits + cbit_space); - break; - - case OP_NOT_WORDCHAR: - invert_bits = TRUE; - /* Fall through */ - case OP_WORDCHAR: - set2 = (uint8_t *)(cb->cbits + cbit_word); - break; - - default: - return FALSE; - } - - /* Because the bit sets are unaligned bytes, we need to perform byte - comparison here. */ - - set_end = set1 + 32; - if (invert_bits) - { - do - { - if ((*set1++ & ~(*set2++)) != 0) return FALSE; - } - while (set1 < set_end); - } - else - { - do - { - if ((*set1++ & *set2++) != 0) return FALSE; - } - while (set1 < set_end); - } - - if (list[1] == 0) return TRUE; - /* Might be an empty repeat. */ - continue; - } - - /* Some property combinations also acceptable. Unicode property opcodes are - processed specially; the rest can be handled with a lookup table. */ - - else - { - uint32_t leftop, rightop; - - leftop = base_list[0]; - rightop = list[0]; - -#ifdef SUPPORT_UNICODE - accepted = FALSE; /* Always set in non-unicode case. */ - if (leftop == OP_PROP || leftop == OP_NOTPROP) - { - if (rightop == OP_EOD) - accepted = TRUE; - else if (rightop == OP_PROP || rightop == OP_NOTPROP) - { - int n; - const uint8_t *p; - BOOL same = leftop == rightop; - BOOL lisprop = leftop == OP_PROP; - BOOL risprop = rightop == OP_PROP; - BOOL bothprop = lisprop && risprop; - - /* There's a table that specifies how each combination is to be - processed: - 0 Always return FALSE (never auto-possessify) - 1 Character groups are distinct (possessify if both are OP_PROP) - 2 Check character categories in the same group (general or particular) - 3 Return TRUE if the two opcodes are not the same - ... see comments below - */ - - n = propposstab[base_list[2]][list[2]]; - switch(n) - { - case 0: break; - case 1: accepted = bothprop; break; - case 2: accepted = (base_list[3] == list[3]) != same; break; - case 3: accepted = !same; break; - - case 4: /* Left general category, right particular category */ - accepted = risprop && catposstab[base_list[3]][list[3]] == same; - break; - - case 5: /* Right general category, left particular category */ - accepted = lisprop && catposstab[list[3]][base_list[3]] == same; - break; - - /* This code is logically tricky. Think hard before fiddling with it. - The posspropstab table has four entries per row. Each row relates to - one of PCRE's special properties such as ALNUM or SPACE or WORD. - Only WORD actually needs all four entries, but using repeats for the - others means they can all use the same code below. - - The first two entries in each row are Unicode general categories, and - apply always, because all the characters they include are part of the - PCRE character set. The third and fourth entries are a general and a - particular category, respectively, that include one or more relevant - characters. One or the other is used, depending on whether the check - is for a general or a particular category. However, in both cases the - category contains more characters than the specials that are defined - for the property being tested against. Therefore, it cannot be used - in a NOTPROP case. - - Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po. - Underscore is covered by ucp_P or ucp_Po. */ - - case 6: /* Left alphanum vs right general category */ - case 7: /* Left space vs right general category */ - case 8: /* Left word vs right general category */ - p = posspropstab[n-6]; - accepted = risprop && lisprop == - (list[3] != p[0] && - list[3] != p[1] && - (list[3] != p[2] || !lisprop)); - break; - - case 9: /* Right alphanum vs left general category */ - case 10: /* Right space vs left general category */ - case 11: /* Right word vs left general category */ - p = posspropstab[n-9]; - accepted = lisprop && risprop == - (base_list[3] != p[0] && - base_list[3] != p[1] && - (base_list[3] != p[2] || !risprop)); - break; - - case 12: /* Left alphanum vs right particular category */ - case 13: /* Left space vs right particular category */ - case 14: /* Left word vs right particular category */ - p = posspropstab[n-12]; - accepted = risprop && lisprop == - (catposstab[p[0]][list[3]] && - catposstab[p[1]][list[3]] && - (list[3] != p[3] || !lisprop)); - break; - - case 15: /* Right alphanum vs left particular category */ - case 16: /* Right space vs left particular category */ - case 17: /* Right word vs left particular category */ - p = posspropstab[n-15]; - accepted = lisprop && risprop == - (catposstab[p[0]][base_list[3]] && - catposstab[p[1]][base_list[3]] && - (base_list[3] != p[3] || !risprop)); - break; - } - } - } - - else -#endif /* SUPPORT_UNICODE */ - - accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP && - rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && - autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; - - if (!accepted) return FALSE; - - if (list[1] == 0) return TRUE; - /* Might be an empty repeat. */ - continue; - } - - /* Control reaches here only if one of the items is a small character list. - All characters are checked against the other side. */ - - do - { - chr = *chr_ptr; - - switch(list_ptr[0]) - { - case OP_CHAR: - ochr_ptr = list_ptr + 2; - do - { - if (chr == *ochr_ptr) return FALSE; - ochr_ptr++; - } - while(*ochr_ptr != NOTACHAR); - break; - - case OP_NOT: - ochr_ptr = list_ptr + 2; - do - { - if (chr == *ochr_ptr) - break; - ochr_ptr++; - } - while(*ochr_ptr != NOTACHAR); - if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */ - break; - - /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not* - set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ - - case OP_DIGIT: - if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE; - break; - - case OP_NOT_DIGIT: - if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE; - break; - - case OP_WHITESPACE: - if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE; - break; - - case OP_NOT_WHITESPACE: - if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE; - break; - - case OP_WORDCHAR: - if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE; - break; - - case OP_NOT_WORDCHAR: - if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE; - break; - - case OP_HSPACE: - switch(chr) - { - HSPACE_CASES: return FALSE; - default: break; - } - break; - - case OP_NOT_HSPACE: - switch(chr) - { - HSPACE_CASES: break; - default: return FALSE; - } - break; - - case OP_ANYNL: - case OP_VSPACE: - switch(chr) - { - VSPACE_CASES: return FALSE; - default: break; - } - break; - - case OP_NOT_VSPACE: - switch(chr) - { - VSPACE_CASES: break; - default: return FALSE; - } - break; - - case OP_DOLL: - case OP_EODN: - switch (chr) - { - case CHAR_CR: - case CHAR_LF: - case CHAR_VT: - case CHAR_FF: - case CHAR_NEL: -#ifndef EBCDIC - case 0x2028: - case 0x2029: -#endif /* Not EBCDIC */ - return FALSE; - } - break; - - case OP_EOD: /* Can always possessify before \z */ - break; - -#ifdef SUPPORT_UNICODE - case OP_PROP: - case OP_NOTPROP: - if (!check_char_prop(chr, list_ptr[2], list_ptr[3], - list_ptr[0] == OP_NOTPROP)) - return FALSE; - break; -#endif - - case OP_NCLASS: - if (chr > 255) return FALSE; - /* Fall through */ - - case OP_CLASS: - if (chr > 255) break; - class_bitset = (uint8_t *) - ((list_ptr == list ? code : base_end) - list_ptr[2]); - if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE; - break; - -#ifdef SUPPORT_WIDE_CHARS - case OP_XCLASS: - if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) - - list_ptr[2] + LINK_SIZE, utf)) return FALSE; - break; -#endif - - default: - return FALSE; - } - - chr_ptr++; - } - while(*chr_ptr != NOTACHAR); - - /* At least one character must be matched from this opcode. */ - - if (list[1] == 0) return TRUE; - } - -/* Control never reaches here. There used to be a fail-save return FALSE; here, -but some compilers complain about an unreachable statement. */ -} - - - -/************************************************* -* Scan compiled regex for auto-possession * -*************************************************/ - -/* Replaces single character iterations with their possessive alternatives -if appropriate. This function modifies the compiled opcode! Hitting a -non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a -bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches -overly complicated or large patterns. In these cases, the check just stops, -leaving the remainder of the pattern unpossessified. - -Arguments: - code points to start of the byte code - cb compile data block - -Returns: 0 for success - -1 if a non-existant opcode is encountered -*/ - -int -PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb) -{ -PCRE2_UCHAR c; -PCRE2_SPTR end; -PCRE2_UCHAR *repeat_opcode; -uint32_t list[8]; -int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ -BOOL utf = (cb->external_options & PCRE2_UTF) != 0; -BOOL ucp = (cb->external_options & PCRE2_UCP) != 0; - -for (;;) - { - c = *code; - - if (c >= OP_TABLE_LENGTH) return -1; /* Something gone wrong */ - - if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) - { - c -= get_repeat_base(c) - OP_STAR; - end = (c <= OP_MINUPTO) ? - get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL; - list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; - - if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, - &rec_limit)) - { - switch(c) - { - case OP_STAR: - *code += OP_POSSTAR - OP_STAR; - break; - - case OP_MINSTAR: - *code += OP_POSSTAR - OP_MINSTAR; - break; - - case OP_PLUS: - *code += OP_POSPLUS - OP_PLUS; - break; - - case OP_MINPLUS: - *code += OP_POSPLUS - OP_MINPLUS; - break; - - case OP_QUERY: - *code += OP_POSQUERY - OP_QUERY; - break; - - case OP_MINQUERY: - *code += OP_POSQUERY - OP_MINQUERY; - break; - - case OP_UPTO: - *code += OP_POSUPTO - OP_UPTO; - break; - - case OP_MINUPTO: - *code += OP_POSUPTO - OP_MINUPTO; - break; - } - } - c = *code; - } - else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS) - { -#ifdef SUPPORT_WIDE_CHARS - if (c == OP_XCLASS) - repeat_opcode = code + GET(code, 1); - else -#endif - repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); - - c = *repeat_opcode; - if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) - { - /* end must not be NULL. */ - end = get_chr_property_list(code, utf, ucp, cb->fcc, list); - - list[1] = (c & 1) == 0; - - if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit)) - { - switch (c) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - *repeat_opcode = OP_CRPOSSTAR; - break; - - case OP_CRPLUS: - case OP_CRMINPLUS: - *repeat_opcode = OP_CRPOSPLUS; - break; - - case OP_CRQUERY: - case OP_CRMINQUERY: - *repeat_opcode = OP_CRPOSQUERY; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - *repeat_opcode = OP_CRPOSRANGE; - break; - } - } - } - c = *code; - } - - switch(c) - { - case OP_END: - return 0; - - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; - break; - - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEEXACT: - case OP_TYPEPOSUPTO: - if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) - code += 2; - break; - - case OP_CALLOUT_STR: - code += GET(code, 1 + 2*LINK_SIZE); - break; - -#ifdef SUPPORT_WIDE_CHARS - case OP_XCLASS: - code += GET(code, 1); - break; -#endif - - case OP_MARK: - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - case OP_SKIP_ARG: - case OP_THEN_ARG: - code += code[1]; - break; - } - - /* Add in the fixed length from the table */ - - code += PRIV(OP_lengths)[c]; - - /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be - followed by a multi-byte character. The length in the table is a minimum, so - we have to arrange to skip the extra code units. */ - -#ifdef MAYBE_UTF_MULTI - if (utf) switch(c) - { - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - case OP_STAR: - case OP_MINSTAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_QUERY: - case OP_MINQUERY: - case OP_UPTO: - case OP_MINUPTO: - case OP_EXACT: - case OP_POSSTAR: - case OP_POSPLUS: - case OP_POSQUERY: - case OP_POSUPTO: - case OP_STARI: - case OP_MINSTARI: - case OP_PLUSI: - case OP_MINPLUSI: - case OP_QUERYI: - case OP_MINQUERYI: - case OP_UPTOI: - case OP_MINUPTOI: - case OP_EXACTI: - case OP_POSSTARI: - case OP_POSPLUSI: - case OP_POSQUERYI: - case OP_POSUPTOI: - case OP_NOTSTAR: - case OP_NOTMINSTAR: - case OP_NOTPLUS: - case OP_NOTMINPLUS: - case OP_NOTQUERY: - case OP_NOTMINQUERY: - case OP_NOTUPTO: - case OP_NOTMINUPTO: - case OP_NOTEXACT: - case OP_NOTPOSSTAR: - case OP_NOTPOSPLUS: - case OP_NOTPOSQUERY: - case OP_NOTPOSUPTO: - case OP_NOTSTARI: - case OP_NOTMINSTARI: - case OP_NOTPLUSI: - case OP_NOTMINPLUSI: - case OP_NOTQUERYI: - case OP_NOTMINQUERYI: - case OP_NOTUPTOI: - case OP_NOTMINUPTOI: - case OP_NOTEXACTI: - case OP_NOTPOSSTARI: - case OP_NOTPOSPLUSI: - case OP_NOTPOSQUERYI: - case OP_NOTPOSUPTOI: - if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); - break; - } -#else - (void)(utf); /* Keep compiler happy by referencing function argument */ -#endif /* SUPPORT_WIDE_CHARS */ - } -} - -/* End of pcre2_auto_possess.c */ diff --git a/pcre2/src/pcre2_chartables.c.dist b/pcre2/src/pcre2_chartables.c.dist deleted file mode 100644 index 861914d1a..000000000 --- a/pcre2/src/pcre2_chartables.c.dist +++ /dev/null @@ -1,202 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* This file was automatically written by the pcre2_dftables auxiliary -program. It contains character tables that are used when no external -tables are passed to PCRE2 by the application that calls it. The tables -are used only for characters whose code values are less than 256. */ - -/* This set of tables was written in the C locale. */ - -/* The pcre2_ftables program (which is distributed with PCRE2) can be used -to build alternative versions of this file. This is necessary if you are -running in an EBCDIC environment, or if you want to default to a different -encoding, for example ISO-8859-1. When pcre2_dftables is run, it creates -these tables in the "C" locale by default. This happens automatically if -PCRE2 is configured with --enable-rebuild-chartables. However, you can run -pcre2_dftables manually with the -L option to build tables using the LC_ALL -locale. */ - -/* The following #include is present because without it gcc 4.x may remove -the array definition from the final binary if PCRE2 is built into a static -library and dead code stripping is activated. This leads to link errors. -Pulling in the header ensures that the array gets flagged as "someone -outside this compilation unit might reference this" and so it will always -be supplied to the linker. */ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - -const uint8_t PRIV(default_tables)[] = { - -/* This table is a lower casing table. */ - - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, - 56, 57, 58, 59, 60, 61, 62, 63, - 64, 97, 98, 99,100,101,102,103, - 104,105,106,107,108,109,110,111, - 112,113,114,115,116,117,118,119, - 120,121,122, 91, 92, 93, 94, 95, - 96, 97, 98, 99,100,101,102,103, - 104,105,106,107,108,109,110,111, - 112,113,114,115,116,117,118,119, - 120,121,122,123,124,125,126,127, - 128,129,130,131,132,133,134,135, - 136,137,138,139,140,141,142,143, - 144,145,146,147,148,149,150,151, - 152,153,154,155,156,157,158,159, - 160,161,162,163,164,165,166,167, - 168,169,170,171,172,173,174,175, - 176,177,178,179,180,181,182,183, - 184,185,186,187,188,189,190,191, - 192,193,194,195,196,197,198,199, - 200,201,202,203,204,205,206,207, - 208,209,210,211,212,213,214,215, - 216,217,218,219,220,221,222,223, - 224,225,226,227,228,229,230,231, - 232,233,234,235,236,237,238,239, - 240,241,242,243,244,245,246,247, - 248,249,250,251,252,253,254,255, - -/* This table is a case flipping table. */ - - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, - 56, 57, 58, 59, 60, 61, 62, 63, - 64, 97, 98, 99,100,101,102,103, - 104,105,106,107,108,109,110,111, - 112,113,114,115,116,117,118,119, - 120,121,122, 91, 92, 93, 94, 95, - 96, 65, 66, 67, 68, 69, 70, 71, - 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, - 88, 89, 90,123,124,125,126,127, - 128,129,130,131,132,133,134,135, - 136,137,138,139,140,141,142,143, - 144,145,146,147,148,149,150,151, - 152,153,154,155,156,157,158,159, - 160,161,162,163,164,165,166,167, - 168,169,170,171,172,173,174,175, - 176,177,178,179,180,181,182,183, - 184,185,186,187,188,189,190,191, - 192,193,194,195,196,197,198,199, - 200,201,202,203,204,205,206,207, - 208,209,210,211,212,213,214,215, - 216,217,218,219,220,221,222,223, - 224,225,226,227,228,229,230,231, - 232,233,234,235,236,237,238,239, - 240,241,242,243,244,245,246,247, - 248,249,250,251,252,253,254,255, - -/* This table contains bit maps for various character classes. Each map is 32 -bytes long and the bits run from the least significant end of each byte. The -classes that have their own maps are: space, xdigit, digit, upper, lower, word, -graph, print, punct, and cntrl. Other classes are built from combinations. */ - - 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00, /* space */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - - 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* xdigit */ - 0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - - 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* digit */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* upper */ - 0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* lower */ - 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - - 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* word */ - 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - - 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff, /* graph */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - - 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff, /* print */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - - 0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc, /* punct */ - 0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - - 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00, /* cntrl */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - -/* This table identifies various classes of character by individual bits: - 0x01 white space character - 0x02 letter - 0x04 lower case letter - 0x08 decimal digit - 0x10 alphanumeric or '_' -*/ - - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ - 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ - 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ - 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, /* 0 - 7 */ - 0x18,0x18,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ - 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* @ - G */ - 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */ - 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */ - 0x12,0x12,0x12,0x00,0x00,0x00,0x00,0x10, /* X - _ */ - 0x00,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /* ` - g */ - 0x16,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /* h - o */ - 0x16,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /* p - w */ - 0x16,0x16,0x16,0x00,0x00,0x00,0x00,0x00, /* x -127 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ - -/* End of pcre2_chartables.c */ diff --git a/pcre2/src/pcre2_compile.c b/pcre2/src/pcre2_compile.c deleted file mode 100644 index e811f12f0..000000000 --- a/pcre2/src/pcre2_compile.c +++ /dev/null @@ -1,10499 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#define NLBLOCK cb /* Block containing newline information */ -#define PSSTART start_pattern /* Field containing processed string start */ -#define PSEND end_pattern /* Field containing processed string end */ - -#include "pcre2_internal.h" - -/* In rare error cases debugging might require calling pcre2_printint(). */ - -#if 0 -#ifdef EBCDIC -#define PRINTABLE(c) ((c) >= 64 && (c) < 255) -#else -#define PRINTABLE(c) ((c) >= 32 && (c) < 127) -#endif -#include "pcre2_printint.c" -#define DEBUG_CALL_PRINTINT -#endif - -/* Other debugging code can be enabled by these defines. */ - -/* #define DEBUG_SHOW_CAPTURES */ -/* #define DEBUG_SHOW_PARSED */ - -/* There are a few things that vary with different code unit sizes. Handle them -by defining macros in order to minimize #if usage. */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 -#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5 -#define XDIGIT(c) xdigitab[c] - -#else /* Either 16-bit or 32-bit */ -#define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff) - -#if PCRE2_CODE_UNIT_WIDTH == 16 -#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6 - -#else /* 32-bit */ -#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6 -#endif -#endif - -/* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which -consists of uint32_t elements. Assume that if uint32_t can't hold it, two of -them will be able to (i.e. assume a 64-bit world). */ - -#if PCRE2_SIZE_MAX <= UINT32_MAX -#define PUTOFFSET(s,p) *p++ = s -#define GETOFFSET(s,p) s = *p++ -#define GETPLUSOFFSET(s,p) s = *(++p) -#define READPLUSOFFSET(s,p) s = p[1] -#define SKIPOFFSET(p) p++ -#define SIZEOFFSET 1 -#else -#define PUTOFFSET(s,p) \ - { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); } -#define GETOFFSET(s,p) \ - { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; } -#define GETPLUSOFFSET(s,p) \ - { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; } -#define READPLUSOFFSET(s,p) \ - { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; } -#define SKIPOFFSET(p) p += 2 -#define SIZEOFFSET 2 -#endif - -/* Macros for manipulating elements of the parsed pattern vector. */ - -#define META_CODE(x) (x & 0xffff0000u) -#define META_DATA(x) (x & 0x0000ffffu) -#define META_DIFF(x,y) ((x-y)>>16) - -/* Function definitions to allow mutual recursion */ - -#ifdef SUPPORT_UNICODE -static unsigned int - add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, - compile_block *, const uint32_t *, unsigned int); -#endif - -static int - compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t, - uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *, - compile_block *, PCRE2_SIZE *); - -static int - get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *, - compile_block *); - -static BOOL - set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *, - compile_block *); - -static int - check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *, - compile_block *); - - -/************************************************* -* Code parameters and static tables * -*************************************************/ - -#define MAX_GROUP_NUMBER 65535u -#define MAX_REPEAT_COUNT 65535u -#define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1) - -/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in -different ways in the different pattern scans. The parsing and group- -identifying pre-scan uses it to handle nesting, and needs it to be 16-bit -aligned for this. Having defined the size in code units, we set up -C16_WORK_SIZE as the number of elements in the 16-bit vector. - -During the first compiling phase, when determining how much memory is required, -the regex is partly compiled into this space, but the compiled parts are -discarded as soon as they can be, so that hopefully there will never be an -overrun. The code does, however, check for an overrun, which can occur for -pathological patterns. The size of the workspace depends on LINK_SIZE because -the length of compiled items varies with this. - -In the real compile phase, this workspace is not currently used. */ - -#define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */ - -#define C16_WORK_SIZE \ - ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t)) - -/* A uint32_t vector is used for caching information about the size of -capturing groups, to improve performance. A default is created on the stack of -this size. */ - -#define GROUPINFO_DEFAULT_SIZE 256 - -/* The overrun tests check for a slightly smaller size so that they detect the -overrun before it actually does run off the end of the data block. */ - -#define WORK_SIZE_SAFETY_MARGIN (100) - -/* This value determines the size of the initial vector that is used for -remembering named groups during the pre-compile. It is allocated on the stack, -but if it is too small, it is expanded, in a similar way to the workspace. The -value is the number of slots in the list. */ - -#define NAMED_GROUP_LIST_SIZE 20 - -/* The pre-compiling pass over the pattern creates a parsed pattern in a vector -of uint32_t. For short patterns this lives on the stack, with this size. Heap -memory is used for longer patterns. */ - -#define PARSED_PATTERN_DEFAULT_SIZE 1024 - -/* Maximum length value to check against when making sure that the variable -that holds the compiled pattern length does not overflow. We make it a bit less -than INT_MAX to allow for adding in group terminating code units, so that we -don't have to check them every time. */ - -#define OFLOW_MAX (INT_MAX - 20) - -/* Code values for parsed patterns, which are stored in a vector of 32-bit -unsigned ints. Values less than META_END are literal data values. The coding -for identifying the item is in the top 16-bits, leaving 16 bits for the -additional data that some of them need. The META_CODE, META_DATA, and META_DIFF -macros are used to manipulate parsed pattern elements. - -NOTE: When these definitions are changed, the table of extra lengths for each -code (meta_extra_lengths, just below) must be updated to remain in step. */ - -#define META_END 0x80000000u /* End of pattern */ - -#define META_ALT 0x80010000u /* alternation */ -#define META_ATOMIC 0x80020000u /* atomic group */ -#define META_BACKREF 0x80030000u /* Back ref */ -#define META_BACKREF_BYNAME 0x80040000u /* \k'name' */ -#define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */ -#define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */ -#define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */ -#define META_CAPTURE 0x80080000u /* Capturing parenthesis */ -#define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */ -#define META_CLASS 0x800a0000u /* start non-empty class */ -#define META_CLASS_EMPTY 0x800b0000u /* empty class */ -#define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */ -#define META_CLASS_END 0x800d0000u /* end of non-empty class */ -#define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */ -#define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */ -#define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */ -#define META_COND_NAME 0x80110000u /* (?()... */ -#define META_COND_NUMBER 0x80120000u /* (?(digits)... */ -#define META_COND_RNAME 0x80130000u /* (?(R&name)... */ -#define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */ -#define META_COND_VERSION 0x80150000u /* (?(VERSIONx.y)... */ -#define META_DOLLAR 0x80160000u /* $ metacharacter */ -#define META_DOT 0x80170000u /* . metacharacter */ -#define META_ESCAPE 0x80180000u /* \d and friends */ -#define META_KET 0x80190000u /* closing parenthesis */ -#define META_NOCAPTURE 0x801a0000u /* no capture parens */ -#define META_OPTIONS 0x801b0000u /* (?i) and friends */ -#define META_POSIX 0x801c0000u /* POSIX class item */ -#define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */ -#define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */ -#define META_RANGE_LITERAL 0x801f0000u /* range defined literally */ -#define META_RECURSE 0x80200000u /* Recursion */ -#define META_RECURSE_BYNAME 0x80210000u /* (?&name) */ -#define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */ - -/* These must be kept together to make it easy to check that an assertion -is present where expected in a conditional group. */ - -#define META_LOOKAHEAD 0x80230000u /* (?= */ -#define META_LOOKAHEADNOT 0x80240000u /* (?! */ -#define META_LOOKBEHIND 0x80250000u /* (?<= */ -#define META_LOOKBEHINDNOT 0x80260000u /* (?= 10 */ - 1+SIZEOFFSET, /* META_BACKREF_BYNAME */ - 1, /* META_BIGVALUE */ - 3, /* META_CALLOUT_NUMBER */ - 3+SIZEOFFSET, /* META_CALLOUT_STRING */ - 0, /* META_CAPTURE */ - 0, /* META_CIRCUMFLEX */ - 0, /* META_CLASS */ - 0, /* META_CLASS_EMPTY */ - 0, /* META_CLASS_EMPTY_NOT */ - 0, /* META_CLASS_END */ - 0, /* META_CLASS_NOT */ - 0, /* META_COND_ASSERT */ - SIZEOFFSET, /* META_COND_DEFINE */ - 1+SIZEOFFSET, /* META_COND_NAME */ - 1+SIZEOFFSET, /* META_COND_NUMBER */ - 1+SIZEOFFSET, /* META_COND_RNAME */ - 1+SIZEOFFSET, /* META_COND_RNUMBER */ - 3, /* META_COND_VERSION */ - 0, /* META_DOLLAR */ - 0, /* META_DOT */ - 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */ - 0, /* META_KET */ - 0, /* META_NOCAPTURE */ - 1, /* META_OPTIONS */ - 1, /* META_POSIX */ - 1, /* META_POSIX_NEG */ - 0, /* META_RANGE_ESCAPED */ - 0, /* META_RANGE_LITERAL */ - SIZEOFFSET, /* META_RECURSE */ - 1+SIZEOFFSET, /* META_RECURSE_BYNAME */ - 0, /* META_SCRIPT_RUN */ - 0, /* META_LOOKAHEAD */ - 0, /* META_LOOKAHEADNOT */ - SIZEOFFSET, /* META_LOOKBEHIND */ - SIZEOFFSET, /* META_LOOKBEHINDNOT */ - 0, /* META_LOOKAHEAD_NA */ - SIZEOFFSET, /* META_LOOKBEHIND_NA */ - 1, /* META_MARK - plus the string length */ - 0, /* META_ACCEPT */ - 0, /* META_FAIL */ - 0, /* META_COMMIT */ - 1, /* META_COMMIT_ARG - plus the string length */ - 0, /* META_PRUNE */ - 1, /* META_PRUNE_ARG - plus the string length */ - 0, /* META_SKIP */ - 1, /* META_SKIP_ARG - plus the string length */ - 0, /* META_THEN */ - 1, /* META_THEN_ARG - plus the string length */ - 0, /* META_ASTERISK */ - 0, /* META_ASTERISK_PLUS */ - 0, /* META_ASTERISK_QUERY */ - 0, /* META_PLUS */ - 0, /* META_PLUS_PLUS */ - 0, /* META_PLUS_QUERY */ - 0, /* META_QUERY */ - 0, /* META_QUERY_PLUS */ - 0, /* META_QUERY_QUERY */ - 2, /* META_MINMAX */ - 2, /* META_MINMAX_PLUS */ - 2 /* META_MINMAX_QUERY */ -}; - -/* Types for skipping parts of a parsed pattern. */ - -enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET }; - -/* Macro for setting individual bits in class bitmaps. It took some -experimenting to figure out how to stop gcc 5.3.0 from warning with --Wconversion. This version gets a warning: - - #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7)) - -Let's hope the apparently less efficient version isn't actually so bad if the -compiler is clever with identical subexpressions. */ - -#define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7))) - -/* Private flags added to firstcu and reqcu. */ - -#define REQ_CASELESS (1u << 0) /* Indicates caselessness */ -#define REQ_VARY (1u << 1) /* reqcu followed non-literal item */ -/* Negative values for the firstcu and reqcu flags */ -#define REQ_UNSET (-2) /* Not yet found anything */ -#define REQ_NONE (-1) /* Found not fixed char */ - -/* These flags are used in the groupinfo vector. */ - -#define GI_SET_FIXED_LENGTH 0x80000000u -#define GI_NOT_FIXED_LENGTH 0x40000000u -#define GI_FIXED_LENGTH_MASK 0x0000ffffu - -/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC -and is fast (a good compiler can turn it into a subtraction and unsigned -comparison). */ - -#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9) - -/* Table to identify hex digits. The tables in chartables are dependent on the -locale, and may mark arbitrary characters as digits. We want to recognize only -0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It -costs 256 bytes, but it is a lot faster than doing character value tests (at -least in some simple cases I timed), and in some applications one wants PCRE2 -to compile efficiently as well as match efficiently. The value in the table is -the binary hex digit value, or 0xff for non-hex digits. */ - -/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in -UTF-8 mode. */ - -#ifndef EBCDIC -static const uint8_t xdigitab[] = - { - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */ - 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */ - 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */ - 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */ - 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */ - -#else - -/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ - -static const uint8_t xdigitab[] = - { - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */ - 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ - 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */ - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */ - 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */ - 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */ -#endif /* EBCDIC */ - - -/* Table for handling alphanumeric escaped characters. Positive returns are -simple data values; negative values are for special things like \d and so on. -Zero means further processing is needed (for things like \x), or the escape is -invalid. */ - -/* This is the "normal" table for ASCII systems or for EBCDIC systems running -in UTF-8 mode. It runs from '0' to 'z'. */ - -#ifndef EBCDIC -#define ESCAPES_FIRST CHAR_0 -#define ESCAPES_LAST CHAR_z -#define UPPER_CASE(c) (c-32) - -static const short int escapes[] = { - 0, 0, - 0, 0, - 0, 0, - 0, 0, - 0, 0, - CHAR_COLON, CHAR_SEMICOLON, - CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, - CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK, - CHAR_COMMERCIAL_AT, -ESC_A, - -ESC_B, -ESC_C, - -ESC_D, -ESC_E, - 0, -ESC_G, - -ESC_H, 0, - 0, -ESC_K, - 0, 0, - -ESC_N, 0, - -ESC_P, -ESC_Q, - -ESC_R, -ESC_S, - 0, 0, - -ESC_V, -ESC_W, - -ESC_X, 0, - -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, - CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, - CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, - CHAR_GRAVE_ACCENT, CHAR_BEL, - -ESC_b, 0, - -ESC_d, CHAR_ESC, - CHAR_FF, 0, - -ESC_h, 0, - 0, -ESC_k, - 0, 0, - CHAR_LF, 0, - -ESC_p, 0, - CHAR_CR, -ESC_s, - CHAR_HT, 0, - -ESC_v, -ESC_w, - 0, 0, - -ESC_z -}; - -#else - -/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. -It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code -is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a -because it is defined as 'a', which of course picks up the ASCII value. */ - -#if 'a' == 0x81 /* Check for a real EBCDIC environment */ -#define ESCAPES_FIRST CHAR_a -#define ESCAPES_LAST CHAR_9 -#define UPPER_CASE(c) (c+64) -#else /* Testing in an ASCII environment */ -#define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */ -#define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */ -#define UPPER_CASE(c) (c-32) -#endif - -static const short int escapes[] = { -/* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0, -/* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0, -/* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p, -/* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0, -/* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0, -/* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0, -/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, -/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', -/* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, -/* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0, -/* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P, -/* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0, -/* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X, -/* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0, -/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, -/* F8 */ 0, 0 -}; - -/* We also need a table of characters that may follow \c in an EBCDIC -environment for characters 0-31. */ - -static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"; - -#endif /* EBCDIC */ - - -/* Table of special "verbs" like (*PRUNE). This is a short table, so it is -searched linearly. Put all the names into a single string, in order to reduce -the number of relocations when a shared library is dynamically linked. The -string is built from string macros so that it works in UTF-8 mode on EBCDIC -platforms. */ - -typedef struct verbitem { - unsigned int len; /* Length of verb name */ - uint32_t meta; /* Base META_ code */ - int has_arg; /* Argument requirement */ -} verbitem; - -static const char verbnames[] = - "\0" /* Empty name is a shorthand for MARK */ - STRING_MARK0 - STRING_ACCEPT0 - STRING_F0 - STRING_FAIL0 - STRING_COMMIT0 - STRING_PRUNE0 - STRING_SKIP0 - STRING_THEN; - -static const verbitem verbs[] = { - { 0, META_MARK, +1 }, /* > 0 => must have an argument */ - { 4, META_MARK, +1 }, - { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */ - { 1, META_FAIL, -1 }, - { 4, META_FAIL, -1 }, - { 6, META_COMMIT, 0 }, - { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */ - { 4, META_SKIP, 0 }, - { 4, META_THEN, 0 } -}; - -static const int verbcount = sizeof(verbs)/sizeof(verbitem); - -/* Verb opcodes, indexed by their META code offset from META_MARK. */ - -static const uint32_t verbops[] = { - OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE, - OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG }; - -/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */ - -typedef struct alasitem { - unsigned int len; /* Length of name */ - uint32_t meta; /* Base META_ code */ -} alasitem; - -static const char alasnames[] = - STRING_pla0 - STRING_plb0 - STRING_napla0 - STRING_naplb0 - STRING_nla0 - STRING_nlb0 - STRING_positive_lookahead0 - STRING_positive_lookbehind0 - STRING_non_atomic_positive_lookahead0 - STRING_non_atomic_positive_lookbehind0 - STRING_negative_lookahead0 - STRING_negative_lookbehind0 - STRING_atomic0 - STRING_sr0 - STRING_asr0 - STRING_script_run0 - STRING_atomic_script_run; - -static const alasitem alasmeta[] = { - { 3, META_LOOKAHEAD }, - { 3, META_LOOKBEHIND }, - { 5, META_LOOKAHEAD_NA }, - { 5, META_LOOKBEHIND_NA }, - { 3, META_LOOKAHEADNOT }, - { 3, META_LOOKBEHINDNOT }, - { 18, META_LOOKAHEAD }, - { 19, META_LOOKBEHIND }, - { 29, META_LOOKAHEAD_NA }, - { 30, META_LOOKBEHIND_NA }, - { 18, META_LOOKAHEADNOT }, - { 19, META_LOOKBEHINDNOT }, - { 6, META_ATOMIC }, - { 2, META_SCRIPT_RUN }, /* sr = script run */ - { 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */ - { 10, META_SCRIPT_RUN }, /* script run */ - { 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */ -}; - -static const int alascount = sizeof(alasmeta)/sizeof(alasitem); - -/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */ - -static uint32_t chartypeoffset[] = { - OP_STAR - OP_STAR, OP_STARI - OP_STAR, - OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR }; - -/* Tables of names of POSIX character classes and their lengths. The names are -now all in a single string, to reduce the number of relocations when a shared -library is dynamically loaded. The list of lengths is terminated by a zero -length entry. The first three must be alpha, lower, upper, as this is assumed -for handling case independence. The indices for graph, print, and punct are -needed, so identify them. */ - -static const char posix_names[] = - STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0 - STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0 - STRING_graph0 STRING_print0 STRING_punct0 STRING_space0 - STRING_word0 STRING_xdigit; - -static const uint8_t posix_name_lengths[] = { - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; - -#define PC_GRAPH 8 -#define PC_PRINT 9 -#define PC_PUNCT 10 - -/* Table of class bit maps for each POSIX class. Each class is formed from a -base map, with an optional addition or removal of another map. Then, for some -classes, there is some additional tweaking: for [:blank:] the vertical space -characters are removed, and for [:alpha:] and [:alnum:] the underscore -character is removed. The triples in the table consist of the base map offset, -second map offset or -1 if no second map, and a non-negative value for map -addition or a negative value for map subtraction (if there are two maps). The -absolute value of the third field has these meanings: 0 => no tweaking, 1 => -remove vertical space characters, 2 => remove underscore. */ - -static const int posix_class_maps[] = { - cbit_word, cbit_digit, -2, /* alpha */ - cbit_lower, -1, 0, /* lower */ - cbit_upper, -1, 0, /* upper */ - cbit_word, -1, 2, /* alnum - word without underscore */ - cbit_print, cbit_cntrl, 0, /* ascii */ - cbit_space, -1, 1, /* blank - a GNU extension */ - cbit_cntrl, -1, 0, /* cntrl */ - cbit_digit, -1, 0, /* digit */ - cbit_graph, -1, 0, /* graph */ - cbit_print, -1, 0, /* print */ - cbit_punct, -1, 0, /* punct */ - cbit_space, -1, 0, /* space */ - cbit_word, -1, 0, /* word - a Perl extension */ - cbit_xdigit,-1, 0 /* xdigit */ -}; - -#ifdef SUPPORT_UNICODE - -/* The POSIX class Unicode property substitutes that are used in UCP mode must -be in the order of the POSIX class names, defined above. */ - -static int posix_substitutes[] = { - PT_GC, ucp_L, /* alpha */ - PT_PC, ucp_Ll, /* lower */ - PT_PC, ucp_Lu, /* upper */ - PT_ALNUM, 0, /* alnum */ - -1, 0, /* ascii, treat as non-UCP */ - -1, 1, /* blank, treat as \h */ - PT_PC, ucp_Cc, /* cntrl */ - PT_PC, ucp_Nd, /* digit */ - PT_PXGRAPH, 0, /* graph */ - PT_PXPRINT, 0, /* print */ - PT_PXPUNCT, 0, /* punct */ - PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */ - PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */ - -1, 0 /* xdigit, treat as non-UCP */ -}; -#define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t))) -#endif /* SUPPORT_UNICODE */ - -/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset -are allowed. */ - -#define PUBLIC_LITERAL_COMPILE_OPTIONS \ - (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \ - PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \ - PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF) - -#define PUBLIC_COMPILE_OPTIONS \ - (PUBLIC_LITERAL_COMPILE_OPTIONS| \ - PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ - PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \ - PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \ - PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \ - PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \ - PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY) - -#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \ - (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD) - -#define PUBLIC_COMPILE_EXTRA_OPTIONS \ - (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \ - PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \ - PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX) - -/* Compile time error code numbers. They are given names so that they can more -easily be tracked. When a new number is added, the tables called eint1 and -eint2 in pcre2posix.c may need to be updated, and a new error text must be -added to compile_error_texts in pcre2_error.c. */ - -enum { ERR0 = COMPILE_ERROR_BASE, - ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10, - ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, - ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30, - ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40, - ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, - ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, - ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, - ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80, - ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90, - ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98 }; - -/* This is a table of start-of-pattern options such as (*UTF) and settings such -as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward -compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is -generic and always supported. */ - -enum { PSO_OPT, /* Value is an option bit */ - PSO_FLG, /* Value is a flag bit */ - PSO_NL, /* Value is a newline type */ - PSO_BSR, /* Value is a \R type */ - PSO_LIMH, /* Read integer value for heap limit */ - PSO_LIMM, /* Read integer value for match limit */ - PSO_LIMD }; /* Read integer value for depth limit */ - -typedef struct pso { - const uint8_t *name; - uint16_t length; - uint16_t type; - uint32_t value; -} pso; - -/* NB: STRING_UTFn_RIGHTPAR contains the length as well */ - -static pso pso_list[] = { - { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF }, - { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF }, - { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP }, - { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET }, - { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET }, - { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS }, - { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR }, - { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT }, - { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE }, - { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 }, - { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 }, - { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 }, - { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 }, - { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR }, - { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF }, - { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF }, - { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY }, - { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL }, - { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF }, - { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF }, - { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE } -}; - -/* This table is used when converting repeating opcodes into possessified -versions as a result of an explicit possessive quantifier such as ++. A zero -value means there is no possessified version - in those cases the item in -question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT -because all relevant opcodes are less than that. */ - -static const uint8_t opcode_possessify[] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */ - - 0, /* NOTI */ - OP_POSSTAR, 0, /* STAR, MINSTAR */ - OP_POSPLUS, 0, /* PLUS, MINPLUS */ - OP_POSQUERY, 0, /* QUERY, MINQUERY */ - OP_POSUPTO, 0, /* UPTO, MINUPTO */ - 0, /* EXACT */ - 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */ - - OP_POSSTARI, 0, /* STARI, MINSTARI */ - OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */ - OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */ - OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */ - 0, /* EXACTI */ - 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */ - - OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */ - OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */ - OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */ - OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */ - 0, /* NOTEXACT */ - 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */ - - OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */ - OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */ - OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */ - OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */ - 0, /* NOTEXACTI */ - 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */ - - OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */ - OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */ - OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */ - OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */ - 0, /* TYPEEXACT */ - 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */ - - OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */ - OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */ - OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */ - OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */ - 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */ - - 0, 0, 0, /* CLASS, NCLASS, XCLASS */ - 0, 0, /* REF, REFI */ - 0, 0, /* DNREF, DNREFI */ - 0, 0 /* RECURSE, CALLOUT */ -}; - - -#ifdef DEBUG_SHOW_PARSED -/************************************************* -* Show the parsed pattern for debugging * -*************************************************/ - -/* For debugging the pre-scan, this code, which outputs the parsed data vector, -can be enabled. */ - -static void show_parsed(compile_block *cb) -{ -uint32_t *pptr = cb->parsed_pattern; - -for (;;) - { - int max, min; - PCRE2_SIZE offset; - uint32_t i; - uint32_t length; - uint32_t meta_arg = META_DATA(*pptr); - - fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr); - - if (*pptr < META_END) - { - if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr); - pptr++; - } - - else switch (META_CODE(*pptr++)) - { - default: - fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n"); - return; - - case META_END: - fprintf(stderr, "META_END\n"); - return; - - case META_CAPTURE: - fprintf(stderr, "META_CAPTURE %d", meta_arg); - break; - - case META_RECURSE: - GETOFFSET(offset, pptr); - fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset); - break; - - case META_BACKREF: - if (meta_arg < 10) - offset = cb->small_ref_offset[meta_arg]; - else - GETOFFSET(offset, pptr); - fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset); - break; - - case META_ESCAPE: - if (meta_arg == ESC_P || meta_arg == ESC_p) - { - uint32_t ptype = *pptr >> 16; - uint32_t pvalue = *pptr++ & 0xffff; - fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p', - ptype, pvalue); - } - else - { - uint32_t cc; - /* There's just one escape we might have here that isn't negated in the - escapes table. */ - if (meta_arg == ESC_g) cc = CHAR_g; - else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++) - { - if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break; - } - if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK; - fprintf(stderr, "META \\%c", cc); - } - break; - - case META_MINMAX: - min = *pptr++; - max = *pptr++; - if (max != REPEAT_UNLIMITED) - fprintf(stderr, "META {%d,%d}", min, max); - else - fprintf(stderr, "META {%d,}", min); - break; - - case META_MINMAX_QUERY: - min = *pptr++; - max = *pptr++; - if (max != REPEAT_UNLIMITED) - fprintf(stderr, "META {%d,%d}?", min, max); - else - fprintf(stderr, "META {%d,}?", min); - break; - - case META_MINMAX_PLUS: - min = *pptr++; - max = *pptr++; - if (max != REPEAT_UNLIMITED) - fprintf(stderr, "META {%d,%d}+", min, max); - else - fprintf(stderr, "META {%d,}+", min); - break; - - case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break; - case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break; - case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break; - case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break; - case META_DOT: fprintf(stderr, "META_DOT"); break; - case META_ASTERISK: fprintf(stderr, "META *"); break; - case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break; - case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break; - case META_PLUS: fprintf(stderr, "META +"); break; - case META_PLUS_QUERY: fprintf(stderr, "META +?"); break; - case META_PLUS_PLUS: fprintf(stderr, "META ++"); break; - case META_QUERY: fprintf(stderr, "META ?"); break; - case META_QUERY_QUERY: fprintf(stderr, "META ??"); break; - case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break; - - case META_ATOMIC: fprintf(stderr, "META (?>"); break; - case META_NOCAPTURE: fprintf(stderr, "META (?:"); break; - case META_LOOKAHEAD: fprintf(stderr, "META (?="); break; - case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break; - case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break; - case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break; - case META_KET: fprintf(stderr, "META )"); break; - case META_ALT: fprintf(stderr, "META | %d", meta_arg); break; - - case META_CLASS: fprintf(stderr, "META ["); break; - case META_CLASS_NOT: fprintf(stderr, "META [^"); break; - case META_CLASS_END: fprintf(stderr, "META ]"); break; - case META_CLASS_EMPTY: fprintf(stderr, "META []"); break; - case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break; - - case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break; - case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break; - - case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break; - case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break; - - case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break; - case META_FAIL: fprintf(stderr, "META (*FAIL)"); break; - case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break; - case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break; - case META_SKIP: fprintf(stderr, "META (*SKIP)"); break; - case META_THEN: fprintf(stderr, "META (*THEN)"); break; - - case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break; - - case META_LOOKBEHIND: - fprintf(stderr, "META (?<= %d offset=", meta_arg); - GETOFFSET(offset, pptr); - fprintf(stderr, "%zd", offset); - break; - - case META_LOOKBEHIND_NA: - fprintf(stderr, "META (*naplb: %d offset=", meta_arg); - GETOFFSET(offset, pptr); - fprintf(stderr, "%zd", offset); - break; - - case META_LOOKBEHINDNOT: - fprintf(stderr, "META (?="); - fprintf(stderr, "%d.", *pptr++); - fprintf(stderr, "%d)", *pptr++); - break; - - case META_COND_NAME: - fprintf(stderr, "META (?() length=%d offset=", *pptr++); - GETOFFSET(offset, pptr); - fprintf(stderr, "%zd", offset); - break; - - case META_COND_RNAME: - fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++); - GETOFFSET(offset, pptr); - fprintf(stderr, "%zd", offset); - break; - - /* This is kept as a name, because it might be. */ - - case META_COND_RNUMBER: - fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++); - GETOFFSET(offset, pptr); - fprintf(stderr, "%zd", offset); - break; - - case META_MARK: - fprintf(stderr, "META (*MARK:"); - goto SHOWARG; - - case META_COMMIT_ARG: - fprintf(stderr, "META (*COMMIT:"); - goto SHOWARG; - - case META_PRUNE_ARG: - fprintf(stderr, "META (*PRUNE:"); - goto SHOWARG; - - case META_SKIP_ARG: - fprintf(stderr, "META (*SKIP:"); - goto SHOWARG; - - case META_THEN_ARG: - fprintf(stderr, "META (*THEN:"); - SHOWARG: - length = *pptr++; - for (i = 0; i < length; i++) - { - uint32_t cc = *pptr++; - if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc); - else fprintf(stderr, "\\x{%x}", cc); - } - fprintf(stderr, ") length=%u", length); - break; - } - fprintf(stderr, "\n"); - } -return; -} -#endif /* DEBUG_SHOW_PARSED */ - - - -/************************************************* -* Copy compiled code * -*************************************************/ - -/* Compiled JIT code cannot be copied, so the new compiled block has no -associated JIT data. */ - -PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION -pcre2_code_copy(const pcre2_code *code) -{ -PCRE2_SIZE* ref_count; -pcre2_code *newcode; - -if (code == NULL) return NULL; -newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); -if (newcode == NULL) return NULL; -memcpy(newcode, code, code->blocksize); -newcode->executable_jit = NULL; - -/* If the code is one that has been deserialized, increment the reference count -in the decoded tables. */ - -if ((code->flags & PCRE2_DEREF_TABLES) != 0) - { - ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH); - (*ref_count)++; - } - -return newcode; -} - - - -/************************************************* -* Copy compiled code and character tables * -*************************************************/ - -/* Compiled JIT code cannot be copied, so the new compiled block has no -associated JIT data. This version of code_copy also makes a separate copy of -the character tables. */ - -PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION -pcre2_code_copy_with_tables(const pcre2_code *code) -{ -PCRE2_SIZE* ref_count; -pcre2_code *newcode; -uint8_t *newtables; - -if (code == NULL) return NULL; -newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); -if (newcode == NULL) return NULL; -memcpy(newcode, code, code->blocksize); -newcode->executable_jit = NULL; - -newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE), - code->memctl.memory_data); -if (newtables == NULL) - { - code->memctl.free((void *)newcode, code->memctl.memory_data); - return NULL; - } -memcpy(newtables, code->tables, TABLES_LENGTH); -ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH); -*ref_count = 1; - -newcode->tables = newtables; -newcode->flags |= PCRE2_DEREF_TABLES; -return newcode; -} - - - -/************************************************* -* Free compiled code * -*************************************************/ - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_code_free(pcre2_code *code) -{ -PCRE2_SIZE* ref_count; - -if (code != NULL) - { - if (code->executable_jit != NULL) - PRIV(jit_free)(code->executable_jit, &code->memctl); - - if ((code->flags & PCRE2_DEREF_TABLES) != 0) - { - /* Decoded tables belong to the codes after deserialization, and they must - be freed when there are no more references to them. The *ref_count should - always be > 0. */ - - ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH); - if (*ref_count > 0) - { - (*ref_count)--; - if (*ref_count == 0) - code->memctl.free((void *)code->tables, code->memctl.memory_data); - } - } - - code->memctl.free(code, code->memctl.memory_data); - } -} - - - -/************************************************* -* Read a number, possibly signed * -*************************************************/ - -/* This function is used to read numbers in the pattern. The initial pointer -must be the sign or first digit of the number. When relative values (introduced -by + or -) are allowed, they are relative group numbers, and the result must be -greater than zero. - -Arguments: - ptrptr points to the character pointer variable - ptrend points to the end of the input string - allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this - max_value the largest number allowed - max_error the error to give for an over-large number - intptr where to put the result - errcodeptr where to put an error code - -Returns: TRUE - a number was read - FALSE - errorcode == 0 => no number was found - errorcode != 0 => an error occurred -*/ - -static BOOL -read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign, - uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr) -{ -int sign = 0; -uint32_t n = 0; -PCRE2_SPTR ptr = *ptrptr; -BOOL yield = FALSE; - -*errorcodeptr = 0; - -if (allow_sign >= 0 && ptr < ptrend) - { - if (*ptr == CHAR_PLUS) - { - sign = +1; - max_value -= allow_sign; - ptr++; - } - else if (*ptr == CHAR_MINUS) - { - sign = -1; - ptr++; - } - } - -if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE; -while (ptr < ptrend && IS_DIGIT(*ptr)) - { - n = n * 10 + *ptr++ - CHAR_0; - if (n > max_value) - { - *errorcodeptr = max_error; - goto EXIT; - } - } - -if (allow_sign >= 0 && sign != 0) - { - if (n == 0) - { - *errorcodeptr = ERR26; /* +0 and -0 are not allowed */ - goto EXIT; - } - - if (sign > 0) n += allow_sign; - else if ((int)n > allow_sign) - { - *errorcodeptr = ERR15; /* Non-existent subpattern */ - goto EXIT; - } - else n = allow_sign + 1 - n; - } - -yield = TRUE; - -EXIT: -*intptr = n; -*ptrptr = ptr; -return yield; -} - - - -/************************************************* -* Read repeat counts * -*************************************************/ - -/* Read an item of the form {n,m} and return the values if non-NULL pointers -are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a -larger value is used for "unlimited". We have to use signed arguments for -read_number() because it is capable of returning a signed value. - -Arguments: - ptrptr points to pointer to character after'{' - ptrend pointer to end of input - minp if not NULL, pointer to int for min - maxp if not NULL, pointer to int for max (-1 if no max) - returned as -1 if no max - errorcodeptr points to error code variable - -Returns: FALSE if not a repeat quantifier, errorcode set zero - FALSE on error, with errorcode set non-zero - TRUE on success, with pointer updated to point after '}' -*/ - -static BOOL -read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp, - uint32_t *maxp, int *errorcodeptr) -{ -PCRE2_SPTR p = *ptrptr; -BOOL yield = FALSE; -int32_t min = 0; -int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */ - -/* NB read_number() initializes the error code to zero. The only error is for a -number that is too big. */ - -if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr)) - goto EXIT; - -if (p >= ptrend) goto EXIT; - -if (*p == CHAR_RIGHT_CURLY_BRACKET) - { - p++; - max = min; - } - -else - { - if (*p++ != CHAR_COMMA || p >= ptrend) goto EXIT; - if (*p != CHAR_RIGHT_CURLY_BRACKET) - { - if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, - errorcodeptr) || p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET) - goto EXIT; - if (max < min) - { - *errorcodeptr = ERR4; - goto EXIT; - } - } - p++; - } - -yield = TRUE; -if (minp != NULL) *minp = (uint32_t)min; -if (maxp != NULL) *maxp = (uint32_t)max; - -/* Update the pattern pointer on success, or after an error, but not when -the result is "not a repeat quantifier". */ - -EXIT: -if (yield || *errorcodeptr != 0) *ptrptr = p; -return yield; -} - - - -/************************************************* -* Handle escapes * -*************************************************/ - -/* This function is called when a \ has been encountered. It either returns a -positive value for a simple escape such as \d, or 0 for a data character, which -is placed in chptr. A backreference to group n is returned as negative n. On -entry, ptr is pointing at the character after \. On exit, it points after the -final code unit of the escape sequence. - -This function is also called from pcre2_substitute() to handle escape sequences -in replacement strings. In this case, the cb argument is NULL, and in the case -of escapes that have further processing, only sequences that define a data -character are recognised. The isclass argument is not relevant; the options -argument is the final value of the compiled pattern's options. - -Arguments: - ptrptr points to the input position pointer - ptrend points to the end of the input - chptr points to a returned data character - errorcodeptr points to the errorcode variable (containing zero) - options the current options bits - isclass TRUE if inside a character class - cb compile data block or NULL when called from pcre2_substitute() - -Returns: zero => a data character - positive => a special escape sequence - negative => a numerical back reference - on error, errorcodeptr is set non-zero -*/ - -int -PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr, - int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass, - compile_block *cb) -{ -BOOL utf = (options & PCRE2_UTF) != 0; -PCRE2_SPTR ptr = *ptrptr; -uint32_t c, cc; -int escape = 0; -int i; - -/* If backslash is at the end of the string, it's an error. */ - -if (ptr >= ptrend) - { - *errorcodeptr = ERR1; - return 0; - } - -GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ -*errorcodeptr = 0; /* Be optimistic */ - -/* Non-alphanumerics are literals, so we just leave the value in c. An initial -value test saves a memory lookup for code points outside the alphanumeric -range. */ - -if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */ - -/* Otherwise, do a table lookup. Non-zero values need little processing here. A -positive value is a literal value for something like \n. A negative value is -the negation of one of the ESC_ macros that is passed back for handling by the -calling function. Some extra checking is needed for \N because only \N{U+dddd} -is supported. If the value is zero, further processing is handled below. */ - -else if ((i = escapes[c - ESCAPES_FIRST]) != 0) - { - if (i > 0) - { - c = (uint32_t)i; - if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0) - c = CHAR_LF; - } - else /* Negative table entry */ - { - escape = -i; /* Else return a special escape */ - if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X)) - cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */ - - /* Perl supports \N{name} for character names and \N{U+dddd} for numerical - Unicode code points, as well as plain \N for "not newline". PCRE does not - support \N{name}. However, it does support quantification such as \N{2,3}, - so if \N{ is not followed by U+dddd we check for a quantifier. */ - - if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET) - { - PCRE2_SPTR p = ptr + 1; - - /* \N{U+ can be handled by the \x{ code. However, this construction is - not valid in EBCDIC environments because it specifies a Unicode - character, not a codepoint in the local code. For example \N{U+0041} - must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode - casing semantics for the entire pattern, so allow it only in UTF (i.e. - Unicode) mode. */ - - if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS) - { -#ifdef EBCDIC - *errorcodeptr = ERR93; -#else - if (utf) - { - ptr = p + 1; - escape = 0; /* Not a fancy escape after all */ - goto COME_FROM_NU; - } - else *errorcodeptr = ERR93; -#endif - } - - /* Give an error if what follows is not a quantifier, but don't override - an error set by the quantifier reader (e.g. number overflow). */ - - else - { - if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) && - *errorcodeptr == 0) - *errorcodeptr = ERR37; - } - } - } - } - -/* Escapes that need further processing, including those that are unknown, have -a zero entry in the lookup table. When called from pcre2_substitute(), only \c, -\o, and \x are recognized (\u and \U can never appear as they are used for case -forcing). */ - -else - { - int s; - PCRE2_SPTR oldptr; - BOOL overflow; - BOOL alt_bsux = - ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0; - - /* Filter calls from pcre2_substitute(). */ - - if (cb == NULL) - { - if (c != CHAR_c && c != CHAR_o && c != CHAR_x) - { - *errorcodeptr = ERR3; - return 0; - } - alt_bsux = FALSE; /* Do not modify \x handling */ - } - - switch (c) - { - /* A number of Perl escapes are not handled by PCRE. We give an explicit - error. */ - - case CHAR_F: - case CHAR_l: - case CHAR_L: - *errorcodeptr = ERR37; - break; - - /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX - is set. Otherwise, \u must be followed by exactly four hex digits or, if - PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces. - Otherwise it is a lowercase u letter. This gives some compatibility with - ECMAScript (aka JavaScript). */ - - case CHAR_u: - if (!alt_bsux) *errorcodeptr = ERR37; else - { - uint32_t xc; - - if (ptr >= ptrend) break; - if (*ptr == CHAR_LEFT_CURLY_BRACKET && - (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0) - { - PCRE2_SPTR hptr = ptr + 1; - cc = 0; - - while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff) - { - if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */ - { - *errorcodeptr = ERR77; - ptr = hptr; /* Show where */ - break; /* *hptr != } will cause another break below */ - } - cc = (cc << 4) | xc; - hptr++; - } - - if (hptr == ptr + 1 || /* No hex digits */ - hptr >= ptrend || /* Hit end of input */ - *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */ - break; /* Hex escape not recognized */ - - c = cc; /* Accept the code point */ - ptr = hptr + 1; - } - - else /* Must be exactly 4 hex digits */ - { - if (ptrend - ptr < 4) break; /* Less than 4 chars */ - if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ - if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ - cc = (cc << 4) | xc; - if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ - cc = (cc << 4) | xc; - if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ - c = (cc << 4) | xc; - ptr += 4; - } - - if (utf) - { - if (c > 0x10ffffU) *errorcodeptr = ERR77; - else - if (c >= 0xd800 && c <= 0xdfff && - (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) - *errorcodeptr = ERR73; - } - else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77; - } - break; - - /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, - in which case it is an upper case letter. */ - - case CHAR_U: - if (!alt_bsux) *errorcodeptr = ERR37; - break; - - /* In a character class, \g is just a literal "g". Outside a character - class, \g must be followed by one of a number of specific things: - - (1) A number, either plain or braced. If positive, it is an absolute - backreference. If negative, it is a relative backreference. This is a Perl - 5.10 feature. - - (2) Perl 5.10 also supports \g{name} as a reference to a named group. This - is part of Perl's movement towards a unified syntax for back references. As - this is synonymous with \k{name}, we fudge it up by pretending it really - was \k{name}. - - (3) For Oniguruma compatibility we also support \g followed by a name or a - number either in angle brackets or in single quotes. However, these are - (possibly recursive) subroutine calls, _not_ backreferences. We return - the ESC_g code. - - Summary: Return a negative number for a numerical back reference, ESC_k for - a named back reference, and ESC_g for a named or numbered subroutine call. - */ - - case CHAR_g: - if (isclass) break; - - if (ptr >= ptrend) - { - *errorcodeptr = ERR57; - break; - } - - if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE) - { - escape = ESC_g; - break; - } - - /* If there is a brace delimiter, try to read a numerical reference. If - there isn't one, assume we have a name and treat it as \k. */ - - if (*ptr == CHAR_LEFT_CURLY_BRACKET) - { - PCRE2_SPTR p = ptr + 1; - if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s, - errorcodeptr)) - { - if (*errorcodeptr == 0) escape = ESC_k; /* No number found */ - break; - } - if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET) - { - *errorcodeptr = ERR57; - break; - } - ptr = p + 1; - } - - /* Read an undelimited number */ - - else - { - if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s, - errorcodeptr)) - { - if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */ - break; - } - } - - if (s <= 0) - { - *errorcodeptr = ERR15; - break; - } - - escape = -s; - break; - - /* The handling of escape sequences consisting of a string of digits - starting with one that is not zero is not straightforward. Perl has changed - over the years. Nowadays \g{} for backreferences and \o{} for octal are - recommended to avoid the ambiguities in the old syntax. - - Outside a character class, the digits are read as a decimal number. If the - number is less than 10, or if there are that many previous extracting left - brackets, it is a back reference. Otherwise, up to three octal digits are - read to form an escaped character code. Thus \123 is likely to be octal 123 - (cf \0123, which is octal 012 followed by the literal 3). - - Inside a character class, \ followed by a digit is always either a literal - 8 or 9 or an octal number. */ - - case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: - case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: - - if (!isclass) - { - oldptr = ptr; - ptr--; /* Back to the digit */ - if (!read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, ERR61, &s, - errorcodeptr)) - break; - - /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x - are octal escapes if there are not that many previous captures. */ - - if (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount) - { - if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61; - else escape = -s; /* Indicates a back reference */ - break; - } - ptr = oldptr; /* Put the pointer back and fall through */ - } - - /* Handle a digit following \ when the number is not a back reference, or - we are within a character class. If the first digit is 8 or 9, Perl used to - generate a binary zero and then treat the digit as a following literal. At - least by Perl 5.18 this changed so as not to insert the binary zero. */ - - if (c >= CHAR_8) break; - - /* Fall through */ - - /* \0 always starts an octal number, but we may drop through to here with a - larger first octal digit. The original code used just to take the least - significant 8 bits of octal numbers (I think this is what early Perls used - to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode, - but no more than 3 octal digits. */ - - case CHAR_0: - c -= CHAR_0; - while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) - c = c * 8 + *ptr++ - CHAR_0; -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (!utf && c > 0xff) *errorcodeptr = ERR51; -#endif - break; - - /* \o is a relatively new Perl feature, supporting a more general way of - specifying character codes in octal. The only supported form is \o{ddd}. */ - - case CHAR_o: - if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET) - { - ptr--; - *errorcodeptr = ERR55; - } - else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET) - *errorcodeptr = ERR78; - else - { - c = 0; - overflow = FALSE; - while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) - { - cc = *ptr++; - if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ -#if PCRE2_CODE_UNIT_WIDTH == 32 - if (c >= 0x20000000l) { overflow = TRUE; break; } -#endif - c = (c << 3) + (cc - CHAR_0); -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } -#elif PCRE2_CODE_UNIT_WIDTH == 16 - if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } -#elif PCRE2_CODE_UNIT_WIDTH == 32 - if (utf && c > 0x10ffffU) { overflow = TRUE; break; } -#endif - } - if (overflow) - { - while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++; - *errorcodeptr = ERR34; - } - else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) - { - if (utf && c >= 0xd800 && c <= 0xdfff && - (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) - { - ptr--; - *errorcodeptr = ERR73; - } - } - else - { - ptr--; - *errorcodeptr = ERR64; - } - } - break; - - /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed - by two hexadecimal digits. Otherwise it is a lowercase x letter. */ - - case CHAR_x: - if (alt_bsux) - { - uint32_t xc; - if (ptrend - ptr < 2) break; /* Less than 2 characters */ - if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ - if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ - c = (cc << 4) | xc; - ptr += 2; - } - - /* Handle \x in Perl's style. \x{ddd} is a character code which can be - greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex - digits. If not, { used to be treated as a data character. However, Perl - seems to read hex digits up to the first non-such, and ignore the rest, so - that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE - now gives an error. */ - - else - { - if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET) - { -#ifndef EBCDIC - COME_FROM_NU: -#endif - if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET) - { - *errorcodeptr = ERR78; - break; - } - c = 0; - overflow = FALSE; - - while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff) - { - ptr++; - if (c == 0 && cc == 0) continue; /* Leading zeroes */ -#if PCRE2_CODE_UNIT_WIDTH == 32 - if (c >= 0x10000000l) { overflow = TRUE; break; } -#endif - c = (c << 4) | cc; - if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR)) - { - overflow = TRUE; - break; - } - } - - if (overflow) - { - while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++; - *errorcodeptr = ERR34; - } - else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) - { - if (utf && c >= 0xd800 && c <= 0xdfff && - (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) - { - ptr--; - *errorcodeptr = ERR73; - } - } - - /* If the sequence of hex digits does not end with '}', give an error. - We used just to recognize this construct and fall through to the normal - \x handling, but nowadays Perl gives an error, which seems much more - sensible, so we do too. */ - - else - { - ptr--; - *errorcodeptr = ERR67; - } - } /* End of \x{} processing */ - - /* Read a up to two hex digits after \x */ - - else - { - c = 0; - if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */ - ptr++; - c = cc; - if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */ - ptr++; - c = (c << 4) | cc; - } /* End of \xdd handling */ - } /* End of Perl-style \x handling */ - break; - - /* The handling of \c is different in ASCII and EBCDIC environments. In an - ASCII (or Unicode) environment, an error is given if the character - following \c is not a printable ASCII character. Otherwise, the following - character is upper-cased if it is a letter, and after that the 0x40 bit is - flipped. The result is the value of the escape. - - In an EBCDIC environment the handling of \c is compatible with the - specification in the perlebcdic document. The following character must be - a letter or one of small number of special characters. These provide a - means of defining the character values 0-31. - - For testing the EBCDIC handling of \c in an ASCII environment, recognize - the EBCDIC value of 'c' explicitly. */ - -#if defined EBCDIC && 'a' != 0x81 - case 0x83: -#else - case CHAR_c: -#endif - if (ptr >= ptrend) - { - *errorcodeptr = ERR2; - break; - } - c = *ptr; - if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c); - - /* Handle \c in an ASCII/Unicode environment. */ - -#ifndef EBCDIC /* ASCII/UTF-8 coding */ - if (c < 32 || c > 126) /* Excludes all non-printable ASCII */ - { - *errorcodeptr = ERR68; - break; - } - c ^= 0x40; - - /* Handle \c in an EBCDIC environment. The special case \c? is converted to - 255 (0xff) or 95 (0x5f) if other characters suggest we are using the - POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.) - The other valid sequences correspond to a list of specific characters. */ - -#else - if (c == CHAR_QUESTION_MARK) - c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff; - else - { - for (i = 0; i < 32; i++) - { - if (c == ebcdic_escape_c[i]) break; - } - if (i < 32) c = i; else *errorcodeptr = ERR68; - } -#endif /* EBCDIC */ - - ptr++; - break; - - /* Any other alphanumeric following \ is an error. Perl gives an error only - if in warning mode, but PCRE doesn't have a warning mode. */ - - default: - *errorcodeptr = ERR3; - *ptrptr = ptr - 1; /* Point to the character at fault */ - return 0; - } - } - -/* Set the pointer to the next character before returning. */ - -*ptrptr = ptr; -*chptr = c; -return escape; -} - - - -#ifdef SUPPORT_UNICODE -/************************************************* -* Handle \P and \p * -*************************************************/ - -/* This function is called after \P or \p has been encountered, provided that -PCRE2 is compiled with support for UTF and Unicode properties. On entry, the -contents of ptrptr are pointing after the P or p. On exit, it is left pointing -after the final code unit of the escape sequence. - -Arguments: - ptrptr the pattern position pointer - negptr a boolean that is set TRUE for negation else FALSE - ptypeptr an unsigned int that is set to the type value - pdataptr an unsigned int that is set to the detailed property value - errorcodeptr the error code variable - cb the compile data - -Returns: TRUE if the type value was found, or FALSE for an invalid type -*/ - -static BOOL -get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr, - uint16_t *pdataptr, int *errorcodeptr, compile_block *cb) -{ -PCRE2_UCHAR c; -PCRE2_SIZE i, bot, top; -PCRE2_SPTR ptr = *ptrptr; -PCRE2_UCHAR name[32]; - -if (ptr >= cb->end_pattern) goto ERROR_RETURN; -c = *ptr++; -*negptr = FALSE; - -/* \P or \p can be followed by a name in {}, optionally preceded by ^ for -negation. */ - -if (c == CHAR_LEFT_CURLY_BRACKET) - { - if (ptr >= cb->end_pattern) goto ERROR_RETURN; - if (*ptr == CHAR_CIRCUMFLEX_ACCENT) - { - *negptr = TRUE; - ptr++; - } - for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++) - { - if (ptr >= cb->end_pattern) goto ERROR_RETURN; - c = *ptr++; - if (c == CHAR_NUL) goto ERROR_RETURN; - if (c == CHAR_RIGHT_CURLY_BRACKET) break; - name[i] = c; - } - if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; - name[i] = 0; - } - -/* Otherwise there is just one following character, which must be an ASCII -letter. */ - -else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0) - { - name[0] = c; - name[1] = 0; - } -else goto ERROR_RETURN; - -*ptrptr = ptr; - -/* Search for a recognized property name using binary chop. */ - -bot = 0; -top = PRIV(utt_size); - -while (bot < top) - { - int r; - i = (bot + top) >> 1; - r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset); - if (r == 0) - { - *ptypeptr = PRIV(utt)[i].type; - *pdataptr = PRIV(utt)[i].value; - return TRUE; - } - if (r > 0) bot = i + 1; else top = i; - } -*errorcodeptr = ERR47; /* Unrecognized name */ -return FALSE; - -ERROR_RETURN: /* Malformed \P or \p */ -*errorcodeptr = ERR46; -*ptrptr = ptr; -return FALSE; -} -#endif - - - -/************************************************* -* Check for POSIX class syntax * -*************************************************/ - -/* This function is called when the sequence "[:" or "[." or "[=" is -encountered in a character class. It checks whether this is followed by a -sequence of characters terminated by a matching ":]" or ".]" or "=]". If we -reach an unescaped ']' without the special preceding character, return FALSE. - -Originally, this function only recognized a sequence of letters between the -terminators, but it seems that Perl recognizes any sequence of characters, -though of course unknown POSIX names are subsequently rejected. Perl gives an -"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE -didn't consider this to be a POSIX class. Likewise for [:1234:]. - -The problem in trying to be exactly like Perl is in the handling of escapes. We -have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX -class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code -below handles the special cases \\ and \], but does not try to do any other -escape processing. This makes it different from Perl for cases such as -[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does -not recognize "l\ower". This is a lesser evil than not diagnosing bad classes -when Perl does, I think. - -A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. -It seems that the appearance of a nested POSIX class supersedes an apparent -external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or -a digit. This is handled by returning FALSE if the start of a new group with -the same terminator is encountered, since the next closing sequence must close -the nested group, not the outer one. - -In Perl, unescaped square brackets may also appear as part of class names. For -example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for -[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not -seem right at all. PCRE does not allow closing square brackets in POSIX class -names. - -Arguments: - ptr pointer to the character after the initial [ (colon, dot, equals) - ptrend pointer to the end of the pattern - endptr where to return a pointer to the terminating ':', '.', or '=' - -Returns: TRUE or FALSE -*/ - -static BOOL -check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr) -{ -PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */ -terminator = *ptr++; /* compiler warns about "non-constant" initializer. */ - -for (; ptrend - ptr >= 2; ptr++) - { - if (*ptr == CHAR_BACKSLASH && - (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH)) - ptr++; - - else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) || - *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; - - else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) - { - *endptr = ptr; - return TRUE; - } - } - -return FALSE; -} - - - -/************************************************* -* Check POSIX class name * -*************************************************/ - -/* This function is called to check the name given in a POSIX-style class entry -such as [:alnum:]. - -Arguments: - ptr points to the first letter - len the length of the name - -Returns: a value representing the name, or -1 if unknown -*/ - -static int -check_posix_name(PCRE2_SPTR ptr, int len) -{ -const char *pn = posix_names; -int yield = 0; -while (posix_name_lengths[yield] != 0) - { - if (len == posix_name_lengths[yield] && - PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield; - pn += posix_name_lengths[yield] + 1; - yield++; - } -return -1; -} - - - -/************************************************* -* Read a subpattern or VERB name * -*************************************************/ - -/* This function is called from parse_regex() below whenever it needs to read -the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial -pointer must be to the character before the name. If that character is '*' we -are reading a verb or alpha assertion name. The pointer is updated to point -after the name, for a VERB or alpha assertion name, or after tha name's -terminator for a subpattern name. Returning both the offset and the name -pointer is redundant information, but some callers use one and some the other, -so it is simplest just to return both. - -Arguments: - ptrptr points to the character pointer variable - ptrend points to the end of the input string - utf true if the input is UTF-encoded - terminator the terminator of a subpattern name must be this - offsetptr where to put the offset from the start of the pattern - nameptr where to put a pointer to the name in the input - namelenptr where to put the length of the name - errcodeptr where to put an error code - cb pointer to the compile data block - -Returns: TRUE if a name was read - FALSE otherwise, with error code set -*/ - -static BOOL -read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator, - PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr, - int *errorcodeptr, compile_block *cb) -{ -PCRE2_SPTR ptr = *ptrptr; -BOOL is_group = (*ptr != CHAR_ASTERISK); - -if (++ptr >= ptrend) /* No characters in name */ - { - *errorcodeptr = is_group? ERR62: /* Subpattern name expected */ - ERR60; /* Verb not recognized or malformed */ - goto FAILED; - } - -*nameptr = ptr; -*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern); - -/* In UTF mode, a group name may contain letters and decimal digits as defined -by Unicode properties, and underscores, but must not start with a digit. */ - -#ifdef SUPPORT_UNICODE -if (utf && is_group) - { - uint32_t c, type; - - GETCHAR(c, ptr); - type = UCD_CHARTYPE(c); - - if (type == ucp_Nd) - { - *errorcodeptr = ERR44; - goto FAILED; - } - - for(;;) - { - if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L && - c != CHAR_UNDERSCORE) break; - ptr++; - FORWARDCHARTEST(ptr, ptrend); - if (ptr >= ptrend) break; - GETCHAR(c, ptr); - type = UCD_CHARTYPE(c); - } - } -else -#else -(void)utf; /* Avoid compiler warning */ -#endif /* SUPPORT_UNICODE */ - -/* Handle non-group names and group names in non-UTF modes. A group name must -not start with a digit. If either of the others start with a digit it just -won't be recognized. */ - - { - if (is_group && IS_DIGIT(*ptr)) - { - *errorcodeptr = ERR44; - goto FAILED; - } - - while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) - { - ptr++; - } - } - -/* Check name length */ - -if (ptr > *nameptr + MAX_NAME_SIZE) - { - *errorcodeptr = ERR48; - goto FAILED; - } -*namelenptr = (uint32_t)(ptr - *nameptr); - -/* Subpattern names must not be empty, and their terminator is checked here. -(What follows a verb or alpha assertion name is checked separately.) */ - -if (is_group) - { - if (ptr == *nameptr) - { - *errorcodeptr = ERR62; /* Subpattern name expected */ - goto FAILED; - } - if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator) - { - *errorcodeptr = ERR42; - goto FAILED; - } - ptr++; - } - -*ptrptr = ptr; -return TRUE; - -FAILED: -*ptrptr = ptr; -return FALSE; -} - - - -/************************************************* -* Manage callouts at start of cycle * -*************************************************/ - -/* At the start of a new item in parse_regex() we are able to record the -details of the previous item in a prior callout, and also to set up an -automatic callout if enabled. Avoid having two adjacent automatic callouts, -which would otherwise happen for items such as \Q that contribute nothing to -the parsed pattern. - -Arguments: - ptr current pattern pointer - pcalloutptr points to a pointer to previous callout, or NULL - auto_callout TRUE if auto_callouts are enabled - parsed_pattern the parsed pattern pointer - cb compile block - -Returns: possibly updated parsed_pattern pointer. -*/ - -static uint32_t * -manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout, - uint32_t *parsed_pattern, compile_block *cb) -{ -uint32_t *previous_callout = *pcalloutptr; - -if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr - - cb->start_pattern - (PCRE2_SIZE)previous_callout[1]); - -if (!auto_callout) previous_callout = NULL; else - { - if (previous_callout == NULL || - previous_callout != parsed_pattern - 4 || - previous_callout[3] != 255) - { - previous_callout = parsed_pattern; /* Set up new automatic callout */ - parsed_pattern += 4; - previous_callout[0] = META_CALLOUT_NUMBER; - previous_callout[2] = 0; - previous_callout[3] = 255; - } - previous_callout[1] = (uint32_t)(ptr - cb->start_pattern); - } - -*pcalloutptr = previous_callout; -return parsed_pattern; -} - - - -/************************************************* -* Parse regex and identify named groups * -*************************************************/ - -/* This function is called first of all. It scans the pattern and does two -things: (1) It identifies capturing groups and makes a table of named capturing -groups so that information about them is fully available to both the compiling -scans. (2) It writes a parsed version of the pattern with comments omitted and -escapes processed into the parsed_pattern vector. - -Arguments: - ptr points to the start of the pattern - options compiling dynamic options (may change during the scan) - has_lookbehind points to a boolean, set TRUE if a lookbehind is found - cb pointer to the compile data block - -Returns: zero on success or a non-zero error code, with the - error offset placed in the cb field -*/ - -/* A structure and some flags for dealing with nested groups. */ - -typedef struct nest_save { - uint16_t nest_depth; - uint16_t reset_group; - uint16_t max_group; - uint16_t flags; - uint32_t options; -} nest_save; - -#define NSF_RESET 0x0001u -#define NSF_CONDASSERT 0x0002u -#define NSF_ATOMICSR 0x0004u - -/* Options that are changeable within the pattern must be tracked during -parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing, -but all must be tracked so that META_OPTIONS items set the correct values for -the main compiling phase. */ - -#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \ - PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \ - PCRE2_UNGREEDY) - -/* States used for analyzing ranges in character classes. The two OK values -must be last. */ - -enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL }; - -/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates -the storing of literal values in the main parsed pattern, where they can always -be quantified. */ - -#if PCRE2_CODE_UNIT_WIDTH == 32 -#define PARSED_LITERAL(c, p) \ - { \ - if (c >= META_END) *p++ = META_BIGVALUE; \ - *p++ = c; \ - okquantifier = TRUE; \ - } -#else -#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE; -#endif - -/* Here's the actual function. */ - -static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind, - compile_block *cb) -{ -uint32_t c; -uint32_t delimiter; -uint32_t namelen; -uint32_t class_range_state; -uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */ -uint32_t *verbstartptr = NULL; -uint32_t *previous_callout = NULL; -uint32_t *parsed_pattern = cb->parsed_pattern; -uint32_t *parsed_pattern_end = cb->parsed_pattern_end; -uint32_t meta_quantifier = 0; -uint32_t add_after_mark = 0; -uint32_t extra_options = cb->cx->extra_options; -uint16_t nest_depth = 0; -int after_manual_callout = 0; -int expect_cond_assert = 0; -int errorcode = 0; -int escape; -int i; -BOOL inescq = FALSE; -BOOL inverbname = FALSE; -BOOL utf = (options & PCRE2_UTF) != 0; -BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0; -BOOL isdupname; -BOOL negate_class; -BOOL okquantifier = FALSE; -PCRE2_SPTR thisptr; -PCRE2_SPTR name; -PCRE2_SPTR ptrend = cb->end_pattern; -PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */ -named_group *ng; -nest_save *top_nest, *end_nests; - -/* Insert leading items for word and line matching (features provided for the -benefit of pcre2grep). */ - -if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0) - { - *parsed_pattern++ = META_CIRCUMFLEX; - *parsed_pattern++ = META_NOCAPTURE; - } -else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0) - { - *parsed_pattern++ = META_ESCAPE + ESC_b; - *parsed_pattern++ = META_NOCAPTURE; - } - -/* If the pattern is actually a literal string, process it separately to avoid -cluttering up the main loop. */ - -if ((options & PCRE2_LITERAL) != 0) - { - while (ptr < ptrend) - { - if (parsed_pattern >= parsed_pattern_end) - { - errorcode = ERR63; /* Internal error (parsed pattern overflow) */ - goto FAILED; - } - thisptr = ptr; - GETCHARINCTEST(c, ptr); - if (auto_callout) - parsed_pattern = manage_callouts(thisptr, &previous_callout, - auto_callout, parsed_pattern, cb); - PARSED_LITERAL(c, parsed_pattern); - } - goto PARSED_END; - } - -/* Process a real regex which may contain meta-characters. */ - -top_nest = NULL; -end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size); - -/* The size of the nest_save structure might not be a factor of the size of the -workspace. Therefore we must round down end_nests so as to correctly avoid -creating a nest_save that spans the end of the workspace. */ - -end_nests = (nest_save *)((char *)end_nests - - ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save))); - -/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */ - -if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED; - -/* Now scan the pattern */ - -while (ptr < ptrend) - { - int prev_expect_cond_assert; - uint32_t min_repeat, max_repeat; - uint32_t set, unset, *optset; - uint32_t terminator; - uint32_t prev_meta_quantifier; - BOOL prev_okquantifier; - PCRE2_SPTR tempptr; - PCRE2_SIZE offset; - - if (parsed_pattern >= parsed_pattern_end) - { - errorcode = ERR63; /* Internal error (parsed pattern overflow) */ - goto FAILED; - } - - if (nest_depth > cb->cx->parens_nest_limit) - { - errorcode = ERR19; - goto FAILED; /* Parentheses too deeply nested */ - } - - /* Get next input character, save its position for callout handling. */ - - thisptr = ptr; - GETCHARINCTEST(c, ptr); - - /* Copy quoted literals until \E, allowing for the possibility of automatic - callouts, except when processing a (*VERB) "name". */ - - if (inescq) - { - if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E) - { - inescq = FALSE; - ptr++; /* Skip E */ - } - else - { - if (expect_cond_assert > 0) /* A literal is not allowed if we are */ - { /* expecting a conditional assertion, */ - ptr--; /* but an empty \Q\E sequence is OK. */ - errorcode = ERR28; - goto FAILED; - } - if (inverbname) - { /* Don't use PARSED_LITERAL() because it */ -#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */ - if (c >= META_END) *parsed_pattern++ = META_BIGVALUE; -#endif - *parsed_pattern++ = c; - } - else - { - if (after_manual_callout-- <= 0) - parsed_pattern = manage_callouts(thisptr, &previous_callout, - auto_callout, parsed_pattern, cb); - PARSED_LITERAL(c, parsed_pattern); - } - meta_quantifier = 0; - } - continue; /* Next character */ - } - - /* If we are processing the "name" part of a (*VERB:NAME) item, all - characters up to the closing parenthesis are literals except when - PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q - and \E and escaped characters are allowed (no character types such as \d). If - PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do - this by not entering the special (*VERB:NAME) processing - they are then - picked up below. Note that c is a character, not a code unit, so we must not - use MAX_255 to test its size because MAX_255 tests code units and is assumed - TRUE in 8-bit mode. */ - - if (inverbname && - ( - /* EITHER: not both options set */ - ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) != - (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) || -#ifdef SUPPORT_UNICODE - /* OR: character > 255 AND not Unicode Pattern White Space */ - (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) || -#endif - /* OR: not a # comment or isspace() white space */ - (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0 -#ifdef SUPPORT_UNICODE - /* and not CHAR_NEL when Unicode is supported */ - && c != CHAR_NEL -#endif - ))) - { - PCRE2_SIZE verbnamelength; - - switch(c) - { - default: /* Don't use PARSED_LITERAL() because it */ -#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */ - if (c >= META_END) *parsed_pattern++ = META_BIGVALUE; -#endif - *parsed_pattern++ = c; - break; - - case CHAR_RIGHT_PARENTHESIS: - inverbname = FALSE; - /* This is the length in characters */ - verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1); - /* But the limit on the length is in code units */ - if (ptr - verbnamestart - 1 > (int)MAX_MARK) - { - ptr--; - errorcode = ERR76; - goto FAILED; - } - *verblengthptr = (uint32_t)verbnamelength; - - /* If this name was on a verb such as (*ACCEPT) which does not continue, - a (*MARK) was generated for the name. We now add the original verb as the - next item. */ - - if (add_after_mark != 0) - { - *parsed_pattern++ = add_after_mark; - add_after_mark = 0; - } - break; - - case CHAR_BACKSLASH: - if ((options & PCRE2_ALT_VERBNAMES) != 0) - { - escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, - cb->cx->extra_options, FALSE, cb); - if (errorcode != 0) goto FAILED; - } - else escape = 0; /* Treat all as literal */ - - switch(escape) - { - case 0: /* Don't use PARSED_LITERAL() because it */ -#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */ - if (c >= META_END) *parsed_pattern++ = META_BIGVALUE; -#endif - *parsed_pattern++ = c; - break; - - case ESC_Q: - inescq = TRUE; - break; - - case ESC_E: /* Ignore */ - break; - - default: - errorcode = ERR40; /* Invalid in verb name */ - goto FAILED; - } - } - continue; /* Next character in pattern */ - } - - /* Not a verb name character. At this point we must process everything that - must not change the quantification state. This is mainly comments, but we - handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as - A+, as in Perl. An isolated \E is ignored. */ - - if (c == CHAR_BACKSLASH && ptr < ptrend) - { - if (*ptr == CHAR_Q || *ptr == CHAR_E) - { - inescq = *ptr == CHAR_Q; - ptr++; - continue; - } - } - - /* Skip over whitespace and # comments in extended mode. Note that c is a - character, not a code unit, so we must not use MAX_255 to test its size - because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The - whitespace characters are those designated as "Pattern White Space" by - Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is - U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a - subset of space characters that match \h and \v. */ - - if ((options & PCRE2_EXTENDED) != 0) - { - if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue; -#ifdef SUPPORT_UNICODE - if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue; -#endif - if (c == CHAR_NUMBER_SIGN) - { - while (ptr < ptrend) - { - if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ - { /* IS_NEWLINE sets cb->nllen. */ - ptr += cb->nllen; - break; - } - ptr++; -#ifdef SUPPORT_UNICODE - if (utf) FORWARDCHARTEST(ptr, ptrend); -#endif - } - continue; /* Next character in pattern */ - } - } - - /* Skip over bracketed comments */ - - if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 && - ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN) - { - while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS); - if (ptr >= ptrend) - { - errorcode = ERR18; /* A special error for missing ) in a comment */ - goto FAILED; /* to make it easier to debug. */ - } - ptr++; - continue; /* Next character in pattern */ - } - - /* If the next item is not a quantifier, fill in length of any previous - callout and create an auto callout if required. */ - - if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK && - (c != CHAR_LEFT_CURLY_BRACKET || - (tempptr = ptr, - !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode)))) - { - if (after_manual_callout-- <= 0) - parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout, - parsed_pattern, cb); - } - - /* If expect_cond_assert is 2, we have just passed (?( and are expecting an - assertion, possibly preceded by a callout. If the value is 1, we have just - had the callout and expect an assertion. There must be at least 3 more - characters in all cases. When expect_cond_assert is 2, we know that the - current character is an opening parenthesis, as otherwise we wouldn't be - here. However, when it is 1, we need to check, and it's easiest just to check - always. Note that expect_cond_assert may be negative, since all callouts just - decrement it. */ - - if (expect_cond_assert > 0) - { - BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 && - (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK); - if (ok) - { - if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */ - { - ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0; - } - else switch(ptr[1]) /* Traditional symbolic format */ - { - case CHAR_C: - ok = expect_cond_assert == 2; - break; - - case CHAR_EQUALS_SIGN: - case CHAR_EXCLAMATION_MARK: - break; - - case CHAR_LESS_THAN_SIGN: - ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK; - break; - - default: - ok = FALSE; - } - } - - if (!ok) - { - ptr--; /* Adjust error offset */ - errorcode = ERR28; - goto FAILED; - } - } - - /* Remember whether we are expecting a conditional assertion, and set the - default for this item. */ - - prev_expect_cond_assert = expect_cond_assert; - expect_cond_assert = 0; - - /* Remember quantification status for the previous significant item, then set - default for this item. */ - - prev_okquantifier = okquantifier; - prev_meta_quantifier = meta_quantifier; - okquantifier = FALSE; - meta_quantifier = 0; - - /* If the previous significant item was a quantifier, adjust the parsed code - if there is a following modifier. The base meta value is always followed by - the PLUS and QUERY values, in that order. We do this here rather than after - reading a quantifier so that intervening comments and /x whitespace can be - ignored without having to replicate code. */ - - if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS)) - { - parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] = - prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)? - 0x00020000u : 0x00010000u); - continue; /* Next character in pattern */ - } - - - /* Process the next item in the main part of a pattern. */ - - switch(c) - { - default: /* Non-special character */ - PARSED_LITERAL(c, parsed_pattern); - break; - - - /* ---- Escape sequence ---- */ - - case CHAR_BACKSLASH: - tempptr = ptr; - escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, - cb->cx->extra_options, FALSE, cb); - if (errorcode != 0) - { - ESCAPE_FAILED: - if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0) - goto FAILED; - ptr = tempptr; - if (ptr >= ptrend) c = CHAR_BACKSLASH; else - { - GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ - } - escape = 0; /* Treat as literal character */ - } - - /* The escape was a data escape or literal character. */ - - if (escape == 0) - { - PARSED_LITERAL(c, parsed_pattern); - } - - /* The escape was a back (or forward) reference. We keep the offset in - order to give a more useful diagnostic for a bad forward reference. For - references to groups numbered less than 10 we can't use more than two items - in parsed_pattern because they may be just two characters in the input (and - in a 64-bit world an offset may need two elements). So for them, the offset - of the first occurrent is held in a special vector. */ - - else if (escape < 0) - { - offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1); - escape = -escape; - *parsed_pattern++ = META_BACKREF | (uint32_t)escape; - if (escape < 10) - { - if (cb->small_ref_offset[escape] == PCRE2_UNSET) - cb->small_ref_offset[escape] = offset; - } - else - { - PUTOFFSET(offset, parsed_pattern); - } - okquantifier = TRUE; - } - - /* The escape was a character class such as \d etc. or other special - escape indicator such as \A or \X. Most of them generate just a single - parsed item, but \P and \p are followed by a 16-bit type and a 16-bit - value. They are supported only when Unicode is available. The type and - value are packed into a single 32-bit value so that the whole sequences - uses only two elements in the parsed_vector. This is because the same - coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is - set. - - There are also some cases where the escape sequence is followed by a name: - \k{name}, \k, and \k'name' are backreferences by name, and \g - and \g'name' are subroutine calls by name; \g{name} is a synonym for - \k{name}. Note that \g and \g'number' are handled by check_escape() - and returned as a negative value (handled above). A name is coded as an - offset into the pattern and a length. */ - - else switch (escape) - { - case ESC_C: -#ifdef NEVER_BACKSLASH_C - errorcode = ERR85; - goto ESCAPE_FAILED; -#else - if ((options & PCRE2_NEVER_BACKSLASH_C) != 0) - { - errorcode = ERR83; - goto ESCAPE_FAILED; - } -#endif - okquantifier = TRUE; - *parsed_pattern++ = META_ESCAPE + escape; - break; - - case ESC_X: -#ifndef SUPPORT_UNICODE - errorcode = ERR45; /* Supported only with Unicode support */ - goto ESCAPE_FAILED; -#endif - case ESC_H: - case ESC_h: - case ESC_N: - case ESC_R: - case ESC_V: - case ESC_v: - okquantifier = TRUE; - *parsed_pattern++ = META_ESCAPE + escape; - break; - - default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */ - *parsed_pattern++ = META_ESCAPE + escape; - break; - - /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set - without Unicode support because it is checked when pcre2_compile() is - called. */ - - case ESC_d: - case ESC_D: - case ESC_s: - case ESC_S: - case ESC_w: - case ESC_W: - okquantifier = TRUE; - if ((options & PCRE2_UCP) == 0) - { - *parsed_pattern++ = META_ESCAPE + escape; - } - else - { - *parsed_pattern++ = META_ESCAPE + - ((escape == ESC_d || escape == ESC_s || escape == ESC_w)? - ESC_p : ESC_P); - switch(escape) - { - case ESC_d: - case ESC_D: - *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; - break; - - case ESC_s: - case ESC_S: - *parsed_pattern++ = PT_SPACE << 16; - break; - - case ESC_w: - case ESC_W: - *parsed_pattern++ = PT_WORD << 16; - break; - } - } - break; - - /* Unicode property matching */ - - case ESC_P: - case ESC_p: -#ifdef SUPPORT_UNICODE - { - BOOL negated; - uint16_t ptype = 0, pdata = 0; - if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb)) - goto ESCAPE_FAILED; - if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; - *parsed_pattern++ = META_ESCAPE + escape; - *parsed_pattern++ = (ptype << 16) | pdata; - okquantifier = TRUE; - } -#else - errorcode = ERR45; - goto ESCAPE_FAILED; -#endif - break; /* End \P and \p */ - - /* When \g is used with quotes or angle brackets as delimiters, it is a - numerical or named subroutine call, and control comes here. When used - with brace delimiters it is a numberical back reference and does not come - here because check_escape() returns it directly as a reference. \k is - always a named back reference. */ - - case ESC_g: - case ESC_k: - if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET && - *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE)) - { - errorcode = (escape == ESC_g)? ERR57 : ERR69; - goto ESCAPE_FAILED; - } - terminator = (*ptr == CHAR_LESS_THAN_SIGN)? - CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? - CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET; - - /* For a non-braced \g, check for a numerical recursion. */ - - if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET) - { - PCRE2_SPTR p = ptr + 1; - - if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i, - &errorcode)) - { - if (p >= ptrend || *p != terminator) - { - errorcode = ERR57; - goto ESCAPE_FAILED; - } - ptr = p; - goto SET_RECURSION; - } - if (errorcode != 0) goto ESCAPE_FAILED; - } - - /* Not a numerical recursion */ - - if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen, - &errorcode, cb)) goto ESCAPE_FAILED; - - /* \k and \g when used with braces are back references, whereas \g used - with quotes or angle brackets is a recursion */ - - *parsed_pattern++ = - (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)? - META_BACKREF_BYNAME : META_RECURSE_BYNAME; - *parsed_pattern++ = namelen; - - PUTOFFSET(offset, parsed_pattern); - okquantifier = TRUE; - break; /* End special escape processing */ - } - break; /* End escape sequence processing */ - - - /* ---- Single-character special items ---- */ - - case CHAR_CIRCUMFLEX_ACCENT: - *parsed_pattern++ = META_CIRCUMFLEX; - break; - - case CHAR_DOLLAR_SIGN: - *parsed_pattern++ = META_DOLLAR; - break; - - case CHAR_DOT: - *parsed_pattern++ = META_DOT; - okquantifier = TRUE; - break; - - - /* ---- Single-character quantifiers ---- */ - - case CHAR_ASTERISK: - meta_quantifier = META_ASTERISK; - goto CHECK_QUANTIFIER; - - case CHAR_PLUS: - meta_quantifier = META_PLUS; - goto CHECK_QUANTIFIER; - - case CHAR_QUESTION_MARK: - meta_quantifier = META_QUERY; - goto CHECK_QUANTIFIER; - - - /* ---- Potential {n,m} quantifier ---- */ - - case CHAR_LEFT_CURLY_BRACKET: - if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat, - &errorcode)) - { - if (errorcode != 0) goto FAILED; /* Error in quantifier. */ - PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */ - break; /* No more quantifier processing */ - } - meta_quantifier = META_MINMAX; - /* Fall through */ - - - /* ---- Quantifier post-processing ---- */ - - /* Check that a quantifier is allowed after the previous item. */ - - CHECK_QUANTIFIER: - if (!prev_okquantifier) - { - errorcode = ERR9; - goto FAILED_BACK; - } - - /* Most (*VERB)s are not allowed to be quantified, but an ungreedy - quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a - sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by - wrapping it in non-capturing brackets, but we have to allow for a preceding - (*MARK) for when (*ACCEPT) has an argument. */ - - if (parsed_pattern[-1] == META_ACCEPT) - { - uint32_t *p; - for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0]; - *verbstartptr = META_NOCAPTURE; - parsed_pattern[1] = META_KET; - parsed_pattern += 2; - } - - /* Now we can put the quantifier into the parsed pattern vector. At this - stage, we have only the basic quantifier. The check for a following + or ? - modifier happens at the top of the loop, after any intervening comments - have been removed. */ - - *parsed_pattern++ = meta_quantifier; - if (c == CHAR_LEFT_CURLY_BRACKET) - { - *parsed_pattern++ = min_repeat; - *parsed_pattern++ = max_repeat; - } - break; - - - /* ---- Character class ---- */ - - case CHAR_LEFT_SQUARE_BRACKET: - okquantifier = TRUE; - - /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is - used for "start of word" and "end of word". As these are otherwise illegal - sequences, we don't break anything by recognizing them. They are replaced - by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are - erroneous and are handled by the normal code below. */ - - if (ptrend - ptr >= 6 && - (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 || - PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0)) - { - *parsed_pattern++ = META_ESCAPE + ESC_b; - - if (ptr[2] == CHAR_LESS_THAN_SIGN) - { - *parsed_pattern++ = META_LOOKAHEAD; - } - else - { - *parsed_pattern++ = META_LOOKBEHIND; - *has_lookbehind = TRUE; - - /* The offset is used only for the "non-fixed length" error; this won't - occur here, so just store zero. */ - - PUTOFFSET((PCRE2_SIZE)0, parsed_pattern); - } - - if ((options & PCRE2_UCP) == 0) - *parsed_pattern++ = META_ESCAPE + ESC_w; - else - { - *parsed_pattern++ = META_ESCAPE + ESC_p; - *parsed_pattern++ = PT_WORD << 16; - } - *parsed_pattern++ = META_KET; - ptr += 6; - break; - } - - /* PCRE supports POSIX class stuff inside a class. Perl gives an error if - they are encountered at the top level, so we'll do that too. */ - - if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT || - *ptr == CHAR_EQUALS_SIGN) && - check_posix_syntax(ptr, ptrend, &tempptr)) - { - errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13; - goto FAILED; - } - - /* Process a regular character class. If the first character is '^', set - the negation flag. If the first few characters (either before or after ^) - are \Q\E or \E or space or tab in extended-more mode, we skip them too. - This makes for compatibility with Perl. */ - - negate_class = FALSE; - while (ptr < ptrend) - { - GETCHARINCTEST(c, ptr); - if (c == CHAR_BACKSLASH) - { - if (ptr < ptrend && *ptr == CHAR_E) ptr++; - else if (ptrend - ptr >= 3 && - PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0) - ptr += 3; - else - break; - } - else if ((options & PCRE2_EXTENDED_MORE) != 0 && - (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */ - continue; - else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) - negate_class = TRUE; - else break; - } - - /* Now the real contents of the class; c has the first "real" character. - Empty classes are permitted only if the option is set. */ - - if (c == CHAR_RIGHT_SQUARE_BRACKET && - (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0) - { - *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY; - break; /* End of class processing */ - } - - /* Process a non-empty class. */ - - *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS; - class_range_state = RANGE_NO; - - /* In an EBCDIC environment, Perl treats alphabetic ranges specially - because there are holes in the encoding, and simply using the range A-Z - (for example) would include the characters in the holes. This applies only - to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z] - in this respect. In order to accommodate this, we keep track of whether - character values are literal or not, and a state variable for handling - ranges. */ - - /* Loop for the contents of the class */ - - for (;;) - { - BOOL char_is_literal = TRUE; - - /* Inside \Q...\E everything is literal except \E */ - - if (inescq) - { - if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E) - { - inescq = FALSE; /* Reset literal state */ - ptr++; /* Skip the 'E' */ - goto CLASS_CONTINUE; - } - goto CLASS_LITERAL; - } - - /* Skip over space and tab (only) in extended-more mode. */ - - if ((options & PCRE2_EXTENDED_MORE) != 0 && - (c == CHAR_SPACE || c == CHAR_HT)) - goto CLASS_CONTINUE; - - /* Handle POSIX class names. Perl allows a negation extension of the - form [:^name:]. A square bracket that doesn't match the syntax is - treated as a literal. We also recognize the POSIX constructions - [.ch.] and [=ch=] ("collating elements") and fault them, as Perl - 5.6 and 5.8 do. */ - - if (c == CHAR_LEFT_SQUARE_BRACKET && - ptrend - ptr >= 3 && - (*ptr == CHAR_COLON || *ptr == CHAR_DOT || - *ptr == CHAR_EQUALS_SIGN) && - check_posix_syntax(ptr, ptrend, &tempptr)) - { - BOOL posix_negate = FALSE; - int posix_class; - - /* Perl treats a hyphen before a POSIX class as a literal, not the - start of a range. However, it gives a warning in its warning mode. PCRE - does not have a warning mode, so we give an error, because this is - likely an error on the user's part. */ - - if (class_range_state == RANGE_STARTED) - { - errorcode = ERR50; - goto FAILED; - } - - if (*ptr != CHAR_COLON) - { - errorcode = ERR13; - goto FAILED_BACK; - } - - if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT) - { - posix_negate = TRUE; - ptr++; - } - - posix_class = check_posix_name(ptr, (int)(tempptr - ptr)); - if (posix_class < 0) - { - errorcode = ERR30; - goto FAILED; - } - ptr = tempptr + 2; - - /* Perl treats a hyphen after a POSIX class as a literal, not the - start of a range. However, it gives a warning in its warning mode - unless the hyphen is the last character in the class. PCRE does not - have a warning mode, so we give an error, because this is likely an - error on the user's part. */ - - if (ptr < ptrend - 1 && *ptr == CHAR_MINUS && - ptr[1] != CHAR_RIGHT_SQUARE_BRACKET) - { - errorcode = ERR50; - goto FAILED; - } - - /* Set "a hyphen is not the start of a range" for the -] case, and also - in case the POSIX class is followed by \E or \Q\E (possibly repeated - - fuzzers do that kind of thing) and *then* a hyphen. This causes that - hyphen to be treated as a literal. I don't think it's worth setting up - special apparatus to do otherwise. */ - - class_range_state = RANGE_NO; - - /* When PCRE2_UCP is set, some of the POSIX classes are converted to - use Unicode properties \p or \P or, in one case, \h or \H. The - substitutes table has two values per class, containing the type and - value of a \p or \P item. The special cases are specified with a - negative type: a non-zero value causes \h or \H to be used, and a zero - value falls through to behave like a non-UCP POSIX class. */ - -#ifdef SUPPORT_UNICODE - if ((options & PCRE2_UCP) != 0) - { - int ptype = posix_substitutes[2*posix_class]; - int pvalue = posix_substitutes[2*posix_class + 1]; - if (ptype >= 0) - { - *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p); - *parsed_pattern++ = (ptype << 16) | pvalue; - goto CLASS_CONTINUE; - } - - if (pvalue != 0) - { - *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h); - goto CLASS_CONTINUE; - } - - /* Fall through */ - } -#endif /* SUPPORT_UNICODE */ - - /* Non-UCP POSIX class */ - - *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX; - *parsed_pattern++ = posix_class; - } - - /* Handle potential start of range */ - - else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED) - { - *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)? - META_RANGE_LITERAL : META_RANGE_ESCAPED; - class_range_state = RANGE_STARTED; - } - - /* Handle a literal character */ - - else if (c != CHAR_BACKSLASH) - { - CLASS_LITERAL: - if (class_range_state == RANGE_STARTED) - { - if (c == parsed_pattern[-2]) /* Optimize one-char range */ - parsed_pattern--; - else if (parsed_pattern[-2] > c) /* Check range is in order */ - { - errorcode = ERR8; - goto FAILED_BACK; - } - else - { - if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL) - parsed_pattern[-1] = META_RANGE_ESCAPED; - PARSED_LITERAL(c, parsed_pattern); - } - class_range_state = RANGE_NO; - } - else /* Potential start of range */ - { - class_range_state = char_is_literal? - RANGE_OK_LITERAL : RANGE_OK_ESCAPED; - PARSED_LITERAL(c, parsed_pattern); - } - } - - /* Handle escapes in a class */ - - else - { - tempptr = ptr; - escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, - cb->cx->extra_options, TRUE, cb); - - if (errorcode != 0) - { - if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0) - goto FAILED; - ptr = tempptr; - if (ptr >= ptrend) c = CHAR_BACKSLASH; else - { - GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ - } - escape = 0; /* Treat as literal character */ - } - - switch(escape) - { - case 0: /* Escaped character code point is in c */ - char_is_literal = FALSE; - goto CLASS_LITERAL; - - case ESC_b: - c = CHAR_BS; /* \b is backspace in a class */ - char_is_literal = FALSE; - goto CLASS_LITERAL; - - case ESC_Q: - inescq = TRUE; /* Enter literal mode */ - goto CLASS_CONTINUE; - - case ESC_E: /* Ignore orphan \E */ - goto CLASS_CONTINUE; - - case ESC_B: /* Always an error in a class */ - case ESC_R: - case ESC_X: - errorcode = ERR7; - ptr--; - goto FAILED; - } - - /* The second part of a range can be a single-character escape - sequence (detected above), but not any of the other escapes. Perl - treats a hyphen as a literal in such circumstances. However, in Perl's - warning mode, a warning is given, so PCRE now faults it, as it is - almost certainly a mistake on the user's part. */ - - if (class_range_state == RANGE_STARTED) - { - errorcode = ERR50; - goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */ - } - - /* Of the remaining escapes, only those that define characters are - allowed in a class. None may start a range. */ - - class_range_state = RANGE_NO; - switch(escape) - { - case ESC_N: - errorcode = ERR71; - goto FAILED; - - case ESC_H: - case ESC_h: - case ESC_V: - case ESC_v: - *parsed_pattern++ = META_ESCAPE + escape; - break; - - /* These escapes are converted to Unicode property tests when - PCRE2_UCP is set. */ - - case ESC_d: - case ESC_D: - case ESC_s: - case ESC_S: - case ESC_w: - case ESC_W: - if ((options & PCRE2_UCP) == 0) - { - *parsed_pattern++ = META_ESCAPE + escape; - } - else - { - *parsed_pattern++ = META_ESCAPE + - ((escape == ESC_d || escape == ESC_s || escape == ESC_w)? - ESC_p : ESC_P); - switch(escape) - { - case ESC_d: - case ESC_D: - *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; - break; - - case ESC_s: - case ESC_S: - *parsed_pattern++ = PT_SPACE << 16; - break; - - case ESC_w: - case ESC_W: - *parsed_pattern++ = PT_WORD << 16; - break; - } - } - break; - - /* Explicit Unicode property matching */ - - case ESC_P: - case ESC_p: -#ifdef SUPPORT_UNICODE - { - BOOL negated; - uint16_t ptype = 0, pdata = 0; - if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb)) - goto FAILED; - if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; - *parsed_pattern++ = META_ESCAPE + escape; - *parsed_pattern++ = (ptype << 16) | pdata; - } -#else - errorcode = ERR45; - goto FAILED; -#endif - break; /* End \P and \p */ - - default: /* All others are not allowed in a class */ - errorcode = ERR7; - ptr--; - goto FAILED; - } - - /* Perl gives a warning unless a following hyphen is the last character - in the class. PCRE throws an error. */ - - if (ptr < ptrend - 1 && *ptr == CHAR_MINUS && - ptr[1] != CHAR_RIGHT_SQUARE_BRACKET) - { - errorcode = ERR50; - goto FAILED; - } - } - - /* Proceed to next thing in the class. */ - - CLASS_CONTINUE: - if (ptr >= ptrend) - { - errorcode = ERR6; /* Missing terminating ']' */ - goto FAILED; - } - GETCHARINCTEST(c, ptr); - if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break; - } /* End of class-processing loop */ - - /* -] at the end of a class is a literal '-' */ - - if (class_range_state == RANGE_STARTED) - { - parsed_pattern[-1] = CHAR_MINUS; - class_range_state = RANGE_NO; - } - - *parsed_pattern++ = META_CLASS_END; - break; /* End of character class */ - - - /* ---- Opening parenthesis ---- */ - - case CHAR_LEFT_PARENTHESIS: - if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; - - /* If ( is not followed by ? it is either a capture or a special verb or an - alpha assertion or a positive non-atomic lookahead. */ - - if (*ptr != CHAR_QUESTION_MARK) - { - const char *vn; - - /* Handle capturing brackets (or non-capturing if auto-capture is turned - off). */ - - if (*ptr != CHAR_ASTERISK) - { - nest_depth++; - if ((options & PCRE2_NO_AUTO_CAPTURE) == 0) - { - if (cb->bracount >= MAX_GROUP_NUMBER) - { - errorcode = ERR97; - goto FAILED; - } - cb->bracount++; - *parsed_pattern++ = META_CAPTURE | cb->bracount; - } - else *parsed_pattern++ = META_NOCAPTURE; - } - - /* Do nothing for (* followed by end of pattern or ) so it gives a "bad - quantifier" error rather than "(*MARK) must have an argument". */ - - else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS) - break; - - /* Handle "alpha assertions" such as (*pla:...). Most of these are - synonyms for the historical symbolic assertions, but the script run and - non-atomic lookaround ones are new. They are distinguished by starting - with a lower case letter. Checking both ends of the alphabet makes this - work in all character codes. */ - - else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0) - { - uint32_t meta; - - vn = alasnames; - if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen, - &errorcode, cb)) goto FAILED; - if (ptr >= ptrend || *ptr != CHAR_COLON) - { - errorcode = ERR95; /* Malformed */ - goto FAILED; - } - - /* Scan the table of alpha assertion names */ - - for (i = 0; i < alascount; i++) - { - if (namelen == alasmeta[i].len && - PRIV(strncmp_c8)(name, vn, namelen) == 0) - break; - vn += alasmeta[i].len + 1; - } - - if (i >= alascount) - { - errorcode = ERR95; /* Alpha assertion not recognized */ - goto FAILED; - } - - /* Check for expecting an assertion condition. If so, only atomic - lookaround assertions are valid. */ - - meta = alasmeta[i].meta; - if (prev_expect_cond_assert > 0 && - (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT)) - { - errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)? - ERR98 : ERR28; /* (Atomic) assertion expected */ - goto FAILED; - } - - /* The lookaround alphabetic synonyms can mostly be handled by jumping - to the code that handles the traditional symbolic forms. */ - - switch(meta) - { - default: - errorcode = ERR89; /* Unknown code; should never occur because */ - goto FAILED; /* the meta values come from a table above. */ - - case META_ATOMIC: - goto ATOMIC_GROUP; - - case META_LOOKAHEAD: - goto POSITIVE_LOOK_AHEAD; - - case META_LOOKAHEAD_NA: - goto POSITIVE_NONATOMIC_LOOK_AHEAD; - - case META_LOOKAHEADNOT: - goto NEGATIVE_LOOK_AHEAD; - - case META_LOOKBEHIND: - case META_LOOKBEHINDNOT: - case META_LOOKBEHIND_NA: - *parsed_pattern++ = meta; - ptr--; - goto POST_LOOKBEHIND; - - /* The script run facilities are handled here. Unicode support is - required (give an error if not, as this is a security issue). Always - record a META_SCRIPT_RUN item. Then, for the atomic version, insert - META_ATOMIC and remember that we need two META_KETs at the end. */ - - case META_SCRIPT_RUN: - case META_ATOMIC_SCRIPT_RUN: -#ifdef SUPPORT_UNICODE - *parsed_pattern++ = META_SCRIPT_RUN; - nest_depth++; - ptr++; - if (meta == META_ATOMIC_SCRIPT_RUN) - { - *parsed_pattern++ = META_ATOMIC; - if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); - else if (++top_nest >= end_nests) - { - errorcode = ERR84; - goto FAILED; - } - top_nest->nest_depth = nest_depth; - top_nest->flags = NSF_ATOMICSR; - top_nest->options = options & PARSE_TRACKED_OPTIONS; - } - break; -#else /* SUPPORT_UNICODE */ - errorcode = ERR96; - goto FAILED; -#endif - } - } - - - /* ---- Handle (*VERB) and (*VERB:NAME) ---- */ - - else - { - vn = verbnames; - if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen, - &errorcode, cb)) goto FAILED; - if (ptr >= ptrend || (*ptr != CHAR_COLON && - *ptr != CHAR_RIGHT_PARENTHESIS)) - { - errorcode = ERR60; /* Malformed */ - goto FAILED; - } - - /* Scan the table of verb names */ - - for (i = 0; i < verbcount; i++) - { - if (namelen == verbs[i].len && - PRIV(strncmp_c8)(name, vn, namelen) == 0) - break; - vn += verbs[i].len + 1; - } - - if (i >= verbcount) - { - errorcode = ERR60; /* Verb not recognized */ - goto FAILED; - } - - /* An empty argument is treated as no argument. */ - - if (*ptr == CHAR_COLON && ptr + 1 < ptrend && - ptr[1] == CHAR_RIGHT_PARENTHESIS) - ptr++; /* Advance to the closing parens */ - - /* Check for mandatory non-empty argument; this is (*MARK) */ - - if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON) - { - errorcode = ERR66; - goto FAILED; - } - - /* Remember where this verb, possibly with a preceding (*MARK), starts, - for handling quantified (*ACCEPT). */ - - verbstartptr = parsed_pattern; - okquantifier = (verbs[i].meta == META_ACCEPT); - - /* It appears that Perl allows any characters whatsoever, other than a - closing parenthesis, to appear in arguments ("names"), so we no longer - insist on letters, digits, and underscores. Perl does not, however, do - any interpretation within arguments, and has no means of including a - closing parenthesis. PCRE supports escape processing but only when it - is requested by an option. We set inverbname TRUE here, and let the - main loop take care of this so that escape and \x processing is done by - the main code above. */ - - if (*ptr++ == CHAR_COLON) /* Skip past : or ) */ - { - /* Some optional arguments can be treated as a preceding (*MARK) */ - - if (verbs[i].has_arg < 0) - { - add_after_mark = verbs[i].meta; - *parsed_pattern++ = META_MARK; - } - - /* The remaining verbs with arguments (except *MARK) need a different - opcode. */ - - else - { - *parsed_pattern++ = verbs[i].meta + - ((verbs[i].meta != META_MARK)? 0x00010000u:0); - } - - /* Set up for reading the name in the main loop. */ - - verblengthptr = parsed_pattern++; - verbnamestart = ptr; - inverbname = TRUE; - } - else /* No verb "name" argument */ - { - *parsed_pattern++ = verbs[i].meta; - } - } /* End of (*VERB) handling */ - break; /* Done with this parenthesis */ - } /* End of groups that don't start with (? */ - - - /* ---- Items starting (? ---- */ - - /* The type of item is determined by what follows (?. Handle (?| and option - changes under "default" because both need a new block on the nest stack. - Comments starting with (?# are handled above. Note that there is some - ambiguity about the sequence (?- because if a digit follows it's a relative - recursion or subroutine call whereas otherwise it's an option unsetting. */ - - if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; - - switch(*ptr) - { - default: - if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1])) - goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */ - - /* We now have either (?| or a (possibly empty) option setting, - optionally followed by a non-capturing group. */ - - nest_depth++; - if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); - else if (++top_nest >= end_nests) - { - errorcode = ERR84; - goto FAILED; - } - top_nest->nest_depth = nest_depth; - top_nest->flags = 0; - top_nest->options = options & PARSE_TRACKED_OPTIONS; - - /* Start of non-capturing group that resets the capture count for each - branch. */ - - if (*ptr == CHAR_VERTICAL_LINE) - { - top_nest->reset_group = (uint16_t)cb->bracount; - top_nest->max_group = (uint16_t)cb->bracount; - top_nest->flags |= NSF_RESET; - cb->external_flags |= PCRE2_DUPCAPUSED; - *parsed_pattern++ = META_NOCAPTURE; - ptr++; - } - - /* Scan for options imnsxJU to be set or unset. */ - - else - { - BOOL hyphenok = TRUE; - uint32_t oldoptions = options; - - top_nest->reset_group = 0; - top_nest->max_group = 0; - set = unset = 0; - optset = &set; - - /* ^ at the start unsets imnsx and disables the subsequent use of - */ - - if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT) - { - options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| - PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE); - hyphenok = FALSE; - ptr++; - } - - while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS && - *ptr != CHAR_COLON) - { - switch (*ptr++) - { - case CHAR_MINUS: - if (!hyphenok) - { - errorcode = ERR94; - ptr--; /* Correct the offset */ - goto FAILED; - } - optset = &unset; - hyphenok = FALSE; - break; - - case CHAR_J: /* Record that it changed in the external options */ - *optset |= PCRE2_DUPNAMES; - cb->external_flags |= PCRE2_JCHANGED; - break; - - case CHAR_i: *optset |= PCRE2_CASELESS; break; - case CHAR_m: *optset |= PCRE2_MULTILINE; break; - case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break; - case CHAR_s: *optset |= PCRE2_DOTALL; break; - case CHAR_U: *optset |= PCRE2_UNGREEDY; break; - - /* If x appears twice it sets the extended extended option. */ - - case CHAR_x: - *optset |= PCRE2_EXTENDED; - if (ptr < ptrend && *ptr == CHAR_x) - { - *optset |= PCRE2_EXTENDED_MORE; - ptr++; - } - break; - - default: - errorcode = ERR11; - ptr--; /* Correct the offset */ - goto FAILED; - } - } - - /* If we are setting extended without extended-more, ensure that any - existing extended-more gets unset. Also, unsetting extended must also - unset extended-more. */ - - if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED || - (unset & PCRE2_EXTENDED) != 0) - unset |= PCRE2_EXTENDED_MORE; - - options = (options | set) & (~unset); - - /* If the options ended with ')' this is not the start of a nested - group with option changes, so the options change at this level. - In this case, if the previous level set up a nest block, discard the - one we have just created. Otherwise adjust it for the previous level. - If the options ended with ':' we are starting a non-capturing group, - possibly with an options setting. */ - - if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; - if (*ptr++ == CHAR_RIGHT_PARENTHESIS) - { - nest_depth--; /* This is not a nested group after all. */ - if (top_nest > (nest_save *)(cb->start_workspace) && - (top_nest-1)->nest_depth == nest_depth) top_nest--; - else top_nest->nest_depth = nest_depth; - } - else *parsed_pattern++ = META_NOCAPTURE; - - /* If nothing changed, no need to record. */ - - if (options != oldoptions) - { - *parsed_pattern++ = META_OPTIONS; - *parsed_pattern++ = options; - } - } /* End options processing */ - break; /* End default case after (? */ - - - /* ---- Python syntax support ---- */ - - case CHAR_P: - if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; - - /* (?P is the same as (?, which defines a named group. */ - - if (*ptr == CHAR_LESS_THAN_SIGN) - { - terminator = CHAR_GREATER_THAN_SIGN; - goto DEFINE_NAME; - } - - /* (?P>name) is the same as (?&name), which is a recursion or subroutine - call. */ - - if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME; - - /* (?P=name) is the same as \k, a back reference by name. Anything - else after (?P is an error. */ - - if (*ptr != CHAR_EQUALS_SIGN) - { - errorcode = ERR41; - goto FAILED; - } - if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name, - &namelen, &errorcode, cb)) goto FAILED; - *parsed_pattern++ = META_BACKREF_BYNAME; - *parsed_pattern++ = namelen; - PUTOFFSET(offset, parsed_pattern); - okquantifier = TRUE; - break; /* End of (?P processing */ - - - /* ---- Recursion/subroutine calls by number ---- */ - - case CHAR_R: - i = 0; /* (?R) == (?R0) */ - ptr++; - if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) - { - errorcode = ERR58; - goto FAILED; - } - goto SET_RECURSION; - - /* An item starting (?- followed by a digit comes here via the "default" - case because (?- followed by a non-digit is an options setting. */ - - case CHAR_PLUS: - if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1])) - { - errorcode = ERR29; /* Missing number */ - goto FAILED; - } - /* Fall through */ - - case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: - case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: - RECURSION_BYNUMBER: - if (!read_number(&ptr, ptrend, - (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */ - MAX_GROUP_NUMBER, ERR61, - &i, &errorcode)) goto FAILED; - if (i < 0) /* NB (?0) is permitted */ - { - errorcode = ERR15; /* Unknown group */ - goto FAILED_BACK; - } - if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) - goto UNCLOSED_PARENTHESIS; - - SET_RECURSION: - *parsed_pattern++ = META_RECURSE | (uint32_t)i; - offset = (PCRE2_SIZE)(ptr - cb->start_pattern); - ptr++; - PUTOFFSET(offset, parsed_pattern); - okquantifier = TRUE; - break; /* End of recursive call by number handling */ - - - /* ---- Recursion/subroutine calls by name ---- */ - - case CHAR_AMPERSAND: - RECURSE_BY_NAME: - if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name, - &namelen, &errorcode, cb)) goto FAILED; - *parsed_pattern++ = META_RECURSE_BYNAME; - *parsed_pattern++ = namelen; - PUTOFFSET(offset, parsed_pattern); - okquantifier = TRUE; - break; - - /* ---- Callout with numerical or string argument ---- */ - - case CHAR_C: - if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; - - /* If the previous item was a condition starting (?(? an assertion, - optionally preceded by a callout, is expected. This is checked later on, - during actual compilation. However we need to identify this kind of - assertion in this pass because it must not be qualified. The value of - expect_cond_assert is set to 2 after (?(? is processed. We decrement it - for a callout - still leaving a positive value that identifies the - assertion. Multiple callouts or any other items will make it zero or - less, which doesn't matter because they will cause an error later. */ - - expect_cond_assert = prev_expect_cond_assert - 1; - - /* If previous_callout is not NULL, it means this follows a previous - callout. If it was a manual callout, do nothing; this means its "length - of next pattern item" field will remain zero. If it was an automatic - callout, abolish it. */ - - if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 && - previous_callout == parsed_pattern - 4 && - parsed_pattern[-1] == 255) - parsed_pattern = previous_callout; - - /* Save for updating next pattern item length, and skip one item before - completing. */ - - previous_callout = parsed_pattern; - after_manual_callout = 1; - - /* Handle a string argument; specific delimiter is required. */ - - if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr)) - { - PCRE2_SIZE calloutlength; - PCRE2_SPTR startptr = ptr; - - delimiter = 0; - for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) - { - if (*ptr == PRIV(callout_start_delims)[i]) - { - delimiter = PRIV(callout_end_delims)[i]; - break; - } - } - if (delimiter == 0) - { - errorcode = ERR82; - goto FAILED; - } - - *parsed_pattern = META_CALLOUT_STRING; - parsed_pattern += 3; /* Skip pattern info */ - - for (;;) - { - if (++ptr >= ptrend) - { - errorcode = ERR81; - ptr = startptr; /* To give a more useful message */ - goto FAILED; - } - if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter)) - break; - } - - calloutlength = (PCRE2_SIZE)(ptr - startptr); - if (calloutlength > UINT32_MAX) - { - errorcode = ERR72; - goto FAILED; - } - *parsed_pattern++ = (uint32_t)calloutlength; - offset = (PCRE2_SIZE)(startptr - cb->start_pattern); - PUTOFFSET(offset, parsed_pattern); - } - - /* Handle a callout with an optional numerical argument, which must be - less than or equal to 255. A missing argument gives 0. */ - - else - { - int n = 0; - *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */ - parsed_pattern += 3; /* Skip pattern info */ - while (ptr < ptrend && IS_DIGIT(*ptr)) - { - n = n * 10 + *ptr++ - CHAR_0; - if (n > 255) - { - errorcode = ERR38; - goto FAILED; - } - } - *parsed_pattern++ = n; - } - - /* Both formats must have a closing parenthesis */ - - if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) - { - errorcode = ERR39; - goto FAILED; - } - ptr++; - - /* Remember the offset to the next item in the pattern, and set a default - length. This should get updated after the next item is read. */ - - previous_callout[1] = (uint32_t)(ptr - cb->start_pattern); - previous_callout[2] = 0; - break; /* End callout */ - - - /* ---- Conditional group ---- */ - - /* A condition can be an assertion, a number (referring to a numbered - group's having been set), a name (referring to a named group), or 'R', - referring to overall recursion. R and R&name are also permitted - for recursion state tests. Numbers may be preceded by + or - to specify a - relative group number. - - There are several syntaxes for testing a named group: (?(name)) is used - by Python; Perl 5.10 onwards uses (?() or (?('name')). - - There are two unfortunate ambiguities. 'R' can be the recursive thing or - the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be - the Perl DEFINE feature or the Python named test. We look for a name - first; if not found, we try the other case. - - For compatibility with auto-callouts, we allow a callout to be specified - before a condition that is an assertion. */ - - case CHAR_LEFT_PARENTHESIS: - if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; - nest_depth++; - - /* If the next character is ? or * there must be an assertion next - (optionally preceded by a callout). We do not check this here, but - instead we set expect_cond_assert to 2. If this is still greater than - zero (callouts decrement it) when the next assertion is read, it will be - marked as a condition that must not be repeated. A value greater than - zero also causes checking that an assertion (possibly with callout) - follows. */ - - if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK) - { - *parsed_pattern++ = META_COND_ASSERT; - ptr--; /* Pull pointer back to the opening parenthesis. */ - expect_cond_assert = 2; - break; /* End of conditional */ - } - - /* Handle (?([+-]number)... */ - - if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i, - &errorcode)) - { - if (i <= 0) - { - errorcode = ERR15; - goto FAILED; - } - *parsed_pattern++ = META_COND_NUMBER; - offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); - PUTOFFSET(offset, parsed_pattern); - *parsed_pattern++ = i; - } - else if (errorcode != 0) goto FAILED; /* Number too big */ - - /* No number found. Handle the special case (?(VERSION[>]=n.m)... */ - - else if (ptrend - ptr >= 10 && - PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 && - ptr[7] != CHAR_RIGHT_PARENTHESIS) - { - uint32_t ge = 0; - int major = 0; - int minor = 0; - - ptr += 7; - if (*ptr == CHAR_GREATER_THAN_SIGN) - { - ge = 1; - ptr++; - } - - /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT - references its argument twice. */ - - if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr))) - goto BAD_VERSION_CONDITION; - - if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode)) - goto FAILED; - - if (ptr >= ptrend) goto BAD_VERSION_CONDITION; - if (*ptr == CHAR_DOT) - { - if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION; - minor = (*ptr++ - CHAR_0) * 10; - if (ptr >= ptrend) goto BAD_VERSION_CONDITION; - if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0; - if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) - goto BAD_VERSION_CONDITION; - } - - *parsed_pattern++ = META_COND_VERSION; - *parsed_pattern++ = ge; - *parsed_pattern++ = major; - *parsed_pattern++ = minor; - } - - /* All the remaining cases now require us to read a name. We cannot at - this stage distinguish ambiguous cases such as (?(R12) which might be a - recursion test by number or a name, because the named groups have not yet - all been identified. Those cases are treated as names, but given a - different META code. */ - - else - { - BOOL was_r_ampersand = FALSE; - - if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND) - { - terminator = CHAR_RIGHT_PARENTHESIS; - was_r_ampersand = TRUE; - ptr++; - } - else if (*ptr == CHAR_LESS_THAN_SIGN) - terminator = CHAR_GREATER_THAN_SIGN; - else if (*ptr == CHAR_APOSTROPHE) - terminator = CHAR_APOSTROPHE; - else - { - terminator = CHAR_RIGHT_PARENTHESIS; - ptr--; /* Point to char before name */ - } - if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen, - &errorcode, cb)) goto FAILED; - - /* Handle (?(R&name) */ - - if (was_r_ampersand) - { - *parsed_pattern = META_COND_RNAME; - ptr--; /* Back to closing parens */ - } - - /* Handle (?(name). If the name is "DEFINE" we identify it with a - special code. Likewise if the name consists of R followed only by - digits. Otherwise, handle it like a quoted name. */ - - else if (terminator == CHAR_RIGHT_PARENTHESIS) - { - if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0) - *parsed_pattern = META_COND_DEFINE; - else - { - for (i = 1; i < (int)namelen; i++) - if (!IS_DIGIT(name[i])) break; - *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)? - META_COND_RNUMBER : META_COND_NAME; - } - ptr--; /* Back to closing parens */ - } - - /* Handle (?('name') or (?() */ - - else *parsed_pattern = META_COND_NAME; - - /* All these cases except DEFINE end with the name length and offset; - DEFINE just has an offset (for the "too many branches" error). */ - - if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen; - PUTOFFSET(offset, parsed_pattern); - } /* End cases that read a name */ - - /* Check the closing parenthesis of the condition */ - - if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) - { - errorcode = ERR24; - goto FAILED; - } - ptr++; - break; /* End of condition processing */ - - - /* ---- Atomic group ---- */ - - case CHAR_GREATER_THAN_SIGN: - ATOMIC_GROUP: /* Come from (*atomic: */ - *parsed_pattern++ = META_ATOMIC; - nest_depth++; - ptr++; - break; - - - /* ---- Lookahead assertions ---- */ - - case CHAR_EQUALS_SIGN: - POSITIVE_LOOK_AHEAD: /* Come from (*pla: */ - *parsed_pattern++ = META_LOOKAHEAD; - ptr++; - goto POST_ASSERTION; - - case CHAR_ASTERISK: - POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */ - *parsed_pattern++ = META_LOOKAHEAD_NA; - ptr++; - goto POST_ASSERTION; - - case CHAR_EXCLAMATION_MARK: - NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */ - *parsed_pattern++ = META_LOOKAHEADNOT; - ptr++; - goto POST_ASSERTION; - - - /* ---- Lookbehind assertions ---- */ - - /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?< - is the start of the name of a capturing group. */ - - case CHAR_LESS_THAN_SIGN: - if (ptrend - ptr <= 1 || - (ptr[1] != CHAR_EQUALS_SIGN && - ptr[1] != CHAR_EXCLAMATION_MARK && - ptr[1] != CHAR_ASTERISK)) - { - terminator = CHAR_GREATER_THAN_SIGN; - goto DEFINE_NAME; - } - *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)? - META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)? - META_LOOKBEHINDNOT : META_LOOKBEHIND_NA; - - POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */ - *has_lookbehind = TRUE; - offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); - PUTOFFSET(offset, parsed_pattern); - ptr += 2; - /* Fall through */ - - /* If the previous item was a condition starting (?(? an assertion, - optionally preceded by a callout, is expected. This is checked later on, - during actual compilation. However we need to identify this kind of - assertion in this pass because it must not be qualified. The value of - expect_cond_assert is set to 2 after (?(? is processed. We decrement it - for a callout - still leaving a positive value that identifies the - assertion. Multiple callouts or any other items will make it zero or - less, which doesn't matter because they will cause an error later. */ - - POST_ASSERTION: - nest_depth++; - if (prev_expect_cond_assert > 0) - { - if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); - else if (++top_nest >= end_nests) - { - errorcode = ERR84; - goto FAILED; - } - top_nest->nest_depth = nest_depth; - top_nest->flags = NSF_CONDASSERT; - top_nest->options = options & PARSE_TRACKED_OPTIONS; - } - break; - - - /* ---- Define a named group ---- */ - - /* A named group may be defined as (?'name') or (?). In the latter - case we jump to DEFINE_NAME from the disambiguation of (?< above with the - terminator set to '>'. */ - - case CHAR_APOSTROPHE: - terminator = CHAR_APOSTROPHE; /* Terminator */ - - DEFINE_NAME: - if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen, - &errorcode, cb)) goto FAILED; - - /* We have a name for this capturing group. It is also assigned a number, - which is its primary means of identification. */ - - if (cb->bracount >= MAX_GROUP_NUMBER) - { - errorcode = ERR97; - goto FAILED; - } - cb->bracount++; - *parsed_pattern++ = META_CAPTURE | cb->bracount; - nest_depth++; - - /* Check not too many names */ - - if (cb->names_found >= MAX_NAME_COUNT) - { - errorcode = ERR49; - goto FAILED; - } - - /* Adjust the entry size to accommodate the longest name found. */ - - if (namelen + IMM2_SIZE + 1 > cb->name_entry_size) - cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1); - - /* Scan the list to check for duplicates. For duplicate names, if the - number is the same, break the loop, which causes the name to be - discarded; otherwise, if DUPNAMES is not set, give an error. - If it is set, allow the name with a different number, but continue - scanning in case this is a duplicate with the same number. For - non-duplicate names, give an error if the number is duplicated. */ - - isdupname = FALSE; - ng = cb->named_groups; - for (i = 0; i < cb->names_found; i++, ng++) - { - if (namelen == ng->length && - PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0) - { - if (ng->number == cb->bracount) break; - if ((options & PCRE2_DUPNAMES) == 0) - { - errorcode = ERR43; - goto FAILED; - } - isdupname = ng->isdup = TRUE; /* Mark as a duplicate */ - cb->dupnames = TRUE; /* Duplicate names exist */ - } - else if (ng->number == cb->bracount) - { - errorcode = ERR65; - goto FAILED; - } - } - - if (i < cb->names_found) break; /* Ignore duplicate with same number */ - - /* Increase the list size if necessary */ - - if (cb->names_found >= cb->named_group_list_size) - { - uint32_t newsize = cb->named_group_list_size * 2; - named_group *newspace = - cb->cx->memctl.malloc(newsize * sizeof(named_group), - cb->cx->memctl.memory_data); - if (newspace == NULL) - { - errorcode = ERR21; - goto FAILED; - } - - memcpy(newspace, cb->named_groups, - cb->named_group_list_size * sizeof(named_group)); - if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE) - cb->cx->memctl.free((void *)cb->named_groups, - cb->cx->memctl.memory_data); - cb->named_groups = newspace; - cb->named_group_list_size = newsize; - } - - /* Add this name to the list */ - - cb->named_groups[cb->names_found].name = name; - cb->named_groups[cb->names_found].length = (uint16_t)namelen; - cb->named_groups[cb->names_found].number = cb->bracount; - cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname; - cb->names_found++; - break; - } /* End of (? switch */ - break; /* End of ( handling */ - - - /* ---- Branch terminators ---- */ - - /* Alternation: reset the capture count if we are in a (?| group. */ - - case CHAR_VERTICAL_LINE: - if (top_nest != NULL && top_nest->nest_depth == nest_depth && - (top_nest->flags & NSF_RESET) != 0) - { - if (cb->bracount > top_nest->max_group) - top_nest->max_group = (uint16_t)cb->bracount; - cb->bracount = top_nest->reset_group; - } - *parsed_pattern++ = META_ALT; - break; - - /* End of group; reset the capture count to the maximum if we are in a (?| - group and/or reset the options that are tracked during parsing. Disallow - quantifier for a condition that is an assertion. */ - - case CHAR_RIGHT_PARENTHESIS: - okquantifier = TRUE; - if (top_nest != NULL && top_nest->nest_depth == nest_depth) - { - options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options; - if ((top_nest->flags & NSF_RESET) != 0 && - top_nest->max_group > cb->bracount) - cb->bracount = top_nest->max_group; - if ((top_nest->flags & NSF_CONDASSERT) != 0) - okquantifier = FALSE; - - if ((top_nest->flags & NSF_ATOMICSR) != 0) - { - *parsed_pattern++ = META_KET; - } - - if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL; - else top_nest--; - } - if (nest_depth == 0) /* Unmatched closing parenthesis */ - { - errorcode = ERR22; - goto FAILED_BACK; - } - nest_depth--; - *parsed_pattern++ = META_KET; - break; - } /* End of switch on pattern character */ - } /* End of main character scan loop */ - -/* End of pattern reached. Check for missing ) at the end of a verb name. */ - -if (inverbname && ptr >= ptrend) - { - errorcode = ERR60; - goto FAILED; - } - -/* Manage callout for the final item */ - -PARSED_END: -parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout, - parsed_pattern, cb); - -/* Insert trailing items for word and line matching (features provided for the -benefit of pcre2grep). */ - -if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0) - { - *parsed_pattern++ = META_KET; - *parsed_pattern++ = META_DOLLAR; - } -else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0) - { - *parsed_pattern++ = META_KET; - *parsed_pattern++ = META_ESCAPE + ESC_b; - } - -/* Terminate the parsed pattern, then return success if all groups are closed. -Otherwise we have unclosed parentheses. */ - -if (parsed_pattern >= parsed_pattern_end) - { - errorcode = ERR63; /* Internal error (parsed pattern overflow) */ - goto FAILED; - } - -*parsed_pattern = META_END; -if (nest_depth == 0) return 0; - -UNCLOSED_PARENTHESIS: -errorcode = ERR14; - -/* Come here for all failures. */ - -FAILED: -cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern); -return errorcode; - -/* Some errors need to indicate the previous character. */ - -FAILED_BACK: -ptr--; -goto FAILED; - -/* This failure happens several times. */ - -BAD_VERSION_CONDITION: -errorcode = ERR79; -goto FAILED; -} - - - -/************************************************* -* Find first significant opcode * -*************************************************/ - -/* This is called by several functions that scan a compiled expression looking -for a fixed first character, or an anchoring opcode etc. It skips over things -that do not influence this. For some calls, it makes sense to skip negative -forward and all backward assertions, and also the \b assertion; for others it -does not. - -Arguments: - code pointer to the start of the group - skipassert TRUE if certain assertions are to be skipped - -Returns: pointer to the first significant opcode -*/ - -static const PCRE2_UCHAR* -first_significant_code(PCRE2_SPTR code, BOOL skipassert) -{ -for (;;) - { - switch ((int)*code) - { - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ASSERTBACK_NA: - if (!skipassert) return code; - do code += GET(code, 1); while (*code == OP_ALT); - code += PRIV(OP_lengths)[*code]; - break; - - case OP_WORD_BOUNDARY: - case OP_NOT_WORD_BOUNDARY: - if (!skipassert) return code; - /* Fall through */ - - case OP_CALLOUT: - case OP_CREF: - case OP_DNCREF: - case OP_RREF: - case OP_DNRREF: - case OP_FALSE: - case OP_TRUE: - code += PRIV(OP_lengths)[*code]; - break; - - case OP_CALLOUT_STR: - code += GET(code, 1 + 2*LINK_SIZE); - break; - - case OP_SKIPZERO: - code += 2 + GET(code, 2) + LINK_SIZE; - break; - - case OP_COND: - case OP_SCOND: - if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */ - code[GET(code, 1)] != OP_KET) /* More than one branch */ - return code; - code += GET(code, 1) + 1 + LINK_SIZE; - break; - - case OP_MARK: - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - case OP_SKIP_ARG: - case OP_THEN_ARG: - code += code[1] + PRIV(OP_lengths)[*code]; - break; - - default: - return code; - } - } -/* Control never reaches here */ -} - - - -#ifdef SUPPORT_UNICODE -/************************************************* -* Get othercase range * -*************************************************/ - -/* This function is passed the start and end of a class range in UCP mode. It -searches up the characters, looking for ranges of characters in the "other" -case. Each call returns the next one, updating the start address. A character -with multiple other cases is returned on its own with a special return value. - -Arguments: - cptr points to starting character value; updated - d end value - ocptr where to put start of othercase range - odptr where to put end of othercase range - -Yield: -1 when no more - 0 when a range is returned - >0 the CASESET offset for char with multiple other cases - in this case, ocptr contains the original -*/ - -static int -get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr, - uint32_t *odptr) -{ -uint32_t c, othercase, next; -unsigned int co; - -/* Find the first character that has an other case. If it has multiple other -cases, return its case offset value. */ - -for (c = *cptr; c <= d; c++) - { - if ((co = UCD_CASESET(c)) != 0) - { - *ocptr = c++; /* Character that has the set */ - *cptr = c; /* Rest of input range */ - return (int)co; - } - if ((othercase = UCD_OTHERCASE(c)) != c) break; - } - -if (c > d) return -1; /* Reached end of range */ - -/* Found a character that has a single other case. Search for the end of the -range, which is either the end of the input range, or a character that has zero -or more than one other cases. */ - -*ocptr = othercase; -next = othercase + 1; - -for (++c; c <= d; c++) - { - if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break; - next++; - } - -*odptr = next - 1; /* End of othercase range */ -*cptr = c; /* Rest of input range */ -return 0; -} -#endif /* SUPPORT_UNICODE */ - - - -/************************************************* -* Add a character or range to a class (internal) * -*************************************************/ - -/* This function packages up the logic of adding a character or range of -characters to a class. The character values in the arguments will be within the -valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is -called only from within the "add to class" group of functions, some of which -are recursive and mutually recursive. The external entry point is -add_to_class(). - -Arguments: - classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data - options the options word - cb compile data - start start of range character - end end of range character - -Returns: the number of < 256 characters added - the pointer to extra data is updated -*/ - -static unsigned int -add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, - uint32_t options, compile_block *cb, uint32_t start, uint32_t end) -{ -uint32_t c; -uint32_t classbits_end = (end <= 0xff ? end : 0xff); -unsigned int n8 = 0; - -/* If caseless matching is required, scan the range and process alternate -cases. In Unicode, there are 8-bit characters that have alternate cases that -are greater than 255 and vice-versa. Sometimes we can just extend the original -range. */ - -if ((options & PCRE2_CASELESS) != 0) - { -#ifdef SUPPORT_UNICODE - if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0) - { - int rc; - uint32_t oc, od; - - options &= ~PCRE2_CASELESS; /* Remove for recursive calls */ - c = start; - - while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0) - { - /* Handle a single character that has more than one other case. */ - - if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb, - PRIV(ucd_caseless_sets) + rc, oc); - - /* Do nothing if the other case range is within the original range. */ - - else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue; - - /* Extend the original range if there is overlap, noting that if oc < c, we - can't have od > end because a subrange is always shorter than the basic - range. Otherwise, use a recursive call to add the additional range. */ - - else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ - else if (od > end && oc <= end + 1) - { - end = od; /* Extend upwards */ - if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff); - } - else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od); - } - } - else -#endif /* SUPPORT_UNICODE */ - - /* Not UTF mode */ - - for (c = start; c <= classbits_end; c++) - { - SETBIT(classbits, cb->fcc[c]); - n8++; - } - } - -/* Now handle the originally supplied range. Adjust the final value according -to the bit length - this means that the same lists of (e.g.) horizontal spaces -can be used in all cases. */ - -if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR) - end = MAX_NON_UTF_CHAR; - -if (start > cb->class_range_start && end < cb->class_range_end) return n8; - -/* Use the bitmap for characters < 256. Otherwise use extra data.*/ - -for (c = start; c <= classbits_end; c++) - { - /* Regardless of start, c will always be <= 255. */ - SETBIT(classbits, c); - n8++; - } - -#ifdef SUPPORT_WIDE_CHARS -if (start <= 0xff) start = 0xff + 1; - -if (end >= start) - { - PCRE2_UCHAR *uchardata = *uchardptr; - -#ifdef SUPPORT_UNICODE - if ((options & PCRE2_UTF) != 0) - { - if (start < end) - { - *uchardata++ = XCL_RANGE; - uchardata += PRIV(ord2utf)(start, uchardata); - uchardata += PRIV(ord2utf)(end, uchardata); - } - else if (start == end) - { - *uchardata++ = XCL_SINGLE; - uchardata += PRIV(ord2utf)(start, uchardata); - } - } - else -#endif /* SUPPORT_UNICODE */ - - /* Without UTF support, character values are constrained by the bit length, - and can only be > 256 for 16-bit and 32-bit libraries. */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 - {} -#else - if (start < end) - { - *uchardata++ = XCL_RANGE; - *uchardata++ = start; - *uchardata++ = end; - } - else if (start == end) - { - *uchardata++ = XCL_SINGLE; - *uchardata++ = start; - } -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - *uchardptr = uchardata; /* Updata extra data pointer */ - } -#else /* SUPPORT_WIDE_CHARS */ - (void)uchardptr; /* Avoid compiler warning */ -#endif /* SUPPORT_WIDE_CHARS */ - -return n8; /* Number of 8-bit characters */ -} - - - -#ifdef SUPPORT_UNICODE -/************************************************* -* Add a list of characters to a class (internal) * -*************************************************/ - -/* This function is used for adding a list of case-equivalent characters to a -class when in UTF mode. This function is called only from within -add_to_class_internal(), with which it is mutually recursive. - -Arguments: - classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data - options the options word - cb contains pointers to tables etc. - p points to row of 32-bit values, terminated by NOTACHAR - except character to omit; this is used when adding lists of - case-equivalent characters to avoid including the one we - already know about - -Returns: the number of < 256 characters added - the pointer to extra data is updated -*/ - -static unsigned int -add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, - uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except) -{ -unsigned int n8 = 0; -while (p[0] < NOTACHAR) - { - unsigned int n = 0; - if (p[0] != except) - { - while(p[n+1] == p[0] + n + 1) n++; - n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]); - } - p += n + 1; - } -return n8; -} -#endif - - - -/************************************************* -* External entry point for add range to class * -*************************************************/ - -/* This function sets the overall range so that the internal functions can try -to avoid duplication when handling case-independence. - -Arguments: - classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data - options the options word - cb compile data - start start of range character - end end of range character - -Returns: the number of < 256 characters added - the pointer to extra data is updated -*/ - -static unsigned int -add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, - compile_block *cb, uint32_t start, uint32_t end) -{ -cb->class_range_start = start; -cb->class_range_end = end; -return add_to_class_internal(classbits, uchardptr, options, cb, start, end); -} - - -/************************************************* -* External entry point for add list to class * -*************************************************/ - -/* This function is used for adding a list of horizontal or vertical whitespace -characters to a class. The list must be in order so that ranges of characters -can be detected and handled appropriately. This function sets the overall range -so that the internal functions can try to avoid duplication when handling -case-independence. - -Arguments: - classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data - options the options word - cb contains pointers to tables etc. - p points to row of 32-bit values, terminated by NOTACHAR - except character to omit; this is used when adding lists of - case-equivalent characters to avoid including the one we - already know about - -Returns: the number of < 256 characters added - the pointer to extra data is updated -*/ - -static unsigned int -add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, - compile_block *cb, const uint32_t *p, unsigned int except) -{ -unsigned int n8 = 0; -while (p[0] < NOTACHAR) - { - unsigned int n = 0; - if (p[0] != except) - { - while(p[n+1] == p[0] + n + 1) n++; - cb->class_range_start = p[0]; - cb->class_range_end = p[n]; - n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]); - } - p += n + 1; - } -return n8; -} - - - -/************************************************* -* Add characters not in a list to a class * -*************************************************/ - -/* This function is used for adding the complement of a list of horizontal or -vertical whitespace to a class. The list must be in order. - -Arguments: - classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data - options the options word - cb contains pointers to tables etc. - p points to row of 32-bit values, terminated by NOTACHAR - -Returns: the number of < 256 characters added - the pointer to extra data is updated -*/ - -static unsigned int -add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, - uint32_t options, compile_block *cb, const uint32_t *p) -{ -BOOL utf = (options & PCRE2_UTF) != 0; -unsigned int n8 = 0; -if (p[0] > 0) - n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1); -while (p[0] < NOTACHAR) - { - while (p[1] == p[0] + 1) p++; - n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1, - (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1); - p++; - } -return n8; -} - - - -/************************************************* -* Find details of duplicate group names * -*************************************************/ - -/* This is called from compile_branch() when it needs to know the index and -count of duplicates in the names table when processing named backreferences, -either directly, or as conditions. - -Arguments: - name points to the name - length the length of the name - indexptr where to put the index - countptr where to put the count of duplicates - errorcodeptr where to put an error code - cb the compile block - -Returns: TRUE if OK, FALSE if not, error code set -*/ - -static BOOL -find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr, - int *countptr, int *errorcodeptr, compile_block *cb) -{ -uint32_t i, groupnumber; -int count; -PCRE2_UCHAR *slot = cb->name_table; - -/* Find the first entry in the table */ - -for (i = 0; i < cb->names_found; i++) - { - if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 && - slot[IMM2_SIZE+length] == 0) break; - slot += cb->name_entry_size; - } - -/* This should not occur, because this function is called only when we know we -have duplicate names. Give an internal error. */ - -if (i >= cb->names_found) - { - *errorcodeptr = ERR53; - cb->erroroffset = name - cb->start_pattern; - return FALSE; - } - -/* Record the index and then see how many duplicates there are, updating the -backref map and maximum back reference as we do. */ - -*indexptr = i; -count = 0; - -for (;;) - { - count++; - groupnumber = GET2(slot,0); - cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; - if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; - if (++i >= cb->names_found) break; - slot += cb->name_entry_size; - if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 || - (slot+IMM2_SIZE)[length] != 0) break; - } - -*countptr = count; -return TRUE; -} - - - -/************************************************* -* Compile one branch * -*************************************************/ - -/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If -the options are changed during the branch, the pointer is used to change the -external options bits. This function is used during the pre-compile phase when -we are trying to find out the amount of memory needed, as well as during the -real compile phase. The value of lengthptr distinguishes the two phases. - -Arguments: - optionsptr pointer to the option bits - codeptr points to the pointer to the current code point - pptrptr points to the current parsed pattern pointer - errorcodeptr points to error code variable - firstcuptr place to put the first required code unit - firstcuflagsptr place to put the first code unit flags, or a negative number - reqcuptr place to put the last required code unit - reqcuflagsptr place to put the last required code unit flags, or a negative number - bcptr points to current branch chain - cb contains pointers to tables etc. - lengthptr NULL during the real compile phase - points to length accumulator during pre-compile phase - -Returns: 0 There's been an error, *errorcodeptr is non-zero - +1 Success, this branch must match at least one character - -1 Success, this branch may match an empty string -*/ - -static int -compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr, - int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr, - uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr, - compile_block *cb, PCRE2_SIZE *lengthptr) -{ -int bravalue = 0; -int okreturn = -1; -int group_return = 0; -uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */ -uint32_t greedy_default, greedy_non_default; -uint32_t repeat_type, op_type; -uint32_t options = *optionsptr; /* May change dynamically */ -uint32_t firstcu, reqcu; -uint32_t zeroreqcu, zerofirstcu; -uint32_t escape; -uint32_t *pptr = *pptrptr; -uint32_t meta, meta_arg; -int32_t firstcuflags, reqcuflags; -int32_t zeroreqcuflags, zerofirstcuflags; -int32_t req_caseopt, reqvary, tempreqvary; -PCRE2_SIZE offset = 0; -PCRE2_SIZE length_prevgroup = 0; -PCRE2_UCHAR *code = *codeptr; -PCRE2_UCHAR *last_code = code; -PCRE2_UCHAR *orig_code = code; -PCRE2_UCHAR *tempcode; -PCRE2_UCHAR *previous = NULL; -PCRE2_UCHAR op_previous; -BOOL groupsetfirstcu = FALSE; -BOOL had_accept = FALSE; -BOOL matched_char = FALSE; -BOOL previous_matched_char = FALSE; -BOOL reset_caseful = FALSE; -const uint8_t *cbits = cb->cbits; -uint8_t classbits[32]; - -/* We can fish out the UTF setting once and for all into a BOOL, but we must -not do this for other options (e.g. PCRE2_EXTENDED) because they may change -dynamically as we process the pattern. */ - -#ifdef SUPPORT_UNICODE -BOOL utf = (options & PCRE2_UTF) != 0; -BOOL ucp = (options & PCRE2_UCP) != 0; -#else /* No Unicode support */ -BOOL utf = FALSE; -#endif - -/* Helper variables for OP_XCLASS opcode (for characters > 255). We define -class_uchardata always so that it can be passed to add_to_class() always, -though it will not be used in non-UTF 8-bit cases. This avoids having to supply -alternative calls for the different cases. */ - -PCRE2_UCHAR *class_uchardata; -#ifdef SUPPORT_WIDE_CHARS -BOOL xclass; -PCRE2_UCHAR *class_uchardata_base; -#endif - -/* Set up the default and non-default settings for greediness */ - -greedy_default = ((options & PCRE2_UNGREEDY) != 0); -greedy_non_default = greedy_default ^ 1; - -/* Initialize no first unit, no required unit. REQ_UNSET means "no char -matching encountered yet". It gets changed to REQ_NONE if we hit something that -matches a non-fixed first unit; reqcu just remains unset if we never find one. - -When we hit a repeat whose minimum is zero, we may have to adjust these values -to take the zero repeat into account. This is implemented by setting them to -zerofirstcu and zeroreqcu when such a repeat is encountered. The individual -item types that can be repeated set these backoff variables appropriately. */ - -firstcu = reqcu = zerofirstcu = zeroreqcu = 0; -firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET; - -/* The variable req_caseopt contains either the REQ_CASELESS value or zero, -according to the current setting of the caseless flag. The REQ_CASELESS value -leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables -to record the case status of the value. This is used only for ASCII characters. -*/ - -req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0; - -/* Switch on next META item until the end of the branch */ - -for (;; pptr++) - { -#ifdef SUPPORT_WIDE_CHARS - BOOL xclass_has_prop; -#endif - BOOL negate_class; - BOOL should_flip_negation; - BOOL match_all_or_no_wide_chars; - BOOL possessive_quantifier; - BOOL note_group_empty; - int class_has_8bitchar; - int i; - uint32_t mclength; - uint32_t skipunits; - uint32_t subreqcu, subfirstcu; - uint32_t groupnumber; - uint32_t verbarglen, verbculen; - int32_t subreqcuflags, subfirstcuflags; /* Must be signed */ - open_capitem *oc; - PCRE2_UCHAR mcbuffer[8]; - - /* Get next META item in the pattern and its potential argument. */ - - meta = META_CODE(*pptr); - meta_arg = META_DATA(*pptr); - - /* If we are in the pre-compile phase, accumulate the length used for the - previous cycle of this loop, unless the next item is a quantifier. */ - - if (lengthptr != NULL) - { - if (code > cb->start_workspace + cb->workspace_size - - WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */ - { - *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)? - ERR52 : ERR86; - return 0; - } - - /* There is at least one situation where code goes backwards: this is the - case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier - is processed, the whole class is eliminated. However, it is created first, - so we have to allow memory for it. Therefore, don't ever reduce the length - at this point. */ - - if (code < last_code) code = last_code; - - /* If the next thing is not a quantifier, we add the length of the previous - item into the total, and reset the code pointer to the start of the - workspace. Otherwise leave the previous item available to be quantified. */ - - if (meta < META_ASTERISK || meta > META_MINMAX_QUERY) - { - if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code)) - { - *errorcodeptr = ERR20; /* Integer overflow */ - return 0; - } - *lengthptr += (PCRE2_SIZE)(code - orig_code); - if (*lengthptr > MAX_PATTERN_SIZE) - { - *errorcodeptr = ERR20; /* Pattern is too large */ - return 0; - } - code = orig_code; - } - - /* Remember where this code item starts so we can catch the "backwards" - case above next time round. */ - - last_code = code; - } - - /* Process the next parsed pattern item. If it is not a quantifier, remember - where it starts so that it can be quantified when a quantifier follows. - Checking for the legality of quantifiers happens in parse_regex(), except for - a quantifier after an assertion that is a condition. */ - - if (meta < META_ASTERISK || meta > META_MINMAX_QUERY) - { - previous = code; - if (matched_char && !had_accept) okreturn = 1; - } - - previous_matched_char = matched_char; - matched_char = FALSE; - note_group_empty = FALSE; - skipunits = 0; /* Default value for most subgroups */ - - switch(meta) - { - /* ===================================================================*/ - /* The branch terminates at pattern end or | or ) */ - - case META_END: - case META_ALT: - case META_KET: - *firstcuptr = firstcu; - *firstcuflagsptr = firstcuflags; - *reqcuptr = reqcu; - *reqcuflagsptr = reqcuflags; - *codeptr = code; - *pptrptr = pptr; - return okreturn; - - - /* ===================================================================*/ - /* Handle single-character metacharacters. In multiline mode, ^ disables - the setting of any following char as a first character. */ - - case META_CIRCUMFLEX: - if ((options & PCRE2_MULTILINE) != 0) - { - if (firstcuflags == REQ_UNSET) - zerofirstcuflags = firstcuflags = REQ_NONE; - *code++ = OP_CIRCM; - } - else *code++ = OP_CIRC; - break; - - case META_DOLLAR: - *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL; - break; - - /* There can never be a first char if '.' is first, whatever happens about - repeats. The value of reqcu doesn't change either. */ - - case META_DOT: - matched_char = TRUE; - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY; - break; - - - /* ===================================================================*/ - /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set. - Otherwise, an initial ']' is taken as a data character. When empty classes - are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must - match any character, so generate OP_ALLANY. */ - - case META_CLASS_EMPTY: - case META_CLASS_EMPTY_NOT: - matched_char = TRUE; - *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL; - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - break; - - - /* ===================================================================*/ - /* Non-empty character class. If the included characters are all < 256, we - build a 32-byte bitmap of the permitted characters, except in the special - case where there is only one such character. For negated classes, we build - the map as usual, then invert it at the end. However, we use a different - opcode so that data characters > 255 can be handled correctly. - - If the class contains characters outside the 0-255 range, a different - opcode is compiled. It may optionally have a bit map for characters < 256, - but those above are are explicitly listed afterwards. A flag code unit - tells whether the bitmap is present, and whether this is a negated class or - not. */ - - case META_CLASS_NOT: - case META_CLASS: - matched_char = TRUE; - negate_class = meta == META_CLASS_NOT; - - /* We can optimize the case of a single character in a class by generating - OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's - negative. In the negative case there can be no first char if this item is - first, whatever repeat count may follow. In the case of reqcu, save the - previous value for reinstating. */ - - /* NOTE: at present this optimization is not effective if the only - character in a class in 32-bit, non-UCP mode has its top bit set. */ - - if (pptr[1] < META_END && pptr[2] == META_CLASS_END) - { -#ifdef SUPPORT_UNICODE - uint32_t d; -#endif - uint32_t c = pptr[1]; - - pptr += 2; /* Move on to class end */ - if (meta == META_CLASS) /* A positive one-char class can be */ - { /* handled as a normal literal character. */ - meta = c; /* Set up the character */ - goto NORMAL_CHAR_SET; - } - - /* Handle a negative one-character class */ - - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - - /* For caseless UTF or UCP mode, check whether this character has more - than one other case. If so, generate a special OP_NOTPROP item instead of - OP_NOTI. */ - -#ifdef SUPPORT_UNICODE - if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 && - (d = UCD_CASESET(c)) != 0) - { - *code++ = OP_NOTPROP; - *code++ = PT_CLIST; - *code++ = d; - break; /* We are finished with this class */ - } -#endif - /* Char has only one other case, or UCP not available */ - - *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT; - code += PUTCHAR(c, code); - break; /* We are finished with this class */ - } /* End of 1-char optimization */ - - /* Handle character classes that contain more than just one literal - character. If there are exactly two characters in a positive class, see if - they are case partners. This can be optimized to generate a caseless single - character match (which also sets first/required code units if relevant). */ - - if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END && - pptr[3] == META_CLASS_END) - { - uint32_t c = pptr[1]; - -#ifdef SUPPORT_UNICODE - if (UCD_CASESET(c) == 0) -#endif - { - uint32_t d; - -#ifdef SUPPORT_UNICODE - if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else -#endif - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - if (c > 255) d = c; else -#endif - d = TABLE_GET(c, cb->fcc, c); - } - - if (c != d && pptr[2] == d) - { - pptr += 3; /* Move on to class end */ - meta = c; - if ((options & PCRE2_CASELESS) == 0) - { - reset_caseful = TRUE; - options |= PCRE2_CASELESS; - req_caseopt = REQ_CASELESS; - } - goto CLASS_CASELESS_CHAR; - } - } - } - - /* If a non-extended class contains a negative special such as \S, we need - to flip the negation flag at the end, so that support for characters > 255 - works correctly (they are all included in the class). An extended class may - need to insert specific matching or non-matching code for wide characters. - */ - - should_flip_negation = match_all_or_no_wide_chars = FALSE; - - /* Extended class (xclass) will be used when characters > 255 - might match. */ - -#ifdef SUPPORT_WIDE_CHARS - xclass = FALSE; - class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ - class_uchardata_base = class_uchardata; /* Save the start */ -#endif - - /* For optimization purposes, we track some properties of the class: - class_has_8bitchar will be non-zero if the class contains at least one - character with a code point less than 256; xclass_has_prop will be TRUE if - Unicode property checks are present in the class. */ - - class_has_8bitchar = 0; -#ifdef SUPPORT_WIDE_CHARS - xclass_has_prop = FALSE; -#endif - - /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map - in a temporary bit of memory, in case the class contains fewer than two - 8-bit characters because in that case the compiled code doesn't use the bit - map. */ - - memset(classbits, 0, 32 * sizeof(uint8_t)); - - /* Process items until META_CLASS_END is reached. */ - - while ((meta = *(++pptr)) != META_CLASS_END) - { - /* Handle POSIX classes such as [:alpha:] etc. */ - - if (meta == META_POSIX || meta == META_POSIX_NEG) - { - BOOL local_negate = (meta == META_POSIX_NEG); - int posix_class = *(++pptr); - int taboffset, tabopt; - uint8_t pbits[32]; - - should_flip_negation = local_negate; /* Note negative special */ - - /* If matching is caseless, upper and lower are converted to alpha. - This relies on the fact that the class table starts with alpha, - lower, upper as the first 3 entries. */ - - if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2) - posix_class = 0; - - /* When PCRE2_UCP is set, some of the POSIX classes are converted to - different escape sequences that use Unicode properties \p or \P. - Others that are not available via \p or \P have to generate - XCL_PROP/XCL_NOTPROP directly, which is done here. */ - -#ifdef SUPPORT_UNICODE - if ((options & PCRE2_UCP) != 0) switch(posix_class) - { - case PC_GRAPH: - case PC_PRINT: - case PC_PUNCT: - *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; - *class_uchardata++ = (PCRE2_UCHAR) - ((posix_class == PC_GRAPH)? PT_PXGRAPH : - (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT); - *class_uchardata++ = 0; - xclass_has_prop = TRUE; - goto CONTINUE_CLASS; - - /* For the other POSIX classes (ascii, xdigit) we are going to - fall through to the non-UCP case and build a bit map for - characters with code points less than 256. However, if we are in - a negated POSIX class, characters with code points greater than - 255 must either all match or all not match, depending on whether - the whole class is not or is negated. For example, for - [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]... - they must not. - - In the special case where there are no xclass items, this is - automatically handled by the use of OP_CLASS or OP_NCLASS, but an - explicit range is needed for OP_XCLASS. Setting a flag here - causes the range to be generated later when it is known that - OP_XCLASS is required. In the 8-bit library this is relevant only in - utf mode, since no wide characters can exist otherwise. */ - - default: -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (utf) -#endif - match_all_or_no_wide_chars |= local_negate; - break; - } -#endif /* SUPPORT_UNICODE */ - - /* In the non-UCP case, or when UCP makes no difference, we build the - bit map for the POSIX class in a chunk of local store because we may - be adding and subtracting from it, and we don't want to subtract bits - that may be in the main map already. At the end we or the result into - the bit map that is being built. */ - - posix_class *= 3; - - /* Copy in the first table (always present) */ - - memcpy(pbits, cbits + posix_class_maps[posix_class], - 32 * sizeof(uint8_t)); - - /* If there is a second table, add or remove it as required. */ - - taboffset = posix_class_maps[posix_class + 1]; - tabopt = posix_class_maps[posix_class + 2]; - - if (taboffset >= 0) - { - if (tabopt >= 0) - for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset]; - else - for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset]; - } - - /* Now see if we need to remove any special characters. An option - value of 1 removes vertical space and 2 removes underscore. */ - - if (tabopt < 0) tabopt = -tabopt; - if (tabopt == 1) pbits[1] &= ~0x3c; - else if (tabopt == 2) pbits[11] &= 0x7f; - - /* Add the POSIX table or its complement into the main table that is - being built and we are done. */ - - if (local_negate) - for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i]; - else - for (i = 0; i < 32; i++) classbits[i] |= pbits[i]; - - /* Every class contains at least one < 256 character. */ - - class_has_8bitchar = 1; - goto CONTINUE_CLASS; /* End of POSIX handling */ - } - - /* Other than POSIX classes, the only items we should encounter are - \d-type escapes and literal characters (possibly as ranges). */ - - if (meta == META_BIGVALUE) - { - meta = *(++pptr); - goto CLASS_LITERAL; - } - - /* Any other non-literal must be an escape */ - - if (meta >= META_END) - { - if (META_CODE(meta) != META_ESCAPE) - { -#ifdef DEBUG_SHOW_PARSED - fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x " - "in character class\n", meta); -#endif - *errorcodeptr = ERR89; /* Internal error - unrecognized. */ - return 0; - } - escape = META_DATA(meta); - - /* Every class contains at least one < 256 character. */ - - class_has_8bitchar++; - - switch(escape) - { - case ESC_d: - for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit]; - break; - - case ESC_D: - should_flip_negation = TRUE; - for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit]; - break; - - case ESC_w: - for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word]; - break; - - case ESC_W: - should_flip_negation = TRUE; - for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word]; - break; - - /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl - 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was - previously set by something earlier in the character class. - Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so - we could just adjust the appropriate bit. From PCRE 8.34 we no - longer treat \s and \S specially. */ - - case ESC_s: - for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space]; - break; - - case ESC_S: - should_flip_negation = TRUE; - for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space]; - break; - - /* When adding the horizontal or vertical space lists to a class, or - their complements, disable PCRE2_CASELESS, because it justs wastes - time, and in the "not-x" UTF cases can create unwanted duplicates in - the XCLASS list (provoked by characters that have more than one other - case and by both cases being in the same "not-x" sublist). */ - - case ESC_h: - (void)add_list_to_class(classbits, &class_uchardata, - options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR); - break; - - case ESC_H: - (void)add_not_list_to_class(classbits, &class_uchardata, - options & ~PCRE2_CASELESS, cb, PRIV(hspace_list)); - break; - - case ESC_v: - (void)add_list_to_class(classbits, &class_uchardata, - options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR); - break; - - case ESC_V: - (void)add_not_list_to_class(classbits, &class_uchardata, - options & ~PCRE2_CASELESS, cb, PRIV(vspace_list)); - break; - - /* If Unicode is not supported, \P and \p are not allowed and are - faulted at parse time, so will never appear here. */ - -#ifdef SUPPORT_UNICODE - case ESC_p: - case ESC_P: - { - uint32_t ptype = *(++pptr) >> 16; - uint32_t pdata = *pptr & 0xffff; - *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP; - *class_uchardata++ = ptype; - *class_uchardata++ = pdata; - xclass_has_prop = TRUE; - class_has_8bitchar--; /* Undo! */ - } - break; -#endif - } - - goto CONTINUE_CLASS; - } /* End handling \d-type escapes */ - - /* A literal character may be followed by a range meta. At parse time - there are checks for out-of-order characters, for ranges where the two - characters are equal, and for hyphens that cannot indicate a range. At - this point, therefore, no checking is needed. */ - - else - { - uint32_t c, d; - - CLASS_LITERAL: - c = d = meta; - - /* Remember if \r or \n were explicitly used */ - - if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; - - /* Process a character range */ - - if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED) - { -#ifdef EBCDIC - BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL); -#endif - pptr += 2; - d = *pptr; - if (d == META_BIGVALUE) d = *(++pptr); - - /* Remember an explicit \r or \n, and add the range to the class. */ - - if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; - - /* In an EBCDIC environment, Perl treats alphabetic ranges specially - because there are holes in the encoding, and simply using the range - A-Z (for example) would include the characters in the holes. This - applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */ - -#ifdef EBCDIC - if (range_is_literal && - (cb->ctypes[c] & ctype_letter) != 0 && - (cb->ctypes[d] & ctype_letter) != 0 && - (c <= CHAR_z) == (d <= CHAR_z)) - { - uint32_t uc = (d <= CHAR_z)? 0 : 64; - uint32_t C = c - uc; - uint32_t D = d - uc; - - if (C <= CHAR_i) - { - class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, cb, C + uc, - ((D < CHAR_i)? D : CHAR_i) + uc); - C = CHAR_j; - } - - if (C <= D && C <= CHAR_r) - { - class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, cb, C + uc, - ((D < CHAR_r)? D : CHAR_r) + uc); - C = CHAR_s; - } - - if (C <= D) - { - class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, cb, C + uc, - D + uc); - } - } - else -#endif - /* Not an EBCDIC special range */ - - class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, cb, c, d); - goto CONTINUE_CLASS; /* Go get the next char in the class */ - } /* End of range handling */ - - - /* Handle a single character. */ - - class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, cb, meta, meta); - } - - /* Continue to the next item in the class. */ - - CONTINUE_CLASS: - -#ifdef SUPPORT_WIDE_CHARS - /* If any wide characters or Unicode properties have been encountered, - set xclass = TRUE. Then, in the pre-compile phase, accumulate the length - of the extra data and reset the pointer. This is so that very large - classes that contain a zillion wide characters or Unicode property tests - do not overwrite the workspace (which is on the stack). */ - - if (class_uchardata > class_uchardata_base) - { - xclass = TRUE; - if (lengthptr != NULL) - { - *lengthptr += class_uchardata - class_uchardata_base; - class_uchardata = class_uchardata_base; - } - } -#endif - - continue; /* Needed to avoid error when not supporting wide chars */ - } /* End of main class-processing loop */ - - /* If this class is the first thing in the branch, there can be no first - char setting, whatever the repeat count. Any reqcu setting must remain - unchanged after any kind of repeat. */ - - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - - /* If there are characters with values > 255, or Unicode property settings - (\p or \P), we have to compile an extended class, with its own opcode, - unless there were no property settings and there was a negated special such - as \S in the class, and PCRE2_UCP is not set, because in that case all - characters > 255 are in or not in the class, so any that were explicitly - given as well can be ignored. - - In the UCP case, if certain negated POSIX classes ([:^ascii:] or - [^:xdigit:]) were present in a class, we either have to match or not match - all wide characters (depending on whether the whole class is or is not - negated). This requirement is indicated by match_all_or_no_wide_chars being - true. We do this by including an explicit range, which works in both cases. - This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there - cannot be any wide characters in 8-bit non-UTF mode. - - When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit - class where \S etc is present without PCRE2_UCP, causing an extended class - to be compiled, we make sure that all characters > 255 are included by - forcing match_all_or_no_wide_chars to be true. - - If, when generating an xclass, there are no characters < 256, we can omit - the bitmap in the actual compiled code. */ - -#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */ - if (xclass && ( -#ifdef SUPPORT_UNICODE - (options & PCRE2_UCP) != 0 || -#endif - xclass_has_prop || !should_flip_negation)) - { - if (match_all_or_no_wide_chars || ( -#if PCRE2_CODE_UNIT_WIDTH == 8 - utf && -#endif - should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0)) - { - *class_uchardata++ = XCL_RANGE; - if (utf) /* Will always be utf in the 8-bit library */ - { - class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); - class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata); - } - else /* Can only happen for the 16-bit & 32-bit libraries */ - { -#if PCRE2_CODE_UNIT_WIDTH == 16 - *class_uchardata++ = 0x100; - *class_uchardata++ = 0xffffu; -#elif PCRE2_CODE_UNIT_WIDTH == 32 - *class_uchardata++ = 0x100; - *class_uchardata++ = 0xffffffffu; -#endif - } - } - *class_uchardata++ = XCL_END; /* Marks the end of extra data */ - *code++ = OP_XCLASS; - code += LINK_SIZE; - *code = negate_class? XCL_NOT:0; - if (xclass_has_prop) *code |= XCL_HASPROP; - - /* If the map is required, move up the extra data to make room for it; - otherwise just move the code pointer to the end of the extra data. */ - - if (class_has_8bitchar > 0) - { - *code++ |= XCL_MAP; - (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code, - CU2BYTES(class_uchardata - code)); - if (negate_class && !xclass_has_prop) - { - /* Using 255 ^ instead of ~ avoids clang sanitize warning. */ - for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i]; - } - memcpy(code, classbits, 32); - code = class_uchardata + (32 / sizeof(PCRE2_UCHAR)); - } - else code = class_uchardata; - - /* Now fill in the complete length of the item */ - - PUT(previous, 1, (int)(code - previous)); - break; /* End of class handling */ - } -#endif /* SUPPORT_WIDE_CHARS */ - - /* If there are no characters > 255, or they are all to be included or - excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the - whole class was negated and whether there were negative specials such as \S - (non-UCP) in the class. Then copy the 32-byte map into the code vector, - negating it if necessary. */ - - *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; - if (lengthptr == NULL) /* Save time in the pre-compile phase */ - { - if (negate_class) - { - /* Using 255 ^ instead of ~ avoids clang sanitize warning. */ - for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i]; - } - memcpy(code, classbits, 32); - } - code += 32 / sizeof(PCRE2_UCHAR); - break; /* End of class processing */ - - - /* ===================================================================*/ - /* Deal with (*VERB)s. */ - - /* Check for open captures before ACCEPT and close those that are within - the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an - assertion. In the first pass, just accumulate the length required; - otherwise hitting (*ACCEPT) inside many nested parentheses can cause - workspace overflow. Do not set firstcu after *ACCEPT. */ - - case META_ACCEPT: - cb->had_accept = had_accept = TRUE; - for (oc = cb->open_caps; - oc != NULL && oc->assert_depth >= cb->assert_depth; - oc = oc->next) - { - if (lengthptr != NULL) - { - *lengthptr += CU2BYTES(1) + IMM2_SIZE; - } - else - { - *code++ = OP_CLOSE; - PUT2INC(code, 0, oc->number); - } - } - *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - break; - - case META_PRUNE: - case META_SKIP: - cb->had_pruneorskip = TRUE; - /* Fall through */ - case META_COMMIT: - case META_FAIL: - *code++ = verbops[(meta - META_MARK) >> 16]; - break; - - case META_THEN: - cb->external_flags |= PCRE2_HASTHEN; - *code++ = OP_THEN; - break; - - /* Handle verbs with arguments. Arguments can be very long, especially in - 16- and 32-bit modes, and can overflow the workspace in the first pass. - However, the argument length is constrained to be small enough to fit in - one code unit. This check happens in parse_regex(). In the first pass, - instead of putting the argument into memory, we just update the length - counter and set up an empty argument. */ - - case META_THEN_ARG: - cb->external_flags |= PCRE2_HASTHEN; - goto VERB_ARG; - - case META_PRUNE_ARG: - case META_SKIP_ARG: - cb->had_pruneorskip = TRUE; - /* Fall through */ - case META_MARK: - case META_COMMIT_ARG: - VERB_ARG: - *code++ = verbops[(meta - META_MARK) >> 16]; - /* The length is in characters. */ - verbarglen = *(++pptr); - verbculen = 0; - tempcode = code++; - for (i = 0; i < (int)verbarglen; i++) - { - meta = *(++pptr); -#ifdef SUPPORT_UNICODE - if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else -#endif - { - mclength = 1; - mcbuffer[0] = meta; - } - if (lengthptr != NULL) *lengthptr += mclength; else - { - memcpy(code, mcbuffer, CU2BYTES(mclength)); - code += mclength; - verbculen += mclength; - } - } - - *tempcode = verbculen; /* Fill in the code unit length */ - *code++ = 0; /* Terminating zero */ - break; - - - /* ===================================================================*/ - /* Handle options change. The new setting must be passed back for use in - subsequent branches. Reset the greedy defaults and the case value for - firstcu and reqcu. */ - - case META_OPTIONS: - *optionsptr = options = *(++pptr); - greedy_default = ((options & PCRE2_UNGREEDY) != 0); - greedy_non_default = greedy_default ^ 1; - req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0; - break; - - - /* ===================================================================*/ - /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous - because it could be a numerical check on recursion, or a name check on a - group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that - we can handle it either way. We first try for a name; if not found, process - the number. */ - - case META_COND_RNUMBER: /* (?(Rdigits) */ - case META_COND_NAME: /* (?(name) or (?'name') or ?() */ - case META_COND_RNAME: /* (?(R&name) - test for recursion */ - bravalue = OP_COND; - { - int count, index; - PCRE2_SPTR name; - named_group *ng = cb->named_groups; - uint32_t length = *(++pptr); - - GETPLUSOFFSET(offset, pptr); - name = cb->start_pattern + offset; - - /* In the first pass, the names generated in the pre-pass are available, - but the main name table has not yet been created. Scan the list of names - generated in the pre-pass in order to get a number and whether or not - this name is duplicated. If it is not duplicated, we can handle it as a - numerical group. */ - - for (i = 0; i < cb->names_found; i++, ng++) - { - if (length == ng->length && - PRIV(strncmp)(name, ng->name, length) == 0) - { - if (!ng->isdup) - { - code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF; - PUT2(code, 2+LINK_SIZE, ng->number); - if (ng->number > cb->top_backref) cb->top_backref = ng->number; - skipunits = 1+IMM2_SIZE; - goto GROUP_PROCESS_NOTE_EMPTY; - } - break; /* Found a duplicated name */ - } - } - - /* If the name was not found we have a bad reference, unless we are - dealing with R, which is treated as a recursion test by number. - */ - - if (i >= cb->names_found) - { - groupnumber = 0; - if (meta == META_COND_RNUMBER) - { - for (i = 1; i < (int)length; i++) - { - groupnumber = groupnumber * 10 + name[i] - CHAR_0; - if (groupnumber > MAX_GROUP_NUMBER) - { - *errorcodeptr = ERR61; - cb->erroroffset = offset + i; - return 0; - } - } - } - - if (meta != META_COND_RNUMBER || groupnumber > cb->bracount) - { - *errorcodeptr = ERR15; - cb->erroroffset = offset; - return 0; - } - - /* (?Rdigits) treated as a recursion reference by number. A value of - zero (which is the result of both (?R) and (?R0)) means "any", and is - translated into RREF_ANY (which is 0xffff). */ - - if (groupnumber == 0) groupnumber = RREF_ANY; - code[1+LINK_SIZE] = OP_RREF; - PUT2(code, 2+LINK_SIZE, groupnumber); - skipunits = 1+IMM2_SIZE; - goto GROUP_PROCESS_NOTE_EMPTY; - } - - /* A duplicated name was found. Note that if an R name is found - (META_COND_RNUMBER), it is a reference test, not a recursion test. */ - - code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF; - - /* We have a duplicated name. In the compile pass we have to search the - main table in order to get the index and count values. */ - - count = 0; /* Values for first pass (avoids compiler warning) */ - index = 0; - if (lengthptr == NULL && !find_dupname_details(name, length, &index, - &count, errorcodeptr, cb)) return 0; - - /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and - insert appropriate data values. */ - - code[1+LINK_SIZE]++; - skipunits = 1+2*IMM2_SIZE; - PUT2(code, 2+LINK_SIZE, index); - PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count); - } - goto GROUP_PROCESS_NOTE_EMPTY; - - /* The DEFINE condition is always false. Its internal groups may never - be called, so matched_char must remain false, hence the jump to - GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */ - - case META_COND_DEFINE: - bravalue = OP_COND; - GETPLUSOFFSET(offset, pptr); - code[1+LINK_SIZE] = OP_DEFINE; - skipunits = 1; - goto GROUP_PROCESS; - - /* Conditional test of a group's being set. */ - - case META_COND_NUMBER: - bravalue = OP_COND; - GETPLUSOFFSET(offset, pptr); - groupnumber = *(++pptr); - if (groupnumber > cb->bracount) - { - *errorcodeptr = ERR15; - cb->erroroffset = offset; - return 0; - } - if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; - offset -= 2; /* Point at initial ( for too many branches error */ - code[1+LINK_SIZE] = OP_CREF; - skipunits = 1+IMM2_SIZE; - PUT2(code, 2+LINK_SIZE, groupnumber); - goto GROUP_PROCESS_NOTE_EMPTY; - - /* Test for the PCRE2 version. */ - - case META_COND_VERSION: - bravalue = OP_COND; - if (pptr[1] > 0) - code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) || - (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))? - OP_TRUE : OP_FALSE; - else - code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])? - OP_TRUE : OP_FALSE; - skipunits = 1; - pptr += 3; - goto GROUP_PROCESS_NOTE_EMPTY; - - /* The condition is an assertion, possibly preceded by a callout. */ - - case META_COND_ASSERT: - bravalue = OP_COND; - goto GROUP_PROCESS_NOTE_EMPTY; - - - /* ===================================================================*/ - /* Handle all kinds of nested bracketed groups. The non-capturing, - non-conditional cases are here; others come to GROUP_PROCESS via goto. */ - - case META_LOOKAHEAD: - bravalue = OP_ASSERT; - cb->assert_depth += 1; - goto GROUP_PROCESS; - - case META_LOOKAHEAD_NA: - bravalue = OP_ASSERT_NA; - cb->assert_depth += 1; - goto GROUP_PROCESS; - - /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird - thing to do, but Perl allows all assertions to be quantified, and when - they contain capturing parentheses there may be a potential use for - this feature. Not that that applies to a quantified (?!) but we allow - it for uniformity. */ - - case META_LOOKAHEADNOT: - if (pptr[1] == META_KET && - (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY)) - { - *code++ = OP_FAIL; - pptr++; - } - else - { - bravalue = OP_ASSERT_NOT; - cb->assert_depth += 1; - goto GROUP_PROCESS; - } - break; - - case META_LOOKBEHIND: - bravalue = OP_ASSERTBACK; - cb->assert_depth += 1; - goto GROUP_PROCESS; - - case META_LOOKBEHINDNOT: - bravalue = OP_ASSERTBACK_NOT; - cb->assert_depth += 1; - goto GROUP_PROCESS; - - case META_LOOKBEHIND_NA: - bravalue = OP_ASSERTBACK_NA; - cb->assert_depth += 1; - goto GROUP_PROCESS; - - case META_ATOMIC: - bravalue = OP_ONCE; - goto GROUP_PROCESS_NOTE_EMPTY; - - case META_SCRIPT_RUN: - bravalue = OP_SCRIPT_RUN; - goto GROUP_PROCESS_NOTE_EMPTY; - - case META_NOCAPTURE: - bravalue = OP_BRA; - /* Fall through */ - - /* Process nested bracketed regex. The nesting depth is maintained for the - benefit of the stackguard function. The test for too deep nesting is now - done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS; - others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take - note of whether or not they may match an empty string. */ - - GROUP_PROCESS_NOTE_EMPTY: - note_group_empty = TRUE; - - GROUP_PROCESS: - cb->parens_depth += 1; - *code = bravalue; - pptr++; - tempcode = code; - tempreqvary = cb->req_varyopt; /* Save value before group */ - length_prevgroup = 0; /* Initialize for pre-compile phase */ - - if ((group_return = - compile_regex( - options, /* The option state */ - &tempcode, /* Where to put code (updated) */ - &pptr, /* Input pointer (updated) */ - errorcodeptr, /* Where to put an error message */ - skipunits, /* Skip over bracket number */ - &subfirstcu, /* For possible first char */ - &subfirstcuflags, - &subreqcu, /* For possible last char */ - &subreqcuflags, - bcptr, /* Current branch chain */ - cb, /* Compile data block */ - (lengthptr == NULL)? NULL : /* Actual compile phase */ - &length_prevgroup /* Pre-compile phase */ - )) == 0) - return 0; /* Error */ - - cb->parens_depth -= 1; - - /* If that was a non-conditional significant group (not an assertion, not a - DEFINE) that matches at least one character, then the current item matches - a character. Conditionals are handled below. */ - - if (note_group_empty && bravalue != OP_COND && group_return > 0) - matched_char = TRUE; - - /* If we've just compiled an assertion, pop the assert depth. */ - - if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA) - cb->assert_depth -= 1; - - /* At the end of compiling, code is still pointing to the start of the - group, while tempcode has been updated to point past the end of the group. - The parsed pattern pointer (pptr) is on the closing META_KET. - - If this is a conditional bracket, check that there are no more than - two branches in the group, or just one if it's a DEFINE group. We do this - in the real compile phase, not in the pre-pass, where the whole group may - not be available. */ - - if (bravalue == OP_COND && lengthptr == NULL) - { - PCRE2_UCHAR *tc = code; - int condcount = 0; - - do { - condcount++; - tc += GET(tc,1); - } - while (*tc != OP_KET); - - /* A DEFINE group is never obeyed inline (the "condition" is always - false). It must have only one branch. Having checked this, change the - opcode to OP_FALSE. */ - - if (code[LINK_SIZE+1] == OP_DEFINE) - { - if (condcount > 1) - { - cb->erroroffset = offset; - *errorcodeptr = ERR54; - return 0; - } - code[LINK_SIZE+1] = OP_FALSE; - bravalue = OP_DEFINE; /* A flag to suppress char handling below */ - } - - /* A "normal" conditional group. If there is just one branch, we must not - make use of its firstcu or reqcu, because this is equivalent to an - empty second branch. Also, it may match an empty string. If there are two - branches, this item must match a character if the group must. */ - - else - { - if (condcount > 2) - { - cb->erroroffset = offset; - *errorcodeptr = ERR27; - return 0; - } - if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE; - else if (group_return > 0) matched_char = TRUE; - } - } - - /* In the pre-compile phase, update the length by the length of the group, - less the brackets at either end. Then reduce the compiled code to just a - set of non-capturing brackets so that it doesn't use much memory if it is - duplicated by a quantifier.*/ - - if (lengthptr != NULL) - { - if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) - { - *errorcodeptr = ERR20; - return 0; - } - *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; - code++; /* This already contains bravalue */ - PUTINC(code, 0, 1 + LINK_SIZE); - *code++ = OP_KET; - PUTINC(code, 0, 1 + LINK_SIZE); - break; /* No need to waste time with special character handling */ - } - - /* Otherwise update the main code pointer to the end of the group. */ - - code = tempcode; - - /* For a DEFINE group, required and first character settings are not - relevant. */ - - if (bravalue == OP_DEFINE) break; - - /* Handle updating of the required and first code units for other types of - group. Update for normal brackets of all kinds, and conditions with two - branches (see code above). If the bracket is followed by a quantifier with - zero repeat, we have to back off. Hence the definition of zeroreqcu and - zerofirstcu outside the main loop so that they can be accessed for the back - off. */ - - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - groupsetfirstcu = FALSE; - - if (bravalue >= OP_ONCE) /* Not an assertion */ - { - /* If we have not yet set a firstcu in this branch, take it from the - subpattern, remembering that it was set here so that a repeat of more - than one can replicate it as reqcu if necessary. If the subpattern has - no firstcu, set "none" for the whole branch. In both cases, a zero - repeat forces firstcu to "none". */ - - if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET) - { - if (subfirstcuflags >= 0) - { - firstcu = subfirstcu; - firstcuflags = subfirstcuflags; - groupsetfirstcu = TRUE; - } - else firstcuflags = REQ_NONE; - zerofirstcuflags = REQ_NONE; - } - - /* If firstcu was previously set, convert the subpattern's firstcu - into reqcu if there wasn't one, using the vary flag that was in - existence beforehand. */ - - else if (subfirstcuflags >= 0 && subreqcuflags < 0) - { - subreqcu = subfirstcu; - subreqcuflags = subfirstcuflags | tempreqvary; - } - - /* If the subpattern set a required code unit (or set a first code unit - that isn't really the first code unit - see above), set it. */ - - if (subreqcuflags >= 0) - { - reqcu = subreqcu; - reqcuflags = subreqcuflags; - } - } - - /* For a forward assertion, we take the reqcu, if set, provided that the - group has also set a firstcu. This can be helpful if the pattern that - follows the assertion doesn't set a different char. For example, it's - useful for /(?=abcde).+/. We can't set firstcu for an assertion, however - because it leads to incorrect effect for patterns such as /(?=a)a.+/ when - the "real" "a" would then become a reqcu instead of a firstcu. This is - overcome by a scan at the end if there's no firstcu, looking for an - asserted first char. A similar effect for patterns like /(?=.*X)X$/ means - we must only take the reqcu when the group also set a firstcu. Otherwise, - in that example, 'X' ends up set for both. */ - - else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) && - subreqcuflags >= 0 && subfirstcuflags >= 0) - { - reqcu = subreqcu; - reqcuflags = subreqcuflags; - } - - break; /* End of nested group handling */ - - - /* ===================================================================*/ - /* Handle named backreferences and recursions. */ - - case META_BACKREF_BYNAME: - case META_RECURSE_BYNAME: - { - int count, index; - PCRE2_SPTR name; - BOOL is_dupname = FALSE; - named_group *ng = cb->named_groups; - uint32_t length = *(++pptr); - - GETPLUSOFFSET(offset, pptr); - name = cb->start_pattern + offset; - - /* In the first pass, the names generated in the pre-pass are available, - but the main name table has not yet been created. Scan the list of names - generated in the pre-pass in order to get a number and whether or not - this name is duplicated. */ - - groupnumber = 0; - for (i = 0; i < cb->names_found; i++, ng++) - { - if (length == ng->length && - PRIV(strncmp)(name, ng->name, length) == 0) - { - is_dupname = ng->isdup; - groupnumber = ng->number; - - /* For a recursion, that's all that is needed. We can now go to - the code that handles numerical recursion, applying it to the first - group with the given name. */ - - if (meta == META_RECURSE_BYNAME) - { - meta_arg = groupnumber; - goto HANDLE_NUMERICAL_RECURSION; - } - - /* For a back reference, update the back reference map and the - maximum back reference. */ - - cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; - if (groupnumber > cb->top_backref) - cb->top_backref = groupnumber; - } - } - - /* If the name was not found we have a bad reference. */ - - if (groupnumber == 0) - { - *errorcodeptr = ERR15; - cb->erroroffset = offset; - return 0; - } - - /* If a back reference name is not duplicated, we can handle it as - a numerical reference. */ - - if (!is_dupname) - { - meta_arg = groupnumber; - goto HANDLE_SINGLE_REFERENCE; - } - - /* If a back reference name is duplicated, we generate a different - opcode to a numerical back reference. In the second pass we must - search for the index and count in the final name table. */ - - count = 0; /* Values for first pass (avoids compiler warning) */ - index = 0; - if (lengthptr == NULL && !find_dupname_details(name, length, &index, - &count, errorcodeptr, cb)) return 0; - - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF; - PUT2INC(code, 0, index); - PUT2INC(code, 0, count); - } - break; - - - /* ===================================================================*/ - /* Handle a numerical callout. */ - - case META_CALLOUT_NUMBER: - code[0] = OP_CALLOUT; - PUT(code, 1, pptr[1]); /* Offset to next pattern item */ - PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */ - code[1 + 2*LINK_SIZE] = pptr[3]; - pptr += 3; - code += PRIV(OP_lengths)[OP_CALLOUT]; - break; - - - /* ===================================================================*/ - /* Handle a callout with a string argument. In the pre-pass we just compute - the length without generating anything. The length in pptr[3] includes both - delimiters; in the actual compile only the first one is copied, but a - terminating zero is added. Any doubled delimiters within the string make - this an overestimate, but it is not worth bothering about. */ - - case META_CALLOUT_STRING: - if (lengthptr != NULL) - { - *lengthptr += pptr[3] + (1 + 4*LINK_SIZE); - pptr += 3; - SKIPOFFSET(pptr); - } - - /* In the real compile we can copy the string. The starting delimiter is - included so that the client can discover it if they want. We also pass the - start offset to help a script language give better error messages. */ - - else - { - PCRE2_SPTR pp; - uint32_t delimiter; - uint32_t length = pptr[3]; - PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE); - - code[0] = OP_CALLOUT_STR; - PUT(code, 1, pptr[1]); /* Offset to next pattern item */ - PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */ - - pptr += 3; - GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */ - pp = cb->start_pattern + offset; - delimiter = *callout_string++ = *pp++; - if (delimiter == CHAR_LEFT_CURLY_BRACKET) - delimiter = CHAR_RIGHT_CURLY_BRACKET; - PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */ - - /* The syntax of the pattern was checked in the parsing scan. The length - includes both delimiters, but we have passed the opening one just above, - so we reduce length before testing it. The test is for > 1 because we do - not want to copy the final delimiter. This also ensures that pp[1] is - accessible. */ - - while (--length > 1) - { - if (*pp == delimiter && pp[1] == delimiter) - { - *callout_string++ = delimiter; - pp += 2; - length--; - } - else *callout_string++ = *pp++; - } - *callout_string++ = CHAR_NUL; - - /* Set the length of the entire item, the advance to its end. */ - - PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code)); - code = callout_string; - } - break; - - - /* ===================================================================*/ - /* Handle repetition. The different types are all sorted out in the parsing - pass. */ - - case META_MINMAX_PLUS: - case META_MINMAX_QUERY: - case META_MINMAX: - repeat_min = *(++pptr); - repeat_max = *(++pptr); - goto REPEAT; - - case META_ASTERISK: - case META_ASTERISK_PLUS: - case META_ASTERISK_QUERY: - repeat_min = 0; - repeat_max = REPEAT_UNLIMITED; - goto REPEAT; - - case META_PLUS: - case META_PLUS_PLUS: - case META_PLUS_QUERY: - repeat_min = 1; - repeat_max = REPEAT_UNLIMITED; - goto REPEAT; - - case META_QUERY: - case META_QUERY_PLUS: - case META_QUERY_QUERY: - repeat_min = 0; - repeat_max = 1; - - REPEAT: - if (previous_matched_char && repeat_min > 0) matched_char = TRUE; - - /* Remember whether this is a variable length repeat, and default to - single-char opcodes. */ - - reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; - op_type = 0; - - /* Adjust first and required code units for a zero repeat. */ - - if (repeat_min == 0) - { - firstcu = zerofirstcu; - firstcuflags = zerofirstcuflags; - reqcu = zeroreqcu; - reqcuflags = zeroreqcuflags; - } - - /* Note the greediness and possessiveness. */ - - switch (meta) - { - case META_MINMAX_PLUS: - case META_ASTERISK_PLUS: - case META_PLUS_PLUS: - case META_QUERY_PLUS: - repeat_type = 0; /* Force greedy */ - possessive_quantifier = TRUE; - break; - - case META_MINMAX_QUERY: - case META_ASTERISK_QUERY: - case META_PLUS_QUERY: - case META_QUERY_QUERY: - repeat_type = greedy_non_default; - possessive_quantifier = FALSE; - break; - - default: - repeat_type = greedy_default; - possessive_quantifier = FALSE; - break; - } - - /* Save start of previous item, in case we have to move it up in order to - insert something before it, and remember what it was. */ - - tempcode = previous; - op_previous = *previous; - - /* Now handle repetition for the different types of item. If the repeat - minimum and the repeat maximum are both 1, we can ignore the quantifier for - non-parenthesized items, as they have only one alternative. For anything in - parentheses, we must not ignore if {1} is possessive. */ - - switch (op_previous) - { - /* If previous was a character or negated character match, abolish the - item and generate a repeat item instead. If a char item has a minimum of - more than one, ensure that it is set in reqcu - it might not be if a - sequence such as x{3} is the first thing in a branch because the x will - have gone into firstcu instead. */ - - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; - op_type = chartypeoffset[op_previous - OP_CHAR]; - - /* Deal with UTF characters that take up more than one code unit. */ - -#ifdef MAYBE_UTF_MULTI - if (utf && NOT_FIRSTCU(code[-1])) - { - PCRE2_UCHAR *lastchar = code - 1; - BACKCHAR(lastchar); - mclength = (uint32_t)(code - lastchar); /* Length of UTF character */ - memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */ - } - else -#endif /* MAYBE_UTF_MULTI */ - - /* Handle the case of a single code unit - either with no UTF support, or - with UTF disabled, or for a single-code-unit UTF character. */ - { - mcbuffer[0] = code[-1]; - mclength = 1; - if (op_previous <= OP_CHARI && repeat_min > 1) - { - reqcu = mcbuffer[0]; - reqcuflags = req_caseopt | cb->req_varyopt; - } - } - goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ - - /* If previous was a character class or a back reference, we put the - repeat stuff after it, but just skip the item if the repeat was {0,0}. */ - -#ifdef SUPPORT_WIDE_CHARS - case OP_XCLASS: -#endif - case OP_CLASS: - case OP_NCLASS: - case OP_REF: - case OP_REFI: - case OP_DNREF: - case OP_DNREFI: - - if (repeat_max == 0) - { - code = previous; - goto END_REPEAT; - } - if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; - - if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED) - *code++ = OP_CRSTAR + repeat_type; - else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED) - *code++ = OP_CRPLUS + repeat_type; - else if (repeat_min == 0 && repeat_max == 1) - *code++ = OP_CRQUERY + repeat_type; - else - { - *code++ = OP_CRRANGE + repeat_type; - PUT2INC(code, 0, repeat_min); - if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */ - PUT2INC(code, 0, repeat_max); - } - break; - - /* If previous is OP_FAIL, it was generated by an empty class [] - (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be - generated, that is by (*FAIL) or (?!), disallow a quantifier at parse - time. We can just ignore this repeat. */ - - case OP_FAIL: - goto END_REPEAT; - - /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets - because pcre2_match() could not handle backtracking into recursively - called groups. Now that this backtracking is available, we no longer need - to do this. However, we still need to replicate recursions as we do for - groups so as to have independent backtracking points. We can replicate - for the minimum number of repeats directly. For optional repeats we now - wrap the recursion in OP_BRA brackets and make use of the bracket - repetition. */ - - case OP_RECURSE: - if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier) - goto END_REPEAT; - - /* Generate unwrapped repeats for a non-zero minimum, except when the - minimum is 1 and the maximum unlimited, because that can be handled with - OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the - minimum, we just need to generate the appropriate additional copies. - Otherwise we need to generate one more, to simulate the situation when - the minimum is zero. */ - - if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED)) - { - int replicate = repeat_min; - if (repeat_min == repeat_max) replicate--; - - /* In the pre-compile phase, we don't actually do the replication. We - just adjust the length as if we had. Do some paranoid checks for - potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit - integer type when available, otherwise double. */ - - if (lengthptr != NULL) - { - PCRE2_SIZE delta = replicate*(1 + LINK_SIZE); - if ((INT64_OR_DOUBLE)replicate* - (INT64_OR_DOUBLE)(1 + LINK_SIZE) > - (INT64_OR_DOUBLE)INT_MAX || - OFLOW_MAX - *lengthptr < delta) - { - *errorcodeptr = ERR20; - return 0; - } - *lengthptr += delta; - } - - else for (i = 0; i < replicate; i++) - { - memcpy(code, previous, CU2BYTES(1 + LINK_SIZE)); - previous = code; - code += 1 + LINK_SIZE; - } - - /* If the number of repeats is fixed, we are done. Otherwise, adjust - the counts and fall through. */ - - if (repeat_min == repeat_max) break; - if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min; - repeat_min = 0; - } - - /* Wrap the recursion call in OP_BRA brackets. */ - - (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE)); - op_previous = *previous = OP_BRA; - PUT(previous, 1, 2 + 2*LINK_SIZE); - previous[2 + 2*LINK_SIZE] = OP_KET; - PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE); - code += 2 + 2 * LINK_SIZE; - length_prevgroup = 3 + 3*LINK_SIZE; - group_return = -1; /* Set "may match empty string" */ - - /* Now treat as a repeated OP_BRA. */ - /* Fall through */ - - /* If previous was a bracket group, we may have to replicate it in - certain cases. Note that at this point we can encounter only the "basic" - bracket opcodes such as BRA and CBRA, as this is the place where they get - converted into the more special varieties such as BRAPOS and SBRA. - Originally, PCRE did not allow repetition of assertions, but now it does, - for Perl compatibility. */ - - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERT_NA: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ASSERTBACK_NA: - case OP_ONCE: - case OP_SCRIPT_RUN: - case OP_BRA: - case OP_CBRA: - case OP_COND: - { - int len = (int)(code - previous); - PCRE2_UCHAR *bralink = NULL; - PCRE2_UCHAR *brazeroptr = NULL; - - if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier) - goto END_REPEAT; - - /* Repeating a DEFINE group (or any group where the condition is always - FALSE and there is only one branch) is pointless, but Perl allows the - syntax, so we just ignore the repeat. */ - - if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE && - previous[GET(previous, 1)] != OP_ALT) - goto END_REPEAT; - - /* Perl allows all assertions to be quantified, and when they contain - capturing parentheses and/or are optional there are potential uses for - this feature. PCRE2 used to force the maximum quantifier to 1 on the - invalid grounds that further repetition was never useful. This was - always a bit pointless, since an assertion could be wrapped with a - repeated group to achieve the effect. General repetition is now - permitted, but if the maximum is unlimited it is set to one more than - the minimum. */ - - if (op_previous < OP_ONCE) /* Assertion */ - { - if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1; - } - - /* The case of a zero minimum is special because of the need to stick - OP_BRAZERO in front of it, and because the group appears once in the - data, whereas in other cases it appears the minimum number of times. For - this reason, it is simplest to treat this case separately, as otherwise - the code gets far too messy. There are several special subcases when the - minimum is zero. */ - - if (repeat_min == 0) - { - /* If the maximum is also zero, we used to just omit the group from - the output altogether, like this: - - ** if (repeat_max == 0) - ** { - ** code = previous; - ** goto END_REPEAT; - ** } - - However, that fails when a group or a subgroup within it is - referenced as a subroutine from elsewhere in the pattern, so now we - stick in OP_SKIPZERO in front of it so that it is skipped on - execution. As we don't have a list of which groups are referenced, we - cannot do this selectively. - - If the maximum is 1 or unlimited, we just have to stick in the - BRAZERO and do no more at this point. */ - - if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED) - { - (void)memmove(previous + 1, previous, CU2BYTES(len)); - code++; - if (repeat_max == 0) - { - *previous++ = OP_SKIPZERO; - goto END_REPEAT; - } - brazeroptr = previous; /* Save for possessive optimizing */ - *previous++ = OP_BRAZERO + repeat_type; - } - - /* If the maximum is greater than 1 and limited, we have to replicate - in a nested fashion, sticking OP_BRAZERO before each set of brackets. - The first one has to be handled carefully because it's the original - copy, which has to be moved up. The remainder can be handled by code - that is common with the non-zero minimum case below. We have to - adjust the value or repeat_max, since one less copy is required. */ - - else - { - int linkoffset; - (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len)); - code += 2 + LINK_SIZE; - *previous++ = OP_BRAZERO + repeat_type; - *previous++ = OP_BRA; - - /* We chain together the bracket link offset fields that have to be - filled in later when the ends of the brackets are reached. */ - - linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink); - bralink = previous; - PUTINC(previous, 0, linkoffset); - } - - if (repeat_max != REPEAT_UNLIMITED) repeat_max--; - } - - /* If the minimum is greater than zero, replicate the group as many - times as necessary, and adjust the maximum to the number of subsequent - copies that we need. */ - - else - { - if (repeat_min > 1) - { - /* In the pre-compile phase, we don't actually do the replication. - We just adjust the length as if we had. Do some paranoid checks for - potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit - integer type when available, otherwise double. */ - - if (lengthptr != NULL) - { - PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup; - if ((INT64_OR_DOUBLE)(repeat_min - 1)* - (INT64_OR_DOUBLE)length_prevgroup > - (INT64_OR_DOUBLE)INT_MAX || - OFLOW_MAX - *lengthptr < delta) - { - *errorcodeptr = ERR20; - return 0; - } - *lengthptr += delta; - } - - /* This is compiling for real. If there is a set first code unit - for the group, and we have not yet set a "required code unit", set - it. */ - - else - { - if (groupsetfirstcu && reqcuflags < 0) - { - reqcu = firstcu; - reqcuflags = firstcuflags; - } - for (i = 1; (uint32_t)i < repeat_min; i++) - { - memcpy(code, previous, CU2BYTES(len)); - code += len; - } - } - } - - if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min; - } - - /* This code is common to both the zero and non-zero minimum cases. If - the maximum is limited, it replicates the group in a nested fashion, - remembering the bracket starts on a stack. In the case of a zero - minimum, the first one was set up above. In all cases the repeat_max - now specifies the number of additional copies needed. Again, we must - remember to replicate entries on the forward reference list. */ - - if (repeat_max != REPEAT_UNLIMITED) - { - /* In the pre-compile phase, we don't actually do the replication. We - just adjust the length as if we had. For each repetition we must add - 1 to the length for BRAZERO and for all but the last repetition we - must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some - paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type - is a 64-bit integer type when available, otherwise double. */ - - if (lengthptr != NULL && repeat_max > 0) - { - PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) - - 2 - 2*LINK_SIZE; /* Last one doesn't nest */ - if ((INT64_OR_DOUBLE)repeat_max * - (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) - > (INT64_OR_DOUBLE)INT_MAX || - OFLOW_MAX - *lengthptr < delta) - { - *errorcodeptr = ERR20; - return 0; - } - *lengthptr += delta; - } - - /* This is compiling for real */ - - else for (i = repeat_max - 1; i >= 0; i--) - { - *code++ = OP_BRAZERO + repeat_type; - - /* All but the final copy start a new nesting, maintaining the - chain of brackets outstanding. */ - - if (i != 0) - { - int linkoffset; - *code++ = OP_BRA; - linkoffset = (bralink == NULL)? 0 : (int)(code - bralink); - bralink = code; - PUTINC(code, 0, linkoffset); - } - - memcpy(code, previous, CU2BYTES(len)); - code += len; - } - - /* Now chain through the pending brackets, and fill in their length - fields (which are holding the chain links pro tem). */ - - while (bralink != NULL) - { - int oldlinkoffset; - int linkoffset = (int)(code - bralink + 1); - PCRE2_UCHAR *bra = code - linkoffset; - oldlinkoffset = GET(bra, 1); - bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; - *code++ = OP_KET; - PUTINC(code, 0, linkoffset); - PUT(bra, 1, linkoffset); - } - } - - /* If the maximum is unlimited, set a repeater in the final copy. For - SCRIPT_RUN and ONCE brackets, that's all we need to do. However, - possessively repeated ONCE brackets can be converted into non-capturing - brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this - saves having to deal with possessive ONCEs specially. - - Otherwise, when we are doing the actual compile phase, check to see - whether this group is one that could match an empty string. If so, - convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so - that runtime checking can be done. [This check is also applied to ONCE - and SCRIPT_RUN groups at runtime, but in a different way.] - - Then, if the quantifier was possessive and the bracket is not a - conditional, we convert the BRA code to the POS form, and the KET code - to KETRPOS. (It turns out to be convenient at runtime to detect this - kind of subpattern at both the start and at the end.) The use of - special opcodes makes it possible to reduce greatly the stack usage in - pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to - OP_BRAPOSZERO. - - Then, if the minimum number of matches is 1 or 0, cancel the possessive - flag so that the default action below, of wrapping everything inside - atomic brackets, does not happen. When the minimum is greater than 1, - there will be earlier copies of the group, and so we still have to wrap - the whole thing. */ - - else - { - PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE; - PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1); - - /* Convert possessive ONCE brackets to non-capturing */ - - if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA; - - /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need - to do is to set the KET. */ - - if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN) - *ketcode = OP_KETRMAX + repeat_type; - - /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs - (which have been converted to non-capturing above). */ - - else - { - /* In the compile phase, adjust the opcode if the group can match - an empty string. For a conditional group with only one branch, the - value of group_return will not show "could be empty", so we must - check that separately. */ - - if (lengthptr == NULL) - { - if (group_return < 0) *bracode += OP_SBRA - OP_BRA; - if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT) - *bracode = OP_SCOND; - } - - /* Handle possessive quantifiers. */ - - if (possessive_quantifier) - { - /* For COND brackets, we wrap the whole thing in a possessively - repeated non-capturing bracket, because we have not invented POS - versions of the COND opcodes. */ - - if (*bracode == OP_COND || *bracode == OP_SCOND) - { - int nlen = (int)(code - bracode); - (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen)); - code += 1 + LINK_SIZE; - nlen += 1 + LINK_SIZE; - *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS; - *code++ = OP_KETRPOS; - PUTINC(code, 0, nlen); - PUT(bracode, 1, nlen); - } - - /* For non-COND brackets, we modify the BRA code and use KETRPOS. */ - - else - { - *bracode += 1; /* Switch to xxxPOS opcodes */ - *ketcode = OP_KETRPOS; - } - - /* If the minimum is zero, mark it as possessive, then unset the - possessive flag when the minimum is 0 or 1. */ - - if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO; - if (repeat_min < 2) possessive_quantifier = FALSE; - } - - /* Non-possessive quantifier */ - - else *ketcode = OP_KETRMAX + repeat_type; - } - } - } - break; - - /* If previous was a character type match (\d or similar), abolish it and - create a suitable repeat item. The code is shared with single-character - repeats by setting op_type to add a suitable offset into repeat_type. - Note the the Unicode property types will be present only when - SUPPORT_UNICODE is defined, but we don't wrap the little bits of code - here because it just makes it horribly messy. */ - - default: - if (op_previous >= OP_EODN) /* Not a character type - internal error */ - { - *errorcodeptr = ERR10; - return 0; - } - else - { - int prop_type, prop_value; - PCRE2_UCHAR *oldcode; - - if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; - - op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ - mclength = 0; /* Not a character */ - - if (op_previous == OP_PROP || op_previous == OP_NOTPROP) - { - prop_type = previous[1]; - prop_value = previous[2]; - } - else - { - /* Come here from just above with a character in mcbuffer/mclength. */ - OUTPUT_SINGLE_REPEAT: - prop_type = prop_value = -1; - } - - /* At this point, if prop_type == prop_value == -1 we either have a - character in mcbuffer when mclength is greater than zero, or we have - mclength zero, in which case there is a non-property character type in - op_previous. If prop_type/value are not negative, we have a property - character type in op_previous. */ - - oldcode = code; /* Save where we were */ - code = previous; /* Usually overwrite previous item */ - - /* If the maximum is zero then the minimum must also be zero; Perl allows - this case, so we do too - by simply omitting the item altogether. */ - - if (repeat_max == 0) goto END_REPEAT; - - /* Combine the op_type with the repeat_type */ - - repeat_type += op_type; - - /* A minimum of zero is handled either as the special case * or ?, or as - an UPTO, with the maximum given. */ - - if (repeat_min == 0) - { - if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type; - else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; - else - { - *code++ = OP_UPTO + repeat_type; - PUT2INC(code, 0, repeat_max); - } - } - - /* A repeat minimum of 1 is optimized into some special cases. If the - maximum is unlimited, we use OP_PLUS. Otherwise, the original item is - left in place and, if the maximum is greater than 1, we use OP_UPTO with - one less than the maximum. */ - - else if (repeat_min == 1) - { - if (repeat_max == REPEAT_UNLIMITED) - *code++ = OP_PLUS + repeat_type; - else - { - code = oldcode; /* Leave previous item in place */ - if (repeat_max == 1) goto END_REPEAT; - *code++ = OP_UPTO + repeat_type; - PUT2INC(code, 0, repeat_max - 1); - } - } - - /* The case {n,n} is just an EXACT, while the general case {n,m} is - handled as an EXACT followed by an UPTO or STAR or QUERY. */ - - else - { - *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ - PUT2INC(code, 0, repeat_min); - - /* Unless repeat_max equals repeat_min, fill in the data for EXACT, - and then generate the second opcode. For a repeated Unicode property - match, there are two extra values that define the required property, - and mclength is set zero to indicate this. */ - - if (repeat_max != repeat_min) - { - if (mclength > 0) - { - memcpy(code, mcbuffer, CU2BYTES(mclength)); - code += mclength; - } - else - { - *code++ = op_previous; - if (prop_type >= 0) - { - *code++ = prop_type; - *code++ = prop_value; - } - } - - /* Now set up the following opcode */ - - if (repeat_max == REPEAT_UNLIMITED) - *code++ = OP_STAR + repeat_type; - else - { - repeat_max -= repeat_min; - if (repeat_max == 1) - { - *code++ = OP_QUERY + repeat_type; - } - else - { - *code++ = OP_UPTO + repeat_type; - PUT2INC(code, 0, repeat_max); - } - } - } - } - - /* Fill in the character or character type for the final opcode. */ - - if (mclength > 0) - { - memcpy(code, mcbuffer, CU2BYTES(mclength)); - code += mclength; - } - else - { - *code++ = op_previous; - if (prop_type >= 0) - { - *code++ = prop_type; - *code++ = prop_value; - } - } - } - break; - } /* End of switch on different op_previous values */ - - - /* If the character following a repeat is '+', possessive_quantifier is - TRUE. For some opcodes, there are special alternative opcodes for this - case. For anything else, we wrap the entire repeated item inside OP_ONCE - brackets. Logically, the '+' notation is just syntactic sugar, taken from - Sun's Java package, but the special opcodes can optimize it. - - Some (but not all) possessively repeated subpatterns have already been - completely handled in the code just above. For them, possessive_quantifier - is always FALSE at this stage. Note that the repeated item starts at - tempcode, not at previous, which might be the first part of a string whose - (former) last char we repeated. */ - - if (possessive_quantifier) - { - int len; - - /* Possessifying an EXACT quantifier has no effect, so we can ignore it. - However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6}, - {5,}, or {5,10}). We skip over an EXACT item; if the length of what - remains is greater than zero, there's a further opcode that can be - handled. If not, do nothing, leaving the EXACT alone. */ - - switch(*tempcode) - { - case OP_TYPEEXACT: - tempcode += PRIV(OP_lengths)[*tempcode] + - ((tempcode[1 + IMM2_SIZE] == OP_PROP - || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); - break; - - /* CHAR opcodes are used for exacts whose count is 1. */ - - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - case OP_EXACT: - case OP_EXACTI: - case OP_NOTEXACT: - case OP_NOTEXACTI: - tempcode += PRIV(OP_lengths)[*tempcode]; -#ifdef SUPPORT_UNICODE - if (utf && HAS_EXTRALEN(tempcode[-1])) - tempcode += GET_EXTRALEN(tempcode[-1]); -#endif - break; - - /* For the class opcodes, the repeat operator appears at the end; - adjust tempcode to point to it. */ - - case OP_CLASS: - case OP_NCLASS: - tempcode += 1 + 32/sizeof(PCRE2_UCHAR); - break; - -#ifdef SUPPORT_WIDE_CHARS - case OP_XCLASS: - tempcode += GET(tempcode, 1); - break; -#endif - } - - /* If tempcode is equal to code (which points to the end of the repeated - item), it means we have skipped an EXACT item but there is no following - QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In - all other cases, tempcode will be pointing to the repeat opcode, and will - be less than code, so the value of len will be greater than 0. */ - - len = (int)(code - tempcode); - if (len > 0) - { - unsigned int repcode = *tempcode; - - /* There is a table for possessifying opcodes, all of which are less - than OP_CALLOUT. A zero entry means there is no possessified version. - */ - - if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0) - *tempcode = opcode_possessify[repcode]; - - /* For opcode without a special possessified version, wrap the item in - ONCE brackets. */ - - else - { - (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len)); - code += 1 + LINK_SIZE; - len += 1 + LINK_SIZE; - tempcode[0] = OP_ONCE; - *code++ = OP_KET; - PUTINC(code, 0, len); - PUT(tempcode, 1, len); - } - } - } - - /* We set the "follows varying string" flag for subsequently encountered - reqcus if it isn't already set and we have just passed a varying length - item. */ - - END_REPEAT: - cb->req_varyopt |= reqvary; - break; - - - /* ===================================================================*/ - /* Handle a 32-bit data character with a value greater than META_END. */ - - case META_BIGVALUE: - pptr++; - goto NORMAL_CHAR; - - - /* ===============================================================*/ - /* Handle a back reference by number, which is the meta argument. The - pattern offsets for back references to group numbers less than 10 are held - in a special vector, to avoid using more than two parsed pattern elements - in 64-bit environments. We only need the offset to the first occurrence, - because if that doesn't fail, subsequent ones will also be OK. */ - - case META_BACKREF: - if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg]; - else GETPLUSOFFSET(offset, pptr); - - if (meta_arg > cb->bracount) - { - cb->erroroffset = offset; - *errorcodeptr = ERR15; /* Non-existent subpattern */ - return 0; - } - - /* Come here from named backref handling when the reference is to a - single group (that is, not to a duplicated name). The back reference - data will have already been updated. We must disable firstcu if not - set, to cope with cases like (?=(\w+))\1: which would otherwise set ':' - later. */ - - HANDLE_SINGLE_REFERENCE: - if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE; - *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF; - PUT2INC(code, 0, meta_arg); - - /* Update the map of back references, and keep the highest one. We - could do this in parse_regex() for numerical back references, but not - for named back references, because we don't know the numbers to which - named back references refer. So we do it all in this function. */ - - cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1; - if (meta_arg > cb->top_backref) cb->top_backref = meta_arg; - break; - - - /* ===============================================================*/ - /* Handle recursion by inserting the number of the called group (which is - the meta argument) after OP_RECURSE. At the end of compiling the pattern is - scanned and these numbers are replaced by offsets within the pattern. It is - done like this to avoid problems with forward references and adjusting - offsets when groups are duplicated and moved (as discovered in previous - implementations). Note that a recursion does not have a set first - character. */ - - case META_RECURSE: - GETPLUSOFFSET(offset, pptr); - if (meta_arg > cb->bracount) - { - cb->erroroffset = offset; - *errorcodeptr = ERR15; /* Non-existent subpattern */ - return 0; - } - HANDLE_NUMERICAL_RECURSION: - *code = OP_RECURSE; - PUT(code, 1, meta_arg); - code += 1 + LINK_SIZE; - groupsetfirstcu = FALSE; - cb->had_recurse = TRUE; - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - break; - - - /* ===============================================================*/ - /* Handle capturing parentheses; the number is the meta argument. */ - - case META_CAPTURE: - bravalue = OP_CBRA; - skipunits = IMM2_SIZE; - PUT2(code, 1+LINK_SIZE, meta_arg); - cb->lastcapture = meta_arg; - goto GROUP_PROCESS_NOTE_EMPTY; - - - /* ===============================================================*/ - /* Handle escape sequence items. For ones like \d, the ESC_values are - arranged to be the same as the corresponding OP_values in the default case - when PCRE2_UCP is not set (which is the only case in which they will appear - here). - - Note: \Q and \E are never seen here, as they were dealt with in - parse_pattern(). Neither are numerical back references or recursions, which - were turned into META_BACKREF or META_RECURSE items, respectively. \k and - \g, when followed by names, are turned into META_BACKREF_BYNAME or - META_RECURSE_BYNAME. */ - - case META_ESCAPE: - - /* We can test for escape sequences that consume a character because their - values lie between ESC_b and ESC_Z; this may have to change if any new ones - are ever created. For these sequences, we disable the setting of a first - character if it hasn't already been set. */ - - if (meta_arg > ESC_b && meta_arg < ESC_Z) - { - matched_char = TRUE; - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - } - - /* Set values to reset to if this is followed by a zero repeat. */ - - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - - /* If Unicode is not supported, \P and \p are not allowed and are - faulted at parse time, so will never appear here. */ - -#ifdef SUPPORT_UNICODE - if (meta_arg == ESC_P || meta_arg == ESC_p) - { - uint32_t ptype = *(++pptr) >> 16; - uint32_t pdata = *pptr & 0xffff; - - /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit - from the auto-anchoring code. */ - - if (meta_arg == ESC_p && ptype == PT_ANY) - { - *code++ = OP_ALLANY; - } - else - { - *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP; - *code++ = ptype; - *code++ = pdata; - } - break; /* End META_ESCAPE */ - } -#endif - - /* For the rest (including \X when Unicode is supported - if not it's - faulted at parse time), the OP value is the escape value when PCRE2_UCP is - not set; if it is set, these escapes do not show up here because they are - converted into Unicode property tests in parse_regex(). Note that \b and \B - do a one-character lookbehind, and \A also behaves as if it does. */ - - if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */ - if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) && - cb->max_lookbehind == 0) - cb->max_lookbehind = 1; - - /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY - instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */ - -#if PCRE2_CODE_UNIT_WIDTH == 32 - *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg; -#else - *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg; -#endif - break; /* End META_ESCAPE */ - - - /* ===================================================================*/ - /* Handle an unrecognized meta value. A parsed pattern value less than - META_END is a literal. Otherwise we have a problem. */ - - default: - if (meta >= META_END) - { -#ifdef DEBUG_SHOW_PARSED - fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr); -#endif - *errorcodeptr = ERR89; /* Internal error - unrecognized. */ - return 0; - } - - /* Handle a literal character. We come here by goto in the case of a - 32-bit, non-UTF character whose value is greater than META_END. */ - - NORMAL_CHAR: - meta = *pptr; /* Get the full 32 bits */ - NORMAL_CHAR_SET: /* Character is already in meta */ - matched_char = TRUE; - - /* For caseless UTF or UCP mode, check whether this character has more than - one other case. If so, generate a special OP_PROP item instead of OP_CHARI. - */ - -#ifdef SUPPORT_UNICODE - if ((utf||ucp) && (options & PCRE2_CASELESS) != 0) - { - uint32_t caseset = UCD_CASESET(meta); - if (caseset != 0) - { - *code++ = OP_PROP; - *code++ = PT_CLIST; - *code++ = caseset; - if (firstcuflags == REQ_UNSET) - firstcuflags = zerofirstcuflags = REQ_NONE; - break; /* End handling this meta item */ - } - } -#endif - - /* Caseful matches, or caseless and not one of the multicase characters. We - come here by goto in the case of a positive class that contains only - case-partners of a character with just two cases; matched_char has already - been set TRUE and options fudged if necessary. */ - - CLASS_CASELESS_CHAR: - - /* Get the character's code units into mcbuffer, with the length in - mclength. When not in UTF mode, the length is always 1. */ - -#ifdef SUPPORT_UNICODE - if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else -#endif - { - mclength = 1; - mcbuffer[0] = meta; - } - - /* Generate the appropriate code */ - - *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR; - memcpy(code, mcbuffer, CU2BYTES(mclength)); - code += mclength; - - /* Remember if \r or \n were seen */ - - if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL) - cb->external_flags |= PCRE2_HASCRORLF; - - /* Set the first and required code units appropriately. If no previous - first code unit, set it from this character, but revert to none on a zero - repeat. Otherwise, leave the firstcu value alone, and don't change it on - a zero repeat. */ - - if (firstcuflags == REQ_UNSET) - { - zerofirstcuflags = REQ_NONE; - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - - /* If the character is more than one code unit long, we can set a single - firstcu only if it is not to be matched caselessly. Multiple possible - starting code units may be picked up later in the studying code. */ - - if (mclength == 1 || req_caseopt == 0) - { - firstcu = mcbuffer[0]; - firstcuflags = req_caseopt; - if (mclength != 1) - { - reqcu = code[-1]; - reqcuflags = cb->req_varyopt; - } - } - else firstcuflags = reqcuflags = REQ_NONE; - } - - /* firstcu was previously set; we can set reqcu only if the length is - 1 or the matching is caseful. */ - - else - { - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - if (mclength == 1 || req_caseopt == 0) - { - reqcu = code[-1]; - reqcuflags = req_caseopt | cb->req_varyopt; - } - } - - /* If caselessness was temporarily instated, reset it. */ - - if (reset_caseful) - { - options &= ~PCRE2_CASELESS; - req_caseopt = 0; - reset_caseful = FALSE; - } - - break; /* End literal character handling */ - } /* End of big switch */ - } /* End of big loop */ - -/* Control never reaches here. */ -} - - - -/************************************************* -* Compile regex: a sequence of alternatives * -*************************************************/ - -/* On entry, pptr is pointing past the bracket meta, but on return it points to -the closing bracket or META_END. The code variable is pointing at the code unit -into which the BRA operator has been stored. This function is used during the -pre-compile phase when we are trying to find out the amount of memory needed, -as well as during the real compile phase. The value of lengthptr distinguishes -the two phases. - -Arguments: - options option bits, including any changes for this subpattern - codeptr -> the address of the current code pointer - pptrptr -> the address of the current parsed pattern pointer - errorcodeptr -> pointer to error code variable - skipunits skip this many code units at start (for brackets and OP_COND) - firstcuptr place to put the first required code unit - firstcuflagsptr place to put the first code unit flags, or a negative number - reqcuptr place to put the last required code unit - reqcuflagsptr place to put the last required code unit flags, or a negative number - bcptr pointer to the chain of currently open branches - cb points to the data block with tables pointers etc. - lengthptr NULL during the real compile phase - points to length accumulator during pre-compile phase - -Returns: 0 There has been an error - +1 Success, this group must match at least one character - -1 Success, this group may match an empty string -*/ - -static int -compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr, - int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr, - int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr, - branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr) -{ -PCRE2_UCHAR *code = *codeptr; -PCRE2_UCHAR *last_branch = code; -PCRE2_UCHAR *start_bracket = code; -BOOL lookbehind; -open_capitem capitem; -int capnumber = 0; -int okreturn = 1; -uint32_t *pptr = *pptrptr; -uint32_t firstcu, reqcu; -uint32_t lookbehindlength; -int32_t firstcuflags, reqcuflags; -uint32_t branchfirstcu, branchreqcu; -int32_t branchfirstcuflags, branchreqcuflags; -PCRE2_SIZE length; -branch_chain bc; - -/* If set, call the external function that checks for stack availability. */ - -if (cb->cx->stack_guard != NULL && - cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data)) - { - *errorcodeptr= ERR33; - return 0; - } - -/* Miscellaneous initialization */ - -bc.outer = bcptr; -bc.current_branch = code; - -firstcu = reqcu = 0; -firstcuflags = reqcuflags = REQ_UNSET; - -/* Accumulate the length for use in the pre-compile phase. Start with the -length of the BRA and KET and any extra code units that are required at the -beginning. We accumulate in a local variable to save frequent testing of -lengthptr for NULL. We cannot do this by looking at the value of 'code' at the -start and end of each alternative, because compiled items are discarded during -the pre-compile phase so that the workspace is not exceeded. */ - -length = 2 + 2*LINK_SIZE + skipunits; - -/* Remember if this is a lookbehind assertion, and if it is, save its length -and skip over the pattern offset. */ - -lookbehind = *code == OP_ASSERTBACK || - *code == OP_ASSERTBACK_NOT || - *code == OP_ASSERTBACK_NA; - -if (lookbehind) - { - lookbehindlength = META_DATA(pptr[-1]); - pptr += SIZEOFFSET; - } -else lookbehindlength = 0; - -/* If this is a capturing subpattern, add to the chain of open capturing items -so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA -need be tested here; changing this opcode to one of its variants, e.g. -OP_SCBRAPOS, happens later, after the group has been compiled. */ - -if (*code == OP_CBRA) - { - capnumber = GET2(code, 1 + LINK_SIZE); - capitem.number = capnumber; - capitem.next = cb->open_caps; - capitem.assert_depth = cb->assert_depth; - cb->open_caps = &capitem; - } - -/* Offset is set zero to mark that this bracket is still open */ - -PUT(code, 1, 0); -code += 1 + LINK_SIZE + skipunits; - -/* Loop for each alternative branch */ - -for (;;) - { - int branch_return; - - /* Insert OP_REVERSE if this is as lookbehind assertion. */ - - if (lookbehind && lookbehindlength > 0) - { - *code++ = OP_REVERSE; - PUTINC(code, 0, lookbehindlength); - length += 1 + LINK_SIZE; - } - - /* Now compile the branch; in the pre-compile phase its length gets added - into the length. */ - - if ((branch_return = - compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu, - &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc, - cb, (lengthptr == NULL)? NULL : &length)) == 0) - return 0; - - /* If a branch can match an empty string, so can the whole group. */ - - if (branch_return < 0) okreturn = -1; - - /* In the real compile phase, there is some post-processing to be done. */ - - if (lengthptr == NULL) - { - /* If this is the first branch, the firstcu and reqcu values for the - branch become the values for the regex. */ - - if (*last_branch != OP_ALT) - { - firstcu = branchfirstcu; - firstcuflags = branchfirstcuflags; - reqcu = branchreqcu; - reqcuflags = branchreqcuflags; - } - - /* If this is not the first branch, the first char and reqcu have to - match the values from all the previous branches, except that if the - previous value for reqcu didn't have REQ_VARY set, it can still match, - and we set REQ_VARY for the group from this branch's value. */ - - else - { - /* If we previously had a firstcu, but it doesn't match the new branch, - we have to abandon the firstcu for the regex, but if there was - previously no reqcu, it takes on the value of the old firstcu. */ - - if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu) - { - if (firstcuflags >= 0) - { - if (reqcuflags < 0) - { - reqcu = firstcu; - reqcuflags = firstcuflags; - } - } - firstcuflags = REQ_NONE; - } - - /* If we (now or from before) have no firstcu, a firstcu from the - branch becomes a reqcu if there isn't a branch reqcu. */ - - if (firstcuflags < 0 && branchfirstcuflags >= 0 && - branchreqcuflags < 0) - { - branchreqcu = branchfirstcu; - branchreqcuflags = branchfirstcuflags; - } - - /* Now ensure that the reqcus match */ - - if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) || - reqcu != branchreqcu) - reqcuflags = REQ_NONE; - else - { - reqcu = branchreqcu; - reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */ - } - } - } - - /* Handle reaching the end of the expression, either ')' or end of pattern. - In the real compile phase, go back through the alternative branches and - reverse the chain of offsets, with the field in the BRA item now becoming an - offset to the first alternative. If there are no alternatives, it points to - the end of the group. The length in the terminating ket is always the length - of the whole bracketed item. Return leaving the pointer at the terminating - char. */ - - if (META_CODE(*pptr) != META_ALT) - { - if (lengthptr == NULL) - { - PCRE2_SIZE branch_length = code - last_branch; - do - { - PCRE2_SIZE prev_length = GET(last_branch, 1); - PUT(last_branch, 1, branch_length); - branch_length = prev_length; - last_branch -= branch_length; - } - while (branch_length > 0); - } - - /* Fill in the ket */ - - *code = OP_KET; - PUT(code, 1, (int)(code - start_bracket)); - code += 1 + LINK_SIZE; - - /* If it was a capturing subpattern, remove the block from the chain. */ - - if (capnumber > 0) cb->open_caps = cb->open_caps->next; - - /* Set values to pass back */ - - *codeptr = code; - *pptrptr = pptr; - *firstcuptr = firstcu; - *firstcuflagsptr = firstcuflags; - *reqcuptr = reqcu; - *reqcuflagsptr = reqcuflags; - if (lengthptr != NULL) - { - if (OFLOW_MAX - *lengthptr < length) - { - *errorcodeptr = ERR20; - return 0; - } - *lengthptr += length; - } - return okreturn; - } - - /* Another branch follows. In the pre-compile phase, we can move the code - pointer back to where it was for the start of the first branch. (That is, - pretend that each branch is the only one.) - - In the real compile phase, insert an ALT node. Its length field points back - to the previous branch while the bracket remains open. At the end the chain - is reversed. It's done like this so that the start of the bracket has a - zero offset until it is closed, making it possible to detect recursion. */ - - if (lengthptr != NULL) - { - code = *codeptr + 1 + LINK_SIZE + skipunits; - length += 1 + LINK_SIZE; - } - else - { - *code = OP_ALT; - PUT(code, 1, (int)(code - last_branch)); - bc.current_branch = last_branch = code; - code += 1 + LINK_SIZE; - } - - /* Set the lookbehind length (if not in a lookbehind the value will be zero) - and then advance past the vertical bar. */ - - lookbehindlength = META_DATA(*pptr); - pptr++; - } -/* Control never reaches here */ -} - - - -/************************************************* -* Check for anchored pattern * -*************************************************/ - -/* Try to find out if this is an anchored regular expression. Consider each -alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket -all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then -it's anchored. However, if this is a multiline pattern, then only OP_SOD will -be found, because ^ generates OP_CIRCM in that mode. - -We can also consider a regex to be anchored if OP_SOM starts all its branches. -This is the code for \G, which means "match at start of match position, taking -into account the match offset". - -A branch is also implicitly anchored if it starts with .* and DOTALL is set, -because that will try the rest of the pattern at all possible matching points, -so there is no point trying again.... er .... - -.... except when the .* appears inside capturing parentheses, and there is a -subsequent back reference to those parentheses. We haven't enough information -to catch that case precisely. - -At first, the best we could do was to detect when .* was in capturing brackets -and the highest back reference was greater than or equal to that level. -However, by keeping a bitmap of the first 31 back references, we can catch some -of the more common cases more precisely. - -... A second exception is when the .* appears inside an atomic group, because -this prevents the number of characters it matches from being adjusted. - -Arguments: - code points to start of the compiled pattern - bracket_map a bitmap of which brackets we are inside while testing; this - handles up to substring 31; after that we just have to take - the less precise approach - cb points to the compile data block - atomcount atomic group level - inassert TRUE if in an assertion - -Returns: TRUE or FALSE -*/ - -static BOOL -is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, - int atomcount, BOOL inassert) -{ -do { - PCRE2_SPTR scode = first_significant_code( - code + PRIV(OP_lengths)[*code], FALSE); - int op = *scode; - - /* Non-capturing brackets */ - - if (op == OP_BRA || op == OP_BRAPOS || - op == OP_SBRA || op == OP_SBRAPOS) - { - if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) - return FALSE; - } - - /* Capturing brackets */ - - else if (op == OP_CBRA || op == OP_CBRAPOS || - op == OP_SCBRA || op == OP_SCBRAPOS) - { - int n = GET2(scode, 1+LINK_SIZE); - int new_map = bracket_map | ((n < 32)? (1u << n) : 1); - if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE; - } - - /* Positive forward assertion */ - - else if (op == OP_ASSERT || op == OP_ASSERT_NA) - { - if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; - } - - /* Condition. If there is no second branch, it can't be anchored. */ - - else if (op == OP_COND || op == OP_SCOND) - { - if (scode[GET(scode,1)] != OP_ALT) return FALSE; - if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) - return FALSE; - } - - /* Atomic groups */ - - else if (op == OP_ONCE) - { - if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert)) - return FALSE; - } - - /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and - it isn't in brackets that are or may be referenced or inside an atomic - group or an assertion. Also the pattern must not contain *PRUNE or *SKIP, - because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/ - with the subject "aab", which matches "b", i.e. not at the start of a line. - There is also an option that disables auto-anchoring. */ - - else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || - op == OP_TYPEPOSSTAR)) - { - if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 || - atomcount > 0 || cb->had_pruneorskip || inassert || - (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) - return FALSE; - } - - /* Check for explicit anchoring */ - - else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE; - - code += GET(code, 1); - } -while (*code == OP_ALT); /* Loop for each alternative */ -return TRUE; -} - - - -/************************************************* -* Check for starting with ^ or .* * -*************************************************/ - -/* This is called to find out if every branch starts with ^ or .* so that -"first char" processing can be done to speed things up in multiline -matching and for non-DOTALL patterns that start with .* (which must start at -the beginning or after \n). As in the case of is_anchored() (see above), we -have to take account of back references to capturing brackets that contain .* -because in that case we can't make the assumption. Also, the appearance of .* -inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE -or *SKIP does not count, because once again the assumption no longer holds. - -Arguments: - code points to start of the compiled pattern or a group - bracket_map a bitmap of which brackets we are inside while testing; this - handles up to substring 31; after that we just have to take - the less precise approach - cb points to the compile data - atomcount atomic group level - inassert TRUE if in an assertion - -Returns: TRUE or FALSE -*/ - -static BOOL -is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, - int atomcount, BOOL inassert) -{ -do { - PCRE2_SPTR scode = first_significant_code( - code + PRIV(OP_lengths)[*code], FALSE); - int op = *scode; - - /* If we are at the start of a conditional assertion group, *both* the - conditional assertion *and* what follows the condition must satisfy the test - for start of line. Other kinds of condition fail. Note that there may be an - auto-callout at the start of a condition. */ - - if (op == OP_COND) - { - scode += 1 + LINK_SIZE; - - if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT]; - else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE); - - switch (*scode) - { - case OP_CREF: - case OP_DNCREF: - case OP_RREF: - case OP_DNRREF: - case OP_FAIL: - case OP_FALSE: - case OP_TRUE: - return FALSE; - - default: /* Assertion */ - if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; - do scode += GET(scode, 1); while (*scode == OP_ALT); - scode += 1 + LINK_SIZE; - break; - } - scode = first_significant_code(scode, FALSE); - op = *scode; - } - - /* Non-capturing brackets */ - - if (op == OP_BRA || op == OP_BRAPOS || - op == OP_SBRA || op == OP_SBRAPOS) - { - if (!is_startline(scode, bracket_map, cb, atomcount, inassert)) - return FALSE; - } - - /* Capturing brackets */ - - else if (op == OP_CBRA || op == OP_CBRAPOS || - op == OP_SCBRA || op == OP_SCBRAPOS) - { - int n = GET2(scode, 1+LINK_SIZE); - int new_map = bracket_map | ((n < 32)? (1u << n) : 1); - if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE; - } - - /* Positive forward assertions */ - - else if (op == OP_ASSERT || op == OP_ASSERT_NA) - { - if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) - return FALSE; - } - - /* Atomic brackets */ - - else if (op == OP_ONCE) - { - if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert)) - return FALSE; - } - - /* .* means "start at start or after \n" if it isn't in atomic brackets or - brackets that may be referenced or an assertion, and as long as the pattern - does not contain *PRUNE or *SKIP, because these break the feature. Consider, - for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", - i.e. not at the start of a line. There is also an option that disables this - optimization. */ - - else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) - { - if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 || - atomcount > 0 || cb->had_pruneorskip || inassert || - (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) - return FALSE; - } - - /* Check for explicit circumflex; anything else gives a FALSE result. Note - in particular that this includes atomic brackets OP_ONCE because the number - of characters matched by .* cannot be adjusted inside them. */ - - else if (op != OP_CIRC && op != OP_CIRCM) return FALSE; - - /* Move on to the next alternative */ - - code += GET(code, 1); - } -while (*code == OP_ALT); /* Loop for each alternative */ -return TRUE; -} - - - -/************************************************* -* Scan compiled regex for recursion reference * -*************************************************/ - -/* This function scans through a compiled pattern until it finds an instance of -OP_RECURSE. - -Arguments: - code points to start of expression - utf TRUE in UTF mode - -Returns: pointer to the opcode for OP_RECURSE, or NULL if not found -*/ - -static PCRE2_SPTR -find_recurse(PCRE2_SPTR code, BOOL utf) -{ -for (;;) - { - PCRE2_UCHAR c = *code; - if (c == OP_END) return NULL; - if (c == OP_RECURSE) return code; - - /* XCLASS is used for classes that cannot be represented just by a bit map. - This includes negated single high-valued characters. CALLOUT_STR is used for - callouts with string arguments. In both cases the length in the table is - zero; the actual length is stored in the compiled code. */ - - if (c == OP_XCLASS) code += GET(code, 1); - else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); - - /* Otherwise, we can get the item's length from the table, except that for - repeated character types, we have to test for \p and \P, which have an extra - two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, - we must add in its length. */ - - else - { - switch(c) - { - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; - break; - - case OP_TYPEPOSUPTO: - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEEXACT: - if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) - code += 2; - break; - - case OP_MARK: - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - case OP_SKIP_ARG: - case OP_THEN_ARG: - code += code[1]; - break; - } - - /* Add in the fixed length from the table */ - - code += PRIV(OP_lengths)[c]; - - /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may - be followed by a multi-unit character. The length in the table is a - minimum, so we have to arrange to skip the extra units. */ - -#ifdef MAYBE_UTF_MULTI - if (utf) switch(c) - { - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - case OP_EXACT: - case OP_EXACTI: - case OP_NOTEXACT: - case OP_NOTEXACTI: - case OP_UPTO: - case OP_UPTOI: - case OP_NOTUPTO: - case OP_NOTUPTOI: - case OP_MINUPTO: - case OP_MINUPTOI: - case OP_NOTMINUPTO: - case OP_NOTMINUPTOI: - case OP_POSUPTO: - case OP_POSUPTOI: - case OP_NOTPOSUPTO: - case OP_NOTPOSUPTOI: - case OP_STAR: - case OP_STARI: - case OP_NOTSTAR: - case OP_NOTSTARI: - case OP_MINSTAR: - case OP_MINSTARI: - case OP_NOTMINSTAR: - case OP_NOTMINSTARI: - case OP_POSSTAR: - case OP_POSSTARI: - case OP_NOTPOSSTAR: - case OP_NOTPOSSTARI: - case OP_PLUS: - case OP_PLUSI: - case OP_NOTPLUS: - case OP_NOTPLUSI: - case OP_MINPLUS: - case OP_MINPLUSI: - case OP_NOTMINPLUS: - case OP_NOTMINPLUSI: - case OP_POSPLUS: - case OP_POSPLUSI: - case OP_NOTPOSPLUS: - case OP_NOTPOSPLUSI: - case OP_QUERY: - case OP_QUERYI: - case OP_NOTQUERY: - case OP_NOTQUERYI: - case OP_MINQUERY: - case OP_MINQUERYI: - case OP_NOTMINQUERY: - case OP_NOTMINQUERYI: - case OP_POSQUERY: - case OP_POSQUERYI: - case OP_NOTPOSQUERY: - case OP_NOTPOSQUERYI: - if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); - break; - } -#else - (void)(utf); /* Keep compiler happy by referencing function argument */ -#endif /* MAYBE_UTF_MULTI */ - } - } -} - - - -/************************************************* -* Check for asserted fixed first code unit * -*************************************************/ - -/* During compilation, the "first code unit" settings from forward assertions -are discarded, because they can cause conflicts with actual literals that -follow. However, if we end up without a first code unit setting for an -unanchored pattern, it is worth scanning the regex to see if there is an -initial asserted first code unit. If all branches start with the same asserted -code unit, or with a non-conditional bracket all of whose alternatives start -with the same asserted code unit (recurse ad lib), then we return that code -unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with -REQ_NONE in the flags. - -Arguments: - code points to start of compiled pattern - flags points to the first code unit flags - inassert non-zero if in an assertion - -Returns: the fixed first code unit, or 0 with REQ_NONE in flags -*/ - -static uint32_t -find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert) -{ -uint32_t c = 0; -int cflags = REQ_NONE; - -*flags = REQ_NONE; -do { - uint32_t d; - int dflags; - int xl = (*code == OP_CBRA || *code == OP_SCBRA || - *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0; - PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE); - PCRE2_UCHAR op = *scode; - - switch(op) - { - default: - return 0; - - case OP_BRA: - case OP_BRAPOS: - case OP_CBRA: - case OP_SCBRA: - case OP_CBRAPOS: - case OP_SCBRAPOS: - case OP_ASSERT: - case OP_ASSERT_NA: - case OP_ONCE: - case OP_SCRIPT_RUN: - d = find_firstassertedcu(scode, &dflags, inassert + - ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0)); - if (dflags < 0) - return 0; - if (cflags < 0) { c = d; cflags = dflags; } - else if (c != d || cflags != dflags) return 0; - break; - - case OP_EXACT: - scode += IMM2_SIZE; - /* Fall through */ - - case OP_CHAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_POSPLUS: - if (inassert == 0) return 0; - if (cflags < 0) { c = scode[1]; cflags = 0; } - else if (c != scode[1]) return 0; - break; - - case OP_EXACTI: - scode += IMM2_SIZE; - /* Fall through */ - - case OP_CHARI: - case OP_PLUSI: - case OP_MINPLUSI: - case OP_POSPLUSI: - if (inassert == 0) return 0; - - /* If the character is more than one code unit long, we cannot set its - first code unit when matching caselessly. Later scanning may pick up - multiple code units. */ - -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (scode[1] >= 0x80) return 0; -#elif PCRE2_CODE_UNIT_WIDTH == 16 - if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0; -#endif -#endif - - if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; } - else if (c != scode[1]) return 0; - break; - } - - code += GET(code, 1); - } -while (*code == OP_ALT); - -*flags = cflags; -return c; -} - - - -/************************************************* -* Add an entry to the name/number table * -*************************************************/ - -/* This function is called between compiling passes to add an entry to the -name/number table, maintaining alphabetical order. Checking for permitted -and forbidden duplicates has already been done. - -Arguments: - cb the compile data block - name the name to add - length the length of the name - groupno the group number - tablecount the count of names in the table so far - -Returns: nothing -*/ - -static void -add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length, - unsigned int groupno, uint32_t tablecount) -{ -uint32_t i; -PCRE2_UCHAR *slot = cb->name_table; - -for (i = 0; i < tablecount; i++) - { - int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length)); - if (crc == 0 && slot[IMM2_SIZE+length] != 0) - crc = -1; /* Current name is a substring */ - - /* Make space in the table and break the loop for an earlier name. For a - duplicate or later name, carry on. We do this for duplicates so that in the - simple case (when ?(| is not used) they are in order of their numbers. In all - cases they are in the order in which they appear in the pattern. */ - - if (crc < 0) - { - (void)memmove(slot + cb->name_entry_size, slot, - CU2BYTES((tablecount - i) * cb->name_entry_size)); - break; - } - - /* Continue the loop for a later or duplicate name */ - - slot += cb->name_entry_size; - } - -PUT2(slot, 0, groupno); -memcpy(slot + IMM2_SIZE, name, CU2BYTES(length)); - -/* Add a terminating zero and fill the rest of the slot with zeroes so that -the memory is all initialized. Otherwise valgrind moans about uninitialized -memory when saving serialized compiled patterns. */ - -memset(slot + IMM2_SIZE + length, 0, - CU2BYTES(cb->name_entry_size - length - IMM2_SIZE)); -} - - - -/************************************************* -* Skip in parsed pattern * -*************************************************/ - -/* This function is called to skip parts of the parsed pattern when finding the -length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find -the end of the branch, it is called to skip over an internal lookaround or -(DEFINE) group, and it is also called to skip to the end of a class, during -which it will never encounter nested groups (but there's no need to have -special code for that). - -When called to find the end of a branch or group, pptr must point to the first -meta code inside the branch, not the branch-starting code. In other cases it -can point to the item that causes the function to be called. - -Arguments: - pptr current pointer to skip from - skiptype PSKIP_CLASS when skipping to end of class - PSKIP_ALT when META_ALT ends the skip - PSKIP_KET when only META_KET ends the skip - -Returns: new value of pptr - NULL if META_END is reached - should never occur - or for an unknown meta value - likewise -*/ - -static uint32_t * -parsed_skip(uint32_t *pptr, uint32_t skiptype) -{ -uint32_t nestlevel = 0; - -for (;; pptr++) - { - uint32_t meta = META_CODE(*pptr); - - switch(meta) - { - default: /* Just skip over most items */ - if (meta < META_END) continue; /* Literal */ - break; - - /* This should never occur. */ - - case META_END: - return NULL; - - /* The data for these items is variable in length. */ - - case META_BACKREF: /* Offset is present only if group >= 10 */ - if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET; - break; - - case META_ESCAPE: /* A few escapes are followed by data items. */ - switch (META_DATA(*pptr)) - { - case ESC_P: - case ESC_p: - pptr += 1; - break; - - case ESC_g: - case ESC_k: - pptr += 1 + SIZEOFFSET; - break; - } - break; - - case META_MARK: /* Add the length of the name. */ - case META_COMMIT_ARG: - case META_PRUNE_ARG: - case META_SKIP_ARG: - case META_THEN_ARG: - pptr += pptr[1]; - break; - - /* These are the "active" items in this loop. */ - - case META_CLASS_END: - if (skiptype == PSKIP_CLASS) return pptr; - break; - - case META_ATOMIC: - case META_CAPTURE: - case META_COND_ASSERT: - case META_COND_DEFINE: - case META_COND_NAME: - case META_COND_NUMBER: - case META_COND_RNAME: - case META_COND_RNUMBER: - case META_COND_VERSION: - case META_LOOKAHEAD: - case META_LOOKAHEADNOT: - case META_LOOKAHEAD_NA: - case META_LOOKBEHIND: - case META_LOOKBEHINDNOT: - case META_LOOKBEHIND_NA: - case META_NOCAPTURE: - case META_SCRIPT_RUN: - nestlevel++; - break; - - case META_ALT: - if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr; - break; - - case META_KET: - if (nestlevel == 0) return pptr; - nestlevel--; - break; - } - - /* The extra data item length for each meta is in a table. */ - - meta = (meta >> 16) & 0x7fff; - if (meta >= sizeof(meta_extra_lengths)) return NULL; - pptr += meta_extra_lengths[meta]; - } -/* Control never reaches here */ -return pptr; -} - - - -/************************************************* -* Find length of a parsed group * -*************************************************/ - -/* This is called for nested groups within a branch of a lookbehind whose -length is being computed. If all the branches in the nested group have the same -length, that is OK. On entry, the pointer must be at the first element after -the group initializing code. On exit it points to OP_KET. Caching is used to -improve processing speed when the same capturing group occurs many times. - -Arguments: - pptrptr pointer to pointer in the parsed pattern - isinline FALSE if a reference or recursion; TRUE for inline group - errcodeptr pointer to the errorcode - lcptr pointer to the loop counter - group number of captured group or -1 for a non-capturing group - recurses chain of recurse_check to catch mutual recursion - cb pointer to the compile data - -Returns: the group length or a negative number -*/ - -static int -get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr, - int group, parsed_recurse_check *recurses, compile_block *cb) -{ -int branchlength; -int grouplength = -1; - -/* The cache can be used only if there is no possibility of there being two -groups with the same number. We do not need to set the end pointer for a group -that is being processed as a back reference or recursion, but we must do so for -an inline group. */ - -if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0) - { - uint32_t groupinfo = cb->groupinfo[group]; - if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1; - if ((groupinfo & GI_SET_FIXED_LENGTH) != 0) - { - if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET); - return groupinfo & GI_FIXED_LENGTH_MASK; - } - } - -/* Scan the group. In this case we find the end pointer of necessity. */ - -for(;;) - { - branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb); - if (branchlength < 0) goto ISNOTFIXED; - if (grouplength == -1) grouplength = branchlength; - else if (grouplength != branchlength) goto ISNOTFIXED; - if (**pptrptr == META_KET) break; - *pptrptr += 1; /* Skip META_ALT */ - } - -if (group > 0) - cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength); -return grouplength; - -ISNOTFIXED: -if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH; -return -1; -} - - - -/************************************************* -* Find length of a parsed branch * -*************************************************/ - -/* Return a fixed length for a branch in a lookbehind, giving an error if the -length is not fixed. On entry, *pptrptr points to the first element inside the -branch. On exit it is set to point to the ALT or KET. - -Arguments: - pptrptr pointer to pointer in the parsed pattern - errcodeptr pointer to error code - lcptr pointer to loop counter - recurses chain of recurse_check to catch mutual recursion - cb pointer to compile block - -Returns: the length, or a negative value on error -*/ - -static int -get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr, - parsed_recurse_check *recurses, compile_block *cb) -{ -int branchlength = 0; -int grouplength; -uint32_t lastitemlength = 0; -uint32_t *pptr = *pptrptr; -PCRE2_SIZE offset; -parsed_recurse_check this_recurse; - -/* A large and/or complex regex can take too long to process. This can happen -more often when (?| groups are present in the pattern because their length -cannot be cached. */ - -if ((*lcptr)++ > 2000) - { - *errcodeptr = ERR35; /* Lookbehind is too complicated */ - return -1; - } - -/* Scan the branch, accumulating the length. */ - -for (;; pptr++) - { - parsed_recurse_check *r; - uint32_t *gptr, *gptrend; - uint32_t escape; - uint32_t group = 0; - uint32_t itemlength = 0; - - if (*pptr < META_END) - { - itemlength = 1; - } - - else switch (META_CODE(*pptr)) - { - case META_KET: - case META_ALT: - goto EXIT; - - /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the - actual termination. */ - - case META_ACCEPT: - case META_FAIL: - pptr = parsed_skip(pptr, PSKIP_ALT); - if (pptr == NULL) goto PARSED_SKIP_FAILED; - goto EXIT; - - case META_MARK: - case META_COMMIT_ARG: - case META_PRUNE_ARG: - case META_SKIP_ARG: - case META_THEN_ARG: - pptr += pptr[1] + 1; - break; - - case META_CIRCUMFLEX: - case META_COMMIT: - case META_DOLLAR: - case META_PRUNE: - case META_SKIP: - case META_THEN: - break; - - case META_OPTIONS: - pptr += 1; - break; - - case META_BIGVALUE: - itemlength = 1; - pptr += 1; - break; - - case META_CLASS: - case META_CLASS_NOT: - itemlength = 1; - pptr = parsed_skip(pptr, PSKIP_CLASS); - if (pptr == NULL) goto PARSED_SKIP_FAILED; - break; - - case META_CLASS_EMPTY_NOT: - case META_DOT: - itemlength = 1; - break; - - case META_CALLOUT_NUMBER: - pptr += 3; - break; - - case META_CALLOUT_STRING: - pptr += 3 + SIZEOFFSET; - break; - - /* Only some escapes consume a character. Of those, \R and \X are never - allowed because they might match more than character. \C is allowed only in - 32-bit and non-UTF 8/16-bit modes. */ - - case META_ESCAPE: - escape = META_DATA(*pptr); - if (escape == ESC_R || escape == ESC_X) return -1; - if (escape > ESC_b && escape < ESC_Z) - { -#if PCRE2_CODE_UNIT_WIDTH != 32 - if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C) - { - *errcodeptr = ERR36; - return -1; - } -#endif - itemlength = 1; - if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */ - } - break; - - /* Lookaheads do not contribute to the length of this branch, but they may - contain lookbehinds within them whose lengths need to be set. */ - - case META_LOOKAHEAD: - case META_LOOKAHEADNOT: - case META_LOOKAHEAD_NA: - *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb); - if (*errcodeptr != 0) return -1; - - /* Ignore any qualifiers that follow a lookahead assertion. */ - - switch (pptr[1]) - { - case META_ASTERISK: - case META_ASTERISK_PLUS: - case META_ASTERISK_QUERY: - case META_PLUS: - case META_PLUS_PLUS: - case META_PLUS_QUERY: - case META_QUERY: - case META_QUERY_PLUS: - case META_QUERY_QUERY: - pptr++; - break; - - case META_MINMAX: - case META_MINMAX_PLUS: - case META_MINMAX_QUERY: - pptr += 3; - break; - - default: - break; - } - break; - - /* A nested lookbehind does not contribute any length to this lookbehind, - but must itself be checked and have its lengths set. */ - - case META_LOOKBEHIND: - case META_LOOKBEHINDNOT: - case META_LOOKBEHIND_NA: - if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb)) - return -1; - break; - - /* Back references and recursions are handled by very similar code. At this - stage, the names generated in the parsing pass are available, but the main - name table has not yet been created. So for the named varieties, scan the - list of names in order to get the number of the first one in the pattern, - and whether or not this name is duplicated. */ - - case META_BACKREF_BYNAME: - if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0) - goto ISNOTFIXED; - /* Fall through */ - - case META_RECURSE_BYNAME: - { - int i; - PCRE2_SPTR name; - BOOL is_dupname = FALSE; - named_group *ng = cb->named_groups; - uint32_t meta_code = META_CODE(*pptr); - uint32_t length = *(++pptr); - - GETPLUSOFFSET(offset, pptr); - name = cb->start_pattern + offset; - for (i = 0; i < cb->names_found; i++, ng++) - { - if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0) - { - group = ng->number; - is_dupname = ng->isdup; - break; - } - } - - if (group == 0) - { - *errcodeptr = ERR15; /* Non-existent subpattern */ - cb->erroroffset = offset; - return -1; - } - - /* A numerical back reference can be fixed length if duplicate capturing - groups are not being used. A non-duplicate named back reference can also - be handled. */ - - if (meta_code == META_RECURSE_BYNAME || - (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)) - goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */ - } - goto ISNOTFIXED; /* Duplicate name or number */ - - /* The offset values for back references < 10 are in a separate vector - because otherwise they would use more than two parsed pattern elements on - 64-bit systems. */ - - case META_BACKREF: - if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 || - (cb->external_flags & PCRE2_DUPCAPUSED) != 0) - goto ISNOTFIXED; - group = META_DATA(*pptr); - if (group < 10) - { - offset = cb->small_ref_offset[group]; - goto RECURSE_OR_BACKREF_LENGTH; - } - - /* Fall through */ - /* For groups >= 10 - picking up group twice does no harm. */ - - /* A true recursion implies not fixed length, but a subroutine call may - be OK. Back reference "recursions" are also failed. */ - - case META_RECURSE: - group = META_DATA(*pptr); - GETPLUSOFFSET(offset, pptr); - - RECURSE_OR_BACKREF_LENGTH: - if (group > cb->bracount) - { - cb->erroroffset = offset; - *errcodeptr = ERR15; /* Non-existent subpattern */ - return -1; - } - if (group == 0) goto ISNOTFIXED; /* Local recursion */ - for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++) - { - if (META_CODE(*gptr) == META_BIGVALUE) gptr++; - else if (*gptr == (META_CAPTURE | group)) break; - } - - /* We must start the search for the end of the group at the first meta code - inside the group. Otherwise it will be treated as an enclosed group. */ - - gptrend = parsed_skip(gptr + 1, PSKIP_KET); - if (gptrend == NULL) goto PARSED_SKIP_FAILED; - if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */ - for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break; - if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */ - this_recurse.prev = recurses; - this_recurse.groupptr = gptr; - - /* We do not need to know the position of the end of the group, that is, - gptr is not used after the call to get_grouplength(). Setting the second - argument FALSE stops it scanning for the end when the length can be found - in the cache. */ - - gptr++; - grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group, - &this_recurse, cb); - if (grouplength < 0) - { - if (*errcodeptr == 0) goto ISNOTFIXED; - return -1; /* Error already set */ - } - itemlength = grouplength; - break; - - /* A (DEFINE) group is never obeyed inline and so it does not contribute to - the length of this branch. Skip from the following item to the next - unpaired ket. */ - - case META_COND_DEFINE: - pptr = parsed_skip(pptr + 1, PSKIP_KET); - break; - - /* Check other nested groups - advance past the initial data for each type - and then seek a fixed length with get_grouplength(). */ - - case META_COND_NAME: - case META_COND_NUMBER: - case META_COND_RNAME: - case META_COND_RNUMBER: - pptr += 2 + SIZEOFFSET; - goto CHECK_GROUP; - - case META_COND_ASSERT: - pptr += 1; - goto CHECK_GROUP; - - case META_COND_VERSION: - pptr += 4; - goto CHECK_GROUP; - - case META_CAPTURE: - group = META_DATA(*pptr); - /* Fall through */ - - case META_ATOMIC: - case META_NOCAPTURE: - case META_SCRIPT_RUN: - pptr++; - CHECK_GROUP: - grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group, - recurses, cb); - if (grouplength < 0) return -1; - itemlength = grouplength; - break; - - /* Exact repetition is OK; variable repetition is not. A repetition of zero - must subtract the length that has already been added. */ - - case META_MINMAX: - case META_MINMAX_PLUS: - case META_MINMAX_QUERY: - if (pptr[1] == pptr[2]) - { - switch(pptr[1]) - { - case 0: - branchlength -= lastitemlength; - break; - - case 1: - itemlength = 0; - break; - - default: /* Check for integer overflow */ - if (lastitemlength != 0 && /* Should not occur, but just in case */ - INT_MAX/lastitemlength < pptr[1] - 1) - { - *errcodeptr = ERR87; /* Integer overflow; lookbehind too big */ - return -1; - } - itemlength = (pptr[1] - 1) * lastitemlength; - break; - } - pptr += 2; - break; - } - /* Fall through */ - - /* Any other item means this branch does not have a fixed length. */ - - default: - ISNOTFIXED: - *errcodeptr = ERR25; /* Not fixed length */ - return -1; - } - - /* Add the item length to the branchlength, checking for integer overflow and - for the branch length exceeding the limit. */ - - if (INT_MAX - branchlength < (int)itemlength || - (branchlength += itemlength) > LOOKBEHIND_MAX) - { - *errcodeptr = ERR87; - return -1; - } - - /* Save this item length for use if the next item is a quantifier. */ - - lastitemlength = itemlength; - } - -EXIT: -*pptrptr = pptr; -return branchlength; - -PARSED_SKIP_FAILED: -*errcodeptr = ERR90; -return -1; -} - - - -/************************************************* -* Set lengths in a lookbehind * -*************************************************/ - -/* This function is called for each lookbehind, to set the lengths in its -branches. An error occurs if any branch does not have a fixed length that is -less than the maximum (65535). On exit, the pointer must be left on the final -ket. - -The function also maintains the max_lookbehind value. Any lookbehind branch -that contains a nested lookbehind may actually look further back than the -length of the branch. The additional amount is passed back from -get_branchlength() as an "extra" value. - -Arguments: - pptrptr pointer to pointer in the parsed pattern - errcodeptr pointer to error code - lcptr pointer to loop counter - recurses chain of recurse_check to catch mutual recursion - cb pointer to compile block - -Returns: TRUE if all is well - FALSE otherwise, with error code and offset set -*/ - -static BOOL -set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr, - parsed_recurse_check *recurses, compile_block *cb) -{ -PCRE2_SIZE offset; -int branchlength; -uint32_t *bptr = *pptrptr; - -READPLUSOFFSET(offset, bptr); /* Offset for error messages */ -*pptrptr += SIZEOFFSET; - -do - { - *pptrptr += 1; - branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb); - if (branchlength < 0) - { - /* The errorcode and offset may already be set from a nested lookbehind. */ - if (*errcodeptr == 0) *errcodeptr = ERR25; - if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset; - return FALSE; - } - if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength; - *bptr |= branchlength; /* branchlength never more than 65535 */ - bptr = *pptrptr; - } -while (*bptr == META_ALT); - -return TRUE; -} - - - -/************************************************* -* Check parsed pattern lookbehinds * -*************************************************/ - -/* This function is called at the end of parsing a pattern if any lookbehinds -were encountered. It scans the parsed pattern for them, calling -set_lookbehind_lengths() for each one. At the start, the errorcode is zero and -the error offset is marked unset. The enables the functions above not to -override settings from deeper nestings. - -This function is called recursively from get_branchlength() for lookaheads in -order to process any lookbehinds that they may contain. It stops when it hits a -non-nested closing parenthesis in this case, returning a pointer to it. - -Arguments - pptr points to where to start (start of pattern or start of lookahead) - retptr if not NULL, return the ket pointer here - recurses chain of recurse_check to catch mutual recursion - cb points to the compile block - -Returns: 0 on success, or an errorcode (cb->erroroffset will be set) -*/ - -static int -check_lookbehinds(uint32_t *pptr, uint32_t **retptr, - parsed_recurse_check *recurses, compile_block *cb) -{ -int errorcode = 0; -int loopcount = 0; -int nestlevel = 0; - -cb->erroroffset = PCRE2_UNSET; - -for (; *pptr != META_END; pptr++) - { - if (*pptr < META_END) continue; /* Literal */ - - switch (META_CODE(*pptr)) - { - default: - return ERR70; /* Unrecognized meta code */ - - case META_ESCAPE: - if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p) - pptr += 1; - break; - - case META_KET: - if (--nestlevel < 0) - { - if (retptr != NULL) *retptr = pptr; - return 0; - } - break; - - case META_ATOMIC: - case META_CAPTURE: - case META_COND_ASSERT: - case META_LOOKAHEAD: - case META_LOOKAHEADNOT: - case META_LOOKAHEAD_NA: - case META_NOCAPTURE: - case META_SCRIPT_RUN: - nestlevel++; - break; - - case META_ACCEPT: - case META_ALT: - case META_ASTERISK: - case META_ASTERISK_PLUS: - case META_ASTERISK_QUERY: - case META_BACKREF: - case META_CIRCUMFLEX: - case META_CLASS: - case META_CLASS_EMPTY: - case META_CLASS_EMPTY_NOT: - case META_CLASS_END: - case META_CLASS_NOT: - case META_COMMIT: - case META_DOLLAR: - case META_DOT: - case META_FAIL: - case META_PLUS: - case META_PLUS_PLUS: - case META_PLUS_QUERY: - case META_PRUNE: - case META_QUERY: - case META_QUERY_PLUS: - case META_QUERY_QUERY: - case META_RANGE_ESCAPED: - case META_RANGE_LITERAL: - case META_SKIP: - case META_THEN: - break; - - case META_RECURSE: - pptr += SIZEOFFSET; - break; - - case META_BACKREF_BYNAME: - case META_RECURSE_BYNAME: - pptr += 1 + SIZEOFFSET; - break; - - case META_COND_DEFINE: - pptr += SIZEOFFSET; - nestlevel++; - break; - - case META_COND_NAME: - case META_COND_NUMBER: - case META_COND_RNAME: - case META_COND_RNUMBER: - pptr += 1 + SIZEOFFSET; - nestlevel++; - break; - - case META_COND_VERSION: - pptr += 3; - nestlevel++; - break; - - case META_CALLOUT_STRING: - pptr += 3 + SIZEOFFSET; - break; - - case META_BIGVALUE: - case META_OPTIONS: - case META_POSIX: - case META_POSIX_NEG: - pptr += 1; - break; - - case META_MINMAX: - case META_MINMAX_QUERY: - case META_MINMAX_PLUS: - pptr += 2; - break; - - case META_CALLOUT_NUMBER: - pptr += 3; - break; - - case META_MARK: - case META_COMMIT_ARG: - case META_PRUNE_ARG: - case META_SKIP_ARG: - case META_THEN_ARG: - pptr += 1 + pptr[1]; - break; - - case META_LOOKBEHIND: - case META_LOOKBEHINDNOT: - case META_LOOKBEHIND_NA: - if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, recurses, cb)) - return errorcode; - break; - } - } - -return 0; -} - - - -/************************************************* -* External function to compile a pattern * -*************************************************/ - -/* This function reads a regular expression in the form of a string and returns -a pointer to a block of store holding a compiled version of the expression. - -Arguments: - pattern the regular expression - patlen the length of the pattern, or PCRE2_ZERO_TERMINATED - options option bits - errorptr pointer to errorcode - erroroffset pointer to error offset - ccontext points to a compile context or is NULL - -Returns: pointer to compiled data block, or NULL on error, - with errorcode and erroroffset set -*/ - -PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION -pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options, - int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) -{ -BOOL utf; /* Set TRUE for UTF mode */ -BOOL ucp; /* Set TRUE for UCP mode */ -BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */ -BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */ -pcre2_real_code *re = NULL; /* What we will return */ -compile_block cb; /* "Static" compile-time data */ -const uint8_t *tables; /* Char tables base pointer */ - -PCRE2_UCHAR *code; /* Current pointer in compiled code */ -PCRE2_SPTR codestart; /* Start of compiled code */ -PCRE2_SPTR ptr; /* Current pointer in pattern */ -uint32_t *pptr; /* Current pointer in parsed pattern */ - -PCRE2_SIZE length = 1; /* Allow for final END opcode */ -PCRE2_SIZE usedlength; /* Actual length used */ -PCRE2_SIZE re_blocksize; /* Size of memory block */ -PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */ -PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */ - -int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */ -uint32_t firstcu, reqcu; /* Value of first/req code unit */ -uint32_t setflags = 0; /* NL and BSR set flags */ - -uint32_t skipatstart; /* When checking (*UTF) etc */ -uint32_t limit_heap = UINT32_MAX; -uint32_t limit_match = UINT32_MAX; /* Unset match limits */ -uint32_t limit_depth = UINT32_MAX; - -int newline = 0; /* Unset; can be set by the pattern */ -int bsr = 0; /* Unset; can be set by the pattern */ -int errorcode = 0; /* Initialize to avoid compiler warn */ -int regexrc; /* Return from compile */ - -uint32_t i; /* Local loop counter */ - -/* Comments at the head of this file explain about these variables. */ - -uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE]; -uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE]; -named_group named_groups[NAMED_GROUP_LIST_SIZE]; - -/* The workspace is used in different ways in the different compiling phases. -It needs to be 16-bit aligned for the preliminary parsing scan. */ - -uint32_t c16workspace[C16_WORK_SIZE]; -PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace; - - -/* -------------- Check arguments and set up the pattern ----------------- */ - -/* There must be error code and offset pointers. */ - -if (errorptr == NULL || erroroffset == NULL) return NULL; -*errorptr = ERR0; -*erroroffset = 0; - -/* There must be a pattern! */ - -if (pattern == NULL) - { - *errorptr = ERR16; - return NULL; - } - -/* A NULL compile context means "use a default context" */ - -if (ccontext == NULL) - ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context)); - -/* PCRE2_MATCH_INVALID_UTF implies UTF */ - -if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF; - -/* Check that all undefined public option bits are zero. */ - -if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 || - (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0) - { - *errorptr = ERR17; - return NULL; - } - -if ((options & PCRE2_LITERAL) != 0 && - ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 || - (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0)) - { - *errorptr = ERR92; - return NULL; - } - -/* A zero-terminated pattern is indicated by the special length value -PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */ - -if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED))) - patlen = PRIV(strlen)(pattern); - -if (patlen > ccontext->max_pattern_length) - { - *errorptr = ERR88; - return NULL; - } - -/* From here on, all returns from this function should end up going via the -EXIT label. */ - - -/* ------------ Initialize the "static" compile data -------------- */ - -tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables); - -cb.lcc = tables + lcc_offset; /* Individual */ -cb.fcc = tables + fcc_offset; /* character */ -cb.cbits = tables + cbits_offset; /* tables */ -cb.ctypes = tables + ctypes_offset; - -cb.assert_depth = 0; -cb.bracount = 0; -cb.cx = ccontext; -cb.dupnames = FALSE; -cb.end_pattern = pattern + patlen; -cb.erroroffset = 0; -cb.external_flags = 0; -cb.external_options = options; -cb.groupinfo = stack_groupinfo; -cb.had_recurse = FALSE; -cb.lastcapture = 0; -cb.max_lookbehind = 0; -cb.name_entry_size = 0; -cb.name_table = NULL; -cb.named_groups = named_groups; -cb.named_group_list_size = NAMED_GROUP_LIST_SIZE; -cb.names_found = 0; -cb.open_caps = NULL; -cb.parens_depth = 0; -cb.parsed_pattern = stack_parsed_pattern; -cb.req_varyopt = 0; -cb.start_code = cworkspace; -cb.start_pattern = pattern; -cb.start_workspace = cworkspace; -cb.workspace_size = COMPILE_WORK_SIZE; - -/* Maximum back reference and backref bitmap. The bitmap records up to 31 back -references to help in deciding whether (.*) can be treated as anchored or not. -*/ - -cb.top_backref = 0; -cb.backref_map = 0; - -/* Escape sequences \1 to \9 are always back references, but as they are only -two characters long, only two elements can be used in the parsed_pattern -vector. The first contains the reference, and we'd like to use the second to -record the offset in the pattern, so that forward references to non-existent -groups can be diagnosed later with an offset. However, on 64-bit systems, -PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first -occurrence of \1 to \9, indexed by the second parsed_pattern value. All other -references have enough space for the offset to be put into the parsed pattern. -*/ - -for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET; - - -/* --------------- Start looking at the pattern --------------- */ - -/* Unless PCRE2_LITERAL is set, check for global one-time option settings at -the start of the pattern, and remember the offset to the actual regex. With -valgrind support, make the terminator of a zero-terminated pattern -inaccessible. This catches bugs that would otherwise only show up for -non-zero-terminated patterns. */ - -#ifdef SUPPORT_VALGRIND -if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1)); -#endif - -ptr = pattern; -skipatstart = 0; - -if ((options & PCRE2_LITERAL) == 0) - { - while (patlen - skipatstart >= 2 && - ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && - ptr[skipatstart+1] == CHAR_ASTERISK) - { - for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++) - { - uint32_t c, pp; - pso *p = pso_list + i; - - if (patlen - skipatstart - 2 >= p->length && - PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name), - p->length) == 0) - { - skipatstart += p->length + 2; - switch(p->type) - { - case PSO_OPT: - cb.external_options |= p->value; - break; - - case PSO_FLG: - setflags |= p->value; - break; - - case PSO_NL: - newline = p->value; - setflags |= PCRE2_NL_SET; - break; - - case PSO_BSR: - bsr = p->value; - setflags |= PCRE2_BSR_SET; - break; - - case PSO_LIMM: - case PSO_LIMD: - case PSO_LIMH: - c = 0; - pp = skipatstart; - if (!IS_DIGIT(ptr[pp])) - { - errorcode = ERR60; - ptr += pp; - goto HAD_EARLY_ERROR; - } - while (IS_DIGIT(ptr[pp])) - { - if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */ - c = c*10 + (ptr[pp++] - CHAR_0); - } - if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS) - { - errorcode = ERR60; - ptr += pp; - goto HAD_EARLY_ERROR; - } - if (p->type == PSO_LIMH) limit_heap = c; - else if (p->type == PSO_LIMM) limit_match = c; - else limit_depth = c; - skipatstart += pp - skipatstart; - break; - } - break; /* Out of the table scan loop */ - } - } - if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */ - } - } - -/* End of pattern-start options; advance to start of real regex. */ - -ptr += skipatstart; - -/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */ - -#ifndef SUPPORT_UNICODE -if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0) - { - errorcode = ERR32; - goto HAD_EARLY_ERROR; - } -#endif - -/* Check UTF. We have the original options in 'options', with that value as -modified by (*UTF) etc in cb->external_options. The extra option -PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the -surrogate code points cannot be represented in UTF-16. */ - -utf = (cb.external_options & PCRE2_UTF) != 0; -if (utf) - { - if ((options & PCRE2_NEVER_UTF) != 0) - { - errorcode = ERR74; - goto HAD_EARLY_ERROR; - } - if ((options & PCRE2_NO_UTF_CHECK) == 0 && - (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0) - goto HAD_ERROR; /* Offset was set by valid_utf() */ - -#if PCRE2_CODE_UNIT_WIDTH == 16 - if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0) - { - errorcode = ERR91; - goto HAD_EARLY_ERROR; - } -#endif - } - -/* Check UCP lockout. */ - -ucp = (cb.external_options & PCRE2_UCP) != 0; -if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0) - { - errorcode = ERR75; - goto HAD_EARLY_ERROR; - } - -/* Process the BSR setting. */ - -if (bsr == 0) bsr = ccontext->bsr_convention; - -/* Process the newline setting. */ - -if (newline == 0) newline = ccontext->newline_convention; -cb.nltype = NLTYPE_FIXED; -switch(newline) - { - case PCRE2_NEWLINE_CR: - cb.nllen = 1; - cb.nl[0] = CHAR_CR; - break; - - case PCRE2_NEWLINE_LF: - cb.nllen = 1; - cb.nl[0] = CHAR_NL; - break; - - case PCRE2_NEWLINE_NUL: - cb.nllen = 1; - cb.nl[0] = CHAR_NUL; - break; - - case PCRE2_NEWLINE_CRLF: - cb.nllen = 2; - cb.nl[0] = CHAR_CR; - cb.nl[1] = CHAR_NL; - break; - - case PCRE2_NEWLINE_ANY: - cb.nltype = NLTYPE_ANY; - break; - - case PCRE2_NEWLINE_ANYCRLF: - cb.nltype = NLTYPE_ANYCRLF; - break; - - default: - errorcode = ERR56; - goto HAD_EARLY_ERROR; - } - -/* Pre-scan the pattern to do two things: (1) Discover the named groups and -their numerical equivalents, so that this information is always available for -the remaining processing. (2) At the same time, parse the pattern and put a -processed version into the parsed_pattern vector. This has escapes interpreted -and comments removed (amongst other things). - -In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned -32-bit ints in the parsed pattern is bounded by the length of the pattern plus -one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is -set. The exceptional case is when running in 32-bit, non-UTF mode, when literal -characters greater than META_END (0x80000000) have to be coded as two units. In -this case, therefore, we scan the pattern to check for such values. */ - -#if PCRE2_CODE_UNIT_WIDTH == 32 -if (!utf) - { - PCRE2_SPTR p; - for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++; - } -#endif - -/* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT -is set we have to assume a numerical callout (4 elements) for each character -plus one at the end. This is overkill, but memory is plentiful these days. For -many smaller patterns the vector on the stack (which was set up above) can be -used. */ - -parsed_size_needed = patlen - skipatstart + big32count; - -if ((ccontext->extra_options & - (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0) - parsed_size_needed += 4; - -if ((options & PCRE2_AUTO_CALLOUT) != 0) - parsed_size_needed = (parsed_size_needed + 1) * 5; - -if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE) - { - uint32_t *heap_parsed_pattern = ccontext->memctl.malloc( - (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data); - if (heap_parsed_pattern == NULL) - { - *errorptr = ERR21; - goto EXIT; - } - cb.parsed_pattern = heap_parsed_pattern; - } -cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1; - -/* Do the parsing scan. */ - -errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb); -if (errorcode != 0) goto HAD_CB_ERROR; - -/* Workspace is needed to remember information about numbered groups: whether a -group can match an empty string and what its fixed length is. This is done to -avoid the possibility of recursive references causing very long compile times -when checking these features. Unnumbered groups do not have this exposure since -they cannot be referenced. We use an indexed vector for this purpose. If there -are sufficiently few groups, the default vector on the stack, as set up above, -can be used. Otherwise we have to get/free a special vector. The vector must be -initialized to zero. */ - -if (cb.bracount >= GROUPINFO_DEFAULT_SIZE) - { - cb.groupinfo = ccontext->memctl.malloc( - (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data); - if (cb.groupinfo == NULL) - { - errorcode = ERR21; - cb.erroroffset = 0; - goto HAD_CB_ERROR; - } - } -memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t)); - -/* If there were any lookbehinds, scan the parsed pattern to figure out their -lengths. */ - -if (has_lookbehind) - { - errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb); - if (errorcode != 0) goto HAD_CB_ERROR; - } - -/* For debugging, there is a function that shows the parsed data vector. */ - -#ifdef DEBUG_SHOW_PARSED -fprintf(stderr, "+++ Pre-scan complete:\n"); -show_parsed(&cb); -#endif - -/* For debugging capturing information this code can be enabled. */ - -#ifdef DEBUG_SHOW_CAPTURES - { - named_group *ng = cb.named_groups; - fprintf(stderr, "+++Captures: %d\n", cb.bracount); - for (i = 0; i < cb.names_found; i++, ng++) - { - fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name); - } - } -#endif - -/* Pretend to compile the pattern while actually just accumulating the amount -of memory required in the 'length' variable. This behaviour is triggered by -passing a non-NULL final argument to compile_regex(). We pass a block of -workspace (cworkspace) for it to compile parts of the pattern into; the -compiled code is discarded when it is no longer needed, so hopefully this -workspace will never overflow, though there is a test for its doing so. - -On error, errorcode will be set non-zero, so we don't need to look at the -result of the function. The initial options have been put into the cb block, -but we still have to pass a separate options variable (the first argument) -because the options may change as the pattern is processed. */ - -cb.erroroffset = patlen; /* For any subsequent errors that do not set it */ -pptr = cb.parsed_pattern; -code = cworkspace; -*code = OP_BRA; - -(void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu, - &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length); - -if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */ - -/* This should be caught in compile_regex(), but just in case... */ - -if (length > MAX_PATTERN_SIZE) - { - errorcode = ERR20; - goto HAD_CB_ERROR; - } - -/* Compute the size of, and then get and initialize, the data block for storing -the compiled pattern and names table. Integer overflow should no longer be -possible because nowadays we limit the maximum value of cb.names_found and -cb.name_entry_size. */ - -re_blocksize = sizeof(pcre2_real_code) + - CU2BYTES(length + - (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size); -re = (pcre2_real_code *) - ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data); -if (re == NULL) - { - errorcode = ERR21; - goto HAD_CB_ERROR; - } - -/* The compiler may put padding at the end of the pcre2_real_code structure in -order to round it up to a multiple of 4 or 8 bytes. This means that when a -compiled pattern is copied (for example, when serialized) undefined bytes are -read, and this annoys debuggers such as valgrind. To avoid this, we explicitly -write to the last 8 bytes of the structure before setting the fields. */ - -memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8); -re->memctl = ccontext->memctl; -re->tables = tables; -re->executable_jit = NULL; -memset(re->start_bitmap, 0, 32 * sizeof(uint8_t)); -re->blocksize = re_blocksize; -re->magic_number = MAGIC_NUMBER; -re->compile_options = options; -re->overall_options = cb.external_options; -re->extra_options = ccontext->extra_options; -re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags; -re->limit_heap = limit_heap; -re->limit_match = limit_match; -re->limit_depth = limit_depth; -re->first_codeunit = 0; -re->last_codeunit = 0; -re->bsr_convention = bsr; -re->newline_convention = newline; -re->max_lookbehind = 0; -re->minlength = 0; -re->top_bracket = 0; -re->top_backref = 0; -re->name_entry_size = cb.name_entry_size; -re->name_count = cb.names_found; - -/* The basic block is immediately followed by the name table, and the compiled -code follows after that. */ - -codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) + - re->name_entry_size * re->name_count; - -/* Update the compile data block for the actual compile. The starting points of -the name/number translation table and of the code are passed around in the -compile data block. The start/end pattern and initial options are already set -from the pre-compile phase, as is the name_entry_size field. */ - -cb.parens_depth = 0; -cb.assert_depth = 0; -cb.lastcapture = 0; -cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); -cb.start_code = codestart; -cb.req_varyopt = 0; -cb.had_accept = FALSE; -cb.had_pruneorskip = FALSE; -cb.open_caps = NULL; - -/* If any named groups were found, create the name/number table from the list -created in the pre-pass. */ - -if (cb.names_found > 0) - { - named_group *ng = cb.named_groups; - for (i = 0; i < cb.names_found; i++, ng++) - add_name_to_table(&cb, ng->name, ng->length, ng->number, i); - } - -/* Set up a starting, non-extracting bracket, then compile the expression. On -error, errorcode will be set non-zero, so we don't need to look at the result -of the function here. */ - -pptr = cb.parsed_pattern; -code = (PCRE2_UCHAR *)codestart; -*code = OP_BRA; -regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0, - &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL); -if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY; -re->top_bracket = cb.bracount; -re->top_backref = cb.top_backref; -re->max_lookbehind = cb.max_lookbehind; - -if (cb.had_accept) - { - reqcu = 0; /* Must disable after (*ACCEPT) */ - reqcuflags = REQ_NONE; - re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */ - } - -/* Fill in the final opcode and check for disastrous overflow. If no overflow, -but the estimated length exceeds the really used length, adjust the value of -re->blocksize, and if valgrind support is configured, mark the extra allocated -memory as unaddressable, so that any out-of-bound reads can be detected. */ - -*code++ = OP_END; -usedlength = code - codestart; -if (usedlength > length) errorcode = ERR23; else - { - re->blocksize -= CU2BYTES(length - usedlength); -#ifdef SUPPORT_VALGRIND - VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength)); -#endif - } - -/* Scan the pattern for recursion/subroutine calls and convert the group -numbers into offsets. Maintain a small cache so that repeated groups containing -recursions are efficiently handled. */ - -#define RSCAN_CACHE_SIZE 8 - -if (errorcode == 0 && cb.had_recurse) - { - PCRE2_UCHAR *rcode; - PCRE2_SPTR rgroup; - unsigned int ccount = 0; - int start = RSCAN_CACHE_SIZE; - recurse_cache rc[RSCAN_CACHE_SIZE]; - - for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf); - rcode != NULL; - rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf)) - { - int p, groupnumber; - - groupnumber = (int)GET(rcode, 1); - if (groupnumber == 0) rgroup = codestart; else - { - PCRE2_SPTR search_from = codestart; - rgroup = NULL; - for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7) - { - if (groupnumber == rc[p].groupnumber) - { - rgroup = rc[p].group; - break; - } - - /* Group n+1 must always start to the right of group n, so we can save - search time below when the new group number is greater than any of the - previously found groups. */ - - if (groupnumber > rc[p].groupnumber) search_from = rc[p].group; - } - - if (rgroup == NULL) - { - rgroup = PRIV(find_bracket)(search_from, utf, groupnumber); - if (rgroup == NULL) - { - errorcode = ERR53; - break; - } - if (--start < 0) start = RSCAN_CACHE_SIZE - 1; - rc[start].groupnumber = groupnumber; - rc[start].group = rgroup; - if (ccount < RSCAN_CACHE_SIZE) ccount++; - } - } - - PUT(rcode, 1, rgroup - codestart); - } - } - -/* In rare debugging situations we sometimes need to look at the compiled code -at this stage. */ - -#ifdef DEBUG_CALL_PRINTINT -pcre2_printint(re, stderr, TRUE); -fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength); -#endif - -/* Unless disabled, check whether any single character iterators can be -auto-possessified. The function overwrites the appropriate opcode values, so -the type of the pointer must be cast. NOTE: the intermediate variable "temp" is -used in this code because at least one compiler gives a warning about loss of -"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the -function call. */ - -if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0) - { - PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; - if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80; - } - -/* Failed to compile, or error while post-processing. */ - -if (errorcode != 0) goto HAD_CB_ERROR; - -/* Successful compile. If the anchored option was not passed, set it if -we can determine that the pattern is anchored by virtue of ^ characters or \A -or anything else, such as starting with non-atomic .* when DOTALL is set and -there are no occurrences of *PRUNE or *SKIP (though there is an option to -disable this case). */ - -if ((re->overall_options & PCRE2_ANCHORED) == 0 && - is_anchored(codestart, 0, &cb, 0, FALSE)) - re->overall_options |= PCRE2_ANCHORED; - -/* Set up the first code unit or startline flag, the required code unit, and -then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE -is set, as the data it would create will not be used. Note that a first code -unit (but not the startline flag) is useful for anchored patterns because it -can still give a quick "no match" and also avoid searching for a last code -unit. */ - -if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) - { - int minminlength = 0; /* For minimal minlength from first/required CU */ - - /* If we do not have a first code unit, see if there is one that is asserted - (these are not saved during the compile because they can cause conflicts with - actual literals that follow). */ - - if (firstcuflags < 0) - firstcu = find_firstassertedcu(codestart, &firstcuflags, 0); - - /* Save the data for a first code unit. The existence of one means the - minimum length must be at least 1. */ - - if (firstcuflags >= 0) - { - re->first_codeunit = firstcu; - re->flags |= PCRE2_FIRSTSET; - minminlength++; - - /* Handle caseless first code units. */ - - if ((firstcuflags & REQ_CASELESS) != 0) - { - if (firstcu < 128 || (!utf && !ucp && firstcu < 255)) - { - if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS; - } - - /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise. - In 8-bit UTF mode, codepoints in the range 128-255 are introductory code - points and cannot have another case, but if UCP is set they may do. */ - -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu) - re->flags |= PCRE2_FIRSTCASELESS; -#else - else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT && - UCD_OTHERCASE(firstcu) != firstcu) - re->flags |= PCRE2_FIRSTCASELESS; -#endif -#endif /* SUPPORT_UNICODE */ - } - } - - /* When there is no first code unit, for non-anchored patterns, see if we can - set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all - branches start with ^ and also when all branches start with non-atomic .* for - non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option - that disables this case.) */ - - else if ((re->overall_options & PCRE2_ANCHORED) == 0 && - is_startline(codestart, 0, &cb, 0, FALSE)) - re->flags |= PCRE2_STARTLINE; - - /* Handle the "required code unit", if one is set. In the UTF case we can - increment the minimum minimum length only if we are sure this really is a - different character and not a non-starting code unit of the first character, - because the minimum length count is in characters, not code units. */ - - if (reqcuflags >= 0) - { -#if PCRE2_CODE_UNIT_WIDTH == 16 - if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */ - firstcuflags < 0 || /* First not set */ - (firstcu & 0xf800) != 0xd800 || /* First not surrogate */ - (reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */ -#elif PCRE2_CODE_UNIT_WIDTH == 8 - if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */ - firstcuflags < 0 || /* First not set */ - (firstcu & 0x80) == 0 || /* First is ASCII */ - (reqcu & 0x80) == 0) /* Req is ASCII */ -#endif - { - minminlength++; - } - - /* In the case of an anchored pattern, set up the value only if it follows - a variable length item in the pattern. */ - - if ((re->overall_options & PCRE2_ANCHORED) == 0 || - (reqcuflags & REQ_VARY) != 0) - { - re->last_codeunit = reqcu; - re->flags |= PCRE2_LASTSET; - - /* Handle caseless required code units as for first code units (above). */ - - if ((reqcuflags & REQ_CASELESS) != 0) - { - if (reqcu < 128 || (!utf && !ucp && reqcu < 255)) - { - if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; - } -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu) - re->flags |= PCRE2_LASTCASELESS; -#else - else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT && - UCD_OTHERCASE(reqcu) != reqcu) - re->flags |= PCRE2_LASTCASELESS; -#endif -#endif /* SUPPORT_UNICODE */ - } - } - } - - /* Study the compiled pattern to set up information such as a bitmap of - starting code units and a minimum matching length. */ - - if (PRIV(study)(re) != 0) - { - errorcode = ERR31; - goto HAD_CB_ERROR; - } - - /* If study() set a bitmap of starting code units, it implies a minimum - length of at least one. */ - - if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0) - minminlength = 1; - - /* If the minimum length set (or not set) by study() is less than the minimum - implied by required code units, override it. */ - - if (re->minlength < minminlength) re->minlength = minminlength; - } /* End of start-of-match optimizations. */ - -/* Control ends up here in all cases. When running under valgrind, make a -pattern's terminating zero defined again. If memory was obtained for the parsed -version of the pattern, free it before returning. Also free the list of named -groups if a larger one had to be obtained, and likewise the group information -vector. */ - -EXIT: -#ifdef SUPPORT_VALGRIND -if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1)); -#endif -if (cb.parsed_pattern != stack_parsed_pattern) - ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data); -if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE) - ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data); -if (cb.groupinfo != stack_groupinfo) - ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data); -return re; /* Will be NULL after an error */ - -/* Errors discovered in parse_regex() set the offset value in the compile -block. Errors discovered before it is called must compute it from the ptr -value. After parse_regex() is called, the offset in the compile block is set to -the end of the pattern, but certain errors in compile_regex() may reset it if -an offset is available in the parsed pattern. */ - -HAD_CB_ERROR: -ptr = pattern + cb.erroroffset; - -HAD_EARLY_ERROR: -*erroroffset = ptr - pattern; - -HAD_ERROR: -*errorptr = errorcode; -pcre2_code_free(re); -re = NULL; -goto EXIT; -} - -/* End of pcre2_compile.c */ diff --git a/pcre2/src/pcre2_config.c b/pcre2/src/pcre2_config.c deleted file mode 100644 index 5ef103caf..000000000 --- a/pcre2/src/pcre2_config.c +++ /dev/null @@ -1,252 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -/* Save the configured link size, which is in bytes. In 16-bit and 32-bit modes -its value gets changed by pcre2_intmodedep.h (included by pcre2_internal.h) to -be in code units. */ - -static int configured_link_size = LINK_SIZE; - -#include "pcre2_internal.h" - -/* These macros are the standard way of turning unquoted text into C strings. -They allow macros like PCRE2_MAJOR to be defined without quotes, which is -convenient for user programs that want to test their values. */ - -#define STRING(a) # a -#define XSTRING(s) STRING(s) - - -/************************************************* -* Return info about what features are configured * -*************************************************/ - -/* If where is NULL, the length of memory required is returned. - -Arguments: - what what information is required - where where to put the information - -Returns: 0 if a numerical value is returned - >= 0 if a string value - PCRE2_ERROR_BADOPTION if "where" not recognized - or JIT target requested when JIT not enabled -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_config(uint32_t what, void *where) -{ -if (where == NULL) /* Requests a length */ - { - switch(what) - { - default: - return PCRE2_ERROR_BADOPTION; - - case PCRE2_CONFIG_BSR: - case PCRE2_CONFIG_COMPILED_WIDTHS: - case PCRE2_CONFIG_DEPTHLIMIT: - case PCRE2_CONFIG_HEAPLIMIT: - case PCRE2_CONFIG_JIT: - case PCRE2_CONFIG_LINKSIZE: - case PCRE2_CONFIG_MATCHLIMIT: - case PCRE2_CONFIG_NEVER_BACKSLASH_C: - case PCRE2_CONFIG_NEWLINE: - case PCRE2_CONFIG_PARENSLIMIT: - case PCRE2_CONFIG_STACKRECURSE: /* Obsolete */ - case PCRE2_CONFIG_TABLES_LENGTH: - case PCRE2_CONFIG_UNICODE: - return sizeof(uint32_t); - - /* These are handled below */ - - case PCRE2_CONFIG_JITTARGET: - case PCRE2_CONFIG_UNICODE_VERSION: - case PCRE2_CONFIG_VERSION: - break; - } - } - -switch (what) - { - default: - return PCRE2_ERROR_BADOPTION; - - case PCRE2_CONFIG_BSR: -#ifdef BSR_ANYCRLF - *((uint32_t *)where) = PCRE2_BSR_ANYCRLF; -#else - *((uint32_t *)where) = PCRE2_BSR_UNICODE; -#endif - break; - - case PCRE2_CONFIG_COMPILED_WIDTHS: - *((uint32_t *)where) = 0 -#ifdef SUPPORT_PCRE2_8 - + 1 -#endif -#ifdef SUPPORT_PCRE2_16 - + 2 -#endif -#ifdef SUPPORT_PCRE2_32 - + 4 -#endif - ; - break; - - case PCRE2_CONFIG_DEPTHLIMIT: - *((uint32_t *)where) = MATCH_LIMIT_DEPTH; - break; - - case PCRE2_CONFIG_HEAPLIMIT: - *((uint32_t *)where) = HEAP_LIMIT; - break; - - case PCRE2_CONFIG_JIT: -#ifdef SUPPORT_JIT - *((uint32_t *)where) = 1; -#else - *((uint32_t *)where) = 0; -#endif - break; - - case PCRE2_CONFIG_JITTARGET: -#ifdef SUPPORT_JIT - { - const char *v = PRIV(jit_get_target)(); - return (int)(1 + ((where == NULL)? - strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v))); - } -#else - return PCRE2_ERROR_BADOPTION; -#endif - - case PCRE2_CONFIG_LINKSIZE: - *((uint32_t *)where) = (uint32_t)configured_link_size; - break; - - case PCRE2_CONFIG_MATCHLIMIT: - *((uint32_t *)where) = MATCH_LIMIT; - break; - - case PCRE2_CONFIG_NEWLINE: - *((uint32_t *)where) = NEWLINE_DEFAULT; - break; - - case PCRE2_CONFIG_NEVER_BACKSLASH_C: -#ifdef NEVER_BACKSLASH_C - *((uint32_t *)where) = 1; -#else - *((uint32_t *)where) = 0; -#endif - break; - - case PCRE2_CONFIG_PARENSLIMIT: - *((uint32_t *)where) = PARENS_NEST_LIMIT; - break; - - /* This is now obsolete. The stack is no longer used via recursion for - handling backtracking in pcre2_match(). */ - - case PCRE2_CONFIG_STACKRECURSE: - *((uint32_t *)where) = 0; - break; - - case PCRE2_CONFIG_TABLES_LENGTH: - *((uint32_t *)where) = TABLES_LENGTH; - break; - - case PCRE2_CONFIG_UNICODE_VERSION: - { -#if defined SUPPORT_UNICODE - const char *v = PRIV(unicode_version); -#else - const char *v = "Unicode not supported"; -#endif - return (int)(1 + ((where == NULL)? - strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v))); - } - break; - - case PCRE2_CONFIG_UNICODE: -#if defined SUPPORT_UNICODE - *((uint32_t *)where) = 1; -#else - *((uint32_t *)where) = 0; -#endif - break; - - /* The hackery in setting "v" below is to cope with the case when - PCRE2_PRERELEASE is set to an empty string (which it is for real releases). - If the second alternative is used in this case, it does not leave a space - before the date. On the other hand, if all four macros are put into a single - XSTRING when PCRE2_PRERELEASE is not empty, an unwanted space is inserted. - There are problems using an "obvious" approach like this: - - XSTRING(PCRE2_MAJOR) "." XSTRING(PCRE_MINOR) - XSTRING(PCRE2_PRERELEASE) " " XSTRING(PCRE_DATE) - - because, when PCRE2_PRERELEASE is empty, this leads to an attempted expansion - of STRING(). The C standard states: "If (before argument substitution) any - argument consists of no preprocessing tokens, the behavior is undefined." It - turns out the gcc treats this case as a single empty string - which is what - we really want - but Visual C grumbles about the lack of an argument for the - macro. Unfortunately, both are within their rights. As there seems to be no - way to test for a macro's value being empty at compile time, we have to - resort to a runtime test. */ - - case PCRE2_CONFIG_VERSION: - { - const char *v = (XSTRING(Z PCRE2_PRERELEASE)[1] == 0)? - XSTRING(PCRE2_MAJOR.PCRE2_MINOR PCRE2_DATE) : - XSTRING(PCRE2_MAJOR.PCRE2_MINOR) XSTRING(PCRE2_PRERELEASE PCRE2_DATE); - return (int)(1 + ((where == NULL)? - strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v))); - } - } - -return 0; -} - -/* End of pcre2_config.c */ diff --git a/pcre2/src/pcre2_context.c b/pcre2/src/pcre2_context.c deleted file mode 100644 index f904a494a..000000000 --- a/pcre2/src/pcre2_context.c +++ /dev/null @@ -1,488 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2018 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - - - -/************************************************* -* Default malloc/free functions * -*************************************************/ - -/* Ignore the "user data" argument in each case. */ - -static void *default_malloc(size_t size, void *data) -{ -(void)data; -return malloc(size); -} - - -static void default_free(void *block, void *data) -{ -(void)data; -free(block); -} - - - -/************************************************* -* Get a block and save memory control * -*************************************************/ - -/* This internal function is called to get a block of memory in which the -memory control data is to be stored at the start for future use. - -Arguments: - size amount of memory required - memctl pointer to a memctl block or NULL - -Returns: pointer to memory or NULL on failure -*/ - -extern void * -PRIV(memctl_malloc)(size_t size, pcre2_memctl *memctl) -{ -pcre2_memctl *newmemctl; -void *yield = (memctl == NULL)? malloc(size) : - memctl->malloc(size, memctl->memory_data); -if (yield == NULL) return NULL; -newmemctl = (pcre2_memctl *)yield; -if (memctl == NULL) - { - newmemctl->malloc = default_malloc; - newmemctl->free = default_free; - newmemctl->memory_data = NULL; - } -else *newmemctl = *memctl; -return yield; -} - - - -/************************************************* -* Create and initialize contexts * -*************************************************/ - -/* Initializing for compile and match contexts is done in separate, private -functions so that these can be called from functions such as pcre2_compile() -when an external context is not supplied. The initializing functions have an -option to set up default memory management. */ - -PCRE2_EXP_DEFN pcre2_general_context * PCRE2_CALL_CONVENTION -pcre2_general_context_create(void *(*private_malloc)(size_t, void *), - void (*private_free)(void *, void *), void *memory_data) -{ -pcre2_general_context *gcontext; -if (private_malloc == NULL) private_malloc = default_malloc; -if (private_free == NULL) private_free = default_free; -gcontext = private_malloc(sizeof(pcre2_real_general_context), memory_data); -if (gcontext == NULL) return NULL; -gcontext->memctl.malloc = private_malloc; -gcontext->memctl.free = private_free; -gcontext->memctl.memory_data = memory_data; -return gcontext; -} - - -/* A default compile context is set up to save having to initialize at run time -when no context is supplied to the compile function. */ - -const pcre2_compile_context PRIV(default_compile_context) = { - { default_malloc, default_free, NULL }, /* Default memory handling */ - NULL, /* Stack guard */ - NULL, /* Stack guard data */ - PRIV(default_tables), /* Character tables */ - PCRE2_UNSET, /* Max pattern length */ - BSR_DEFAULT, /* Backslash R default */ - NEWLINE_DEFAULT, /* Newline convention */ - PARENS_NEST_LIMIT, /* As it says */ - 0 }; /* Extra options */ - -/* The create function copies the default into the new memory, but must -override the default memory handling functions if a gcontext was provided. */ - -PCRE2_EXP_DEFN pcre2_compile_context * PCRE2_CALL_CONVENTION -pcre2_compile_context_create(pcre2_general_context *gcontext) -{ -pcre2_compile_context *ccontext = PRIV(memctl_malloc)( - sizeof(pcre2_real_compile_context), (pcre2_memctl *)gcontext); -if (ccontext == NULL) return NULL; -*ccontext = PRIV(default_compile_context); -if (gcontext != NULL) - *((pcre2_memctl *)ccontext) = *((pcre2_memctl *)gcontext); -return ccontext; -} - - -/* A default match context is set up to save having to initialize at run time -when no context is supplied to a match function. */ - -const pcre2_match_context PRIV(default_match_context) = { - { default_malloc, default_free, NULL }, -#ifdef SUPPORT_JIT - NULL, /* JIT callback */ - NULL, /* JIT callback data */ -#endif - NULL, /* Callout function */ - NULL, /* Callout data */ - NULL, /* Substitute callout function */ - NULL, /* Substitute callout data */ - PCRE2_UNSET, /* Offset limit */ - HEAP_LIMIT, - MATCH_LIMIT, - MATCH_LIMIT_DEPTH }; - -/* The create function copies the default into the new memory, but must -override the default memory handling functions if a gcontext was provided. */ - -PCRE2_EXP_DEFN pcre2_match_context * PCRE2_CALL_CONVENTION -pcre2_match_context_create(pcre2_general_context *gcontext) -{ -pcre2_match_context *mcontext = PRIV(memctl_malloc)( - sizeof(pcre2_real_match_context), (pcre2_memctl *)gcontext); -if (mcontext == NULL) return NULL; -*mcontext = PRIV(default_match_context); -if (gcontext != NULL) - *((pcre2_memctl *)mcontext) = *((pcre2_memctl *)gcontext); -return mcontext; -} - - -/* A default convert context is set up to save having to initialize at run time -when no context is supplied to the convert function. */ - -const pcre2_convert_context PRIV(default_convert_context) = { - { default_malloc, default_free, NULL }, /* Default memory handling */ -#ifdef _WIN32 - CHAR_BACKSLASH, /* Default path separator */ - CHAR_GRAVE_ACCENT /* Default escape character */ -#else /* Not Windows */ - CHAR_SLASH, /* Default path separator */ - CHAR_BACKSLASH /* Default escape character */ -#endif - }; - -/* The create function copies the default into the new memory, but must -override the default memory handling functions if a gcontext was provided. */ - -PCRE2_EXP_DEFN pcre2_convert_context * PCRE2_CALL_CONVENTION -pcre2_convert_context_create(pcre2_general_context *gcontext) -{ -pcre2_convert_context *ccontext = PRIV(memctl_malloc)( - sizeof(pcre2_real_convert_context), (pcre2_memctl *)gcontext); -if (ccontext == NULL) return NULL; -*ccontext = PRIV(default_convert_context); -if (gcontext != NULL) - *((pcre2_memctl *)ccontext) = *((pcre2_memctl *)gcontext); -return ccontext; -} - - -/************************************************* -* Context copy functions * -*************************************************/ - -PCRE2_EXP_DEFN pcre2_general_context * PCRE2_CALL_CONVENTION -pcre2_general_context_copy(pcre2_general_context *gcontext) -{ -pcre2_general_context *new = - gcontext->memctl.malloc(sizeof(pcre2_real_general_context), - gcontext->memctl.memory_data); -if (new == NULL) return NULL; -memcpy(new, gcontext, sizeof(pcre2_real_general_context)); -return new; -} - - -PCRE2_EXP_DEFN pcre2_compile_context * PCRE2_CALL_CONVENTION -pcre2_compile_context_copy(pcre2_compile_context *ccontext) -{ -pcre2_compile_context *new = - ccontext->memctl.malloc(sizeof(pcre2_real_compile_context), - ccontext->memctl.memory_data); -if (new == NULL) return NULL; -memcpy(new, ccontext, sizeof(pcre2_real_compile_context)); -return new; -} - - -PCRE2_EXP_DEFN pcre2_match_context * PCRE2_CALL_CONVENTION -pcre2_match_context_copy(pcre2_match_context *mcontext) -{ -pcre2_match_context *new = - mcontext->memctl.malloc(sizeof(pcre2_real_match_context), - mcontext->memctl.memory_data); -if (new == NULL) return NULL; -memcpy(new, mcontext, sizeof(pcre2_real_match_context)); -return new; -} - - - -PCRE2_EXP_DEFN pcre2_convert_context * PCRE2_CALL_CONVENTION -pcre2_convert_context_copy(pcre2_convert_context *ccontext) -{ -pcre2_convert_context *new = - ccontext->memctl.malloc(sizeof(pcre2_real_convert_context), - ccontext->memctl.memory_data); -if (new == NULL) return NULL; -memcpy(new, ccontext, sizeof(pcre2_real_convert_context)); -return new; -} - - -/************************************************* -* Context free functions * -*************************************************/ - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_general_context_free(pcre2_general_context *gcontext) -{ -if (gcontext != NULL) - gcontext->memctl.free(gcontext, gcontext->memctl.memory_data); -} - - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_compile_context_free(pcre2_compile_context *ccontext) -{ -if (ccontext != NULL) - ccontext->memctl.free(ccontext, ccontext->memctl.memory_data); -} - - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_match_context_free(pcre2_match_context *mcontext) -{ -if (mcontext != NULL) - mcontext->memctl.free(mcontext, mcontext->memctl.memory_data); -} - - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_convert_context_free(pcre2_convert_context *ccontext) -{ -if (ccontext != NULL) - ccontext->memctl.free(ccontext, ccontext->memctl.memory_data); -} - - -/************************************************* -* Set values in contexts * -*************************************************/ - -/* All these functions return 0 for success or PCRE2_ERROR_BADDATA if invalid -data is given. Only some of the functions are able to test the validity of the -data. */ - - -/* ------------ Compile context ------------ */ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_character_tables(pcre2_compile_context *ccontext, - const uint8_t *tables) -{ -ccontext->tables = tables; -return 0; -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_bsr(pcre2_compile_context *ccontext, uint32_t value) -{ -switch(value) - { - case PCRE2_BSR_ANYCRLF: - case PCRE2_BSR_UNICODE: - ccontext->bsr_convention = value; - return 0; - - default: - return PCRE2_ERROR_BADDATA; - } -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, PCRE2_SIZE length) -{ -ccontext->max_pattern_length = length; -return 0; -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t newline) -{ -switch(newline) - { - case PCRE2_NEWLINE_CR: - case PCRE2_NEWLINE_LF: - case PCRE2_NEWLINE_CRLF: - case PCRE2_NEWLINE_ANY: - case PCRE2_NEWLINE_ANYCRLF: - case PCRE2_NEWLINE_NUL: - ccontext->newline_convention = newline; - return 0; - - default: - return PCRE2_ERROR_BADDATA; - } -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, uint32_t limit) -{ -ccontext->parens_nest_limit = limit; -return 0; -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_compile_extra_options(pcre2_compile_context *ccontext, uint32_t options) -{ -ccontext->extra_options = options; -return 0; -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, - int (*guard)(uint32_t, void *), void *user_data) -{ -ccontext->stack_guard = guard; -ccontext->stack_guard_data = user_data; -return 0; -} - - -/* ------------ Match context ------------ */ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_callout(pcre2_match_context *mcontext, - int (*callout)(pcre2_callout_block *, void *), void *callout_data) -{ -mcontext->callout = callout; -mcontext->callout_data = callout_data; -return 0; -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_substitute_callout(pcre2_match_context *mcontext, - int (*substitute_callout)(pcre2_substitute_callout_block *, void *), - void *substitute_callout_data) -{ -mcontext->substitute_callout = substitute_callout; -mcontext->substitute_callout_data = substitute_callout_data; -return 0; -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_heap_limit(pcre2_match_context *mcontext, uint32_t limit) -{ -mcontext->heap_limit = limit; -return 0; -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t limit) -{ -mcontext->match_limit = limit; -return 0; -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_depth_limit(pcre2_match_context *mcontext, uint32_t limit) -{ -mcontext->depth_limit = limit; -return 0; -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE limit) -{ -mcontext->offset_limit = limit; -return 0; -} - -/* This function became obsolete at release 10.30. It is kept as a synonym for -backwards compatibility. */ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_recursion_limit(pcre2_match_context *mcontext, uint32_t limit) -{ -return pcre2_set_depth_limit(mcontext, limit); -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_recursion_memory_management(pcre2_match_context *mcontext, - void *(*mymalloc)(size_t, void *), void (*myfree)(void *, void *), - void *mydata) -{ -(void)mcontext; -(void)mymalloc; -(void)myfree; -(void)mydata; -return 0; -} - -/* ------------ Convert context ------------ */ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_glob_separator(pcre2_convert_context *ccontext, uint32_t separator) -{ -if (separator != CHAR_SLASH && separator != CHAR_BACKSLASH && - separator != CHAR_DOT) return PCRE2_ERROR_BADDATA; -ccontext->glob_separator = separator; -return 0; -} - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_set_glob_escape(pcre2_convert_context *ccontext, uint32_t escape) -{ -if (escape > 255 || (escape != 0 && !ispunct(escape))) - return PCRE2_ERROR_BADDATA; -ccontext->glob_escape = escape; -return 0; -} - -/* End of pcre2_context.c */ - diff --git a/pcre2/src/pcre2_convert.c b/pcre2/src/pcre2_convert.c deleted file mode 100644 index d45b6fee9..000000000 --- a/pcre2/src/pcre2_convert.c +++ /dev/null @@ -1,1182 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2018 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - -#define TYPE_OPTIONS (PCRE2_CONVERT_GLOB| \ - PCRE2_CONVERT_POSIX_BASIC|PCRE2_CONVERT_POSIX_EXTENDED) - -#define ALL_OPTIONS (PCRE2_CONVERT_UTF|PCRE2_CONVERT_NO_UTF_CHECK| \ - PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR| \ - PCRE2_CONVERT_GLOB_NO_STARSTAR| \ - TYPE_OPTIONS) - -#define DUMMY_BUFFER_SIZE 100 - -/* Generated pattern fragments */ - -#define STR_BACKSLASH_A STR_BACKSLASH STR_A -#define STR_BACKSLASH_z STR_BACKSLASH STR_z -#define STR_COLON_RIGHT_SQUARE_BRACKET STR_COLON STR_RIGHT_SQUARE_BRACKET -#define STR_DOT_STAR_LOOKBEHIND STR_DOT STR_ASTERISK STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_LESS_THAN_SIGN STR_EQUALS_SIGN -#define STR_LOOKAHEAD_NOT_DOT STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_EXCLAMATION_MARK STR_BACKSLASH STR_DOT STR_RIGHT_PARENTHESIS -#define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS -#define STR_STAR_NUL STR_LEFT_PARENTHESIS STR_ASTERISK STR_N STR_U STR_L STR_RIGHT_PARENTHESIS - -/* States for range and POSIX processing */ - -enum { RANGE_NOT_STARTED, RANGE_STARTING, RANGE_STARTED }; -enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET, - POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED }; - -/* Macro to add a character string to the output buffer, checking for overflow. */ - -#define PUTCHARS(string) \ - { \ - for (s = (char *)(string); *s != 0; s++) \ - { \ - if (p >= endp) return PCRE2_ERROR_NOMEMORY; \ - *p++ = *s; \ - } \ - } - -/* Literals that must be escaped: \ ? * + | . ^ $ { } [ ] ( ) */ - -static const char *pcre2_escaped_literals = - STR_BACKSLASH STR_QUESTION_MARK STR_ASTERISK STR_PLUS - STR_VERTICAL_LINE STR_DOT STR_CIRCUMFLEX_ACCENT STR_DOLLAR_SIGN - STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET - STR_LEFT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET - STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS; - -/* Recognized escaped metacharacters in POSIX basic patterns. */ - -static const char *posix_meta_escapes = - STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS - STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET - STR_1 STR_2 STR_3 STR_4 STR_5 STR_6 STR_7 STR_8 STR_9; - - - -/************************************************* -* Convert a POSIX pattern * -*************************************************/ - -/* This function handles both basic and extended POSIX patterns. - -Arguments: - pattype the pattern type - pattern the pattern - plength length in code units - utf TRUE if UTF - use_buffer where to put the output - use_length length of use_buffer - bufflenptr where to put the used length - dummyrun TRUE if a dummy run - ccontext the convert context - -Returns: 0 => success - !0 => error code -*/ - -static int -convert_posix(uint32_t pattype, PCRE2_SPTR pattern, PCRE2_SIZE plength, - BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length, - PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext) -{ -char *s; -PCRE2_SPTR posix = pattern; -PCRE2_UCHAR *p = use_buffer; -PCRE2_UCHAR *pp = p; -PCRE2_UCHAR *endp = p + use_length - 1; /* Allow for trailing zero */ -PCRE2_SIZE convlength = 0; - -uint32_t bracount = 0; -uint32_t posix_state = POSIX_START_REGEX; -uint32_t lastspecial = 0; -BOOL extended = (pattype & PCRE2_CONVERT_POSIX_EXTENDED) != 0; -BOOL nextisliteral = FALSE; - -(void)utf; /* Not used when Unicode not supported */ -(void)ccontext; /* Not currently used */ - -/* Initialize default for error offset as end of input. */ - -*bufflenptr = plength; -PUTCHARS(STR_STAR_NUL); - -/* Now scan the input. */ - -while (plength > 0) - { - uint32_t c, sc; - int clength = 1; - - /* Add in the length of the last item, then, if in the dummy run, pull the - pointer back to the start of the (temporary) buffer and then remember the - start of the next item. */ - - convlength += p - pp; - if (dummyrun) p = use_buffer; - pp = p; - - /* Pick up the next character */ - -#ifndef SUPPORT_UNICODE - c = *posix; -#else - GETCHARLENTEST(c, posix, clength); -#endif - posix += clength; - plength -= clength; - - sc = nextisliteral? 0 : c; - nextisliteral = FALSE; - - /* Handle a character within a class. */ - - if (posix_state >= POSIX_CLASS_NOT_STARTED) - { - if (c == CHAR_RIGHT_SQUARE_BRACKET) - { - PUTCHARS(STR_RIGHT_SQUARE_BRACKET); - posix_state = POSIX_NOT_BRACKET; - } - - /* Not the end of the class */ - - else - { - switch (posix_state) - { - case POSIX_CLASS_STARTED: - if (c <= 127 && islower(c)) break; /* Remain in started state */ - posix_state = POSIX_CLASS_NOT_STARTED; - if (c == CHAR_COLON && plength > 0 && - *posix == CHAR_RIGHT_SQUARE_BRACKET) - { - PUTCHARS(STR_COLON_RIGHT_SQUARE_BRACKET); - plength--; - posix++; - continue; /* With next character after :] */ - } - /* Fall through */ - - case POSIX_CLASS_NOT_STARTED: - if (c == CHAR_LEFT_SQUARE_BRACKET) - posix_state = POSIX_CLASS_STARTING; - break; - - case POSIX_CLASS_STARTING: - if (c == CHAR_COLON) posix_state = POSIX_CLASS_STARTED; - break; - } - - if (c == CHAR_BACKSLASH) PUTCHARS(STR_BACKSLASH); - if (p + clength > endp) return PCRE2_ERROR_NOMEMORY; - memcpy(p, posix - clength, CU2BYTES(clength)); - p += clength; - } - } - - /* Handle a character not within a class. */ - - else switch(sc) - { - case CHAR_LEFT_SQUARE_BRACKET: - PUTCHARS(STR_LEFT_SQUARE_BRACKET); - -#ifdef NEVER - /* We could handle special cases [[:<:]] and [[:>:]] (which PCRE does - support) but they are not part of POSIX 1003.1. */ - - if (plength >= 6) - { - if (posix[0] == CHAR_LEFT_SQUARE_BRACKET && - posix[1] == CHAR_COLON && - (posix[2] == CHAR_LESS_THAN_SIGN || - posix[2] == CHAR_GREATER_THAN_SIGN) && - posix[3] == CHAR_COLON && - posix[4] == CHAR_RIGHT_SQUARE_BRACKET && - posix[5] == CHAR_RIGHT_SQUARE_BRACKET) - { - if (p + 6 > endp) return PCRE2_ERROR_NOMEMORY; - memcpy(p, posix, CU2BYTES(6)); - p += 6; - posix += 6; - plength -= 6; - continue; /* With next character */ - } - } -#endif - - /* Handle start of "normal" character classes */ - - posix_state = POSIX_CLASS_NOT_STARTED; - - /* Handle ^ and ] as first characters */ - - if (plength > 0) - { - if (*posix == CHAR_CIRCUMFLEX_ACCENT) - { - posix++; - plength--; - PUTCHARS(STR_CIRCUMFLEX_ACCENT); - } - if (plength > 0 && *posix == CHAR_RIGHT_SQUARE_BRACKET) - { - posix++; - plength--; - PUTCHARS(STR_RIGHT_SQUARE_BRACKET); - } - } - break; - - case CHAR_BACKSLASH: - if (plength == 0) return PCRE2_ERROR_END_BACKSLASH; - if (extended) nextisliteral = TRUE; else - { - if (*posix < 127 && strchr(posix_meta_escapes, *posix) != NULL) - { - if (isdigit(*posix)) PUTCHARS(STR_BACKSLASH); - if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY; - lastspecial = *p++ = *posix++; - plength--; - } - else nextisliteral = TRUE; - } - break; - - case CHAR_RIGHT_PARENTHESIS: - if (!extended || bracount == 0) goto ESCAPE_LITERAL; - bracount--; - goto COPY_SPECIAL; - - case CHAR_LEFT_PARENTHESIS: - bracount++; - /* Fall through */ - - case CHAR_QUESTION_MARK: - case CHAR_PLUS: - case CHAR_LEFT_CURLY_BRACKET: - case CHAR_RIGHT_CURLY_BRACKET: - case CHAR_VERTICAL_LINE: - if (!extended) goto ESCAPE_LITERAL; - /* Fall through */ - - case CHAR_DOT: - case CHAR_DOLLAR_SIGN: - posix_state = POSIX_NOT_BRACKET; - COPY_SPECIAL: - lastspecial = c; - if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY; - *p++ = c; - break; - - case CHAR_ASTERISK: - if (lastspecial != CHAR_ASTERISK) - { - if (!extended && (posix_state < POSIX_NOT_BRACKET || - lastspecial == CHAR_LEFT_PARENTHESIS)) - goto ESCAPE_LITERAL; - goto COPY_SPECIAL; - } - break; /* Ignore second and subsequent asterisks */ - - case CHAR_CIRCUMFLEX_ACCENT: - if (extended) goto COPY_SPECIAL; - if (posix_state == POSIX_START_REGEX || - lastspecial == CHAR_LEFT_PARENTHESIS) - { - posix_state = POSIX_ANCHORED; - goto COPY_SPECIAL; - } - /* Fall through */ - - default: - if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL) - { - ESCAPE_LITERAL: - PUTCHARS(STR_BACKSLASH); - } - lastspecial = 0xff; /* Indicates nothing special */ - if (p + clength > endp) return PCRE2_ERROR_NOMEMORY; - memcpy(p, posix - clength, CU2BYTES(clength)); - p += clength; - posix_state = POSIX_NOT_BRACKET; - break; - } - } - -if (posix_state >= POSIX_CLASS_NOT_STARTED) - return PCRE2_ERROR_MISSING_SQUARE_BRACKET; -convlength += p - pp; /* Final segment */ -*bufflenptr = convlength; -*p++ = 0; -return 0; -} - - -/************************************************* -* Convert a glob pattern * -*************************************************/ - -/* Context for writing the output into a buffer. */ - -typedef struct pcre2_output_context { - PCRE2_UCHAR *output; /* current output position */ - PCRE2_SPTR output_end; /* output end */ - PCRE2_SIZE output_size; /* size of the output */ - uint8_t out_str[8]; /* string copied to the output */ -} pcre2_output_context; - - -/* Write a character into the output. - -Arguments: - out output context - chr the next character -*/ - -static void -convert_glob_write(pcre2_output_context *out, PCRE2_UCHAR chr) -{ -out->output_size++; - -if (out->output < out->output_end) - *out->output++ = chr; -} - - -/* Write a string into the output. - -Arguments: - out output context - length length of out->out_str -*/ - -static void -convert_glob_write_str(pcre2_output_context *out, PCRE2_SIZE length) -{ -uint8_t *out_str = out->out_str; -PCRE2_UCHAR *output = out->output; -PCRE2_SPTR output_end = out->output_end; -PCRE2_SIZE output_size = out->output_size; - -do - { - output_size++; - - if (output < output_end) - *output++ = *out_str++; - } -while (--length != 0); - -out->output = output; -out->output_size = output_size; -} - - -/* Prints the separator into the output. - -Arguments: - out output context - separator glob separator - with_escape backslash is needed before separator -*/ - -static void -convert_glob_print_separator(pcre2_output_context *out, - PCRE2_UCHAR separator, BOOL with_escape) -{ -if (with_escape) - convert_glob_write(out, CHAR_BACKSLASH); - -convert_glob_write(out, separator); -} - - -/* Prints a wildcard into the output. - -Arguments: - out output context - separator glob separator - with_escape backslash is needed before separator -*/ - -static void -convert_glob_print_wildcard(pcre2_output_context *out, - PCRE2_UCHAR separator, BOOL with_escape) -{ -out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; -out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; -convert_glob_write_str(out, 2); - -convert_glob_print_separator(out, separator, with_escape); - -convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET); -} - - -/* Parse a posix class. - -Arguments: - from starting point of scanning the range - pattern_end end of pattern - out output context - -Returns: >0 => class index - 0 => malformed class -*/ - -static int -convert_glob_parse_class(PCRE2_SPTR *from, PCRE2_SPTR pattern_end, - pcre2_output_context *out) -{ -static const char *posix_classes = "alnum:alpha:ascii:blank:cntrl:digit:" - "graph:lower:print:punct:space:upper:word:xdigit:"; -PCRE2_SPTR start = *from + 1; -PCRE2_SPTR pattern = start; -const char *class_ptr; -PCRE2_UCHAR c; -int class_index; - -while (TRUE) - { - if (pattern >= pattern_end) return 0; - - c = *pattern++; - - if (c < CHAR_a || c > CHAR_z) break; - } - -if (c != CHAR_COLON || pattern >= pattern_end || - *pattern != CHAR_RIGHT_SQUARE_BRACKET) - return 0; - -class_ptr = posix_classes; -class_index = 1; - -while (TRUE) - { - if (*class_ptr == CHAR_NUL) return 0; - - pattern = start; - - while (*pattern == (PCRE2_UCHAR) *class_ptr) - { - if (*pattern == CHAR_COLON) - { - pattern += 2; - start -= 2; - - do convert_glob_write(out, *start++); while (start < pattern); - - *from = pattern; - return class_index; - } - pattern++; - class_ptr++; - } - - while (*class_ptr != CHAR_COLON) class_ptr++; - class_ptr++; - class_index++; - } -} - -/* Checks whether the character is in the class. - -Arguments: - class_index class index - c character - -Returns: !0 => character is found in the class - 0 => otherwise -*/ - -static BOOL -convert_glob_char_in_class(int class_index, PCRE2_UCHAR c) -{ -switch (class_index) - { - case 1: return isalnum(c); - case 2: return isalpha(c); - case 3: return 1; - case 4: return c == CHAR_HT || c == CHAR_SPACE; - case 5: return iscntrl(c); - case 6: return isdigit(c); - case 7: return isgraph(c); - case 8: return islower(c); - case 9: return isprint(c); - case 10: return ispunct(c); - case 11: return isspace(c); - case 12: return isupper(c); - case 13: return isalnum(c) || c == CHAR_UNDERSCORE; - default: return isxdigit(c); - } -} - -/* Parse a range of characters. - -Arguments: - from starting point of scanning the range - pattern_end end of pattern - out output context - separator glob separator - with_escape backslash is needed before separator - -Returns: 0 => success - !0 => error code -*/ - -static int -convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end, - pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator, - BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep) -{ -BOOL is_negative = FALSE; -BOOL separator_seen = FALSE; -BOOL has_prev_c; -PCRE2_SPTR pattern = *from; -PCRE2_SPTR char_start = NULL; -uint32_t c, prev_c; -int len, class_index; - -(void)utf; /* Avoid compiler warning. */ - -if (pattern >= pattern_end) - { - *from = pattern; - return PCRE2_ERROR_MISSING_SQUARE_BRACKET; - } - -if (*pattern == CHAR_EXCLAMATION_MARK - || *pattern == CHAR_CIRCUMFLEX_ACCENT) - { - pattern++; - - if (pattern >= pattern_end) - { - *from = pattern; - return PCRE2_ERROR_MISSING_SQUARE_BRACKET; - } - - is_negative = TRUE; - - out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; - out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; - len = 2; - - if (!no_wildsep) - { - if (with_escape) - { - out->out_str[len] = CHAR_BACKSLASH; - len++; - } - out->out_str[len] = (uint8_t) separator; - } - - convert_glob_write_str(out, len + 1); - } -else - convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET); - -has_prev_c = FALSE; -prev_c = 0; - -if (*pattern == CHAR_RIGHT_SQUARE_BRACKET) - { - out->out_str[0] = CHAR_BACKSLASH; - out->out_str[1] = CHAR_RIGHT_SQUARE_BRACKET; - convert_glob_write_str(out, 2); - has_prev_c = TRUE; - prev_c = CHAR_RIGHT_SQUARE_BRACKET; - pattern++; - } - -while (pattern < pattern_end) - { - char_start = pattern; - GETCHARINCTEST(c, pattern); - - if (c == CHAR_RIGHT_SQUARE_BRACKET) - { - convert_glob_write(out, c); - - if (!is_negative && !no_wildsep && separator_seen) - { - out->out_str[0] = CHAR_LEFT_PARENTHESIS; - out->out_str[1] = CHAR_QUESTION_MARK; - out->out_str[2] = CHAR_LESS_THAN_SIGN; - out->out_str[3] = CHAR_EXCLAMATION_MARK; - convert_glob_write_str(out, 4); - - convert_glob_print_separator(out, separator, with_escape); - convert_glob_write(out, CHAR_RIGHT_PARENTHESIS); - } - - *from = pattern; - return 0; - } - - if (pattern >= pattern_end) break; - - if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON) - { - *from = pattern; - class_index = convert_glob_parse_class(from, pattern_end, out); - - if (class_index != 0) - { - pattern = *from; - - has_prev_c = FALSE; - prev_c = 0; - - if (!is_negative && - convert_glob_char_in_class (class_index, separator)) - separator_seen = TRUE; - continue; - } - } - else if (c == CHAR_MINUS && has_prev_c && - *pattern != CHAR_RIGHT_SQUARE_BRACKET) - { - convert_glob_write(out, CHAR_MINUS); - - char_start = pattern; - GETCHARINCTEST(c, pattern); - - if (pattern >= pattern_end) break; - - if (escape != 0 && c == escape) - { - char_start = pattern; - GETCHARINCTEST(c, pattern); - } - else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON) - { - *from = pattern; - return PCRE2_ERROR_CONVERT_SYNTAX; - } - - if (prev_c > c) - { - *from = pattern; - return PCRE2_ERROR_CONVERT_SYNTAX; - } - - if (prev_c < separator && separator < c) separator_seen = TRUE; - - has_prev_c = FALSE; - prev_c = 0; - } - else - { - if (escape != 0 && c == escape) - { - char_start = pattern; - GETCHARINCTEST(c, pattern); - - if (pattern >= pattern_end) break; - } - - has_prev_c = TRUE; - prev_c = c; - } - - if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET || - c == CHAR_BACKSLASH || c == CHAR_MINUS) - convert_glob_write(out, CHAR_BACKSLASH); - - if (c == separator) separator_seen = TRUE; - - do convert_glob_write(out, *char_start++); while (char_start < pattern); - } - -*from = pattern; -return PCRE2_ERROR_MISSING_SQUARE_BRACKET; -} - - -/* Prints a (*COMMIT) into the output. - -Arguments: - out output context -*/ - -static void -convert_glob_print_commit(pcre2_output_context *out) -{ -out->out_str[0] = CHAR_LEFT_PARENTHESIS; -out->out_str[1] = CHAR_ASTERISK; -out->out_str[2] = CHAR_C; -out->out_str[3] = CHAR_O; -out->out_str[4] = CHAR_M; -out->out_str[5] = CHAR_M; -out->out_str[6] = CHAR_I; -out->out_str[7] = CHAR_T; -convert_glob_write_str(out, 8); -convert_glob_write(out, CHAR_RIGHT_PARENTHESIS); -} - - -/* Bash glob converter. - -Arguments: - pattype the pattern type - pattern the pattern - plength length in code units - utf TRUE if UTF - use_buffer where to put the output - use_length length of use_buffer - bufflenptr where to put the used length - dummyrun TRUE if a dummy run - ccontext the convert context - -Returns: 0 => success - !0 => error code -*/ - -static int -convert_glob(uint32_t options, PCRE2_SPTR pattern, PCRE2_SIZE plength, - BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length, - PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext) -{ -pcre2_output_context out; -PCRE2_SPTR pattern_start = pattern; -PCRE2_SPTR pattern_end = pattern + plength; -PCRE2_UCHAR separator = ccontext->glob_separator; -PCRE2_UCHAR escape = ccontext->glob_escape; -PCRE2_UCHAR c; -BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0; -BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0; -BOOL in_atomic = FALSE; -BOOL after_starstar = FALSE; -BOOL no_slash_z = FALSE; -BOOL with_escape, is_start, after_separator; -int result = 0; - -(void)utf; /* Avoid compiler warning. */ - -#ifdef SUPPORT_UNICODE -if (utf && (separator >= 128 || escape >= 128)) - { - /* Currently only ASCII characters are supported. */ - *bufflenptr = 0; - return PCRE2_ERROR_CONVERT_SYNTAX; - } -#endif - -with_escape = strchr(pcre2_escaped_literals, separator) != NULL; - -/* Initialize default for error offset as end of input. */ -out.output = use_buffer; -out.output_end = use_buffer + use_length; -out.output_size = 0; - -out.out_str[0] = CHAR_LEFT_PARENTHESIS; -out.out_str[1] = CHAR_QUESTION_MARK; -out.out_str[2] = CHAR_s; -out.out_str[3] = CHAR_RIGHT_PARENTHESIS; -convert_glob_write_str(&out, 4); - -is_start = TRUE; - -if (pattern < pattern_end && pattern[0] == CHAR_ASTERISK) - { - if (no_wildsep) - is_start = FALSE; - else if (!no_starstar && pattern + 1 < pattern_end && - pattern[1] == CHAR_ASTERISK) - is_start = FALSE; - } - -if (is_start) - { - out.out_str[0] = CHAR_BACKSLASH; - out.out_str[1] = CHAR_A; - convert_glob_write_str(&out, 2); - } - -while (pattern < pattern_end) - { - c = *pattern++; - - if (c == CHAR_ASTERISK) - { - is_start = pattern == pattern_start + 1; - - if (in_atomic) - { - convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS); - in_atomic = FALSE; - } - - if (!no_starstar && pattern < pattern_end && *pattern == CHAR_ASTERISK) - { - after_separator = is_start || (pattern[-2] == separator); - - do pattern++; while (pattern < pattern_end && - *pattern == CHAR_ASTERISK); - - if (pattern >= pattern_end) - { - no_slash_z = TRUE; - break; - } - - after_starstar = TRUE; - - if (after_separator && escape != 0 && *pattern == escape && - pattern + 1 < pattern_end && pattern[1] == separator) - pattern++; - - if (is_start) - { - if (*pattern != separator) continue; - - out.out_str[0] = CHAR_LEFT_PARENTHESIS; - out.out_str[1] = CHAR_QUESTION_MARK; - out.out_str[2] = CHAR_COLON; - out.out_str[3] = CHAR_BACKSLASH; - out.out_str[4] = CHAR_A; - out.out_str[5] = CHAR_VERTICAL_LINE; - convert_glob_write_str(&out, 6); - - convert_glob_print_separator(&out, separator, with_escape); - convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS); - - pattern++; - continue; - } - - convert_glob_print_commit(&out); - - if (!after_separator || *pattern != separator) - { - out.out_str[0] = CHAR_DOT; - out.out_str[1] = CHAR_ASTERISK; - out.out_str[2] = CHAR_QUESTION_MARK; - convert_glob_write_str(&out, 3); - continue; - } - - out.out_str[0] = CHAR_LEFT_PARENTHESIS; - out.out_str[1] = CHAR_QUESTION_MARK; - out.out_str[2] = CHAR_COLON; - out.out_str[3] = CHAR_DOT; - out.out_str[4] = CHAR_ASTERISK; - out.out_str[5] = CHAR_QUESTION_MARK; - - convert_glob_write_str(&out, 6); - - convert_glob_print_separator(&out, separator, with_escape); - - out.out_str[0] = CHAR_RIGHT_PARENTHESIS; - out.out_str[1] = CHAR_QUESTION_MARK; - out.out_str[2] = CHAR_QUESTION_MARK; - convert_glob_write_str(&out, 3); - - pattern++; - continue; - } - - if (pattern < pattern_end && *pattern == CHAR_ASTERISK) - { - do pattern++; while (pattern < pattern_end && - *pattern == CHAR_ASTERISK); - } - - if (no_wildsep) - { - if (pattern >= pattern_end) - { - no_slash_z = TRUE; - break; - } - - /* Start check must be after the end check. */ - if (is_start) continue; - } - - if (!is_start) - { - if (after_starstar) - { - out.out_str[0] = CHAR_LEFT_PARENTHESIS; - out.out_str[1] = CHAR_QUESTION_MARK; - out.out_str[2] = CHAR_GREATER_THAN_SIGN; - convert_glob_write_str(&out, 3); - in_atomic = TRUE; - } - else - convert_glob_print_commit(&out); - } - - if (no_wildsep) - convert_glob_write(&out, CHAR_DOT); - else - convert_glob_print_wildcard(&out, separator, with_escape); - - out.out_str[0] = CHAR_ASTERISK; - out.out_str[1] = CHAR_QUESTION_MARK; - if (pattern >= pattern_end) - out.out_str[1] = CHAR_PLUS; - convert_glob_write_str(&out, 2); - continue; - } - - if (c == CHAR_QUESTION_MARK) - { - if (no_wildsep) - convert_glob_write(&out, CHAR_DOT); - else - convert_glob_print_wildcard(&out, separator, with_escape); - continue; - } - - if (c == CHAR_LEFT_SQUARE_BRACKET) - { - result = convert_glob_parse_range(&pattern, pattern_end, - &out, utf, separator, with_escape, escape, no_wildsep); - if (result != 0) break; - continue; - } - - if (escape != 0 && c == escape) - { - if (pattern >= pattern_end) - { - result = PCRE2_ERROR_CONVERT_SYNTAX; - break; - } - c = *pattern++; - } - - if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL) - convert_glob_write(&out, CHAR_BACKSLASH); - - convert_glob_write(&out, c); - } - -if (result == 0) - { - if (!no_slash_z) - { - out.out_str[0] = CHAR_BACKSLASH; - out.out_str[1] = CHAR_z; - convert_glob_write_str(&out, 2); - } - - if (in_atomic) - convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS); - - convert_glob_write(&out, CHAR_NUL); - - if (!dummyrun && out.output_size != (PCRE2_SIZE) (out.output - use_buffer)) - result = PCRE2_ERROR_NOMEMORY; - } - -if (result != 0) - { - *bufflenptr = pattern - pattern_start; - return result; - } - -*bufflenptr = out.output_size - 1; -return 0; -} - - -/************************************************* -* Convert pattern * -*************************************************/ - -/* This is the external-facing function for converting other forms of pattern -into PCRE2 regular expression patterns. On error, the bufflenptr argument is -used to return an offset in the original pattern. - -Arguments: - pattern the input pattern - plength length of input, or PCRE2_ZERO_TERMINATED - options options bits - buffptr pointer to pointer to output buffer - bufflenptr pointer to length of output buffer - ccontext convert context or NULL - -Returns: 0 for success, else an error code (+ve or -ve) -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE plength, uint32_t options, - PCRE2_UCHAR **buffptr, PCRE2_SIZE *bufflenptr, - pcre2_convert_context *ccontext) -{ -int i, rc; -PCRE2_UCHAR dummy_buffer[DUMMY_BUFFER_SIZE]; -PCRE2_UCHAR *use_buffer = dummy_buffer; -PCRE2_SIZE use_length = DUMMY_BUFFER_SIZE; -BOOL utf = (options & PCRE2_CONVERT_UTF) != 0; -uint32_t pattype = options & TYPE_OPTIONS; - -if (pattern == NULL || bufflenptr == NULL) return PCRE2_ERROR_NULL; - -if ((options & ~ALL_OPTIONS) != 0 || /* Undefined bit set */ - (pattype & (~pattype+1)) != pattype || /* More than one type set */ - pattype == 0) /* No type set */ - { - *bufflenptr = 0; /* Error offset */ - return PCRE2_ERROR_BADOPTION; - } - -if (plength == PCRE2_ZERO_TERMINATED) plength = PRIV(strlen)(pattern); -if (ccontext == NULL) ccontext = - (pcre2_convert_context *)(&PRIV(default_convert_context)); - -/* Check UTF if required. */ - -#ifndef SUPPORT_UNICODE -if (utf) - { - *bufflenptr = 0; /* Error offset */ - return PCRE2_ERROR_UNICODE_NOT_SUPPORTED; - } -#else -if (utf && (options & PCRE2_CONVERT_NO_UTF_CHECK) == 0) - { - PCRE2_SIZE erroroffset; - rc = PRIV(valid_utf)(pattern, plength, &erroroffset); - if (rc != 0) - { - *bufflenptr = erroroffset; - return rc; - } - } -#endif - -/* If buffptr is not NULL, and what it points to is not NULL, we are being -provided with a buffer and a length, so set them as the buffer to use. */ - -if (buffptr != NULL && *buffptr != NULL) - { - use_buffer = *buffptr; - use_length = *bufflenptr; - } - -/* Call an individual converter, either just once (if a buffer was provided or -just the length is needed), or twice (if a memory allocation is required). */ - -for (i = 0; i < 2; i++) - { - PCRE2_UCHAR *allocated; - BOOL dummyrun = buffptr == NULL || *buffptr == NULL; - - switch(pattype) - { - case PCRE2_CONVERT_GLOB: - rc = convert_glob(options & ~PCRE2_CONVERT_GLOB, pattern, plength, utf, - use_buffer, use_length, bufflenptr, dummyrun, ccontext); - break; - - case PCRE2_CONVERT_POSIX_BASIC: - case PCRE2_CONVERT_POSIX_EXTENDED: - rc = convert_posix(pattype, pattern, plength, utf, use_buffer, use_length, - bufflenptr, dummyrun, ccontext); - break; - - default: - *bufflenptr = 0; /* Error offset */ - return PCRE2_ERROR_INTERNAL; - } - - if (rc != 0 || /* Error */ - buffptr == NULL || /* Just the length is required */ - *buffptr != NULL) /* Buffer was provided or allocated */ - return rc; - - /* Allocate memory for the buffer, with hidden space for an allocator at - the start. The next time round the loop runs the conversion for real. */ - - allocated = PRIV(memctl_malloc)(sizeof(pcre2_memctl) + - (*bufflenptr + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)ccontext); - if (allocated == NULL) return PCRE2_ERROR_NOMEMORY; - *buffptr = (PCRE2_UCHAR *)(((char *)allocated) + sizeof(pcre2_memctl)); - - use_buffer = *buffptr; - use_length = *bufflenptr + 1; - } - -/* Control should never get here. */ - -return PCRE2_ERROR_INTERNAL; -} - - -/************************************************* -* Free converted pattern * -*************************************************/ - -/* This frees a converted pattern that was put in newly-allocated memory. - -Argument: the converted pattern -Returns: nothing -*/ - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_converted_pattern_free(PCRE2_UCHAR *converted) -{ -if (converted != NULL) - { - pcre2_memctl *memctl = - (pcre2_memctl *)((char *)converted - sizeof(pcre2_memctl)); - memctl->free(memctl, memctl->memory_data); - } -} - -/* End of pcre2_convert.c */ diff --git a/pcre2/src/pcre2_dfa_match.c b/pcre2/src/pcre2_dfa_match.c deleted file mode 100644 index 625695b7c..000000000 --- a/pcre2/src/pcre2_dfa_match.c +++ /dev/null @@ -1,3982 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -/* This module contains the external function pcre2_dfa_match(), which is an -alternative matching function that uses a sort of DFA algorithm (not a true -FSM). This is NOT Perl-compatible, but it has advantages in certain -applications. */ - - -/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved -the performance of his patterns greatly. I could not use it as it stood, as it -was not thread safe, and made assumptions about pattern sizes. Also, it caused -test 7 to loop, and test 9 to crash with a segfault. - -The issue is the check for duplicate states, which is done by a simple linear -search up the state list. (Grep for "duplicate" below to find the code.) For -many patterns, there will never be many states active at one time, so a simple -linear search is fine. In patterns that have many active states, it might be a -bottleneck. The suggested code used an indexing scheme to remember which states -had previously been used for each character, and avoided the linear search when -it knew there was no chance of a duplicate. This was implemented when adding -states to the state lists. - -I wrote some thread-safe, not-limited code to try something similar at the time -of checking for duplicates (instead of when adding states), using index vectors -on the stack. It did give a 13% improvement with one specially constructed -pattern for certain subject strings, but on other strings and on many of the -simpler patterns in the test suite it did worse. The major problem, I think, -was the extra time to initialize the index. This had to be done for each call -of internal_dfa_match(). (The supplied patch used a static vector, initialized -only once - I suspect this was the cause of the problems with the tests.) - -Overall, I concluded that the gains in some cases did not outweigh the losses -in others, so I abandoned this code. */ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#define NLBLOCK mb /* Block containing newline information */ -#define PSSTART start_subject /* Field containing processed string start */ -#define PSEND end_subject /* Field containing processed string end */ - -#include "pcre2_internal.h" - -#define PUBLIC_DFA_MATCH_OPTIONS \ - (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ - PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ - PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \ - PCRE2_COPY_MATCHED_SUBJECT) - - -/************************************************* -* Code parameters and static tables * -*************************************************/ - -/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes -into others, under special conditions. A gap of 20 between the blocks should be -enough. The resulting opcodes don't have to be less than 256 because they are -never stored, so we push them well clear of the normal opcodes. */ - -#define OP_PROP_EXTRA 300 -#define OP_EXTUNI_EXTRA 320 -#define OP_ANYNL_EXTRA 340 -#define OP_HSPACE_EXTRA 360 -#define OP_VSPACE_EXTRA 380 - - -/* This table identifies those opcodes that are followed immediately by a -character that is to be tested in some way. This makes it possible to -centralize the loading of these characters. In the case of Type * etc, the -"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a -small value. Non-zero values in the table are the offsets from the opcode where -the character is to be found. ***NOTE*** If the start of this table is -modified, the three tables that follow must also be modified. */ - -static const uint8_t coptable[] = { - 0, /* End */ - 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ - 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ - 0, 0, 0, /* Any, AllAny, Anybyte */ - 0, 0, /* \P, \p */ - 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ - 0, /* \X */ - 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */ - 1, /* Char */ - 1, /* Chari */ - 1, /* not */ - 1, /* noti */ - /* Positive single-char repeats */ - 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ - 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */ - 1+IMM2_SIZE, /* exact */ - 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */ - 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ - 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */ - 1+IMM2_SIZE, /* exact I */ - 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ - /* Negative single-char repeats - only for chars < 256 */ - 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ - 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */ - 1+IMM2_SIZE, /* NOT exact */ - 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */ - 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ - 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */ - 1+IMM2_SIZE, /* NOT exact I */ - 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */ - /* Positive type repeats */ - 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ - 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */ - 1+IMM2_SIZE, /* Type exact */ - 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */ - /* Character class & ref repeats */ - 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ - 0, 0, /* CRRANGE, CRMINRANGE */ - 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */ - 0, /* CLASS */ - 0, /* NCLASS */ - 0, /* XCLASS - variable length */ - 0, /* REF */ - 0, /* REFI */ - 0, /* DNREF */ - 0, /* DNREFI */ - 0, /* RECURSE */ - 0, /* CALLOUT */ - 0, /* CALLOUT_STR */ - 0, /* Alt */ - 0, /* Ket */ - 0, /* KetRmax */ - 0, /* KetRmin */ - 0, /* KetRpos */ - 0, /* Reverse */ - 0, /* Assert */ - 0, /* Assert not */ - 0, /* Assert behind */ - 0, /* Assert behind not */ - 0, /* NA assert */ - 0, /* NA assert behind */ - 0, /* ONCE */ - 0, /* SCRIPT_RUN */ - 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ - 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ - 0, 0, /* CREF, DNCREF */ - 0, 0, /* RREF, DNRREF */ - 0, 0, /* FALSE, TRUE */ - 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ - 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ - 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ - 0, 0, /* COMMIT, COMMIT_ARG */ - 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */ - 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */ -}; - -/* This table identifies those opcodes that inspect a character. It is used to -remember the fact that a character could have been inspected when the end of -the subject is reached. ***NOTE*** If the start of this table is modified, the -two tables that follow must also be modified. */ - -static const uint8_t poptable[] = { - 0, /* End */ - 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */ - 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ - 1, 1, 1, /* Any, AllAny, Anybyte */ - 1, 1, /* \P, \p */ - 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ - 1, /* \X */ - 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */ - 1, /* Char */ - 1, /* Chari */ - 1, /* not */ - 1, /* noti */ - /* Positive single-char repeats */ - 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ - 1, 1, 1, /* upto, minupto, exact */ - 1, 1, 1, 1, /* *+, ++, ?+, upto+ */ - 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ - 1, 1, 1, /* upto I, minupto I, exact I */ - 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */ - /* Negative single-char repeats - only for chars < 256 */ - 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ - 1, 1, 1, /* NOT upto, minupto, exact */ - 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */ - 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ - 1, 1, 1, /* NOT upto I, minupto I, exact I */ - 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */ - /* Positive type repeats */ - 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ - 1, 1, 1, /* Type upto, minupto, exact */ - 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */ - /* Character class & ref repeats */ - 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ - 1, 1, /* CRRANGE, CRMINRANGE */ - 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */ - 1, /* CLASS */ - 1, /* NCLASS */ - 1, /* XCLASS - variable length */ - 0, /* REF */ - 0, /* REFI */ - 0, /* DNREF */ - 0, /* DNREFI */ - 0, /* RECURSE */ - 0, /* CALLOUT */ - 0, /* CALLOUT_STR */ - 0, /* Alt */ - 0, /* Ket */ - 0, /* KetRmax */ - 0, /* KetRmin */ - 0, /* KetRpos */ - 0, /* Reverse */ - 0, /* Assert */ - 0, /* Assert not */ - 0, /* Assert behind */ - 0, /* Assert behind not */ - 0, /* NA assert */ - 0, /* NA assert behind */ - 0, /* ONCE */ - 0, /* SCRIPT_RUN */ - 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ - 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ - 0, 0, /* CREF, DNCREF */ - 0, 0, /* RREF, DNRREF */ - 0, 0, /* FALSE, TRUE */ - 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ - 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ - 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ - 0, 0, /* COMMIT, COMMIT_ARG */ - 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */ - 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */ -}; - -/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, -and \w */ - -static const uint8_t toptable1[] = { - 0, 0, 0, 0, 0, 0, - ctype_digit, ctype_digit, - ctype_space, ctype_space, - ctype_word, ctype_word, - 0, 0 /* OP_ANY, OP_ALLANY */ -}; - -static const uint8_t toptable2[] = { - 0, 0, 0, 0, 0, 0, - ctype_digit, 0, - ctype_space, 0, - ctype_word, 0, - 1, 1 /* OP_ANY, OP_ALLANY */ -}; - - -/* Structure for holding data about a particular state, which is in effect the -current data for an active path through the match tree. It must consist -entirely of ints because the working vector we are passed, and which we put -these structures in, is a vector of ints. */ - -typedef struct stateblock { - int offset; /* Offset to opcode (-ve has meaning) */ - int count; /* Count for repeats */ - int data; /* Some use extra data */ -} stateblock; - -#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int)) - - -/* Before version 10.32 the recursive calls of internal_dfa_match() were passed -local working space and output vectors that were created on the stack. This has -caused issues for some patterns, especially in small-stack environments such as -Windows. A new scheme is now in use which sets up a vector on the stack, but if -this is too small, heap memory is used, up to the heap_limit. The main -parameters are all numbers of ints because the workspace is a vector of ints. - -The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is -defined in pcre2_internal.h so as to be available to pcre2test when it is -finding the minimum heap requirement for a match. */ - -#define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int)) - -#define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */ -#define RWS_RSIZE 1000 /* Work size for recursion */ -#define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */ -#define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */ - -/* This structure is at the start of each workspace block. */ - -typedef struct RWS_anchor { - struct RWS_anchor *next; - uint32_t size; /* Number of ints */ - uint32_t free; /* Number of ints */ -} RWS_anchor; - -#define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int)) - - - -/************************************************* -* Process a callout * -*************************************************/ - -/* This function is called to perform a callout. - -Arguments: - code current code pointer - offsets points to current capture offsets - current_subject start of current subject match - ptr current position in subject - mb the match block - extracode extra code offset when called from condition - lengthptr where to return the callout length - -Returns: the return from the callout -*/ - -static int -do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject, - PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode, - PCRE2_SIZE *lengthptr) -{ -pcre2_callout_block *cb = mb->cb; - -*lengthptr = (code[extracode] == OP_CALLOUT)? - (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] : - (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode); - -if (mb->callout == NULL) return 0; /* No callout provided */ - -/* Fixed fields in the callout block are set once and for all at the start of -matching. */ - -cb->offset_vector = offsets; -cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject); -cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject); -cb->pattern_position = GET(code, 1 + extracode); -cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode); - -if (code[extracode] == OP_CALLOUT) - { - cb->callout_number = code[1 + 2*LINK_SIZE + extracode]; - cb->callout_string_offset = 0; - cb->callout_string = NULL; - cb->callout_string_length = 0; - } -else - { - cb->callout_number = 0; - cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode); - cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1; - cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2; - } - -return (mb->callout)(cb, mb->callout_data); -} - - - -/************************************************* -* Expand local workspace memory * -*************************************************/ - -/* This function is called when internal_dfa_match() is about to be called -recursively and there is insufficient working space left in the current -workspace block. If there's an existing next block, use it; otherwise get a new -block unless the heap limit is reached. - -Arguments: - rwsptr pointer to block pointer (updated) - ovecsize space needed for an ovector - mb the match block - -Returns: 0 rwsptr has been updated - !0 an error code -*/ - -static int -more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb) -{ -RWS_anchor *rws = *rwsptr; -RWS_anchor *new; - -if (rws->next != NULL) - { - new = rws->next; - } - -/* Sizes in the RWS_anchor blocks are in units of sizeof(int), but -mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid -overflow. */ - -else - { - uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2; - uint32_t newsizeK = newsize/(1024/sizeof(int)); - - if (newsizeK + mb->heap_used > mb->heap_limit) - newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used); - newsize = newsizeK*(1024/sizeof(int)); - - if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE) - return PCRE2_ERROR_HEAPLIMIT; - new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data); - if (new == NULL) return PCRE2_ERROR_NOMEMORY; - mb->heap_used += newsizeK; - new->next = NULL; - new->size = newsize; - rws->next = new; - } - -new->free = new->size - RWS_ANCHOR_SIZE; -*rwsptr = new; -return 0; -} - - - -/************************************************* -* Match a Regular Expression - DFA engine * -*************************************************/ - -/* This internal function applies a compiled pattern to a subject string, -starting at a given point, using a DFA engine. This function is called from the -external one, possibly multiple times if the pattern is not anchored. The -function calls itself recursively for some kinds of subpattern. - -Arguments: - mb the match_data block with fixed information - this_start_code the opening bracket of this subexpression's code - current_subject where we currently are in the subject string - start_offset start offset in the subject string - offsets vector to contain the matching string offsets - offsetcount size of same - workspace vector of workspace - wscount size of same - rlevel function call recursion level - -Returns: > 0 => number of match offset pairs placed in offsets - = 0 => offsets overflowed; longest matches are present - -1 => failed to match - < -1 => some kind of unexpected problem - -The following macros are used for adding states to the two state vectors (one -for the current character, one for the following character). */ - -#define ADD_ACTIVE(x,y) \ - if (active_count++ < wscount) \ - { \ - next_active_state->offset = (x); \ - next_active_state->count = (y); \ - next_active_state++; \ - } \ - else return PCRE2_ERROR_DFA_WSSIZE - -#define ADD_ACTIVE_DATA(x,y,z) \ - if (active_count++ < wscount) \ - { \ - next_active_state->offset = (x); \ - next_active_state->count = (y); \ - next_active_state->data = (z); \ - next_active_state++; \ - } \ - else return PCRE2_ERROR_DFA_WSSIZE - -#define ADD_NEW(x,y) \ - if (new_count++ < wscount) \ - { \ - next_new_state->offset = (x); \ - next_new_state->count = (y); \ - next_new_state++; \ - } \ - else return PCRE2_ERROR_DFA_WSSIZE - -#define ADD_NEW_DATA(x,y,z) \ - if (new_count++ < wscount) \ - { \ - next_new_state->offset = (x); \ - next_new_state->count = (y); \ - next_new_state->data = (z); \ - next_new_state++; \ - } \ - else return PCRE2_ERROR_DFA_WSSIZE - -/* And now, here is the code */ - -static int -internal_dfa_match( - dfa_match_block *mb, - PCRE2_SPTR this_start_code, - PCRE2_SPTR current_subject, - PCRE2_SIZE start_offset, - PCRE2_SIZE *offsets, - uint32_t offsetcount, - int *workspace, - int wscount, - uint32_t rlevel, - int *RWS) -{ -stateblock *active_states, *new_states, *temp_states; -stateblock *next_active_state, *next_new_state; -const uint8_t *ctypes, *lcc, *fcc; -PCRE2_SPTR ptr; -PCRE2_SPTR end_code; -dfa_recursion_info new_recursive; -int active_count, new_count, match_count; - -/* Some fields in the mb block are frequently referenced, so we load them into -independent variables in the hope that this will perform better. */ - -PCRE2_SPTR start_subject = mb->start_subject; -PCRE2_SPTR end_subject = mb->end_subject; -PCRE2_SPTR start_code = mb->start_code; - -#ifdef SUPPORT_UNICODE -BOOL utf = (mb->poptions & PCRE2_UTF) != 0; -BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0; -#else -BOOL utf = FALSE; -#endif - -BOOL reset_could_continue = FALSE; - -if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT; -if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT; -offsetcount &= (uint32_t)(-2); /* Round down */ - -wscount -= 2; -wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / - (2 * INTS_PER_STATEBLOCK); - -ctypes = mb->tables + ctypes_offset; -lcc = mb->tables + lcc_offset; -fcc = mb->tables + fcc_offset; - -match_count = PCRE2_ERROR_NOMATCH; /* A negative number */ - -active_states = (stateblock *)(workspace + 2); -next_new_state = new_states = active_states + wscount; -new_count = 0; - -/* The first thing in any (sub) pattern is a bracket of some sort. Push all -the alternative states onto the list, and find out where the end is. This -makes is possible to use this function recursively, when we want to stop at a -matching internal ket rather than at the end. - -If we are dealing with a backward assertion we have to find out the maximum -amount to move back, and set up each alternative appropriately. */ - -if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT) - { - size_t max_back = 0; - size_t gone_back; - - end_code = this_start_code; - do - { - size_t back = (size_t)GET(end_code, 2+LINK_SIZE); - if (back > max_back) max_back = back; - end_code += GET(end_code, 1); - } - while (*end_code == OP_ALT); - - /* If we can't go back the amount required for the longest lookbehind - pattern, go back as far as we can; some alternatives may still be viable. */ - -#ifdef SUPPORT_UNICODE - /* In character mode we have to step back character by character */ - - if (utf) - { - for (gone_back = 0; gone_back < max_back; gone_back++) - { - if (current_subject <= start_subject) break; - current_subject--; - ACROSSCHAR(current_subject > start_subject, current_subject, - current_subject--); - } - } - else -#endif - - /* In byte-mode we can do this quickly. */ - - { - size_t current_offset = (size_t)(current_subject - start_subject); - gone_back = (current_offset < max_back)? current_offset : max_back; - current_subject -= gone_back; - } - - /* Save the earliest consulted character */ - - if (current_subject < mb->start_used_ptr) - mb->start_used_ptr = current_subject; - - /* Now we can process the individual branches. There will be an OP_REVERSE at - the start of each branch, except when the length of the branch is zero. */ - - end_code = this_start_code; - do - { - uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0; - size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE); - if (back <= gone_back) - { - int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen); - ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back)); - } - end_code += GET(end_code, 1); - } - while (*end_code == OP_ALT); - } - -/* This is the code for a "normal" subpattern (not a backward assertion). The -start of a whole pattern is always one of these. If we are at the top level, -we may be asked to restart matching from the same point that we reached for a -previous partial match. We still have to scan through the top-level branches to -find the end state. */ - -else - { - end_code = this_start_code; - - /* Restarting */ - - if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0) - { - do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); - new_count = workspace[1]; - if (!workspace[0]) - memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock)); - } - - /* Not restarting */ - - else - { - int length = 1 + LINK_SIZE + - ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || - *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) - ? IMM2_SIZE:0); - do - { - ADD_NEW((int)(end_code - start_code + length), 0); - end_code += GET(end_code, 1); - length = 1 + LINK_SIZE; - } - while (*end_code == OP_ALT); - } - } - -workspace[0] = 0; /* Bit indicating which vector is current */ - -/* Loop for scanning the subject */ - -ptr = current_subject; -for (;;) - { - int i, j; - int clen, dlen; - uint32_t c, d; - int forced_fail = 0; - BOOL partial_newline = FALSE; - BOOL could_continue = reset_could_continue; - reset_could_continue = FALSE; - - if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr; - - /* Make the new state list into the active state list and empty the - new state list. */ - - temp_states = active_states; - active_states = new_states; - new_states = temp_states; - active_count = new_count; - new_count = 0; - - workspace[0] ^= 1; /* Remember for the restarting feature */ - workspace[1] = active_count; - - /* Set the pointers for adding new states */ - - next_active_state = active_states + active_count; - next_new_state = new_states; - - /* Load the current character from the subject outside the loop, as many - different states may want to look at it, and we assume that at least one - will. */ - - if (ptr < end_subject) - { - clen = 1; /* Number of data items in the character */ -#ifdef SUPPORT_UNICODE - GETCHARLENTEST(c, ptr, clen); -#else - c = *ptr; -#endif /* SUPPORT_UNICODE */ - } - else - { - clen = 0; /* This indicates the end of the subject */ - c = NOTACHAR; /* This value should never actually be used */ - } - - /* Scan up the active states and act on each one. The result of an action - may be to add more states to the currently active list (e.g. on hitting a - parenthesis) or it may be to put states on the new list, for considering - when we move the character pointer on. */ - - for (i = 0; i < active_count; i++) - { - stateblock *current_state = active_states + i; - BOOL caseless = FALSE; - PCRE2_SPTR code; - uint32_t codevalue; - int state_offset = current_state->offset; - int rrc; - int count; - - /* A negative offset is a special case meaning "hold off going to this - (negated) state until the number of characters in the data field have - been skipped". If the could_continue flag was passed over from a previous - state, arrange for it to passed on. */ - - if (state_offset < 0) - { - if (current_state->data > 0) - { - ADD_NEW_DATA(state_offset, current_state->count, - current_state->data - 1); - if (could_continue) reset_could_continue = TRUE; - continue; - } - else - { - current_state->offset = state_offset = -state_offset; - } - } - - /* Check for a duplicate state with the same count, and skip if found. - See the note at the head of this module about the possibility of improving - performance here. */ - - for (j = 0; j < i; j++) - { - if (active_states[j].offset == state_offset && - active_states[j].count == current_state->count) - goto NEXT_ACTIVE_STATE; - } - - /* The state offset is the offset to the opcode */ - - code = start_code + state_offset; - codevalue = *code; - - /* If this opcode inspects a character, but we are at the end of the - subject, remember the fact for use when testing for a partial match. */ - - if (clen == 0 && poptable[codevalue] != 0) - could_continue = TRUE; - - /* If this opcode is followed by an inline character, load it. It is - tempting to test for the presence of a subject character here, but that - is wrong, because sometimes zero repetitions of the subject are - permitted. - - We also use this mechanism for opcodes such as OP_TYPEPLUS that take an - argument that is not a data character - but is always one byte long because - the values are small. We have to take special action to deal with \P, \p, - \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert - these ones to new opcodes. */ - - if (coptable[codevalue] > 0) - { - dlen = 1; -#ifdef SUPPORT_UNICODE - if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else -#endif /* SUPPORT_UNICODE */ - d = code[coptable[codevalue]]; - if (codevalue >= OP_TYPESTAR) - { - switch(d) - { - case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM; - case OP_NOTPROP: - case OP_PROP: codevalue += OP_PROP_EXTRA; break; - case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; - case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; - case OP_NOT_HSPACE: - case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break; - case OP_NOT_VSPACE: - case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break; - default: break; - } - } - } - else - { - dlen = 0; /* Not strictly necessary, but compilers moan */ - d = NOTACHAR; /* if these variables are not set. */ - } - - - /* Now process the individual opcodes */ - - switch (codevalue) - { -/* ========================================================================== */ - /* These cases are never obeyed. This is a fudge that causes a compile- - time error if the vectors coptable or poptable, which are indexed by - opcode, are not the correct length. It seems to be the only way to do - such a check at compile time, as the sizeof() operator does not work - in the C preprocessor. */ - - case OP_TABLE_LENGTH: - case OP_TABLE_LENGTH + - ((sizeof(coptable) == OP_TABLE_LENGTH) && - (sizeof(poptable) == OP_TABLE_LENGTH)): - return 0; - -/* ========================================================================== */ - /* Reached a closing bracket. If not at the end of the pattern, carry - on with the next opcode. For repeating opcodes, also add the repeat - state. Note that KETRPOS will always be encountered at the end of the - subpattern, because the possessive subpattern repeats are always handled - using recursive calls. Thus, it never adds any new states. - - At the end of the (sub)pattern, unless we have an empty string and - PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the - start of the subject, save the match data, shifting up all previous - matches so we always have the longest first. */ - - case OP_KET: - case OP_KETRMIN: - case OP_KETRMAX: - case OP_KETRPOS: - if (code != end_code) - { - ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); - if (codevalue != OP_KET) - { - ADD_ACTIVE(state_offset - (int)GET(code, 1), 0); - } - } - else - { - if (ptr > current_subject || - ((mb->moptions & PCRE2_NOTEMPTY) == 0 && - ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 || - current_subject > start_subject + mb->start_offset))) - { - if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; - else if (match_count > 0 && ++match_count * 2 > (int)offsetcount) - match_count = 0; - count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2; - if (count > 0) (void)memmove(offsets + 2, offsets, - (size_t)count * sizeof(PCRE2_SIZE)); - if (offsetcount >= 2) - { - offsets[0] = (PCRE2_SIZE)(current_subject - start_subject); - offsets[1] = (PCRE2_SIZE)(ptr - start_subject); - } - if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count; - } - } - break; - -/* ========================================================================== */ - /* These opcodes add to the current list of states without looking - at the current character. */ - - /*-----------------------------------------------------------------*/ - case OP_ALT: - do { code += GET(code, 1); } while (*code == OP_ALT); - ADD_ACTIVE((int)(code - start_code), 0); - break; - - /*-----------------------------------------------------------------*/ - case OP_BRA: - case OP_SBRA: - do - { - ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); - code += GET(code, 1); - } - while (*code == OP_ALT); - break; - - /*-----------------------------------------------------------------*/ - case OP_CBRA: - case OP_SCBRA: - ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0); - code += GET(code, 1); - while (*code == OP_ALT) - { - ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); - code += GET(code, 1); - } - break; - - /*-----------------------------------------------------------------*/ - case OP_BRAZERO: - case OP_BRAMINZERO: - ADD_ACTIVE(state_offset + 1, 0); - code += 1 + GET(code, 2); - while (*code == OP_ALT) code += GET(code, 1); - ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); - break; - - /*-----------------------------------------------------------------*/ - case OP_SKIPZERO: - code += 1 + GET(code, 2); - while (*code == OP_ALT) code += GET(code, 1); - ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); - break; - - /*-----------------------------------------------------------------*/ - case OP_CIRC: - if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) - { ADD_ACTIVE(state_offset + 1, 0); } - break; - - /*-----------------------------------------------------------------*/ - case OP_CIRCM: - if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) || - ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 ) - && WAS_NEWLINE(ptr))) - { ADD_ACTIVE(state_offset + 1, 0); } - break; - - /*-----------------------------------------------------------------*/ - case OP_EOD: - if (ptr >= end_subject) - { - if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) - return PCRE2_ERROR_PARTIAL; - else { ADD_ACTIVE(state_offset + 1, 0); } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_SOD: - if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } - break; - - /*-----------------------------------------------------------------*/ - case OP_SOM: - if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } - break; - - -/* ========================================================================== */ - /* These opcodes inspect the next subject character, and sometimes - the previous one as well, but do not have an argument. The variable - clen contains the length of the current character and is zero if we are - at the end of the subject. */ - - /*-----------------------------------------------------------------*/ - case OP_ANY: - if (clen > 0 && !IS_NEWLINE(ptr)) - { - if (ptr + 1 >= mb->end_subject && - (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - c == NLBLOCK->nl[0]) - { - could_continue = partial_newline = TRUE; - } - else - { - ADD_NEW(state_offset + 1, 0); - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_ALLANY: - if (clen > 0) - { ADD_NEW(state_offset + 1, 0); } - break; - - /*-----------------------------------------------------------------*/ - case OP_EODN: - if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen)) - { - if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) - return PCRE2_ERROR_PARTIAL; - ADD_ACTIVE(state_offset + 1, 0); - } - break; - - /*-----------------------------------------------------------------*/ - case OP_DOLL: - if ((mb->moptions & PCRE2_NOTEOL) == 0) - { - if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) - could_continue = TRUE; - else if (clen == 0 || - ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) && - (ptr == end_subject - mb->nllen) - )) - { ADD_ACTIVE(state_offset + 1, 0); } - else if (ptr + 1 >= mb->end_subject && - (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - c == NLBLOCK->nl[0]) - { - if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) - { - reset_could_continue = TRUE; - ADD_NEW_DATA(-(state_offset + 1), 0, 1); - } - else could_continue = partial_newline = TRUE; - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_DOLLM: - if ((mb->moptions & PCRE2_NOTEOL) == 0) - { - if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) - could_continue = TRUE; - else if (clen == 0 || - ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr))) - { ADD_ACTIVE(state_offset + 1, 0); } - else if (ptr + 1 >= mb->end_subject && - (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - c == NLBLOCK->nl[0]) - { - if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) - { - reset_could_continue = TRUE; - ADD_NEW_DATA(-(state_offset + 1), 0, 1); - } - else could_continue = partial_newline = TRUE; - } - } - else if (IS_NEWLINE(ptr)) - { ADD_ACTIVE(state_offset + 1, 0); } - break; - - /*-----------------------------------------------------------------*/ - - case OP_DIGIT: - case OP_WHITESPACE: - case OP_WORDCHAR: - if (clen > 0 && c < 256 && - ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) - { ADD_NEW(state_offset + 1, 0); } - break; - - /*-----------------------------------------------------------------*/ - case OP_NOT_DIGIT: - case OP_NOT_WHITESPACE: - case OP_NOT_WORDCHAR: - if (clen > 0 && (c >= 256 || - ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) - { ADD_NEW(state_offset + 1, 0); } - break; - - /*-----------------------------------------------------------------*/ - case OP_WORD_BOUNDARY: - case OP_NOT_WORD_BOUNDARY: - { - int left_word, right_word; - - if (ptr > start_subject) - { - PCRE2_SPTR temp = ptr - 1; - if (temp < mb->start_used_ptr) mb->start_used_ptr = temp; -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (utf) { BACKCHAR(temp); } -#endif - GETCHARTEST(d, temp); -#ifdef SUPPORT_UNICODE - if ((mb->poptions & PCRE2_UCP) != 0) - { - if (d == '_') left_word = TRUE; else - { - uint32_t cat = UCD_CATEGORY(d); - left_word = (cat == ucp_L || cat == ucp_N); - } - } - else -#endif - left_word = d < 256 && (ctypes[d] & ctype_word) != 0; - } - else left_word = FALSE; - - if (clen > 0) - { - if (ptr >= mb->last_used_ptr) - { - PCRE2_SPTR temp = ptr + 1; -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (utf) { FORWARDCHARTEST(temp, mb->end_subject); } -#endif - mb->last_used_ptr = temp; - } -#ifdef SUPPORT_UNICODE - if ((mb->poptions & PCRE2_UCP) != 0) - { - if (c == '_') right_word = TRUE; else - { - uint32_t cat = UCD_CATEGORY(c); - right_word = (cat == ucp_L || cat == ucp_N); - } - } - else -#endif - right_word = c < 256 && (ctypes[c] & ctype_word) != 0; - } - else right_word = FALSE; - - if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) - { ADD_ACTIVE(state_offset + 1, 0); } - } - break; - - - /*-----------------------------------------------------------------*/ - /* Check the next character by Unicode property. We will get here only - if the support is in the binary; otherwise a compile-time error occurs. - */ - -#ifdef SUPPORT_UNICODE - case OP_PROP: - case OP_NOTPROP: - if (clen > 0) - { - BOOL OK; - const uint32_t *cp; - const ucd_record * prop = GET_UCD(c); - switch(code[1]) - { - case PT_ANY: - OK = TRUE; - break; - - case PT_LAMP: - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt; - break; - - case PT_GC: - OK = PRIV(ucp_gentype)[prop->chartype] == code[2]; - break; - - case PT_PC: - OK = prop->chartype == code[2]; - break; - - case PT_SC: - OK = prop->script == code[2]; - break; - - /* These are specials for combination cases. */ - - case PT_ALNUM: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N; - break; - - /* Perl space used to exclude VT, but from Perl 5.18 it is included, - which means that Perl space and POSIX space are now identical. PCRE - was changed at release 8.34. */ - - case PT_SPACE: /* Perl space */ - case PT_PXSPACE: /* POSIX space */ - switch(c) - { - HSPACE_CASES: - VSPACE_CASES: - OK = TRUE; - break; - - default: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; - break; - } - break; - - case PT_WORD: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || - c == CHAR_UNDERSCORE; - break; - - case PT_CLIST: - cp = PRIV(ucd_caseless_sets) + code[2]; - for (;;) - { - if (c < *cp) { OK = FALSE; break; } - if (c == *cp++) { OK = TRUE; break; } - } - break; - - case PT_UCNC: - OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || - c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || - c >= 0xe000; - break; - - /* Should never occur, but keep compilers from grumbling. */ - - default: - OK = codevalue != OP_PROP; - break; - } - - if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } - } - break; -#endif - - - -/* ========================================================================== */ - /* These opcodes likewise inspect the subject character, but have an - argument that is not a data character. It is one of these opcodes: - OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, - OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */ - - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEPOSPLUS: - count = current_state->count; /* Already matched */ - if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } - if (clen > 0) - { - if (d == OP_ANY && ptr + 1 >= mb->end_subject && - (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - c == NLBLOCK->nl[0]) - { - could_continue = partial_newline = TRUE; - } - else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || - (c < 256 && - (d != OP_ANY || !IS_NEWLINE(ptr)) && - ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) - { - if (count > 0 && codevalue == OP_TYPEPOSPLUS) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - count++; - ADD_NEW(state_offset, count); - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSQUERY: - ADD_ACTIVE(state_offset + 2, 0); - if (clen > 0) - { - if (d == OP_ANY && ptr + 1 >= mb->end_subject && - (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - c == NLBLOCK->nl[0]) - { - could_continue = partial_newline = TRUE; - } - else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || - (c < 256 && - (d != OP_ANY || !IS_NEWLINE(ptr)) && - ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) - { - if (codevalue == OP_TYPEPOSQUERY) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - ADD_NEW(state_offset + 2, 0); - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPOSSTAR: - ADD_ACTIVE(state_offset + 2, 0); - if (clen > 0) - { - if (d == OP_ANY && ptr + 1 >= mb->end_subject && - (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - c == NLBLOCK->nl[0]) - { - could_continue = partial_newline = TRUE; - } - else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || - (c < 256 && - (d != OP_ANY || !IS_NEWLINE(ptr)) && - ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) - { - if (codevalue == OP_TYPEPOSSTAR) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - ADD_NEW(state_offset, 0); - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_TYPEEXACT: - count = current_state->count; /* Number already matched */ - if (clen > 0) - { - if (d == OP_ANY && ptr + 1 >= mb->end_subject && - (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - c == NLBLOCK->nl[0]) - { - could_continue = partial_newline = TRUE; - } - else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || - (c < 256 && - (d != OP_ANY || !IS_NEWLINE(ptr)) && - ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) - { - if (++count >= (int)GET2(code, 1)) - { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); } - else - { ADD_NEW(state_offset, count); } - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEPOSUPTO: - ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); - count = current_state->count; /* Number already matched */ - if (clen > 0) - { - if (d == OP_ANY && ptr + 1 >= mb->end_subject && - (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - c == NLBLOCK->nl[0]) - { - could_continue = partial_newline = TRUE; - } - else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || - (c < 256 && - (d != OP_ANY || !IS_NEWLINE(ptr)) && - ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) - { - if (codevalue == OP_TYPEPOSUPTO) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - if (++count >= (int)GET2(code, 1)) - { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); } - else - { ADD_NEW(state_offset, count); } - } - } - break; - -/* ========================================================================== */ - /* These are virtual opcodes that are used when something like - OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its - argument. It keeps the code above fast for the other cases. The argument - is in the d variable. */ - -#ifdef SUPPORT_UNICODE - case OP_PROP_EXTRA + OP_TYPEPLUS: - case OP_PROP_EXTRA + OP_TYPEMINPLUS: - case OP_PROP_EXTRA + OP_TYPEPOSPLUS: - count = current_state->count; /* Already matched */ - if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } - if (clen > 0) - { - BOOL OK; - const uint32_t *cp; - const ucd_record * prop = GET_UCD(c); - switch(code[2]) - { - case PT_ANY: - OK = TRUE; - break; - - case PT_LAMP: - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt; - break; - - case PT_GC: - OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; - break; - - case PT_PC: - OK = prop->chartype == code[3]; - break; - - case PT_SC: - OK = prop->script == code[3]; - break; - - /* These are specials for combination cases. */ - - case PT_ALNUM: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N; - break; - - /* Perl space used to exclude VT, but from Perl 5.18 it is included, - which means that Perl space and POSIX space are now identical. PCRE - was changed at release 8.34. */ - - case PT_SPACE: /* Perl space */ - case PT_PXSPACE: /* POSIX space */ - switch(c) - { - HSPACE_CASES: - VSPACE_CASES: - OK = TRUE; - break; - - default: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; - break; - } - break; - - case PT_WORD: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || - c == CHAR_UNDERSCORE; - break; - - case PT_CLIST: - cp = PRIV(ucd_caseless_sets) + code[3]; - for (;;) - { - if (c < *cp) { OK = FALSE; break; } - if (c == *cp++) { OK = TRUE; break; } - } - break; - - case PT_UCNC: - OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || - c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || - c >= 0xe000; - break; - - /* Should never occur, but keep compilers from grumbling. */ - - default: - OK = codevalue != OP_PROP; - break; - } - - if (OK == (d == OP_PROP)) - { - if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - count++; - ADD_NEW(state_offset, count); - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_EXTUNI_EXTRA + OP_TYPEPLUS: - case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: - case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: - count = current_state->count; /* Already matched */ - if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } - if (clen > 0) - { - int ncount = 0; - if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf, - &ncount); - count++; - ADD_NEW_DATA(-state_offset, count, ncount); - } - break; -#endif - - /*-----------------------------------------------------------------*/ - case OP_ANYNL_EXTRA + OP_TYPEPLUS: - case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: - case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: - count = current_state->count; /* Already matched */ - if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } - if (clen > 0) - { - int ncount = 0; - switch (c) - { - case CHAR_VT: - case CHAR_FF: - case CHAR_NEL: -#ifndef EBCDIC - case 0x2028: - case 0x2029: -#endif /* Not EBCDIC */ - if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; - goto ANYNL01; - - case CHAR_CR: - if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1; - /* Fall through */ - - ANYNL01: - case CHAR_LF: - if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - count++; - ADD_NEW_DATA(-state_offset, count, ncount); - break; - - default: - break; - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_VSPACE_EXTRA + OP_TYPEPLUS: - case OP_VSPACE_EXTRA + OP_TYPEMINPLUS: - case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS: - count = current_state->count; /* Already matched */ - if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } - if (clen > 0) - { - BOOL OK; - switch (c) - { - VSPACE_CASES: - OK = TRUE; - break; - - default: - OK = FALSE; - break; - } - - if (OK == (d == OP_VSPACE)) - { - if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - count++; - ADD_NEW_DATA(-state_offset, count, 0); - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_HSPACE_EXTRA + OP_TYPEPLUS: - case OP_HSPACE_EXTRA + OP_TYPEMINPLUS: - case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS: - count = current_state->count; /* Already matched */ - if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } - if (clen > 0) - { - BOOL OK; - switch (c) - { - HSPACE_CASES: - OK = TRUE; - break; - - default: - OK = FALSE; - break; - } - - if (OK == (d == OP_HSPACE)) - { - if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - count++; - ADD_NEW_DATA(-state_offset, count, 0); - } - } - break; - - /*-----------------------------------------------------------------*/ -#ifdef SUPPORT_UNICODE - case OP_PROP_EXTRA + OP_TYPEQUERY: - case OP_PROP_EXTRA + OP_TYPEMINQUERY: - case OP_PROP_EXTRA + OP_TYPEPOSQUERY: - count = 4; - goto QS1; - - case OP_PROP_EXTRA + OP_TYPESTAR: - case OP_PROP_EXTRA + OP_TYPEMINSTAR: - case OP_PROP_EXTRA + OP_TYPEPOSSTAR: - count = 0; - - QS1: - - ADD_ACTIVE(state_offset + 4, 0); - if (clen > 0) - { - BOOL OK; - const uint32_t *cp; - const ucd_record * prop = GET_UCD(c); - switch(code[2]) - { - case PT_ANY: - OK = TRUE; - break; - - case PT_LAMP: - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt; - break; - - case PT_GC: - OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; - break; - - case PT_PC: - OK = prop->chartype == code[3]; - break; - - case PT_SC: - OK = prop->script == code[3]; - break; - - /* These are specials for combination cases. */ - - case PT_ALNUM: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N; - break; - - /* Perl space used to exclude VT, but from Perl 5.18 it is included, - which means that Perl space and POSIX space are now identical. PCRE - was changed at release 8.34. */ - - case PT_SPACE: /* Perl space */ - case PT_PXSPACE: /* POSIX space */ - switch(c) - { - HSPACE_CASES: - VSPACE_CASES: - OK = TRUE; - break; - - default: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; - break; - } - break; - - case PT_WORD: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || - c == CHAR_UNDERSCORE; - break; - - case PT_CLIST: - cp = PRIV(ucd_caseless_sets) + code[3]; - for (;;) - { - if (c < *cp) { OK = FALSE; break; } - if (c == *cp++) { OK = TRUE; break; } - } - break; - - case PT_UCNC: - OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || - c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || - c >= 0xe000; - break; - - /* Should never occur, but keep compilers from grumbling. */ - - default: - OK = codevalue != OP_PROP; - break; - } - - if (OK == (d == OP_PROP)) - { - if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR || - codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - ADD_NEW(state_offset + count, 0); - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_EXTUNI_EXTRA + OP_TYPEQUERY: - case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: - case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: - count = 2; - goto QS2; - - case OP_EXTUNI_EXTRA + OP_TYPESTAR: - case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: - case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: - count = 0; - - QS2: - - ADD_ACTIVE(state_offset + 2, 0); - if (clen > 0) - { - int ncount = 0; - if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || - codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf, - &ncount); - ADD_NEW_DATA(-(state_offset + count), 0, ncount); - } - break; -#endif - - /*-----------------------------------------------------------------*/ - case OP_ANYNL_EXTRA + OP_TYPEQUERY: - case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: - case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: - count = 2; - goto QS3; - - case OP_ANYNL_EXTRA + OP_TYPESTAR: - case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: - case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: - count = 0; - - QS3: - ADD_ACTIVE(state_offset + 2, 0); - if (clen > 0) - { - int ncount = 0; - switch (c) - { - case CHAR_VT: - case CHAR_FF: - case CHAR_NEL: -#ifndef EBCDIC - case 0x2028: - case 0x2029: -#endif /* Not EBCDIC */ - if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; - goto ANYNL02; - - case CHAR_CR: - if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1; - /* Fall through */ - - ANYNL02: - case CHAR_LF: - if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || - codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount); - break; - - default: - break; - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_VSPACE_EXTRA + OP_TYPEQUERY: - case OP_VSPACE_EXTRA + OP_TYPEMINQUERY: - case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY: - count = 2; - goto QS4; - - case OP_VSPACE_EXTRA + OP_TYPESTAR: - case OP_VSPACE_EXTRA + OP_TYPEMINSTAR: - case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR: - count = 0; - - QS4: - ADD_ACTIVE(state_offset + 2, 0); - if (clen > 0) - { - BOOL OK; - switch (c) - { - VSPACE_CASES: - OK = TRUE; - break; - - default: - OK = FALSE; - break; - } - if (OK == (d == OP_VSPACE)) - { - if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR || - codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_HSPACE_EXTRA + OP_TYPEQUERY: - case OP_HSPACE_EXTRA + OP_TYPEMINQUERY: - case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY: - count = 2; - goto QS5; - - case OP_HSPACE_EXTRA + OP_TYPESTAR: - case OP_HSPACE_EXTRA + OP_TYPEMINSTAR: - case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR: - count = 0; - - QS5: - ADD_ACTIVE(state_offset + 2, 0); - if (clen > 0) - { - BOOL OK; - switch (c) - { - HSPACE_CASES: - OK = TRUE; - break; - - default: - OK = FALSE; - break; - } - - if (OK == (d == OP_HSPACE)) - { - if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR || - codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); - } - } - break; - - /*-----------------------------------------------------------------*/ -#ifdef SUPPORT_UNICODE - case OP_PROP_EXTRA + OP_TYPEEXACT: - case OP_PROP_EXTRA + OP_TYPEUPTO: - case OP_PROP_EXTRA + OP_TYPEMINUPTO: - case OP_PROP_EXTRA + OP_TYPEPOSUPTO: - if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) - { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); } - count = current_state->count; /* Number already matched */ - if (clen > 0) - { - BOOL OK; - const uint32_t *cp; - const ucd_record * prop = GET_UCD(c); - switch(code[1 + IMM2_SIZE + 1]) - { - case PT_ANY: - OK = TRUE; - break; - - case PT_LAMP: - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt; - break; - - case PT_GC: - OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2]; - break; - - case PT_PC: - OK = prop->chartype == code[1 + IMM2_SIZE + 2]; - break; - - case PT_SC: - OK = prop->script == code[1 + IMM2_SIZE + 2]; - break; - - /* These are specials for combination cases. */ - - case PT_ALNUM: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N; - break; - - /* Perl space used to exclude VT, but from Perl 5.18 it is included, - which means that Perl space and POSIX space are now identical. PCRE - was changed at release 8.34. */ - - case PT_SPACE: /* Perl space */ - case PT_PXSPACE: /* POSIX space */ - switch(c) - { - HSPACE_CASES: - VSPACE_CASES: - OK = TRUE; - break; - - default: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; - break; - } - break; - - case PT_WORD: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || - c == CHAR_UNDERSCORE; - break; - - case PT_CLIST: - cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2]; - for (;;) - { - if (c < *cp) { OK = FALSE; break; } - if (c == *cp++) { OK = TRUE; break; } - } - break; - - case PT_UCNC: - OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || - c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || - c >= 0xe000; - break; - - /* Should never occur, but keep compilers from grumbling. */ - - default: - OK = codevalue != OP_PROP; - break; - } - - if (OK == (d == OP_PROP)) - { - if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - if (++count >= (int)GET2(code, 1)) - { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); } - else - { ADD_NEW(state_offset, count); } - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_EXTUNI_EXTRA + OP_TYPEEXACT: - case OP_EXTUNI_EXTRA + OP_TYPEUPTO: - case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: - case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: - if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) - { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } - count = current_state->count; /* Number already matched */ - if (clen > 0) - { - PCRE2_SPTR nptr; - int ncount = 0; - if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf, - &ncount); - if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) - reset_could_continue = TRUE; - if (++count >= (int)GET2(code, 1)) - { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } - else - { ADD_NEW_DATA(-state_offset, count, ncount); } - } - break; -#endif - - /*-----------------------------------------------------------------*/ - case OP_ANYNL_EXTRA + OP_TYPEEXACT: - case OP_ANYNL_EXTRA + OP_TYPEUPTO: - case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: - case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: - if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) - { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } - count = current_state->count; /* Number already matched */ - if (clen > 0) - { - int ncount = 0; - switch (c) - { - case CHAR_VT: - case CHAR_FF: - case CHAR_NEL: -#ifndef EBCDIC - case 0x2028: - case 0x2029: -#endif /* Not EBCDIC */ - if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; - goto ANYNL03; - - case CHAR_CR: - if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1; - /* Fall through */ - - ANYNL03: - case CHAR_LF: - if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - if (++count >= (int)GET2(code, 1)) - { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } - else - { ADD_NEW_DATA(-state_offset, count, ncount); } - break; - - default: - break; - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_VSPACE_EXTRA + OP_TYPEEXACT: - case OP_VSPACE_EXTRA + OP_TYPEUPTO: - case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: - case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: - if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) - { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } - count = current_state->count; /* Number already matched */ - if (clen > 0) - { - BOOL OK; - switch (c) - { - VSPACE_CASES: - OK = TRUE; - break; - - default: - OK = FALSE; - } - - if (OK == (d == OP_VSPACE)) - { - if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - if (++count >= (int)GET2(code, 1)) - { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } - else - { ADD_NEW_DATA(-state_offset, count, 0); } - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_HSPACE_EXTRA + OP_TYPEEXACT: - case OP_HSPACE_EXTRA + OP_TYPEUPTO: - case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: - case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: - if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) - { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } - count = current_state->count; /* Number already matched */ - if (clen > 0) - { - BOOL OK; - switch (c) - { - HSPACE_CASES: - OK = TRUE; - break; - - default: - OK = FALSE; - break; - } - - if (OK == (d == OP_HSPACE)) - { - if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - if (++count >= (int)GET2(code, 1)) - { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } - else - { ADD_NEW_DATA(-state_offset, count, 0); } - } - } - break; - -/* ========================================================================== */ - /* These opcodes are followed by a character that is usually compared - to the current subject character; it is loaded into d. We still get - here even if there is no subject character, because in some cases zero - repetitions are permitted. */ - - /*-----------------------------------------------------------------*/ - case OP_CHAR: - if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } - break; - - /*-----------------------------------------------------------------*/ - case OP_CHARI: - if (clen == 0) break; - -#ifdef SUPPORT_UNICODE - if (utf_or_ucp) - { - if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else - { - unsigned int othercase; - if (c < 128) - othercase = fcc[c]; - else - othercase = UCD_OTHERCASE(c); - if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } - } - } - else -#endif /* SUPPORT_UNICODE */ - /* Not UTF or UCP mode */ - { - if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d)) - { ADD_NEW(state_offset + 2, 0); } - } - break; - - -#ifdef SUPPORT_UNICODE - /*-----------------------------------------------------------------*/ - /* This is a tricky one because it can match more than one character. - Find out how many characters to skip, and then set up a negative state - to wait for them to pass before continuing. */ - - case OP_EXTUNI: - if (clen > 0) - { - int ncount = 0; - PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, - end_subject, utf, &ncount); - if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) - reset_could_continue = TRUE; - ADD_NEW_DATA(-(state_offset + 1), 0, ncount); - } - break; -#endif - - /*-----------------------------------------------------------------*/ - /* This is a tricky like EXTUNI because it too can match more than one - character (when CR is followed by LF). In this case, set up a negative - state to wait for one character to pass before continuing. */ - - case OP_ANYNL: - if (clen > 0) switch(c) - { - case CHAR_VT: - case CHAR_FF: - case CHAR_NEL: -#ifndef EBCDIC - case 0x2028: - case 0x2029: -#endif /* Not EBCDIC */ - if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; - /* Fall through */ - - case CHAR_LF: - ADD_NEW(state_offset + 1, 0); - break; - - case CHAR_CR: - if (ptr + 1 >= end_subject) - { - ADD_NEW(state_offset + 1, 0); - if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) - reset_could_continue = TRUE; - } - else if (UCHAR21TEST(ptr + 1) == CHAR_LF) - { - ADD_NEW_DATA(-(state_offset + 1), 0, 1); - } - else - { - ADD_NEW(state_offset + 1, 0); - } - break; - } - break; - - /*-----------------------------------------------------------------*/ - case OP_NOT_VSPACE: - if (clen > 0) switch(c) - { - VSPACE_CASES: - break; - - default: - ADD_NEW(state_offset + 1, 0); - break; - } - break; - - /*-----------------------------------------------------------------*/ - case OP_VSPACE: - if (clen > 0) switch(c) - { - VSPACE_CASES: - ADD_NEW(state_offset + 1, 0); - break; - - default: - break; - } - break; - - /*-----------------------------------------------------------------*/ - case OP_NOT_HSPACE: - if (clen > 0) switch(c) - { - HSPACE_CASES: - break; - - default: - ADD_NEW(state_offset + 1, 0); - break; - } - break; - - /*-----------------------------------------------------------------*/ - case OP_HSPACE: - if (clen > 0) switch(c) - { - HSPACE_CASES: - ADD_NEW(state_offset + 1, 0); - break; - - default: - break; - } - break; - - /*-----------------------------------------------------------------*/ - /* Match a negated single character casefully. */ - - case OP_NOT: - if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); } - break; - - /*-----------------------------------------------------------------*/ - /* Match a negated single character caselessly. */ - - case OP_NOTI: - if (clen > 0) - { - uint32_t otherd; -#ifdef SUPPORT_UNICODE - if (utf_or_ucp && d >= 128) - otherd = UCD_OTHERCASE(d); - else -#endif /* SUPPORT_UNICODE */ - otherd = TABLE_GET(d, fcc, d); - if (c != d && c != otherd) - { ADD_NEW(state_offset + dlen + 1, 0); } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_PLUSI: - case OP_MINPLUSI: - case OP_POSPLUSI: - case OP_NOTPLUSI: - case OP_NOTMINPLUSI: - case OP_NOTPOSPLUSI: - caseless = TRUE; - codevalue -= OP_STARI - OP_STAR; - - /* Fall through */ - case OP_PLUS: - case OP_MINPLUS: - case OP_POSPLUS: - case OP_NOTPLUS: - case OP_NOTMINPLUS: - case OP_NOTPOSPLUS: - count = current_state->count; /* Already matched */ - if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } - if (clen > 0) - { - uint32_t otherd = NOTACHAR; - if (caseless) - { -#ifdef SUPPORT_UNICODE - if (utf_or_ucp && d >= 128) - otherd = UCD_OTHERCASE(d); - else -#endif /* SUPPORT_UNICODE */ - otherd = TABLE_GET(d, fcc, d); - } - if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) - { - if (count > 0 && - (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS)) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - count++; - ADD_NEW(state_offset, count); - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_QUERYI: - case OP_MINQUERYI: - case OP_POSQUERYI: - case OP_NOTQUERYI: - case OP_NOTMINQUERYI: - case OP_NOTPOSQUERYI: - caseless = TRUE; - codevalue -= OP_STARI - OP_STAR; - /* Fall through */ - case OP_QUERY: - case OP_MINQUERY: - case OP_POSQUERY: - case OP_NOTQUERY: - case OP_NOTMINQUERY: - case OP_NOTPOSQUERY: - ADD_ACTIVE(state_offset + dlen + 1, 0); - if (clen > 0) - { - uint32_t otherd = NOTACHAR; - if (caseless) - { -#ifdef SUPPORT_UNICODE - if (utf_or_ucp && d >= 128) - otherd = UCD_OTHERCASE(d); - else -#endif /* SUPPORT_UNICODE */ - otherd = TABLE_GET(d, fcc, d); - } - if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) - { - if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - ADD_NEW(state_offset + dlen + 1, 0); - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_STARI: - case OP_MINSTARI: - case OP_POSSTARI: - case OP_NOTSTARI: - case OP_NOTMINSTARI: - case OP_NOTPOSSTARI: - caseless = TRUE; - codevalue -= OP_STARI - OP_STAR; - /* Fall through */ - case OP_STAR: - case OP_MINSTAR: - case OP_POSSTAR: - case OP_NOTSTAR: - case OP_NOTMINSTAR: - case OP_NOTPOSSTAR: - ADD_ACTIVE(state_offset + dlen + 1, 0); - if (clen > 0) - { - uint32_t otherd = NOTACHAR; - if (caseless) - { -#ifdef SUPPORT_UNICODE - if (utf_or_ucp && d >= 128) - otherd = UCD_OTHERCASE(d); - else -#endif /* SUPPORT_UNICODE */ - otherd = TABLE_GET(d, fcc, d); - } - if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) - { - if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - ADD_NEW(state_offset, 0); - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_EXACTI: - case OP_NOTEXACTI: - caseless = TRUE; - codevalue -= OP_STARI - OP_STAR; - /* Fall through */ - case OP_EXACT: - case OP_NOTEXACT: - count = current_state->count; /* Number already matched */ - if (clen > 0) - { - uint32_t otherd = NOTACHAR; - if (caseless) - { -#ifdef SUPPORT_UNICODE - if (utf_or_ucp && d >= 128) - otherd = UCD_OTHERCASE(d); - else -#endif /* SUPPORT_UNICODE */ - otherd = TABLE_GET(d, fcc, d); - } - if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) - { - if (++count >= (int)GET2(code, 1)) - { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } - else - { ADD_NEW(state_offset, count); } - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_UPTOI: - case OP_MINUPTOI: - case OP_POSUPTOI: - case OP_NOTUPTOI: - case OP_NOTMINUPTOI: - case OP_NOTPOSUPTOI: - caseless = TRUE; - codevalue -= OP_STARI - OP_STAR; - /* Fall through */ - case OP_UPTO: - case OP_MINUPTO: - case OP_POSUPTO: - case OP_NOTUPTO: - case OP_NOTMINUPTO: - case OP_NOTPOSUPTO: - ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0); - count = current_state->count; /* Number already matched */ - if (clen > 0) - { - uint32_t otherd = NOTACHAR; - if (caseless) - { -#ifdef SUPPORT_UNICODE - if (utf_or_ucp && d >= 128) - otherd = UCD_OTHERCASE(d); - else -#endif /* SUPPORT_UNICODE */ - otherd = TABLE_GET(d, fcc, d); - } - if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) - { - if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - if (++count >= (int)GET2(code, 1)) - { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } - else - { ADD_NEW(state_offset, count); } - } - } - break; - - -/* ========================================================================== */ - /* These are the class-handling opcodes */ - - case OP_CLASS: - case OP_NCLASS: - case OP_XCLASS: - { - BOOL isinclass = FALSE; - int next_state_offset; - PCRE2_SPTR ecode; - - /* For a simple class, there is always just a 32-byte table, and we - can set isinclass from it. */ - - if (codevalue != OP_XCLASS) - { - ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); - if (clen > 0) - { - isinclass = (c > 255)? (codevalue == OP_NCLASS) : - ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0); - } - } - - /* An extended class may have a table or a list of single characters, - ranges, or both, and it may be positive or negative. There's a - function that sorts all this out. */ - - else - { - ecode = code + GET(code, 1); - if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf); - } - - /* At this point, isinclass is set for all kinds of class, and ecode - points to the byte after the end of the class. If there is a - quantifier, this is where it will be. */ - - next_state_offset = (int)(ecode - start_code); - - switch (*ecode) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPOSSTAR: - ADD_ACTIVE(next_state_offset + 1, 0); - if (isinclass) - { - if (*ecode == OP_CRPOSSTAR) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - ADD_NEW(state_offset, 0); - } - break; - - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRPOSPLUS: - count = current_state->count; /* Already matched */ - if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } - if (isinclass) - { - if (count > 0 && *ecode == OP_CRPOSPLUS) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - count++; - ADD_NEW(state_offset, count); - } - break; - - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSQUERY: - ADD_ACTIVE(next_state_offset + 1, 0); - if (isinclass) - { - if (*ecode == OP_CRPOSQUERY) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - ADD_NEW(next_state_offset + 1, 0); - } - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - count = current_state->count; /* Already matched */ - if (count >= (int)GET2(ecode, 1)) - { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } - if (isinclass) - { - int max = (int)GET2(ecode, 1 + IMM2_SIZE); - - if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1)) - { - active_count--; /* Remove non-match possibility */ - next_active_state--; - } - - if (++count >= max && max != 0) /* Max 0 => no limit */ - { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } - else - { ADD_NEW(state_offset, count); } - } - break; - - default: - if (isinclass) { ADD_NEW(next_state_offset, 0); } - break; - } - } - break; - -/* ========================================================================== */ - /* These are the opcodes for fancy brackets of various kinds. We have - to use recursion in order to handle them. The "always failing" assertion - (?!) is optimised to OP_FAIL when compiling, so we have to support that, - though the other "backtracking verbs" are not supported. */ - - case OP_FAIL: - forced_fail++; /* Count FAILs for multiple states */ - break; - - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - { - int rc; - int *local_workspace; - PCRE2_SIZE *local_offsets; - PCRE2_SPTR endasscode = code + GET(code, 1); - RWS_anchor *rws = (RWS_anchor *)RWS; - - if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE) - { - rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb); - if (rc != 0) return rc; - RWS = (int *)rws; - } - - local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); - local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE; - rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE; - - while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); - - rc = internal_dfa_match( - mb, /* static match data */ - code, /* this subexpression's code */ - ptr, /* where we currently are */ - (PCRE2_SIZE)(ptr - start_subject), /* start offset */ - local_offsets, /* offset vector */ - RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */ - local_workspace, /* workspace vector */ - RWS_RSIZE, /* size of same */ - rlevel, /* function recursion level */ - RWS); /* recursion workspace */ - - rws->free += RWS_RSIZE + RWS_OVEC_OSIZE; - - if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc; - if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) - { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_COND: - case OP_SCOND: - { - int codelink = (int)GET(code, 1); - PCRE2_UCHAR condcode; - - /* Because of the way auto-callout works during compile, a callout item - is inserted between OP_COND and an assertion condition. This does not - happen for the other conditions. */ - - if (code[LINK_SIZE + 1] == OP_CALLOUT - || code[LINK_SIZE + 1] == OP_CALLOUT_STR) - { - PCRE2_SIZE callout_length; - rrc = do_callout(code, offsets, current_subject, ptr, mb, - 1 + LINK_SIZE, &callout_length); - if (rrc < 0) return rrc; /* Abandon */ - if (rrc > 0) break; /* Fail this thread */ - code += callout_length; /* Skip callout data */ - } - - condcode = code[LINK_SIZE+1]; - - /* Back reference conditions and duplicate named recursion conditions - are not supported */ - - if (condcode == OP_CREF || condcode == OP_DNCREF || - condcode == OP_DNRREF) - return PCRE2_ERROR_DFA_UCOND; - - /* The DEFINE condition is always false, and the assertion (?!) is - converted to OP_FAIL. */ - - if (condcode == OP_FALSE || condcode == OP_FAIL) - { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } - - /* There is also an always-true condition */ - - else if (condcode == OP_TRUE) - { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); } - - /* The only supported version of OP_RREF is for the value RREF_ANY, - which means "test if in any recursion". We can't test for specifically - recursed groups. */ - - else if (condcode == OP_RREF) - { - unsigned int value = GET2(code, LINK_SIZE + 2); - if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND; - if (mb->recursive != NULL) - { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); } - else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } - } - - /* Otherwise, the condition is an assertion */ - - else - { - int rc; - int *local_workspace; - PCRE2_SIZE *local_offsets; - PCRE2_SPTR asscode = code + LINK_SIZE + 1; - PCRE2_SPTR endasscode = asscode + GET(asscode, 1); - RWS_anchor *rws = (RWS_anchor *)RWS; - - if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE) - { - rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb); - if (rc != 0) return rc; - RWS = (int *)rws; - } - - local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); - local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE; - rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE; - - while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); - - rc = internal_dfa_match( - mb, /* fixed match data */ - asscode, /* this subexpression's code */ - ptr, /* where we currently are */ - (PCRE2_SIZE)(ptr - start_subject), /* start offset */ - local_offsets, /* offset vector */ - RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */ - local_workspace, /* workspace vector */ - RWS_RSIZE, /* size of same */ - rlevel, /* function recursion level */ - RWS); /* recursion workspace */ - - rws->free += RWS_RSIZE + RWS_OVEC_OSIZE; - - if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc; - if ((rc >= 0) == - (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) - { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } - else - { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_RECURSE: - { - int rc; - int *local_workspace; - PCRE2_SIZE *local_offsets; - RWS_anchor *rws = (RWS_anchor *)RWS; - dfa_recursion_info *ri; - PCRE2_SPTR callpat = start_code + GET(code, 1); - uint32_t recno = (callpat == mb->start_code)? 0 : - GET2(callpat, 1 + LINK_SIZE); - - if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE) - { - rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb); - if (rc != 0) return rc; - RWS = (int *)rws; - } - - local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); - local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE; - rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE; - - /* Check for repeating a recursion without advancing the subject - pointer. This should catch convoluted mutual recursions. (Some simple - cases are caught at compile time.) */ - - for (ri = mb->recursive; ri != NULL; ri = ri->prevrec) - if (recno == ri->group_num && ptr == ri->subject_position) - return PCRE2_ERROR_RECURSELOOP; - - /* Remember this recursion and where we started it so as to - catch infinite loops. */ - - new_recursive.group_num = recno; - new_recursive.subject_position = ptr; - new_recursive.prevrec = mb->recursive; - mb->recursive = &new_recursive; - - rc = internal_dfa_match( - mb, /* fixed match data */ - callpat, /* this subexpression's code */ - ptr, /* where we currently are */ - (PCRE2_SIZE)(ptr - start_subject), /* start offset */ - local_offsets, /* offset vector */ - RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */ - local_workspace, /* workspace vector */ - RWS_RSIZE, /* size of same */ - rlevel, /* function recursion level */ - RWS); /* recursion workspace */ - - rws->free += RWS_RSIZE + RWS_OVEC_RSIZE; - mb->recursive = new_recursive.prevrec; /* Done this recursion */ - - /* Ran out of internal offsets */ - - if (rc == 0) return PCRE2_ERROR_DFA_RECURSE; - - /* For each successful matched substring, set up the next state with a - count of characters to skip before trying it. Note that the count is in - characters, not bytes. */ - - if (rc > 0) - { - for (rc = rc*2 - 2; rc >= 0; rc -= 2) - { - PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc]; -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (utf) - { - PCRE2_SPTR p = start_subject + local_offsets[rc]; - PCRE2_SPTR pp = start_subject + local_offsets[rc+1]; - while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; - } -#endif - if (charcount > 0) - { - ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, - (int)(charcount - 1)); - } - else - { - ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); - } - } - } - else if (rc != PCRE2_ERROR_NOMATCH) return rc; - } - break; - - /*-----------------------------------------------------------------*/ - case OP_BRAPOS: - case OP_SBRAPOS: - case OP_CBRAPOS: - case OP_SCBRAPOS: - case OP_BRAPOSZERO: - { - int rc; - int *local_workspace; - PCRE2_SIZE *local_offsets; - PCRE2_SIZE charcount, matched_count; - PCRE2_SPTR local_ptr = ptr; - RWS_anchor *rws = (RWS_anchor *)RWS; - BOOL allow_zero; - - if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE) - { - rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb); - if (rc != 0) return rc; - RWS = (int *)rws; - } - - local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); - local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE; - rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE; - - if (codevalue == OP_BRAPOSZERO) - { - allow_zero = TRUE; - codevalue = *(++code); /* Codevalue will be one of above BRAs */ - } - else allow_zero = FALSE; - - /* Loop to match the subpattern as many times as possible as if it were - a complete pattern. */ - - for (matched_count = 0;; matched_count++) - { - rc = internal_dfa_match( - mb, /* fixed match data */ - code, /* this subexpression's code */ - local_ptr, /* where we currently are */ - (PCRE2_SIZE)(ptr - start_subject), /* start offset */ - local_offsets, /* offset vector */ - RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */ - local_workspace, /* workspace vector */ - RWS_RSIZE, /* size of same */ - rlevel, /* function recursion level */ - RWS); /* recursion workspace */ - - /* Failed to match */ - - if (rc < 0) - { - if (rc != PCRE2_ERROR_NOMATCH) return rc; - break; - } - - /* Matched: break the loop if zero characters matched. */ - - charcount = local_offsets[1] - local_offsets[0]; - if (charcount == 0) break; - local_ptr += charcount; /* Advance temporary position ptr */ - } - - rws->free += RWS_RSIZE + RWS_OVEC_OSIZE; - - /* At this point we have matched the subpattern matched_count - times, and local_ptr is pointing to the character after the end of the - last match. */ - - if (matched_count > 0 || allow_zero) - { - PCRE2_SPTR end_subpattern = code; - int next_state_offset; - - do { end_subpattern += GET(end_subpattern, 1); } - while (*end_subpattern == OP_ALT); - next_state_offset = - (int)(end_subpattern - start_code + LINK_SIZE + 1); - - /* Optimization: if there are no more active states, and there - are no new states yet set up, then skip over the subject string - right here, to save looping. Otherwise, set up the new state to swing - into action when the end of the matched substring is reached. */ - - if (i + 1 >= active_count && new_count == 0) - { - ptr = local_ptr; - clen = 0; - ADD_NEW(next_state_offset, 0); - } - else - { - PCRE2_SPTR p = ptr; - PCRE2_SPTR pp = local_ptr; - charcount = (PCRE2_SIZE)(pp - p); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; -#endif - ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1)); - } - } - } - break; - - /*-----------------------------------------------------------------*/ - case OP_ONCE: - { - int rc; - int *local_workspace; - PCRE2_SIZE *local_offsets; - RWS_anchor *rws = (RWS_anchor *)RWS; - - if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE) - { - rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb); - if (rc != 0) return rc; - RWS = (int *)rws; - } - - local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); - local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE; - rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE; - - rc = internal_dfa_match( - mb, /* fixed match data */ - code, /* this subexpression's code */ - ptr, /* where we currently are */ - (PCRE2_SIZE)(ptr - start_subject), /* start offset */ - local_offsets, /* offset vector */ - RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */ - local_workspace, /* workspace vector */ - RWS_RSIZE, /* size of same */ - rlevel, /* function recursion level */ - RWS); /* recursion workspace */ - - rws->free += RWS_RSIZE + RWS_OVEC_OSIZE; - - if (rc >= 0) - { - PCRE2_SPTR end_subpattern = code; - PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0]; - int next_state_offset, repeat_state_offset; - - do { end_subpattern += GET(end_subpattern, 1); } - while (*end_subpattern == OP_ALT); - next_state_offset = - (int)(end_subpattern - start_code + LINK_SIZE + 1); - - /* If the end of this subpattern is KETRMAX or KETRMIN, we must - arrange for the repeat state also to be added to the relevant list. - Calculate the offset, or set -1 for no repeat. */ - - repeat_state_offset = (*end_subpattern == OP_KETRMAX || - *end_subpattern == OP_KETRMIN)? - (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1; - - /* If we have matched an empty string, add the next state at the - current character pointer. This is important so that the duplicate - checking kicks in, which is what breaks infinite loops that match an - empty string. */ - - if (charcount == 0) - { - ADD_ACTIVE(next_state_offset, 0); - } - - /* Optimization: if there are no more active states, and there - are no new states yet set up, then skip over the subject string - right here, to save looping. Otherwise, set up the new state to swing - into action when the end of the matched substring is reached. */ - - else if (i + 1 >= active_count && new_count == 0) - { - ptr += charcount; - clen = 0; - ADD_NEW(next_state_offset, 0); - - /* If we are adding a repeat state at the new character position, - we must fudge things so that it is the only current state. - Otherwise, it might be a duplicate of one we processed before, and - that would cause it to be skipped. */ - - if (repeat_state_offset >= 0) - { - next_active_state = active_states; - active_count = 0; - i = -1; - ADD_ACTIVE(repeat_state_offset, 0); - } - } - else - { -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (utf) - { - PCRE2_SPTR p = start_subject + local_offsets[0]; - PCRE2_SPTR pp = start_subject + local_offsets[1]; - while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; - } -#endif - ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1)); - if (repeat_state_offset >= 0) - { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); } - } - } - else if (rc != PCRE2_ERROR_NOMATCH) return rc; - } - break; - - -/* ========================================================================== */ - /* Handle callouts */ - - case OP_CALLOUT: - case OP_CALLOUT_STR: - { - PCRE2_SIZE callout_length; - rrc = do_callout(code, offsets, current_subject, ptr, mb, 0, - &callout_length); - if (rrc < 0) return rrc; /* Abandon */ - if (rrc == 0) - { ADD_ACTIVE(state_offset + (int)callout_length, 0); } - } - break; - - -/* ========================================================================== */ - default: /* Unsupported opcode */ - return PCRE2_ERROR_DFA_UITEM; - } - - NEXT_ACTIVE_STATE: continue; - - } /* End of loop scanning active states */ - - /* We have finished the processing at the current subject character. If no - new states have been set for the next character, we have found all the - matches that we are going to find. If partial matching has been requested, - check for appropriate conditions. - - The "forced_ fail" variable counts the number of (*F) encountered for the - character. If it is equal to the original active_count (saved in - workspace[1]) it means that (*F) was found on every active state. In this - case we don't want to give a partial match. - - The "could_continue" variable is true if a state could have continued but - for the fact that the end of the subject was reached. */ - - if (new_count <= 0) - { - if (could_continue && /* Some could go on, and */ - forced_fail != workspace[1] && /* Not all forced fail & */ - ( /* either... */ - (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */ - || /* or... */ - ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */ - match_count < 0) /* no matches */ - ) && /* And... */ - ( - partial_newline || /* Either partial NL */ - ( /* or ... */ - ptr >= end_subject && /* End of subject and */ - ( /* either */ - ptr > mb->start_used_ptr || /* Inspected non-empty string */ - mb->allowemptypartial /* or pattern has lookbehind */ - ) /* or could match empty */ - ) - )) - match_count = PCRE2_ERROR_PARTIAL; - break; /* Exit from loop along the subject string */ - } - - /* One or more states are active for the next character. */ - - ptr += clen; /* Advance to next subject character */ - } /* Loop to move along the subject string */ - -/* Control gets here from "break" a few lines above. If we have a match and -PCRE2_ENDANCHORED is set, the match fails. */ - -if (match_count >= 0 && - ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 && - ptr < end_subject) - match_count = PCRE2_ERROR_NOMATCH; - -return match_count; -} - - - -/************************************************* -* Match a pattern using the DFA algorithm * -*************************************************/ - -/* This function matches a compiled pattern to a subject string, using the -alternate matching algorithm that finds all matches at once. - -Arguments: - code points to the compiled pattern - subject subject string - length length of subject string - startoffset where to start matching in the subject - options option bits - match_data points to a match data structure - gcontext points to a match context - workspace pointer to workspace - wscount size of workspace - -Returns: > 0 => number of match offset pairs placed in offsets - = 0 => offsets overflowed; longest matches are present - -1 => failed to match - < -1 => some kind of unexpected problem -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, - PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, - pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount) -{ -int rc; -int was_zero_terminated = 0; - -const pcre2_real_code *re = (const pcre2_real_code *)code; - -PCRE2_SPTR start_match; -PCRE2_SPTR end_subject; -PCRE2_SPTR bumpalong_limit; -PCRE2_SPTR req_cu_ptr; - -BOOL utf, anchored, startline, firstline; -BOOL has_first_cu = FALSE; -BOOL has_req_cu = FALSE; - -#if PCRE2_CODE_UNIT_WIDTH == 8 -BOOL memchr_not_found_first_cu = FALSE; -BOOL memchr_not_found_first_cu2 = FALSE; -#endif - -PCRE2_UCHAR first_cu = 0; -PCRE2_UCHAR first_cu2 = 0; -PCRE2_UCHAR req_cu = 0; -PCRE2_UCHAR req_cu2 = 0; - -const uint8_t *start_bits = NULL; - -/* We need to have mb pointing to a match block, because the IS_NEWLINE macro -is used below, and it expects NLBLOCK to be defined as a pointer. */ - -pcre2_callout_block cb; -dfa_match_block actual_match_block; -dfa_match_block *mb = &actual_match_block; - -/* Set up a starting block of memory for use during recursive calls to -internal_dfa_match(). By putting this on the stack, it minimizes resource use -in the case when it is not needed. If this is too small, more memory is -obtained from the heap. At the start of each block is an anchor structure.*/ - -int base_recursion_workspace[RWS_BASE_SIZE]; -RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace; -rws->next = NULL; -rws->size = RWS_BASE_SIZE; -rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE; - -/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated -subject string. */ - -if (length == PCRE2_ZERO_TERMINATED) - { - length = PRIV(strlen)(subject); - was_zero_terminated = 1; - } - -/* Plausibility checks */ - -if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; -if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL) - return PCRE2_ERROR_NULL; -if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE; -if (start_offset > length) return PCRE2_ERROR_BADOFFSET; - -/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same -time. */ - -if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 && - ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0) - return PCRE2_ERROR_BADOPTION; - -/* Invalid UTF support is not available for DFA matching. */ - -if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0) - return PCRE2_ERROR_DFA_UINVALID_UTF; - -/* Check that the first field in the block is the magic number. If it is not, -return with PCRE2_ERROR_BADMAGIC. */ - -if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; - -/* Check the code unit width. */ - -if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) - return PCRE2_ERROR_BADMODE; - -/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the -options variable for this function. Users of PCRE2 who are not calling the -function directly would like to have a way of setting these flags, in the same -way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with -constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and -(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be -transferred to the options for this function. The bits are guaranteed to be -adjacent, but do not have the same values. This bit of Boolean trickery assumes -that the match-time bits are not more significant than the flag bits. If by -accident this is not the case, a compile-time division by zero error will -occur. */ - -#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) -#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) -options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); -#undef FF -#undef OO - -/* If restarting after a partial match, do some sanity checks on the contents -of the workspace. */ - -if ((options & PCRE2_DFA_RESTART) != 0) - { - if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 || - workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK)) - return PCRE2_ERROR_DFA_BADRESTART; - } - -/* Set some local values */ - -utf = (re->overall_options & PCRE2_UTF) != 0; -start_match = subject + start_offset; -end_subject = subject + length; -req_cu_ptr = start_match - 1; -anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 || - (re->overall_options & PCRE2_ANCHORED) != 0; - -/* The "must be at the start of a line" flags are used in a loop when finding -where to start. */ - -startline = (re->flags & PCRE2_STARTLINE) != 0; -firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; -bumpalong_limit = end_subject; - -/* Initialize and set up the fixed fields in the callout block, with a pointer -in the match block. */ - -mb->cb = &cb; -cb.version = 2; -cb.subject = subject; -cb.subject_length = (PCRE2_SIZE)(end_subject - subject); -cb.callout_flags = 0; -cb.capture_top = 1; /* No capture support */ -cb.capture_last = 0; -cb.mark = NULL; /* No (*MARK) support */ - -/* Get data from the match context, if present, and fill in the remaining -fields in the match block. It is an error to set an offset limit without -setting the flag at compile time. */ - -if (mcontext == NULL) - { - mb->callout = NULL; - mb->memctl = re->memctl; - mb->match_limit = PRIV(default_match_context).match_limit; - mb->match_limit_depth = PRIV(default_match_context).depth_limit; - mb->heap_limit = PRIV(default_match_context).heap_limit; - } -else - { - if (mcontext->offset_limit != PCRE2_UNSET) - { - if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) - return PCRE2_ERROR_BADOFFSETLIMIT; - bumpalong_limit = subject + mcontext->offset_limit; - } - mb->callout = mcontext->callout; - mb->callout_data = mcontext->callout_data; - mb->memctl = mcontext->memctl; - mb->match_limit = mcontext->match_limit; - mb->match_limit_depth = mcontext->depth_limit; - mb->heap_limit = mcontext->heap_limit; - } - -if (mb->match_limit > re->limit_match) - mb->match_limit = re->limit_match; - -if (mb->match_limit_depth > re->limit_depth) - mb->match_limit_depth = re->limit_depth; - -if (mb->heap_limit > re->limit_heap) - mb->heap_limit = re->limit_heap; - -mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + - re->name_count * re->name_entry_size; -mb->tables = re->tables; -mb->start_subject = subject; -mb->end_subject = end_subject; -mb->start_offset = start_offset; -mb->allowemptypartial = (re->max_lookbehind > 0) || - (re->flags & PCRE2_MATCH_EMPTY) != 0; -mb->moptions = options; -mb->poptions = re->overall_options; -mb->match_call_count = 0; -mb->heap_used = 0; - -/* Process the \R and newline settings. */ - -mb->bsr_convention = re->bsr_convention; -mb->nltype = NLTYPE_FIXED; -switch(re->newline_convention) - { - case PCRE2_NEWLINE_CR: - mb->nllen = 1; - mb->nl[0] = CHAR_CR; - break; - - case PCRE2_NEWLINE_LF: - mb->nllen = 1; - mb->nl[0] = CHAR_NL; - break; - - case PCRE2_NEWLINE_NUL: - mb->nllen = 1; - mb->nl[0] = CHAR_NUL; - break; - - case PCRE2_NEWLINE_CRLF: - mb->nllen = 2; - mb->nl[0] = CHAR_CR; - mb->nl[1] = CHAR_NL; - break; - - case PCRE2_NEWLINE_ANY: - mb->nltype = NLTYPE_ANY; - break; - - case PCRE2_NEWLINE_ANYCRLF: - mb->nltype = NLTYPE_ANYCRLF; - break; - - default: return PCRE2_ERROR_INTERNAL; - } - -/* Check a UTF string for validity if required. For 8-bit and 16-bit strings, -we must also check that a starting offset does not point into the middle of a -multiunit character. We check only the portion of the subject that is going to -be inspected during matching - from the offset minus the maximum back reference -to the given length. This saves time when a small part of a large subject is -being matched by the use of a starting offset. Note that the maximum lookbehind -is a number of characters, not code units. */ - -#ifdef SUPPORT_UNICODE -if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) - { - PCRE2_SPTR check_subject = start_match; /* start_match includes offset */ - - if (start_offset > 0) - { -#if PCRE2_CODE_UNIT_WIDTH != 32 - unsigned int i; - if (start_match < end_subject && NOT_FIRSTCU(*start_match)) - return PCRE2_ERROR_BADUTFOFFSET; - for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--) - { - check_subject--; - while (check_subject > subject && -#if PCRE2_CODE_UNIT_WIDTH == 8 - (*check_subject & 0xc0) == 0x80) -#else /* 16-bit */ - (*check_subject & 0xfc00) == 0xdc00) -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - check_subject--; - } -#else /* In the 32-bit library, one code unit equals one character. */ - check_subject -= re->max_lookbehind; - if (check_subject < subject) check_subject = subject; -#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ - } - - /* Validate the relevant portion of the subject. After an error, adjust the - offset to be an absolute offset in the whole string. */ - - match_data->rc = PRIV(valid_utf)(check_subject, - length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar)); - if (match_data->rc != 0) - { - match_data->startchar += (PCRE2_SIZE)(check_subject - subject); - return match_data->rc; - } - } -#endif /* SUPPORT_UNICODE */ - -/* Set up the first code unit to match, if available. If there's no first code -unit there may be a bitmap of possible first characters. */ - -if ((re->flags & PCRE2_FIRSTSET) != 0) - { - has_first_cu = TRUE; - first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); - if ((re->flags & PCRE2_FIRSTCASELESS) != 0) - { - first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu); -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0) - first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); -#else - if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0)) - first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); -#endif -#endif /* SUPPORT_UNICODE */ - } - } -else - if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) - start_bits = re->start_bitmap; - -/* There may be a "last known required code unit" set. */ - -if ((re->flags & PCRE2_LASTSET) != 0) - { - has_req_cu = TRUE; - req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit); - if ((re->flags & PCRE2_LASTCASELESS) != 0) - { - req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu); -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0) - req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); -#else - if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0)) - req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); -#endif -#endif /* SUPPORT_UNICODE */ - } - } - -/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, -free the memory that was obtained. */ - -if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) - { - match_data->memctl.free((void *)match_data->subject, - match_data->memctl.memory_data); - match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; - } - -/* Fill in fields that are always returned in the match data. */ - -match_data->code = re; -match_data->subject = NULL; /* Default for no match */ -match_data->mark = NULL; -match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER; - -/* Call the main matching function, looping for a non-anchored regex after a -failed match. If not restarting, perform certain optimizations at the start of -a match. */ - -for (;;) - { - /* ----------------- Start of match optimizations ---------------- */ - - /* There are some optimizations that avoid running the match if a known - starting point is not found, or if a known later code unit is not present. - However, there is an option (settable at compile time) that disables - these, for testing and for ensuring that all callouts do actually occur. - The optimizations must also be avoided when restarting a DFA match. */ - - if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 && - (options & PCRE2_DFA_RESTART) == 0) - { - /* If firstline is TRUE, the start of the match is constrained to the first - line of a multiline string. That is, the match must be before or at the - first newline following the start of matching. Temporarily adjust - end_subject so that we stop the optimization scans for a first code unit - immediately after the first character of a newline (the first code unit can - legitimately be a newline). If the match fails at the newline, later code - breaks this loop. */ - - if (firstline) - { - PCRE2_SPTR t = start_match; -#ifdef SUPPORT_UNICODE - if (utf) - { - while (t < end_subject && !IS_NEWLINE(t)) - { - t++; - ACROSSCHAR(t < end_subject, t, t++); - } - } - else -#endif - while (t < end_subject && !IS_NEWLINE(t)) t++; - end_subject = t; - } - - /* Anchored: check the first code unit if one is recorded. This may seem - pointless but it can help in detecting a no match case without scanning for - the required code unit. */ - - if (anchored) - { - if (has_first_cu || start_bits != NULL) - { - BOOL ok = start_match < end_subject; - if (ok) - { - PCRE2_UCHAR c = UCHAR21TEST(start_match); - ok = has_first_cu && (c == first_cu || c == first_cu2); - if (!ok && start_bits != NULL) - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - if (c > 255) c = 255; -#endif - ok = (start_bits[c/8] & (1u << (c&7))) != 0; - } - } - if (!ok) break; - } - } - - /* Not anchored. Advance to a unique first code unit if there is one. In - 8-bit mode, the use of memchr() gives a big speed up, even though we have - to call it twice in caseless mode, in order to find the earliest occurrence - of the character in either of its cases. If a call to memchr() that - searches the rest of the subject fails to find one case, remember that in - order not to keep on repeating the search. This can make a huge difference - when the strings are very long and only one case is present. */ - - else - { - if (has_first_cu) - { - if (first_cu != first_cu2) /* Caseless */ - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - PCRE2_UCHAR smc; - while (start_match < end_subject && - (smc = UCHAR21TEST(start_match)) != first_cu && - smc != first_cu2) - start_match++; - -#else /* 8-bit code units */ - PCRE2_SPTR pp1 = NULL; - PCRE2_SPTR pp2 = NULL; - PCRE2_SIZE cu2size = end_subject - start_match; - - if (!memchr_not_found_first_cu) - { - pp1 = memchr(start_match, first_cu, end_subject - start_match); - if (pp1 == NULL) memchr_not_found_first_cu = TRUE; - else cu2size = pp1 - start_match; - } - - /* If pp1 is not NULL, we have arranged to search only as far as pp1, - to see if the other case is earlier, so we can set "not found" only - when both searches have returned NULL. */ - - if (!memchr_not_found_first_cu2) - { - pp2 = memchr(start_match, first_cu2, cu2size); - memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL); - } - - if (pp1 == NULL) - start_match = (pp2 == NULL)? end_subject : pp2; - else - start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; -#endif - } - - /* The caseful case */ - - else - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - while (start_match < end_subject && UCHAR21TEST(start_match) != - first_cu) - start_match++; -#else /* 8-bit code units */ - start_match = memchr(start_match, first_cu, end_subject - start_match); - if (start_match == NULL) start_match = end_subject; -#endif - } - - /* If we can't find the required code unit, having reached the true end - of the subject, break the bumpalong loop, to force a match failure, - except when doing partial matching, when we let the next cycle run at - the end of the subject. To see why, consider the pattern /(?<=abc)def/, - which partially matches "abc", even though the string does not contain - the starting character "d". If we have not reached the true end of the - subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified) - we also let the cycle run, because the matching string is legitimately - allowed to start with the first code unit of a newline. */ - - if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 && - start_match >= mb->end_subject) - break; - } - - /* If there's no first code unit, advance to just after a linebreak for a - multiline match if required. */ - - else if (startline) - { - if (start_match > mb->start_subject + start_offset) - { -#ifdef SUPPORT_UNICODE - if (utf) - { - while (start_match < end_subject && !WAS_NEWLINE(start_match)) - { - start_match++; - ACROSSCHAR(start_match < end_subject, start_match, start_match++); - } - } - else -#endif - while (start_match < end_subject && !WAS_NEWLINE(start_match)) - start_match++; - - /* If we have just passed a CR and the newline option is ANY or - ANYCRLF, and we are now at a LF, advance the match position by one - more code unit. */ - - if (start_match[-1] == CHAR_CR && - (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && - start_match < end_subject && - UCHAR21TEST(start_match) == CHAR_NL) - start_match++; - } - } - - /* If there's no first code unit or a requirement for a multiline line - start, advance to a non-unique first code unit if any have been - identified. The bitmap contains only 256 bits. When code units are 16 or - 32 bits wide, all code units greater than 254 set the 255 bit. */ - - else if (start_bits != NULL) - { - while (start_match < end_subject) - { - uint32_t c = UCHAR21TEST(start_match); -#if PCRE2_CODE_UNIT_WIDTH != 8 - if (c > 255) c = 255; -#endif - if ((start_bits[c/8] & (1u << (c&7))) != 0) break; - start_match++; - } - - /* See comment above in first_cu checking about the next line. */ - - if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 && - start_match >= mb->end_subject) - break; - } - } /* End of first code unit handling */ - - /* Restore fudged end_subject */ - - end_subject = mb->end_subject; - - /* The following two optimizations are disabled for partial matching. */ - - if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0) - { - PCRE2_SPTR p; - - /* The minimum matching length is a lower bound; no actual string of that - length may actually match the pattern. Although the value is, strictly, - in characters, we treat it as code units to avoid spending too much time - in this optimization. */ - - if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT; - - /* If req_cu is set, we know that that code unit must appear in the - subject for the match to succeed. If the first code unit is set, req_cu - must be later in the subject; otherwise the test starts at the match - point. This optimization can save a huge amount of backtracking in - patterns with nested unlimited repeats that aren't going to match. - Writing separate code for cased/caseless versions makes it go faster, as - does using an autoincrement and backing off on a match. As in the case of - the first code unit, using memchr() in the 8-bit library gives a big - speed up. Unlike the first_cu check above, we do not need to call - memchr() twice in the caseless case because we only need to check for the - presence of the character in either case, not find the first occurrence. - - The search can be skipped if the code unit was found later than the - current starting point in a previous iteration of the bumpalong loop. - - HOWEVER: when the subject string is very, very long, searching to its end - can take a long time, and give bad performance on quite ordinary - patterns. This showed up when somebody was matching something like - /^\d+C/ on a 32-megabyte string... so we don't do this when the string is - sufficiently long, but it's worth searching a lot more for unanchored - patterns. */ - - p = start_match + (has_first_cu? 1:0); - if (has_req_cu && p > req_cu_ptr) - { - PCRE2_SIZE check_length = end_subject - start_match; - - if (check_length < REQ_CU_MAX || - (!anchored && check_length < REQ_CU_MAX * 1000)) - { - if (req_cu != req_cu2) /* Caseless */ - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - while (p < end_subject) - { - uint32_t pp = UCHAR21INCTEST(p); - if (pp == req_cu || pp == req_cu2) { p--; break; } - } -#else /* 8-bit code units */ - PCRE2_SPTR pp = p; - p = memchr(pp, req_cu, end_subject - pp); - if (p == NULL) - { - p = memchr(pp, req_cu2, end_subject - pp); - if (p == NULL) p = end_subject; - } -#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ - } - - /* The caseful case */ - - else - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - while (p < end_subject) - { - if (UCHAR21INCTEST(p) == req_cu) { p--; break; } - } - -#else /* 8-bit code units */ - p = memchr(p, req_cu, end_subject - p); - if (p == NULL) p = end_subject; -#endif - } - - /* If we can't find the required code unit, break the matching loop, - forcing a match failure. */ - - if (p >= end_subject) break; - - /* If we have found the required code unit, save the point where we - found it, so that we don't search again next time round the loop if - the start hasn't passed this code unit yet. */ - - req_cu_ptr = p; - } - } - } - } - - /* ------------ End of start of match optimizations ------------ */ - - /* Give no match if we have passed the bumpalong limit. */ - - if (start_match > bumpalong_limit) break; - - /* OK, now we can do the business */ - - mb->start_used_ptr = start_match; - mb->last_used_ptr = start_match; - mb->recursive = NULL; - - rc = internal_dfa_match( - mb, /* fixed match data */ - mb->start_code, /* this subexpression's code */ - start_match, /* where we currently are */ - start_offset, /* start offset in subject */ - match_data->ovector, /* offset vector */ - (uint32_t)match_data->oveccount * 2, /* actual size of same */ - workspace, /* workspace vector */ - (int)wscount, /* size of same */ - 0, /* function recurse level */ - base_recursion_workspace); /* initial workspace for recursion */ - - /* Anything other than "no match" means we are done, always; otherwise, carry - on only if not anchored. */ - - if (rc != PCRE2_ERROR_NOMATCH || anchored) - { - if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0) - { - match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject); - match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject); - } - match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject); - match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject); - match_data->startchar = (PCRE2_SIZE)(start_match - subject); - match_data->rc = rc; - - if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0) - { - length = CU2BYTES(length + was_zero_terminated); - match_data->subject = match_data->memctl.malloc(length, - match_data->memctl.memory_data); - if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; - memcpy((void *)match_data->subject, subject, length); - match_data->flags |= PCRE2_MD_COPIED_SUBJECT; - } - else - { - if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject; - } - goto EXIT; - } - - /* Advance to the next subject character unless we are at the end of a line - and firstline is set. */ - - if (firstline && IS_NEWLINE(start_match)) break; - start_match++; -#ifdef SUPPORT_UNICODE - if (utf) - { - ACROSSCHAR(start_match < end_subject, start_match, start_match++); - } -#endif - if (start_match > end_subject) break; - - /* If we have just passed a CR and we are now at a LF, and the pattern does - not contain any explicit matches for \r or \n, and the newline option is CRLF - or ANY or ANYCRLF, advance the match position by one more character. */ - - if (UCHAR21TEST(start_match - 1) == CHAR_CR && - start_match < end_subject && - UCHAR21TEST(start_match) == CHAR_NL && - (re->flags & PCRE2_HASCRORLF) == 0 && - (mb->nltype == NLTYPE_ANY || - mb->nltype == NLTYPE_ANYCRLF || - mb->nllen == 2)) - start_match++; - - } /* "Bumpalong" loop */ - -NOMATCH_EXIT: -rc = PCRE2_ERROR_NOMATCH; - -EXIT: -while (rws->next != NULL) - { - RWS_anchor *next = rws->next; - rws->next = next->next; - mb->memctl.free(next, mb->memctl.memory_data); - } - -return rc; -} - -/* End of pcre2_dfa_match.c */ diff --git a/pcre2/src/pcre2_dftables.c b/pcre2/src/pcre2_dftables.c deleted file mode 100644 index 71b90ce83..000000000 --- a/pcre2/src/pcre2_dftables.c +++ /dev/null @@ -1,303 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -/* This is a freestanding support program to generate a file containing -character tables for PCRE2. The tables are built using the pcre2_maketables() -function, which is part of the PCRE2 API. By default, the system's "C" locale -is used rather than what the building user happens to have set, but the -L -option can be used to select the current locale from the LC_ALL environment -variable. By default, the tables are written in source form, but if -b is -given, they are written in binary. */ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include -#include -#include -#include - -#define PCRE2_CODE_UNIT_WIDTH 0 /* Must be set, but not relevant here */ -#include "pcre2_internal.h" - -#define PCRE2_DFTABLES /* pcre2_maketables.c notices this */ -#include "pcre2_maketables.c" - - -static const char *classlist[] = - { - "space", "xdigit", "digit", "upper", "lower", - "word", "graph", "print", "punct", "cntrl" - }; - - - -/************************************************* -* Usage * -*************************************************/ - -static void -usage(void) -{ -(void)fprintf(stderr, - "Usage: pcre2_dftables [options] \n" - " -b Write output in binary (default is source code)\n" - " -L Use locale from LC_ALL (default is \"C\" locale)\n" - ); -} - - - -/************************************************* -* Entry point * -*************************************************/ - -int main(int argc, char **argv) -{ -FILE *f; -int i; -int nclass = 0; -BOOL binary = FALSE; -char *env = (char *)"C"; -const unsigned char *tables; -const unsigned char *base_of_tables; - -/* Process options */ - -for (i = 1; i < argc; i++) - { - char *arg = argv[i]; - if (*arg != '-') break; - - if (strcmp(arg, "-help") == 0 || strcmp(arg, "--help") == 0) - { - usage(); - return 0; - } - - else if (strcmp(arg, "-L") == 0) - { - if (setlocale(LC_ALL, "") == NULL) - { - (void)fprintf(stderr, "pcre2_dftables: setlocale() failed\n"); - return 1; - } - env = getenv("LC_ALL"); - } - - else if (strcmp(arg, "-b") == 0) - binary = TRUE; - - else - { - (void)fprintf(stderr, "pcre2_dftables: unrecognized option %s\n", arg); - return 1; - } - } - -if (i != argc - 1) - { - (void)fprintf(stderr, "pcre2_dftables: one filename argument is required\n"); - return 1; - } - -/* Make the tables */ - -tables = maketables(); -base_of_tables = tables; - -f = fopen(argv[i], "wb"); -if (f == NULL) - { - fprintf(stderr, "pcre2_dftables: failed to open %s for writing\n", argv[1]); - return 1; - } - -/* If -b was specified, we write the tables in binary. */ - -if (binary) - { - int yield = 0; - size_t len = fwrite(tables, 1, TABLES_LENGTH, f); - if (len != TABLES_LENGTH) - { - (void)fprintf(stderr, "pcre2_dftables: fwrite() returned wrong length %d " - "instead of %d\n", (int)len, TABLES_LENGTH); - yield = 1; - } - fclose(f); - free((void *)base_of_tables); - return yield; - } - -/* Write the tables as source code for inclusion in the PCRE2 library. There -are several fprintf() calls here, because gcc in pedantic mode complains about -the very long string otherwise. */ - -(void)fprintf(f, - "/*************************************************\n" - "* Perl-Compatible Regular Expressions *\n" - "*************************************************/\n\n" - "/* This file was automatically written by the pcre2_dftables auxiliary\n" - "program. It contains character tables that are used when no external\n" - "tables are passed to PCRE2 by the application that calls it. The tables\n" - "are used only for characters whose code values are less than 256. */\n\n"); - -(void)fprintf(f, - "/* This set of tables was written in the %s locale. */\n\n", env); - -(void)fprintf(f, - "/* The pcre2_ftables program (which is distributed with PCRE2) can be used\n" - "to build alternative versions of this file. This is necessary if you are\n" - "running in an EBCDIC environment, or if you want to default to a different\n" - "encoding, for example ISO-8859-1. When pcre2_dftables is run, it creates\n" - "these tables in the \"C\" locale by default. This happens automatically if\n" - "PCRE2 is configured with --enable-rebuild-chartables. However, you can run\n" - "pcre2_dftables manually with the -L option to build tables using the LC_ALL\n" - "locale. */\n\n"); - -/* Force config.h in z/OS */ - -#if defined NATIVE_ZOS -(void)fprintf(f, - "/* For z/OS, config.h is forced */\n" - "#ifndef HAVE_CONFIG_H\n" - "#define HAVE_CONFIG_H 1\n" - "#endif\n\n"); -#endif - -(void)fprintf(f, - "/* The following #include is present because without it gcc 4.x may remove\n" - "the array definition from the final binary if PCRE2 is built into a static\n" - "library and dead code stripping is activated. This leads to link errors.\n" - "Pulling in the header ensures that the array gets flagged as \"someone\n" - "outside this compilation unit might reference this\" and so it will always\n" - "be supplied to the linker. */\n\n"); - -(void)fprintf(f, - "#ifdef HAVE_CONFIG_H\n" - "#include \"config.h\"\n" - "#endif\n\n" - "#include \"pcre2_internal.h\"\n\n"); - -(void)fprintf(f, - "const uint8_t PRIV(default_tables)[] = {\n\n" - "/* This table is a lower casing table. */\n\n"); - -(void)fprintf(f, " "); -for (i = 0; i < 256; i++) - { - if ((i & 7) == 0 && i != 0) fprintf(f, "\n "); - fprintf(f, "%3d", *tables++); - if (i != 255) fprintf(f, ","); - } -(void)fprintf(f, ",\n\n"); - -(void)fprintf(f, "/* This table is a case flipping table. */\n\n"); - -(void)fprintf(f, " "); -for (i = 0; i < 256; i++) - { - if ((i & 7) == 0 && i != 0) fprintf(f, "\n "); - fprintf(f, "%3d", *tables++); - if (i != 255) fprintf(f, ","); - } -(void)fprintf(f, ",\n\n"); - -(void)fprintf(f, - "/* This table contains bit maps for various character classes. Each map is 32\n" - "bytes long and the bits run from the least significant end of each byte. The\n" - "classes that have their own maps are: space, xdigit, digit, upper, lower, word,\n" - "graph, print, punct, and cntrl. Other classes are built from combinations. */\n\n"); - -(void)fprintf(f, " "); -for (i = 0; i < cbit_length; i++) - { - if ((i & 7) == 0 && i != 0) - { - if ((i & 31) == 0) (void)fprintf(f, "\n"); - if ((i & 24) == 8) (void)fprintf(f, " /* %s */", classlist[nclass++]); - (void)fprintf(f, "\n "); - } - (void)fprintf(f, "0x%02x", *tables++); - if (i != cbit_length - 1) (void)fprintf(f, ","); - } -(void)fprintf(f, ",\n\n"); - -(void)fprintf(f, - "/* This table identifies various classes of character by individual bits:\n" - " 0x%02x white space character\n" - " 0x%02x letter\n" - " 0x%02x lower case letter\n" - " 0x%02x decimal digit\n" - " 0x%02x alphanumeric or '_'\n*/\n\n", - ctype_space, ctype_letter, ctype_lcletter, ctype_digit, ctype_word); - -(void)fprintf(f, " "); -for (i = 0; i < 256; i++) - { - if ((i & 7) == 0 && i != 0) - { - (void)fprintf(f, " /* "); - if (isprint(i-8)) (void)fprintf(f, " %c -", i-8); - else (void)fprintf(f, "%3d-", i-8); - if (isprint(i-1)) (void)fprintf(f, " %c ", i-1); - else (void)fprintf(f, "%3d", i-1); - (void)fprintf(f, " */\n "); - } - (void)fprintf(f, "0x%02x", *tables++); - if (i != 255) (void)fprintf(f, ","); - } - -(void)fprintf(f, "};/* "); -if (isprint(i-8)) (void)fprintf(f, " %c -", i-8); - else (void)fprintf(f, "%3d-", i-8); -if (isprint(i-1)) (void)fprintf(f, " %c ", i-1); - else (void)fprintf(f, "%3d", i-1); -(void)fprintf(f, " */\n\n/* End of pcre2_chartables.c */\n"); - -fclose(f); -free((void *)base_of_tables); -return 0; -} - -/* End of pcre2_dftables.c */ diff --git a/pcre2/src/pcre2_error.c b/pcre2/src/pcre2_error.c deleted file mode 100644 index c61648cb7..000000000 --- a/pcre2/src/pcre2_error.c +++ /dev/null @@ -1,340 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - -#define STRING(a) # a -#define XSTRING(s) STRING(s) - -/* The texts of compile-time error messages. Compile-time error numbers start -at COMPILE_ERROR_BASE (100). - -This used to be a table of strings, but in order to reduce the number of -relocations needed when a shared library is loaded dynamically, it is now one -long string. We cannot use a table of offsets, because the lengths of inserts -such as XSTRING(MAX_NAME_SIZE) are not known. Instead, -pcre2_get_error_message() counts through to the one it wants - this isn't a -performance issue because these strings are used only when there is an error. - -Each substring ends with \0 to insert a null character. This includes the final -substring, so that the whole string ends with \0\0, which can be detected when -counting through. */ - -static const unsigned char compile_error_texts[] = - "no error\0" - "\\ at end of pattern\0" - "\\c at end of pattern\0" - "unrecognized character follows \\\0" - "numbers out of order in {} quantifier\0" - /* 5 */ - "number too big in {} quantifier\0" - "missing terminating ] for character class\0" - "escape sequence is invalid in character class\0" - "range out of order in character class\0" - "quantifier does not follow a repeatable item\0" - /* 10 */ - "internal error: unexpected repeat\0" - "unrecognized character after (? or (?-\0" - "POSIX named classes are supported only within a class\0" - "POSIX collating elements are not supported\0" - "missing closing parenthesis\0" - /* 15 */ - "reference to non-existent subpattern\0" - "pattern passed as NULL\0" - "unrecognised compile-time option bit(s)\0" - "missing ) after (?# comment\0" - "parentheses are too deeply nested\0" - /* 20 */ - "regular expression is too large\0" - "failed to allocate heap memory\0" - "unmatched closing parenthesis\0" - "internal error: code overflow\0" - "missing closing parenthesis for condition\0" - /* 25 */ - "lookbehind assertion is not fixed length\0" - "a relative value of zero is not allowed\0" - "conditional subpattern contains more than two branches\0" - "assertion expected after (?( or (?(?C)\0" - "digit expected after (?+ or (?-\0" - /* 30 */ - "unknown POSIX class name\0" - "internal error in pcre2_study(): should not occur\0" - "this version of PCRE2 does not have Unicode support\0" - "parentheses are too deeply nested (stack check)\0" - "character code point value in \\x{} or \\o{} is too large\0" - /* 35 */ - "lookbehind is too complicated\0" - "\\C is not allowed in a lookbehind assertion in UTF-" XSTRING(PCRE2_CODE_UNIT_WIDTH) " mode\0" - "PCRE2 does not support \\F, \\L, \\l, \\N{name}, \\U, or \\u\0" - "number after (?C is greater than 255\0" - "closing parenthesis for (?C expected\0" - /* 40 */ - "invalid escape sequence in (*VERB) name\0" - "unrecognized character after (?P\0" - "syntax error in subpattern name (missing terminator?)\0" - "two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0" - "subpattern name must start with a non-digit\0" - /* 45 */ - "this version of PCRE2 does not have support for \\P, \\p, or \\X\0" - "malformed \\P or \\p sequence\0" - "unknown property name after \\P or \\p\0" - "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " code units)\0" - "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" - /* 50 */ - "invalid range in character class\0" - "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0" - "internal error: overran compiling workspace\0" - "internal error: previously-checked referenced subpattern not found\0" - "DEFINE subpattern contains more than one branch\0" - /* 55 */ - "missing opening brace after \\o\0" - "internal error: unknown newline setting\0" - "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" - "(?R (recursive pattern call) must be followed by a closing parenthesis\0" - /* "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0" */ - "obsolete error (should not occur)\0" /* Was the above */ - /* 60 */ - "(*VERB) not recognized or malformed\0" - "subpattern number is too big\0" - "subpattern name expected\0" - "internal error: parsed pattern overflow\0" - "non-octal character in \\o{} (closing brace missing?)\0" - /* 65 */ - "different names for subpatterns of the same number are not allowed\0" - "(*MARK) must have an argument\0" - "non-hex character in \\x{} (closing brace missing?)\0" -#ifndef EBCDIC - "\\c must be followed by a printable ASCII character\0" -#else - "\\c must be followed by a letter or one of [\\]^_?\0" -#endif - "\\k is not followed by a braced, angle-bracketed, or quoted name\0" - /* 70 */ - "internal error: unknown meta code in check_lookbehinds()\0" - "\\N is not supported in a class\0" - "callout string is too long\0" - "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" - "using UTF is disabled by the application\0" - /* 75 */ - "using UCP is disabled by the application\0" - "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0" - "character code point value in \\u.... sequence is too large\0" - "digits missing in \\x{} or \\o{} or \\N{U+}\0" - "syntax error or number too big in (?(VERSION condition\0" - /* 80 */ - "internal error: unknown opcode in auto_possessify()\0" - "missing terminating delimiter for callout with string argument\0" - "unrecognized string delimiter follows (?C\0" - "using \\C is disabled by the application\0" - "(?| and/or (?J: or (?x: parentheses are too deeply nested\0" - /* 85 */ - "using \\C is disabled in this PCRE2 library\0" - "regular expression is too complicated\0" - "lookbehind assertion is too long\0" - "pattern string is longer than the limit set by the application\0" - "internal error: unknown code in parsed pattern\0" - /* 90 */ - "internal error: bad code value in parsed_skip()\0" - "PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode\0" - "invalid option bits with PCRE2_LITERAL\0" - "\\N{U+dddd} is supported only in Unicode (UTF) mode\0" - "invalid hyphen in option setting\0" - /* 95 */ - "(*alpha_assertion) not recognized\0" - "script runs require Unicode support, which this version of PCRE2 does not have\0" - "too many capturing groups (maximum 65535)\0" - "atomic assertion expected after (?( or (?(?C)\0" - ; - -/* Match-time and UTF error texts are in the same format. */ - -static const unsigned char match_error_texts[] = - "no error\0" - "no match\0" - "partial match\0" - "UTF-8 error: 1 byte missing at end\0" - "UTF-8 error: 2 bytes missing at end\0" - /* 5 */ - "UTF-8 error: 3 bytes missing at end\0" - "UTF-8 error: 4 bytes missing at end\0" - "UTF-8 error: 5 bytes missing at end\0" - "UTF-8 error: byte 2 top bits not 0x80\0" - "UTF-8 error: byte 3 top bits not 0x80\0" - /* 10 */ - "UTF-8 error: byte 4 top bits not 0x80\0" - "UTF-8 error: byte 5 top bits not 0x80\0" - "UTF-8 error: byte 6 top bits not 0x80\0" - "UTF-8 error: 5-byte character is not allowed (RFC 3629)\0" - "UTF-8 error: 6-byte character is not allowed (RFC 3629)\0" - /* 15 */ - "UTF-8 error: code points greater than 0x10ffff are not defined\0" - "UTF-8 error: code points 0xd800-0xdfff are not defined\0" - "UTF-8 error: overlong 2-byte sequence\0" - "UTF-8 error: overlong 3-byte sequence\0" - "UTF-8 error: overlong 4-byte sequence\0" - /* 20 */ - "UTF-8 error: overlong 5-byte sequence\0" - "UTF-8 error: overlong 6-byte sequence\0" - "UTF-8 error: isolated byte with 0x80 bit set\0" - "UTF-8 error: illegal byte (0xfe or 0xff)\0" - "UTF-16 error: missing low surrogate at end\0" - /* 25 */ - "UTF-16 error: invalid low surrogate\0" - "UTF-16 error: isolated low surrogate\0" - "UTF-32 error: code points 0xd800-0xdfff are not defined\0" - "UTF-32 error: code points greater than 0x10ffff are not defined\0" - "bad data value\0" - /* 30 */ - "patterns do not all use the same character tables\0" - "magic number missing\0" - "pattern compiled in wrong mode: 8/16/32-bit error\0" - "bad offset value\0" - "bad option value\0" - /* 35 */ - "invalid replacement string\0" - "bad offset into UTF string\0" - "callout error code\0" /* Never returned by PCRE2 itself */ - "invalid data in workspace for DFA restart\0" - "too much recursion for DFA matching\0" - /* 40 */ - "backreference condition or recursion test is not supported for DFA matching\0" - "function is not supported for DFA matching\0" - "pattern contains an item that is not supported for DFA matching\0" - "workspace size exceeded in DFA matching\0" - "internal error - pattern overwritten?\0" - /* 45 */ - "bad JIT option\0" - "JIT stack limit reached\0" - "match limit exceeded\0" - "no more memory\0" - "unknown substring\0" - /* 50 */ - "non-unique substring name\0" - "NULL argument passed\0" - "nested recursion at the same subject position\0" - "matching depth limit exceeded\0" - "requested value is not available\0" - /* 55 */ - "requested value is not set\0" - "offset limit set without PCRE2_USE_OFFSET_LIMIT\0" - "bad escape sequence in replacement string\0" - "expected closing curly bracket in replacement string\0" - "bad substitution in replacement string\0" - /* 60 */ - "match with end before start or start moved backwards is not supported\0" - "too many replacements (more than INT_MAX)\0" - "bad serialized data\0" - "heap limit exceeded\0" - "invalid syntax\0" - /* 65 */ - "internal error - duplicate substitution match\0" - "PCRE2_MATCH_INVALID_UTF is not supported for DFA matching\0" - ; - - -/************************************************* -* Return error message * -*************************************************/ - -/* This function copies an error message into a buffer whose units are of an -appropriate width. Error numbers are positive for compile-time errors, and -negative for match-time errors (except for UTF errors), but the numbers are all -distinct. - -Arguments: - enumber error number - buffer where to put the message (zero terminated) - size size of the buffer in code units - -Returns: length of message if all is well - negative on error -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_get_error_message(int enumber, PCRE2_UCHAR *buffer, PCRE2_SIZE size) -{ -const unsigned char *message; -PCRE2_SIZE i; -int n; - -if (size == 0) return PCRE2_ERROR_NOMEMORY; - -if (enumber >= COMPILE_ERROR_BASE) /* Compile error */ - { - message = compile_error_texts; - n = enumber - COMPILE_ERROR_BASE; - } -else if (enumber < 0) /* Match or UTF error */ - { - message = match_error_texts; - n = -enumber; - } -else /* Invalid error number */ - { - message = (unsigned char *)"\0"; /* Empty message list */ - n = 1; - } - -for (; n > 0; n--) - { - while (*message++ != CHAR_NUL) {}; - if (*message == CHAR_NUL) return PCRE2_ERROR_BADDATA; - } - -for (i = 0; *message != 0; i++) - { - if (i >= size - 1) - { - buffer[i] = 0; /* Terminate partial message */ - return PCRE2_ERROR_NOMEMORY; - } - buffer[i] = *message++; - } - -buffer[i] = 0; -return (int)i; -} - -/* End of pcre2_error.c */ diff --git a/pcre2/src/pcre2_extuni.c b/pcre2/src/pcre2_extuni.c deleted file mode 100644 index 5a719e9cb..000000000 --- a/pcre2/src/pcre2_extuni.c +++ /dev/null @@ -1,148 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -/* This module contains an internal function that is used to match a Unicode -extended grapheme sequence. It is used by both pcre2_match() and -pcre2_def_match(). However, it is called only when Unicode support is being -compiled. Nevertheless, we provide a dummy function when there is no Unicode -support, because some compilers do not like functionless source files. */ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - - -#include "pcre2_internal.h" - - -/* Dummy function */ - -#ifndef SUPPORT_UNICODE -PCRE2_SPTR -PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, - PCRE2_SPTR end_subject, BOOL utf, int *xcount) -{ -(void)c; -(void)eptr; -(void)start_subject; -(void)end_subject; -(void)utf; -(void)xcount; -return NULL; -} -#else - - -/************************************************* -* Match an extended grapheme sequence * -*************************************************/ - -/* -Arguments: - c the first character - eptr pointer to next character - start_subject pointer to start of subject - end_subject pointer to end of subject - utf TRUE if in UTF mode - xcount pointer to count of additional characters, - or NULL if count not needed - -Returns: pointer after the end of the sequence -*/ - -PCRE2_SPTR -PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, - PCRE2_SPTR end_subject, BOOL utf, int *xcount) -{ -int lgb = UCD_GRAPHBREAK(c); - -while (eptr < end_subject) - { - int rgb; - int len = 1; - if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } - rgb = UCD_GRAPHBREAK(c); - if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; - - /* Not breaking between Regional Indicators is allowed only if there - are an even number of preceding RIs. */ - - if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator) - { - int ricount = 0; - PCRE2_SPTR bptr = eptr - 1; - if (utf) BACKCHAR(bptr); - - /* bptr is pointing to the left-hand character */ - - while (bptr > start_subject) - { - bptr--; - if (utf) - { - BACKCHAR(bptr); - GETCHAR(c, bptr); - } - else - c = *bptr; - if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break; - ricount++; - } - if ((ricount & 1) != 0) break; /* Grapheme break required */ - } - - /* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this - allows any number of them before a following Extended_Pictographic. */ - - if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) || - lgb != ucp_gbExtended_Pictographic) - lgb = rgb; - - eptr += len; - if (xcount != NULL) *xcount += 1; - } - -return eptr; -} - -#endif /* SUPPORT_UNICODE */ - -/* End of pcre2_extuni.c */ diff --git a/pcre2/src/pcre2_find_bracket.c b/pcre2/src/pcre2_find_bracket.c deleted file mode 100644 index 70baa1394..000000000 --- a/pcre2/src/pcre2_find_bracket.c +++ /dev/null @@ -1,219 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2018 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -/* This module contains a single function that scans through a compiled pattern -until it finds a capturing bracket with the given number, or, if the number is -negative, an instance of OP_REVERSE for a lookbehind. The function is called -from pcre2_compile.c and also from pcre2_study.c when finding the minimum -matching length. */ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - - -/************************************************* -* Scan compiled regex for specific bracket * -*************************************************/ - -/* -Arguments: - code points to start of expression - utf TRUE in UTF mode - number the required bracket number or negative to find a lookbehind - -Returns: pointer to the opcode for the bracket, or NULL if not found -*/ - -PCRE2_SPTR -PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number) -{ -for (;;) - { - PCRE2_UCHAR c = *code; - - if (c == OP_END) return NULL; - - /* XCLASS is used for classes that cannot be represented just by a bit map. - This includes negated single high-valued characters. CALLOUT_STR is used for - callouts with string arguments. In both cases the length in the table is - zero; the actual length is stored in the compiled code. */ - - if (c == OP_XCLASS) code += GET(code, 1); - else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); - - /* Handle lookbehind */ - - else if (c == OP_REVERSE) - { - if (number < 0) return (PCRE2_UCHAR *)code; - code += PRIV(OP_lengths)[c]; - } - - /* Handle capturing bracket */ - - else if (c == OP_CBRA || c == OP_SCBRA || - c == OP_CBRAPOS || c == OP_SCBRAPOS) - { - int n = (int)GET2(code, 1+LINK_SIZE); - if (n == number) return (PCRE2_UCHAR *)code; - code += PRIV(OP_lengths)[c]; - } - - /* Otherwise, we can get the item's length from the table, except that for - repeated character types, we have to test for \p and \P, which have an extra - two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we - must add in its length. */ - - else - { - switch(c) - { - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; - break; - - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEEXACT: - case OP_TYPEPOSUPTO: - if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) - code += 2; - break; - - case OP_MARK: - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - case OP_SKIP_ARG: - case OP_THEN_ARG: - code += code[1]; - break; - } - - /* Add in the fixed length from the table */ - - code += PRIV(OP_lengths)[c]; - - /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be - followed by a multi-byte character. The length in the table is a minimum, so - we have to arrange to skip the extra bytes. */ - -#ifdef MAYBE_UTF_MULTI - if (utf) switch(c) - { - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - case OP_EXACT: - case OP_EXACTI: - case OP_NOTEXACT: - case OP_NOTEXACTI: - case OP_UPTO: - case OP_UPTOI: - case OP_NOTUPTO: - case OP_NOTUPTOI: - case OP_MINUPTO: - case OP_MINUPTOI: - case OP_NOTMINUPTO: - case OP_NOTMINUPTOI: - case OP_POSUPTO: - case OP_POSUPTOI: - case OP_NOTPOSUPTO: - case OP_NOTPOSUPTOI: - case OP_STAR: - case OP_STARI: - case OP_NOTSTAR: - case OP_NOTSTARI: - case OP_MINSTAR: - case OP_MINSTARI: - case OP_NOTMINSTAR: - case OP_NOTMINSTARI: - case OP_POSSTAR: - case OP_POSSTARI: - case OP_NOTPOSSTAR: - case OP_NOTPOSSTARI: - case OP_PLUS: - case OP_PLUSI: - case OP_NOTPLUS: - case OP_NOTPLUSI: - case OP_MINPLUS: - case OP_MINPLUSI: - case OP_NOTMINPLUS: - case OP_NOTMINPLUSI: - case OP_POSPLUS: - case OP_POSPLUSI: - case OP_NOTPOSPLUS: - case OP_NOTPOSPLUSI: - case OP_QUERY: - case OP_QUERYI: - case OP_NOTQUERY: - case OP_NOTQUERYI: - case OP_MINQUERY: - case OP_MINQUERYI: - case OP_NOTMINQUERY: - case OP_NOTMINQUERYI: - case OP_POSQUERY: - case OP_POSQUERYI: - case OP_NOTPOSQUERY: - case OP_NOTPOSQUERYI: - if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); - break; - } -#else - (void)(utf); /* Keep compiler happy by referencing function argument */ -#endif /* MAYBE_UTF_MULTI */ - } - } -} - -/* End of pcre2_find_bracket.c */ diff --git a/pcre2/src/pcre2_fuzzsupport.c b/pcre2/src/pcre2_fuzzsupport.c deleted file mode 100644 index 48781ffc0..000000000 --- a/pcre2/src/pcre2_fuzzsupport.c +++ /dev/null @@ -1,365 +0,0 @@ -/*************************************************************************** -Fuzzer driver for PCRE2. Given an arbitrary string of bytes and a length, it -tries to compile and match it, deriving options from the string itself. If -STANDALONE is defined, a main program that calls the driver with the contents -of specified files is compiled, and commentary on what is happening is output. -If an argument starts with '=' the rest of it it is taken as a literal string -rather than a file name. This allows easy testing of short strings. - -Written by Philip Hazel, October 2016 -***************************************************************************/ - -#include -#include -#include -#include - -#define PCRE2_CODE_UNIT_WIDTH 8 -#include "pcre2.h" - -#define MAX_MATCH_SIZE 1000 - -#define DFA_WORKSPACE_COUNT 100 - -#define ALLOWED_COMPILE_OPTIONS \ - (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ - PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \ - PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_ENDANCHORED|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \ - PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \ - PCRE2_NO_AUTO_CAPTURE| \ - PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \ - PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \ - PCRE2_UTF) - -#define ALLOWED_MATCH_OPTIONS \ - (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ - PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_HARD| \ - PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT) - -/* This is the callout function. Its only purpose is to halt matching if there -are more than 100 callouts, as one way of stopping too much time being spent on -fruitless matches. The callout data is a pointer to the counter. */ - -static int callout_function(pcre2_callout_block *cb, void *callout_data) -{ -(void)cb; /* Avoid unused parameter warning */ -*((uint32_t *)callout_data) += 1; -return (*((uint32_t *)callout_data) > 100)? PCRE2_ERROR_CALLOUT : 0; -} - -/* Putting in this apparently unnecessary prototype prevents gcc from giving a -"no previous prototype" warning when compiling at high warning level. */ - -int LLVMFuzzerTestOneInput(const unsigned char *, size_t); - -/* Here's the driving function. */ - -int LLVMFuzzerTestOneInput(const unsigned char *data, size_t size) -{ -uint32_t compile_options; -uint32_t match_options; -pcre2_match_data *match_data = NULL; -pcre2_match_context *match_context = NULL; -size_t match_size; -int dfa_workspace[DFA_WORKSPACE_COUNT]; -int r1, r2; -int i; - -if (size < 1) return 0; - -/* Limiting the length of the subject for matching stops fruitless searches -in large trees taking too much time. */ - -match_size = (size > MAX_MATCH_SIZE)? MAX_MATCH_SIZE : size; - -/* Figure out some options to use. Initialize the random number to ensure -repeatability. Ensure that we get a 32-bit unsigned random number for testing -options. (RAND_MAX is required to be at least 32767, but is commonly -2147483647, which excludes the top bit.) */ - -srand((unsigned int)(data[size/2])); -r1 = rand(); -r2 = rand(); - -/* Ensure that all undefined option bits are zero (waste of time trying them) -and also that PCRE2_NO_UTF_CHECK is unset, as there is no guarantee that the -input is UTF-8. Also unset PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as there is no -reason to disallow UTF and UCP. Force PCRE2_NEVER_BACKSLASH_C to be set because -\C in random patterns is highly likely to cause a crash. */ - -compile_options = - ((((uint32_t)r1 << 16) | ((uint32_t)r2 & 0xffff)) & ALLOWED_COMPILE_OPTIONS) | - PCRE2_NEVER_BACKSLASH_C; - -match_options = - ((((uint32_t)r1 << 16) | ((uint32_t)r2 & 0xffff)) & ALLOWED_MATCH_OPTIONS); - -/* Discard partial matching if PCRE2_ENDANCHORED is set, because they are not -allowed together and just give an immediate error return. */ - -if (((compile_options|match_options) & PCRE2_ENDANCHORED) != 0) - match_options &= ~(PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT); - -/* Do the compile with and without the options, and after a successful compile, -likewise do the match with and without the options. */ - -for (i = 0; i < 2; i++) - { - uint32_t callout_count; - int errorcode; - PCRE2_SIZE erroroffset; - pcre2_code *code; - -#ifdef STANDALONE - printf("Compile options %.8x never_backslash_c", compile_options); - printf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - ((compile_options & PCRE2_ALT_BSUX) != 0)? ",alt_bsux" : "", - ((compile_options & PCRE2_ALT_CIRCUMFLEX) != 0)? ",alt_circumflex" : "", - ((compile_options & PCRE2_ALT_VERBNAMES) != 0)? ",alt_verbnames" : "", - ((compile_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? ",allow_empty_class" : "", - ((compile_options & PCRE2_ANCHORED) != 0)? ",anchored" : "", - ((compile_options & PCRE2_AUTO_CALLOUT) != 0)? ",auto_callout" : "", - ((compile_options & PCRE2_CASELESS) != 0)? ",caseless" : "", - ((compile_options & PCRE2_DOLLAR_ENDONLY) != 0)? ",dollar_endonly" : "", - ((compile_options & PCRE2_DOTALL) != 0)? ",dotall" : "", - ((compile_options & PCRE2_DUPNAMES) != 0)? ",dupnames" : "", - ((compile_options & PCRE2_ENDANCHORED) != 0)? ",endanchored" : "", - ((compile_options & PCRE2_EXTENDED) != 0)? ",extended" : "", - ((compile_options & PCRE2_FIRSTLINE) != 0)? ",firstline" : "", - ((compile_options & PCRE2_MATCH_UNSET_BACKREF) != 0)? ",match_unset_backref" : "", - ((compile_options & PCRE2_MULTILINE) != 0)? ",multiline" : "", - ((compile_options & PCRE2_NEVER_UCP) != 0)? ",never_ucp" : "", - ((compile_options & PCRE2_NEVER_UTF) != 0)? ",never_utf" : "", - ((compile_options & PCRE2_NO_AUTO_CAPTURE) != 0)? ",no_auto_capture" : "", - ((compile_options & PCRE2_NO_AUTO_POSSESS) != 0)? ",no_auto_possess" : "", - ((compile_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)? ",no_dotstar_anchor" : "", - ((compile_options & PCRE2_NO_UTF_CHECK) != 0)? ",no_utf_check" : "", - ((compile_options & PCRE2_NO_START_OPTIMIZE) != 0)? ",no_start_optimize" : "", - ((compile_options & PCRE2_UCP) != 0)? ",ucp" : "", - ((compile_options & PCRE2_UNGREEDY) != 0)? ",ungreedy" : "", - ((compile_options & PCRE2_USE_OFFSET_LIMIT) != 0)? ",use_offset_limit" : "", - ((compile_options & PCRE2_UTF) != 0)? ",utf" : ""); -#endif - - code = pcre2_compile((PCRE2_SPTR)data, (PCRE2_SIZE)size, compile_options, - &errorcode, &erroroffset, NULL); - - /* Compilation succeeded */ - - if (code != NULL) - { - int j; - uint32_t save_match_options = match_options; - - /* Create match data and context blocks only when we first need them. Set - low match and depth limits to avoid wasting too much searching large - pattern trees. Almost all matches are going to fail. */ - - if (match_data == NULL) - { - match_data = pcre2_match_data_create(32, NULL); - if (match_data == NULL) - { -#ifdef STANDALONE - printf("** Failed to create match data block\n"); -#endif - return 0; - } - } - - if (match_context == NULL) - { - match_context = pcre2_match_context_create(NULL); - if (match_context == NULL) - { -#ifdef STANDALONE - printf("** Failed to create match context block\n"); -#endif - return 0; - } - (void)pcre2_set_match_limit(match_context, 100); - (void)pcre2_set_depth_limit(match_context, 100); - (void)pcre2_set_callout(match_context, callout_function, &callout_count); - } - - /* Match twice, with and without options. */ - - for (j = 0; j < 2; j++) - { -#ifdef STANDALONE - printf("Match options %.8x", match_options); - printf("%s%s%s%s%s%s%s%s%s%s\n", - ((match_options & PCRE2_ANCHORED) != 0)? ",anchored" : "", - ((match_options & PCRE2_ENDANCHORED) != 0)? ",endanchored" : "", - ((match_options & PCRE2_NO_JIT) != 0)? ",no_jit" : "", - ((match_options & PCRE2_NO_UTF_CHECK) != 0)? ",no_utf_check" : "", - ((match_options & PCRE2_NOTBOL) != 0)? ",notbol" : "", - ((match_options & PCRE2_NOTEMPTY) != 0)? ",notempty" : "", - ((match_options & PCRE2_NOTEMPTY_ATSTART) != 0)? ",notempty_atstart" : "", - ((match_options & PCRE2_NOTEOL) != 0)? ",noteol" : "", - ((match_options & PCRE2_PARTIAL_HARD) != 0)? ",partial_hard" : "", - ((match_options & PCRE2_PARTIAL_SOFT) != 0)? ",partial_soft" : ""); -#endif - - callout_count = 0; - errorcode = pcre2_match(code, (PCRE2_SPTR)data, (PCRE2_SIZE)match_size, 0, - match_options, match_data, match_context); - -#ifdef STANDALONE - if (errorcode >= 0) printf("Match returned %d\n", errorcode); else - { - unsigned char buffer[256]; - pcre2_get_error_message(errorcode, buffer, 256); - printf("Match failed: error %d: %s\n", errorcode, buffer); - } -#endif - - match_options = 0; /* For second time */ - } - - /* Match with DFA twice, with and without options. */ - - match_options = save_match_options & ~PCRE2_NO_JIT; /* Not valid for DFA */ - - for (j = 0; j < 2; j++) - { -#ifdef STANDALONE - printf("DFA match options %.8x", match_options); - printf("%s%s%s%s%s%s%s%s%s\n", - ((match_options & PCRE2_ANCHORED) != 0)? ",anchored" : "", - ((match_options & PCRE2_ENDANCHORED) != 0)? ",endanchored" : "", - ((match_options & PCRE2_NO_UTF_CHECK) != 0)? ",no_utf_check" : "", - ((match_options & PCRE2_NOTBOL) != 0)? ",notbol" : "", - ((match_options & PCRE2_NOTEMPTY) != 0)? ",notempty" : "", - ((match_options & PCRE2_NOTEMPTY_ATSTART) != 0)? ",notempty_atstart" : "", - ((match_options & PCRE2_NOTEOL) != 0)? ",noteol" : "", - ((match_options & PCRE2_PARTIAL_HARD) != 0)? ",partial_hard" : "", - ((match_options & PCRE2_PARTIAL_SOFT) != 0)? ",partial_soft" : ""); -#endif - - callout_count = 0; - errorcode = pcre2_dfa_match(code, (PCRE2_SPTR)data, - (PCRE2_SIZE)match_size, 0, match_options, match_data, match_context, - dfa_workspace, DFA_WORKSPACE_COUNT); - -#ifdef STANDALONE - if (errorcode >= 0) printf("Match returned %d\n", errorcode); else - { - unsigned char buffer[256]; - pcre2_get_error_message(errorcode, buffer, 256); - printf("Match failed: error %d: %s\n", errorcode, buffer); - } -#endif - - match_options = 0; /* For second time */ - } - - match_options = save_match_options; /* Reset for the second compile */ - pcre2_code_free(code); - } - - /* Compilation failed */ - - else - { - unsigned char buffer[256]; - pcre2_get_error_message(errorcode, buffer, 256); -#ifdef STANDALONE - printf("Error %d at offset %lu: %s\n", errorcode, erroroffset, buffer); -#else - if (strstr((const char *)buffer, "internal error") != NULL) abort(); -#endif - } - - compile_options = PCRE2_NEVER_BACKSLASH_C; /* For second time */ - } - -if (match_data != NULL) pcre2_match_data_free(match_data); -if (match_context != NULL) pcre2_match_context_free(match_context); - -return 0; -} - - -/* Optional main program. */ - -#ifdef STANDALONE -int main(int argc, char **argv) -{ -int i; - -if (argc < 2) - { - printf("** No arguments given\n"); - return 0; - } - -for (i = 1; i < argc; i++) - { - size_t filelen; - size_t readsize; - unsigned char *buffer; - FILE *f; - - /* Handle a literal string. Copy to an exact size buffer so that checks for - overrunning work. */ - - if (argv[i][0] == '=') - { - readsize = strlen(argv[i]) - 1; - printf("------ ------\n"); - printf("Length = %lu\n", readsize); - printf("%.*s\n", (int)readsize, argv[i]+1); - buffer = (unsigned char *)malloc(readsize); - if (buffer == NULL) - printf("** Failed to allocate %lu bytes of memory\n", readsize); - else - { - memcpy(buffer, argv[i]+1, readsize); - LLVMFuzzerTestOneInput(buffer, readsize); - free(buffer); - } - continue; - } - - /* Handle a string given in a file */ - - f = fopen(argv[i], "rb"); - if (f == NULL) - { - printf("** Failed to open %s: %s\n", argv[i], strerror(errno)); - continue; - } - - printf("------ %s ------\n", argv[i]); - - fseek(f, 0, SEEK_END); - filelen = ftell(f); - fseek(f, 0, SEEK_SET); - - buffer = (unsigned char *)malloc(filelen); - if (buffer == NULL) - { - printf("** Failed to allocate %lu bytes of memory\n", filelen); - fclose(f); - continue; - } - - readsize = fread(buffer, 1, filelen, f); - fclose(f); - - if (readsize != filelen) - printf("** File size is %lu but fread() returned %lu\n", filelen, readsize); - else - { - printf("Length = %lu\n", filelen); - LLVMFuzzerTestOneInput(buffer, filelen); - } - free(buffer); - } - -return 0; -} -#endif /* STANDALONE */ - -/* End */ diff --git a/pcre2/src/pcre2_internal.h b/pcre2/src/pcre2_internal.h deleted file mode 100644 index d8fad1e93..000000000 --- a/pcre2/src/pcre2_internal.h +++ /dev/null @@ -1,2004 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE2 is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -#ifndef PCRE2_INTERNAL_H_IDEMPOTENT_GUARD -#define PCRE2_INTERNAL_H_IDEMPOTENT_GUARD - -/* We do not support both EBCDIC and Unicode at the same time. The "configure" -script prevents both being selected, but not everybody uses "configure". EBCDIC -is only supported for the 8-bit library, but the check for this has to be later -in this file, because the first part is not width-dependent, and is included by -pcre2test.c with CODE_UNIT_WIDTH == 0. */ - -#if defined EBCDIC && defined SUPPORT_UNICODE -#error The use of both EBCDIC and SUPPORT_UNICODE is not supported. -#endif - -/* Standard C headers */ - -#include -#include -#include -#include -#include -#include - -/* Macros to make boolean values more obvious. The #ifndef is to pacify -compiler warnings in environments where these macros are defined elsewhere. -Unfortunately, there is no way to do the same for the typedef. */ - -typedef int BOOL; -#ifndef FALSE -#define FALSE 0 -#define TRUE 1 -#endif - -/* Valgrind (memcheck) support */ - -#ifdef SUPPORT_VALGRIND -#include -#endif - -/* -ftrivial-auto-var-init support supports initializing all local variables -to avoid some classes of bug, but this can cause an unacceptable slowdown -for large on-stack arrays in hot functions. This macro lets us annotate -such arrays. */ - -#ifdef HAVE_ATTRIBUTE_UNINITIALIZED -#define PCRE2_KEEP_UNINITIALIZED __attribute__((uninitialized)) -#else -#define PCRE2_KEEP_UNINITIALIZED -#endif - -/* Older versions of MSVC lack snprintf(). This define allows for -warning/error-free compilation and testing with MSVC compilers back to at least -MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */ - -#if defined(_MSC_VER) && (_MSC_VER < 1900) -#define snprintf _snprintf -#endif - -/* When compiling a DLL for Windows, the exported symbols have to be declared -using some MS magic. I found some useful information on this web page: -http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the -information there, using __declspec(dllexport) without "extern" we have a -definition; with "extern" we have a declaration. The settings here override the -setting in pcre2.h (which is included below); it defines only PCRE2_EXP_DECL, -which is all that is needed for applications (they just import the symbols). We -use: - - PCRE2_EXP_DECL for declarations - PCRE2_EXP_DEFN for definitions - -The reason for wrapping this in #ifndef PCRE2_EXP_DECL is so that pcre2test, -which is an application, but needs to import this file in order to "peek" at -internals, can #include pcre2.h first to get an application's-eye view. - -In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon, -special-purpose environments) might want to stick other stuff in front of -exported symbols. That's why, in the non-Windows case, we set PCRE2_EXP_DEFN -only if it is not already set. */ - -#ifndef PCRE2_EXP_DECL -# ifdef _WIN32 -# ifndef PCRE2_STATIC -# define PCRE2_EXP_DECL extern __declspec(dllexport) -# define PCRE2_EXP_DEFN __declspec(dllexport) -# else -# define PCRE2_EXP_DECL extern -# define PCRE2_EXP_DEFN -# endif -# else -# ifdef __cplusplus -# define PCRE2_EXP_DECL extern "C" -# else -# define PCRE2_EXP_DECL extern -# endif -# ifndef PCRE2_EXP_DEFN -# define PCRE2_EXP_DEFN PCRE2_EXP_DECL -# endif -# endif -#endif - -/* Include the public PCRE2 header and the definitions of UCP character -property values. This must follow the setting of PCRE2_EXP_DECL above. */ - -#include "pcre2.h" -#include "pcre2_ucp.h" - -/* When PCRE2 is compiled as a C++ library, the subject pointer can be replaced -with a custom type. This makes it possible, for example, to allow pcre2_match() -to process subject strings that are discontinuous by using a smart pointer -class. It must always be possible to inspect all of the subject string in -pcre2_match() because of the way it backtracks. */ - -/* WARNING: This is as yet untested for PCRE2. */ - -#ifdef CUSTOM_SUBJECT_PTR -#undef PCRE2_SPTR -#define PCRE2_SPTR CUSTOM_SUBJECT_PTR -#endif - -/* When checking for integer overflow in pcre2_compile(), we need to handle -large integers. If a 64-bit integer type is available, we can use that. -Otherwise we have to cast to double, which of course requires floating point -arithmetic. Handle this by defining a macro for the appropriate type. */ - -#if defined INT64_MAX || defined int64_t -#define INT64_OR_DOUBLE int64_t -#else -#define INT64_OR_DOUBLE double -#endif - -/* External (in the C sense) functions and tables that are private to the -libraries are always referenced using the PRIV macro. This makes it possible -for pcre2test.c to include some of the source files from the libraries using a -different PRIV definition to avoid name clashes. It also makes it clear in the -code that a non-static object is being referenced. */ - -#ifndef PRIV -#define PRIV(name) _pcre2_##name -#endif - -/* When compiling for use with the Virtual Pascal compiler, these functions -need to have their names changed. PCRE2 must be compiled with the -DVPCOMPAT -option on the command line. */ - -#ifdef VPCOMPAT -#define strlen(s) _strlen(s) -#define strncmp(s1,s2,m) _strncmp(s1,s2,m) -#define memcmp(s,c,n) _memcmp(s,c,n) -#define memcpy(d,s,n) _memcpy(d,s,n) -#define memmove(d,s,n) _memmove(d,s,n) -#define memset(s,c,n) _memset(s,c,n) -#else /* VPCOMPAT */ - -/* Otherwise, to cope with SunOS4 and other systems that lack memmove(), define -a macro that calls an emulating function. */ - -#ifndef HAVE_MEMMOVE -#undef memmove /* Some systems may have a macro */ -#define memmove(a, b, c) PRIV(memmove)(a, b, c) -#endif /* not HAVE_MEMMOVE */ -#endif /* not VPCOMPAT */ - -/* This is an unsigned int value that no UTF character can ever have, as -Unicode doesn't go beyond 0x0010ffff. */ - -#define NOTACHAR 0xffffffff - -/* This is the largest valid UTF/Unicode code point. */ - -#define MAX_UTF_CODE_POINT 0x10ffff - -/* Compile-time positive error numbers (all except UTF errors, which are -negative) start at this value. It should probably never be changed, in case -some application is checking for specific numbers. There is a copy of this -#define in pcre2posix.c (which now no longer includes this file). Ideally, a -way of having a single definition should be found, but as the number is -unlikely to change, this is not a pressing issue. The original reason for -having a base other than 0 was to keep the absolute values of compile-time and -run-time error numbers numerically different, but in the event the code does -not rely on this. */ - -#define COMPILE_ERROR_BASE 100 - -/* The initial frames vector for remembering backtracking points in -pcre2_match() is allocated on the system stack, of this size (bytes). The size -must be a multiple of sizeof(PCRE2_SPTR) in all environments, so making it a -multiple of 8 is best. Typical frame sizes are a few hundred bytes (it depends -on the number of capturing parentheses) so 20KiB handles quite a few frames. A -larger vector on the heap is obtained for patterns that need more frames. The -maximum size of this can be limited. */ - -#define START_FRAMES_SIZE 20480 - -/* Similarly, for DFA matching, an initial internal workspace vector is -allocated on the stack. */ - -#define DFA_START_RWS_SIZE 30720 - -/* Define the default BSR convention. */ - -#ifdef BSR_ANYCRLF -#define BSR_DEFAULT PCRE2_BSR_ANYCRLF -#else -#define BSR_DEFAULT PCRE2_BSR_UNICODE -#endif - - -/* ---------------- Basic UTF-8 macros ---------------- */ - -/* These UTF-8 macros are always defined because they are used in pcre2test for -handling wide characters in 16-bit and 32-bit modes, even if an 8-bit library -is not supported. */ - -/* Tests whether a UTF-8 code point needs extra bytes to decode. */ - -#define HASUTF8EXTRALEN(c) ((c) >= 0xc0) - -/* The following macros were originally written in the form of loops that used -data from the tables whose names start with PRIV(utf8_table). They were -rewritten by a user so as not to use loops, because in some environments this -gives a significant performance advantage, and it seems never to do any harm. -*/ - -/* Base macro to pick up the remaining bytes of a UTF-8 character, not -advancing the pointer. */ - -#define GETUTF8(c, eptr) \ - { \ - if ((c & 0x20u) == 0) \ - c = ((c & 0x1fu) << 6) | (eptr[1] & 0x3fu); \ - else if ((c & 0x10u) == 0) \ - c = ((c & 0x0fu) << 12) | ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ - else if ((c & 0x08u) == 0) \ - c = ((c & 0x07u) << 18) | ((eptr[1] & 0x3fu) << 12) | \ - ((eptr[2] & 0x3fu) << 6) | (eptr[3] & 0x3fu); \ - else if ((c & 0x04u) == 0) \ - c = ((c & 0x03u) << 24) | ((eptr[1] & 0x3fu) << 18) | \ - ((eptr[2] & 0x3fu) << 12) | ((eptr[3] & 0x3fu) << 6) | \ - (eptr[4] & 0x3fu); \ - else \ - c = ((c & 0x01u) << 30) | ((eptr[1] & 0x3fu) << 24) | \ - ((eptr[2] & 0x3fu) << 18) | ((eptr[3] & 0x3fu) << 12) | \ - ((eptr[4] & 0x3fu) << 6) | (eptr[5] & 0x3fu); \ - } - -/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing -the pointer. */ - -#define GETUTF8INC(c, eptr) \ - { \ - if ((c & 0x20u) == 0) \ - c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \ - else if ((c & 0x10u) == 0) \ - { \ - c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \ - eptr += 2; \ - } \ - else if ((c & 0x08u) == 0) \ - { \ - c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \ - ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ - eptr += 3; \ - } \ - else if ((c & 0x04u) == 0) \ - { \ - c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \ - ((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \ - (eptr[3] & 0x3fu); \ - eptr += 4; \ - } \ - else \ - { \ - c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \ - ((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \ - ((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \ - eptr += 5; \ - } \ - } - -/* Base macro to pick up the remaining bytes of a UTF-8 character, not -advancing the pointer, incrementing the length. */ - -#define GETUTF8LEN(c, eptr, len) \ - { \ - if ((c & 0x20u) == 0) \ - { \ - c = ((c & 0x1fu) << 6) | (eptr[1] & 0x3fu); \ - len++; \ - } \ - else if ((c & 0x10u) == 0) \ - { \ - c = ((c & 0x0fu) << 12) | ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ - len += 2; \ - } \ - else if ((c & 0x08u) == 0) \ - {\ - c = ((c & 0x07u) << 18) | ((eptr[1] & 0x3fu) << 12) | \ - ((eptr[2] & 0x3fu) << 6) | (eptr[3] & 0x3fu); \ - len += 3; \ - } \ - else if ((c & 0x04u) == 0) \ - { \ - c = ((c & 0x03u) << 24) | ((eptr[1] & 0x3fu) << 18) | \ - ((eptr[2] & 0x3fu) << 12) | ((eptr[3] & 0x3fu) << 6) | \ - (eptr[4] & 0x3fu); \ - len += 4; \ - } \ - else \ - {\ - c = ((c & 0x01u) << 30) | ((eptr[1] & 0x3fu) << 24) | \ - ((eptr[2] & 0x3fu) << 18) | ((eptr[3] & 0x3fu) << 12) | \ - ((eptr[4] & 0x3fu) << 6) | (eptr[5] & 0x3fu); \ - len += 5; \ - } \ - } - -/* --------------- Whitespace macros ---------------- */ - -/* Tests for Unicode horizontal and vertical whitespace characters must check a -number of different values. Using a switch statement for this generates the -fastest code (no loop, no memory access), and there are several places in the -interpreter code where this happens. In order to ensure that all the case lists -remain in step, we use macros so that there is only one place where the lists -are defined. - -These values are also required as lists in pcre2_compile.c when processing \h, -\H, \v and \V in a character class. The lists are defined in pcre2_tables.c, -but macros that define the values are here so that all the definitions are -together. The lists must be in ascending character order, terminated by -NOTACHAR (which is 0xffffffff). - -Any changes should ensure that the various macros are kept in step with each -other. NOTE: The values also appear in pcre2_jit_compile.c. */ - -/* -------------- ASCII/Unicode environments -------------- */ - -#ifndef EBCDIC - -/* Character U+180E (Mongolian Vowel Separator) is not included in the list of -spaces in the Unicode file PropList.txt, and Perl does not recognize it as a -space. However, in many other sources it is listed as a space and has been in -PCRE (both APIs) for a long time. */ - -#define HSPACE_LIST \ - CHAR_HT, CHAR_SPACE, CHAR_NBSP, \ - 0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \ - 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \ - NOTACHAR - -#define HSPACE_MULTIBYTE_CASES \ - case 0x1680: /* OGHAM SPACE MARK */ \ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ \ - case 0x2000: /* EN QUAD */ \ - case 0x2001: /* EM QUAD */ \ - case 0x2002: /* EN SPACE */ \ - case 0x2003: /* EM SPACE */ \ - case 0x2004: /* THREE-PER-EM SPACE */ \ - case 0x2005: /* FOUR-PER-EM SPACE */ \ - case 0x2006: /* SIX-PER-EM SPACE */ \ - case 0x2007: /* FIGURE SPACE */ \ - case 0x2008: /* PUNCTUATION SPACE */ \ - case 0x2009: /* THIN SPACE */ \ - case 0x200A: /* HAIR SPACE */ \ - case 0x202f: /* NARROW NO-BREAK SPACE */ \ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ \ - case 0x3000 /* IDEOGRAPHIC SPACE */ - -#define HSPACE_BYTE_CASES \ - case CHAR_HT: \ - case CHAR_SPACE: \ - case CHAR_NBSP - -#define HSPACE_CASES \ - HSPACE_BYTE_CASES: \ - HSPACE_MULTIBYTE_CASES - -#define VSPACE_LIST \ - CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, 0x2028, 0x2029, NOTACHAR - -#define VSPACE_MULTIBYTE_CASES \ - case 0x2028: /* LINE SEPARATOR */ \ - case 0x2029 /* PARAGRAPH SEPARATOR */ - -#define VSPACE_BYTE_CASES \ - case CHAR_LF: \ - case CHAR_VT: \ - case CHAR_FF: \ - case CHAR_CR: \ - case CHAR_NEL - -#define VSPACE_CASES \ - VSPACE_BYTE_CASES: \ - VSPACE_MULTIBYTE_CASES - -/* -------------- EBCDIC environments -------------- */ - -#else -#define HSPACE_LIST CHAR_HT, CHAR_SPACE, CHAR_NBSP, NOTACHAR - -#define HSPACE_BYTE_CASES \ - case CHAR_HT: \ - case CHAR_SPACE: \ - case CHAR_NBSP - -#define HSPACE_CASES HSPACE_BYTE_CASES - -#ifdef EBCDIC_NL25 -#define VSPACE_LIST \ - CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, CHAR_LF, NOTACHAR -#else -#define VSPACE_LIST \ - CHAR_VT, CHAR_FF, CHAR_CR, CHAR_LF, CHAR_NEL, NOTACHAR -#endif - -#define VSPACE_BYTE_CASES \ - case CHAR_LF: \ - case CHAR_VT: \ - case CHAR_FF: \ - case CHAR_CR: \ - case CHAR_NEL - -#define VSPACE_CASES VSPACE_BYTE_CASES -#endif /* EBCDIC */ - -/* -------------- End of whitespace macros -------------- */ - - -/* PCRE2 is able to support several different kinds of newline (CR, LF, CRLF, -"any" and "anycrlf" at present). The following macros are used to package up -testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various -modules to indicate in which datablock the parameters exist, and what the -start/end of string field names are. */ - -#define NLTYPE_FIXED 0 /* Newline is a fixed length string */ -#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */ -#define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */ - -/* This macro checks for a newline at the given position */ - -#define IS_NEWLINE(p) \ - ((NLBLOCK->nltype != NLTYPE_FIXED)? \ - ((p) < NLBLOCK->PSEND && \ - PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, \ - &(NLBLOCK->nllen), utf)) \ - : \ - ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ - UCHAR21TEST(p) == NLBLOCK->nl[0] && \ - (NLBLOCK->nllen == 1 || UCHAR21TEST(p+1) == NLBLOCK->nl[1]) \ - ) \ - ) - -/* This macro checks for a newline immediately preceding the given position */ - -#define WAS_NEWLINE(p) \ - ((NLBLOCK->nltype != NLTYPE_FIXED)? \ - ((p) > NLBLOCK->PSSTART && \ - PRIV(was_newline)((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \ - &(NLBLOCK->nllen), utf)) \ - : \ - ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ - UCHAR21TEST(p - NLBLOCK->nllen) == NLBLOCK->nl[0] && \ - (NLBLOCK->nllen == 1 || UCHAR21TEST(p - NLBLOCK->nllen + 1) == NLBLOCK->nl[1]) \ - ) \ - ) - -/* Private flags containing information about the compiled pattern. The first -three must not be changed, because whichever is set is actually the number of -bytes in a code unit in that mode. */ - -#define PCRE2_MODE8 0x00000001 /* compiled in 8 bit mode */ -#define PCRE2_MODE16 0x00000002 /* compiled in 16 bit mode */ -#define PCRE2_MODE32 0x00000004 /* compiled in 32 bit mode */ -#define PCRE2_FIRSTSET 0x00000010 /* first_code unit is set */ -#define PCRE2_FIRSTCASELESS 0x00000020 /* caseless first code unit */ -#define PCRE2_FIRSTMAPSET 0x00000040 /* bitmap of first code units is set */ -#define PCRE2_LASTSET 0x00000080 /* last code unit is set */ -#define PCRE2_LASTCASELESS 0x00000100 /* caseless last code unit */ -#define PCRE2_STARTLINE 0x00000200 /* start after \n for multiline */ -#define PCRE2_JCHANGED 0x00000400 /* j option used in pattern */ -#define PCRE2_HASCRORLF 0x00000800 /* explicit \r or \n in pattern */ -#define PCRE2_HASTHEN 0x00001000 /* pattern contains (*THEN) */ -#define PCRE2_MATCH_EMPTY 0x00002000 /* pattern can match empty string */ -#define PCRE2_BSR_SET 0x00004000 /* BSR was set in the pattern */ -#define PCRE2_NL_SET 0x00008000 /* newline was set in the pattern */ -#define PCRE2_NOTEMPTY_SET 0x00010000 /* (*NOTEMPTY) used ) keep */ -#define PCRE2_NE_ATST_SET 0x00020000 /* (*NOTEMPTY_ATSTART) used) together */ -#define PCRE2_DEREF_TABLES 0x00040000 /* release character tables */ -#define PCRE2_NOJIT 0x00080000 /* (*NOJIT) used */ -#define PCRE2_HASBKPORX 0x00100000 /* contains \P, \p, or \X */ -#define PCRE2_DUPCAPUSED 0x00200000 /* contains (?| */ -#define PCRE2_HASBKC 0x00400000 /* contains \C */ -#define PCRE2_HASACCEPT 0x00800000 /* contains (*ACCEPT) */ - -#define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32) - -/* Values for the matchedby field in a match data block. */ - -enum { PCRE2_MATCHEDBY_INTERPRETER, /* pcre2_match() */ - PCRE2_MATCHEDBY_DFA_INTERPRETER, /* pcre2_dfa_match() */ - PCRE2_MATCHEDBY_JIT }; /* pcre2_jit_match() */ - -/* Values for the flags field in a match data block. */ - -#define PCRE2_MD_COPIED_SUBJECT 0x01u - -/* Magic number to provide a small check against being handed junk. */ - -#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ - -/* The maximum remaining length of subject we are prepared to search for a -req_unit match from an anchored pattern. In 8-bit mode, memchr() is used and is -much faster than the search loop that has to be used in 16-bit and 32-bit -modes. */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 -#define REQ_CU_MAX 5000 -#else -#define REQ_CU_MAX 2000 -#endif - -/* Offsets for the bitmap tables in the cbits set of tables. Each table -contains a set of bits for a class map. Some classes are built by combining -these tables. */ - -#define cbit_space 0 /* [:space:] or \s */ -#define cbit_xdigit 32 /* [:xdigit:] */ -#define cbit_digit 64 /* [:digit:] or \d */ -#define cbit_upper 96 /* [:upper:] */ -#define cbit_lower 128 /* [:lower:] */ -#define cbit_word 160 /* [:word:] or \w */ -#define cbit_graph 192 /* [:graph:] */ -#define cbit_print 224 /* [:print:] */ -#define cbit_punct 256 /* [:punct:] */ -#define cbit_cntrl 288 /* [:cntrl:] */ -#define cbit_length 320 /* Length of the cbits table */ - -/* Bit definitions for entries in the ctypes table. Do not change these values -without checking pcre2_jit_compile.c, which has an assertion to ensure that -ctype_word has the value 16. */ - -#define ctype_space 0x01 -#define ctype_letter 0x02 -#define ctype_lcletter 0x04 -#define ctype_digit 0x08 -#define ctype_word 0x10 /* alphanumeric or '_' */ - -/* Offsets of the various tables from the base tables pointer, and -total length of the tables. */ - -#define lcc_offset 0 /* Lower case */ -#define fcc_offset 256 /* Flip case */ -#define cbits_offset 512 /* Character classes */ -#define ctypes_offset (cbits_offset + cbit_length) /* Character types */ -#define TABLES_LENGTH (ctypes_offset + 256) - - -/* -------------------- Character and string names ------------------------ */ - -/* If PCRE2 is to support UTF-8 on EBCDIC platforms, we cannot use normal -character constants like '*' because the compiler would emit their EBCDIC code, -which is different from their ASCII/UTF-8 code. Instead we define macros for -the characters so that they always use the ASCII/UTF-8 code when UTF-8 support -is enabled. When UTF-8 support is not enabled, the definitions use character -literals. Both character and string versions of each character are needed, and -there are some longer strings as well. - -This means that, on EBCDIC platforms, the PCRE2 library can handle either -EBCDIC, or UTF-8, but not both. To support both in the same compiled library -would need different lookups depending on whether PCRE2_UTF was set or not. -This would make it impossible to use characters in switch/case statements, -which would reduce performance. For a theoretical use (which nobody has asked -for) in a minority area (EBCDIC platforms), this is not sensible. Any -application that did need both could compile two versions of the library, using -macros to give the functions distinct names. */ - -#ifndef SUPPORT_UNICODE - -/* UTF-8 support is not enabled; use the platform-dependent character literals -so that PCRE2 works in both ASCII and EBCDIC environments, but only in non-UTF -mode. Newline characters are problematic in EBCDIC. Though it has CR and LF -characters, a common practice has been to use its NL (0x15) character as the -line terminator in C-like processing environments. However, sometimes the LF -(0x25) character is used instead, according to this Unicode document: - -http://unicode.org/standard/reports/tr13/tr13-5.html - -PCRE2 defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25 -instead. Whichever is *not* chosen is defined as NEL. - -In both ASCII and EBCDIC environments, CHAR_NL and CHAR_LF are synonyms for the -same code point. */ - -#ifdef EBCDIC - -#ifndef EBCDIC_NL25 -#define CHAR_NL '\x15' -#define CHAR_NEL '\x25' -#define STR_NL "\x15" -#define STR_NEL "\x25" -#else -#define CHAR_NL '\x25' -#define CHAR_NEL '\x15' -#define STR_NL "\x25" -#define STR_NEL "\x15" -#endif - -#define CHAR_LF CHAR_NL -#define STR_LF STR_NL - -#define CHAR_ESC '\047' -#define CHAR_DEL '\007' -#define CHAR_NBSP ((unsigned char)'\x41') -#define STR_ESC "\047" -#define STR_DEL "\007" - -#else /* Not EBCDIC */ - -/* In ASCII/Unicode, linefeed is '\n' and we equate this to NL for -compatibility. NEL is the Unicode newline character; make sure it is -a positive value. */ - -#define CHAR_LF '\n' -#define CHAR_NL CHAR_LF -#define CHAR_NEL ((unsigned char)'\x85') -#define CHAR_ESC '\033' -#define CHAR_DEL '\177' -#define CHAR_NBSP ((unsigned char)'\xa0') - -#define STR_LF "\n" -#define STR_NL STR_LF -#define STR_NEL "\x85" -#define STR_ESC "\033" -#define STR_DEL "\177" - -#endif /* EBCDIC */ - -/* The remaining definitions work in both environments. */ - -#define CHAR_NUL '\0' -#define CHAR_HT '\t' -#define CHAR_VT '\v' -#define CHAR_FF '\f' -#define CHAR_CR '\r' -#define CHAR_BS '\b' -#define CHAR_BEL '\a' - -#define CHAR_SPACE ' ' -#define CHAR_EXCLAMATION_MARK '!' -#define CHAR_QUOTATION_MARK '"' -#define CHAR_NUMBER_SIGN '#' -#define CHAR_DOLLAR_SIGN '$' -#define CHAR_PERCENT_SIGN '%' -#define CHAR_AMPERSAND '&' -#define CHAR_APOSTROPHE '\'' -#define CHAR_LEFT_PARENTHESIS '(' -#define CHAR_RIGHT_PARENTHESIS ')' -#define CHAR_ASTERISK '*' -#define CHAR_PLUS '+' -#define CHAR_COMMA ',' -#define CHAR_MINUS '-' -#define CHAR_DOT '.' -#define CHAR_SLASH '/' -#define CHAR_0 '0' -#define CHAR_1 '1' -#define CHAR_2 '2' -#define CHAR_3 '3' -#define CHAR_4 '4' -#define CHAR_5 '5' -#define CHAR_6 '6' -#define CHAR_7 '7' -#define CHAR_8 '8' -#define CHAR_9 '9' -#define CHAR_COLON ':' -#define CHAR_SEMICOLON ';' -#define CHAR_LESS_THAN_SIGN '<' -#define CHAR_EQUALS_SIGN '=' -#define CHAR_GREATER_THAN_SIGN '>' -#define CHAR_QUESTION_MARK '?' -#define CHAR_COMMERCIAL_AT '@' -#define CHAR_A 'A' -#define CHAR_B 'B' -#define CHAR_C 'C' -#define CHAR_D 'D' -#define CHAR_E 'E' -#define CHAR_F 'F' -#define CHAR_G 'G' -#define CHAR_H 'H' -#define CHAR_I 'I' -#define CHAR_J 'J' -#define CHAR_K 'K' -#define CHAR_L 'L' -#define CHAR_M 'M' -#define CHAR_N 'N' -#define CHAR_O 'O' -#define CHAR_P 'P' -#define CHAR_Q 'Q' -#define CHAR_R 'R' -#define CHAR_S 'S' -#define CHAR_T 'T' -#define CHAR_U 'U' -#define CHAR_V 'V' -#define CHAR_W 'W' -#define CHAR_X 'X' -#define CHAR_Y 'Y' -#define CHAR_Z 'Z' -#define CHAR_LEFT_SQUARE_BRACKET '[' -#define CHAR_BACKSLASH '\\' -#define CHAR_RIGHT_SQUARE_BRACKET ']' -#define CHAR_CIRCUMFLEX_ACCENT '^' -#define CHAR_UNDERSCORE '_' -#define CHAR_GRAVE_ACCENT '`' -#define CHAR_a 'a' -#define CHAR_b 'b' -#define CHAR_c 'c' -#define CHAR_d 'd' -#define CHAR_e 'e' -#define CHAR_f 'f' -#define CHAR_g 'g' -#define CHAR_h 'h' -#define CHAR_i 'i' -#define CHAR_j 'j' -#define CHAR_k 'k' -#define CHAR_l 'l' -#define CHAR_m 'm' -#define CHAR_n 'n' -#define CHAR_o 'o' -#define CHAR_p 'p' -#define CHAR_q 'q' -#define CHAR_r 'r' -#define CHAR_s 's' -#define CHAR_t 't' -#define CHAR_u 'u' -#define CHAR_v 'v' -#define CHAR_w 'w' -#define CHAR_x 'x' -#define CHAR_y 'y' -#define CHAR_z 'z' -#define CHAR_LEFT_CURLY_BRACKET '{' -#define CHAR_VERTICAL_LINE '|' -#define CHAR_RIGHT_CURLY_BRACKET '}' -#define CHAR_TILDE '~' - -#define STR_HT "\t" -#define STR_VT "\v" -#define STR_FF "\f" -#define STR_CR "\r" -#define STR_BS "\b" -#define STR_BEL "\a" - -#define STR_SPACE " " -#define STR_EXCLAMATION_MARK "!" -#define STR_QUOTATION_MARK "\"" -#define STR_NUMBER_SIGN "#" -#define STR_DOLLAR_SIGN "$" -#define STR_PERCENT_SIGN "%" -#define STR_AMPERSAND "&" -#define STR_APOSTROPHE "'" -#define STR_LEFT_PARENTHESIS "(" -#define STR_RIGHT_PARENTHESIS ")" -#define STR_ASTERISK "*" -#define STR_PLUS "+" -#define STR_COMMA "," -#define STR_MINUS "-" -#define STR_DOT "." -#define STR_SLASH "/" -#define STR_0 "0" -#define STR_1 "1" -#define STR_2 "2" -#define STR_3 "3" -#define STR_4 "4" -#define STR_5 "5" -#define STR_6 "6" -#define STR_7 "7" -#define STR_8 "8" -#define STR_9 "9" -#define STR_COLON ":" -#define STR_SEMICOLON ";" -#define STR_LESS_THAN_SIGN "<" -#define STR_EQUALS_SIGN "=" -#define STR_GREATER_THAN_SIGN ">" -#define STR_QUESTION_MARK "?" -#define STR_COMMERCIAL_AT "@" -#define STR_A "A" -#define STR_B "B" -#define STR_C "C" -#define STR_D "D" -#define STR_E "E" -#define STR_F "F" -#define STR_G "G" -#define STR_H "H" -#define STR_I "I" -#define STR_J "J" -#define STR_K "K" -#define STR_L "L" -#define STR_M "M" -#define STR_N "N" -#define STR_O "O" -#define STR_P "P" -#define STR_Q "Q" -#define STR_R "R" -#define STR_S "S" -#define STR_T "T" -#define STR_U "U" -#define STR_V "V" -#define STR_W "W" -#define STR_X "X" -#define STR_Y "Y" -#define STR_Z "Z" -#define STR_LEFT_SQUARE_BRACKET "[" -#define STR_BACKSLASH "\\" -#define STR_RIGHT_SQUARE_BRACKET "]" -#define STR_CIRCUMFLEX_ACCENT "^" -#define STR_UNDERSCORE "_" -#define STR_GRAVE_ACCENT "`" -#define STR_a "a" -#define STR_b "b" -#define STR_c "c" -#define STR_d "d" -#define STR_e "e" -#define STR_f "f" -#define STR_g "g" -#define STR_h "h" -#define STR_i "i" -#define STR_j "j" -#define STR_k "k" -#define STR_l "l" -#define STR_m "m" -#define STR_n "n" -#define STR_o "o" -#define STR_p "p" -#define STR_q "q" -#define STR_r "r" -#define STR_s "s" -#define STR_t "t" -#define STR_u "u" -#define STR_v "v" -#define STR_w "w" -#define STR_x "x" -#define STR_y "y" -#define STR_z "z" -#define STR_LEFT_CURLY_BRACKET "{" -#define STR_VERTICAL_LINE "|" -#define STR_RIGHT_CURLY_BRACKET "}" -#define STR_TILDE "~" - -#define STRING_ACCEPT0 "ACCEPT\0" -#define STRING_COMMIT0 "COMMIT\0" -#define STRING_F0 "F\0" -#define STRING_FAIL0 "FAIL\0" -#define STRING_MARK0 "MARK\0" -#define STRING_PRUNE0 "PRUNE\0" -#define STRING_SKIP0 "SKIP\0" -#define STRING_THEN "THEN" - -#define STRING_atomic0 "atomic\0" -#define STRING_pla0 "pla\0" -#define STRING_plb0 "plb\0" -#define STRING_napla0 "napla\0" -#define STRING_naplb0 "naplb\0" -#define STRING_nla0 "nla\0" -#define STRING_nlb0 "nlb\0" -#define STRING_sr0 "sr\0" -#define STRING_asr0 "asr\0" -#define STRING_positive_lookahead0 "positive_lookahead\0" -#define STRING_positive_lookbehind0 "positive_lookbehind\0" -#define STRING_non_atomic_positive_lookahead0 "non_atomic_positive_lookahead\0" -#define STRING_non_atomic_positive_lookbehind0 "non_atomic_positive_lookbehind\0" -#define STRING_negative_lookahead0 "negative_lookahead\0" -#define STRING_negative_lookbehind0 "negative_lookbehind\0" -#define STRING_script_run0 "script_run\0" -#define STRING_atomic_script_run "atomic_script_run" - -#define STRING_alpha0 "alpha\0" -#define STRING_lower0 "lower\0" -#define STRING_upper0 "upper\0" -#define STRING_alnum0 "alnum\0" -#define STRING_ascii0 "ascii\0" -#define STRING_blank0 "blank\0" -#define STRING_cntrl0 "cntrl\0" -#define STRING_digit0 "digit\0" -#define STRING_graph0 "graph\0" -#define STRING_print0 "print\0" -#define STRING_punct0 "punct\0" -#define STRING_space0 "space\0" -#define STRING_word0 "word\0" -#define STRING_xdigit "xdigit" - -#define STRING_DEFINE "DEFINE" -#define STRING_VERSION "VERSION" -#define STRING_WEIRD_STARTWORD "[:<:]]" -#define STRING_WEIRD_ENDWORD "[:>:]]" - -#define STRING_CR_RIGHTPAR "CR)" -#define STRING_LF_RIGHTPAR "LF)" -#define STRING_CRLF_RIGHTPAR "CRLF)" -#define STRING_ANY_RIGHTPAR "ANY)" -#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" -#define STRING_NUL_RIGHTPAR "NUL)" -#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" -#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" -#define STRING_UTF8_RIGHTPAR "UTF8)" -#define STRING_UTF16_RIGHTPAR "UTF16)" -#define STRING_UTF32_RIGHTPAR "UTF32)" -#define STRING_UTF_RIGHTPAR "UTF)" -#define STRING_UCP_RIGHTPAR "UCP)" -#define STRING_NO_AUTO_POSSESS_RIGHTPAR "NO_AUTO_POSSESS)" -#define STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR "NO_DOTSTAR_ANCHOR)" -#define STRING_NO_JIT_RIGHTPAR "NO_JIT)" -#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" -#define STRING_NOTEMPTY_RIGHTPAR "NOTEMPTY)" -#define STRING_NOTEMPTY_ATSTART_RIGHTPAR "NOTEMPTY_ATSTART)" -#define STRING_LIMIT_HEAP_EQ "LIMIT_HEAP=" -#define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH=" -#define STRING_LIMIT_DEPTH_EQ "LIMIT_DEPTH=" -#define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION=" -#define STRING_MARK "MARK" - -#else /* SUPPORT_UNICODE */ - -/* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This -works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode -only. */ - -#define CHAR_HT '\011' -#define CHAR_VT '\013' -#define CHAR_FF '\014' -#define CHAR_CR '\015' -#define CHAR_LF '\012' -#define CHAR_NL CHAR_LF -#define CHAR_NEL ((unsigned char)'\x85') -#define CHAR_BS '\010' -#define CHAR_BEL '\007' -#define CHAR_ESC '\033' -#define CHAR_DEL '\177' - -#define CHAR_NUL '\0' -#define CHAR_SPACE '\040' -#define CHAR_EXCLAMATION_MARK '\041' -#define CHAR_QUOTATION_MARK '\042' -#define CHAR_NUMBER_SIGN '\043' -#define CHAR_DOLLAR_SIGN '\044' -#define CHAR_PERCENT_SIGN '\045' -#define CHAR_AMPERSAND '\046' -#define CHAR_APOSTROPHE '\047' -#define CHAR_LEFT_PARENTHESIS '\050' -#define CHAR_RIGHT_PARENTHESIS '\051' -#define CHAR_ASTERISK '\052' -#define CHAR_PLUS '\053' -#define CHAR_COMMA '\054' -#define CHAR_MINUS '\055' -#define CHAR_DOT '\056' -#define CHAR_SLASH '\057' -#define CHAR_0 '\060' -#define CHAR_1 '\061' -#define CHAR_2 '\062' -#define CHAR_3 '\063' -#define CHAR_4 '\064' -#define CHAR_5 '\065' -#define CHAR_6 '\066' -#define CHAR_7 '\067' -#define CHAR_8 '\070' -#define CHAR_9 '\071' -#define CHAR_COLON '\072' -#define CHAR_SEMICOLON '\073' -#define CHAR_LESS_THAN_SIGN '\074' -#define CHAR_EQUALS_SIGN '\075' -#define CHAR_GREATER_THAN_SIGN '\076' -#define CHAR_QUESTION_MARK '\077' -#define CHAR_COMMERCIAL_AT '\100' -#define CHAR_A '\101' -#define CHAR_B '\102' -#define CHAR_C '\103' -#define CHAR_D '\104' -#define CHAR_E '\105' -#define CHAR_F '\106' -#define CHAR_G '\107' -#define CHAR_H '\110' -#define CHAR_I '\111' -#define CHAR_J '\112' -#define CHAR_K '\113' -#define CHAR_L '\114' -#define CHAR_M '\115' -#define CHAR_N '\116' -#define CHAR_O '\117' -#define CHAR_P '\120' -#define CHAR_Q '\121' -#define CHAR_R '\122' -#define CHAR_S '\123' -#define CHAR_T '\124' -#define CHAR_U '\125' -#define CHAR_V '\126' -#define CHAR_W '\127' -#define CHAR_X '\130' -#define CHAR_Y '\131' -#define CHAR_Z '\132' -#define CHAR_LEFT_SQUARE_BRACKET '\133' -#define CHAR_BACKSLASH '\134' -#define CHAR_RIGHT_SQUARE_BRACKET '\135' -#define CHAR_CIRCUMFLEX_ACCENT '\136' -#define CHAR_UNDERSCORE '\137' -#define CHAR_GRAVE_ACCENT '\140' -#define CHAR_a '\141' -#define CHAR_b '\142' -#define CHAR_c '\143' -#define CHAR_d '\144' -#define CHAR_e '\145' -#define CHAR_f '\146' -#define CHAR_g '\147' -#define CHAR_h '\150' -#define CHAR_i '\151' -#define CHAR_j '\152' -#define CHAR_k '\153' -#define CHAR_l '\154' -#define CHAR_m '\155' -#define CHAR_n '\156' -#define CHAR_o '\157' -#define CHAR_p '\160' -#define CHAR_q '\161' -#define CHAR_r '\162' -#define CHAR_s '\163' -#define CHAR_t '\164' -#define CHAR_u '\165' -#define CHAR_v '\166' -#define CHAR_w '\167' -#define CHAR_x '\170' -#define CHAR_y '\171' -#define CHAR_z '\172' -#define CHAR_LEFT_CURLY_BRACKET '\173' -#define CHAR_VERTICAL_LINE '\174' -#define CHAR_RIGHT_CURLY_BRACKET '\175' -#define CHAR_TILDE '\176' -#define CHAR_NBSP ((unsigned char)'\xa0') - -#define STR_HT "\011" -#define STR_VT "\013" -#define STR_FF "\014" -#define STR_CR "\015" -#define STR_NL "\012" -#define STR_BS "\010" -#define STR_BEL "\007" -#define STR_ESC "\033" -#define STR_DEL "\177" - -#define STR_SPACE "\040" -#define STR_EXCLAMATION_MARK "\041" -#define STR_QUOTATION_MARK "\042" -#define STR_NUMBER_SIGN "\043" -#define STR_DOLLAR_SIGN "\044" -#define STR_PERCENT_SIGN "\045" -#define STR_AMPERSAND "\046" -#define STR_APOSTROPHE "\047" -#define STR_LEFT_PARENTHESIS "\050" -#define STR_RIGHT_PARENTHESIS "\051" -#define STR_ASTERISK "\052" -#define STR_PLUS "\053" -#define STR_COMMA "\054" -#define STR_MINUS "\055" -#define STR_DOT "\056" -#define STR_SLASH "\057" -#define STR_0 "\060" -#define STR_1 "\061" -#define STR_2 "\062" -#define STR_3 "\063" -#define STR_4 "\064" -#define STR_5 "\065" -#define STR_6 "\066" -#define STR_7 "\067" -#define STR_8 "\070" -#define STR_9 "\071" -#define STR_COLON "\072" -#define STR_SEMICOLON "\073" -#define STR_LESS_THAN_SIGN "\074" -#define STR_EQUALS_SIGN "\075" -#define STR_GREATER_THAN_SIGN "\076" -#define STR_QUESTION_MARK "\077" -#define STR_COMMERCIAL_AT "\100" -#define STR_A "\101" -#define STR_B "\102" -#define STR_C "\103" -#define STR_D "\104" -#define STR_E "\105" -#define STR_F "\106" -#define STR_G "\107" -#define STR_H "\110" -#define STR_I "\111" -#define STR_J "\112" -#define STR_K "\113" -#define STR_L "\114" -#define STR_M "\115" -#define STR_N "\116" -#define STR_O "\117" -#define STR_P "\120" -#define STR_Q "\121" -#define STR_R "\122" -#define STR_S "\123" -#define STR_T "\124" -#define STR_U "\125" -#define STR_V "\126" -#define STR_W "\127" -#define STR_X "\130" -#define STR_Y "\131" -#define STR_Z "\132" -#define STR_LEFT_SQUARE_BRACKET "\133" -#define STR_BACKSLASH "\134" -#define STR_RIGHT_SQUARE_BRACKET "\135" -#define STR_CIRCUMFLEX_ACCENT "\136" -#define STR_UNDERSCORE "\137" -#define STR_GRAVE_ACCENT "\140" -#define STR_a "\141" -#define STR_b "\142" -#define STR_c "\143" -#define STR_d "\144" -#define STR_e "\145" -#define STR_f "\146" -#define STR_g "\147" -#define STR_h "\150" -#define STR_i "\151" -#define STR_j "\152" -#define STR_k "\153" -#define STR_l "\154" -#define STR_m "\155" -#define STR_n "\156" -#define STR_o "\157" -#define STR_p "\160" -#define STR_q "\161" -#define STR_r "\162" -#define STR_s "\163" -#define STR_t "\164" -#define STR_u "\165" -#define STR_v "\166" -#define STR_w "\167" -#define STR_x "\170" -#define STR_y "\171" -#define STR_z "\172" -#define STR_LEFT_CURLY_BRACKET "\173" -#define STR_VERTICAL_LINE "\174" -#define STR_RIGHT_CURLY_BRACKET "\175" -#define STR_TILDE "\176" - -#define STRING_ACCEPT0 STR_A STR_C STR_C STR_E STR_P STR_T "\0" -#define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0" -#define STRING_F0 STR_F "\0" -#define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0" -#define STRING_MARK0 STR_M STR_A STR_R STR_K "\0" -#define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0" -#define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0" -#define STRING_THEN STR_T STR_H STR_E STR_N - -#define STRING_atomic0 STR_a STR_t STR_o STR_m STR_i STR_c "\0" -#define STRING_pla0 STR_p STR_l STR_a "\0" -#define STRING_plb0 STR_p STR_l STR_b "\0" -#define STRING_napla0 STR_n STR_a STR_p STR_l STR_a "\0" -#define STRING_naplb0 STR_n STR_a STR_p STR_l STR_b "\0" -#define STRING_nla0 STR_n STR_l STR_a "\0" -#define STRING_nlb0 STR_n STR_l STR_b "\0" -#define STRING_sr0 STR_s STR_r "\0" -#define STRING_asr0 STR_a STR_s STR_r "\0" -#define STRING_positive_lookahead0 STR_p STR_o STR_s STR_i STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_a STR_h STR_e STR_a STR_d "\0" -#define STRING_positive_lookbehind0 STR_p STR_o STR_s STR_i STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_b STR_e STR_h STR_i STR_n STR_d "\0" -#define STRING_non_atomic_positive_lookahead0 STR_n STR_o STR_n STR_UNDERSCORE STR_a STR_t STR_o STR_m STR_i STR_c STR_UNDERSCORE STR_p STR_o STR_s STR_i STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_a STR_h STR_e STR_a STR_d "\0" -#define STRING_non_atomic_positive_lookbehind0 STR_n STR_o STR_n STR_UNDERSCORE STR_a STR_t STR_o STR_m STR_i STR_c STR_UNDERSCORE STR_p STR_o STR_s STR_i STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_b STR_e STR_h STR_i STR_n STR_d "\0" -#define STRING_negative_lookahead0 STR_n STR_e STR_g STR_a STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_a STR_h STR_e STR_a STR_d "\0" -#define STRING_negative_lookbehind0 STR_n STR_e STR_g STR_a STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_b STR_e STR_h STR_i STR_n STR_d "\0" -#define STRING_script_run0 STR_s STR_c STR_r STR_i STR_p STR_t STR_UNDERSCORE STR_r STR_u STR_n "\0" -#define STRING_atomic_script_run STR_a STR_t STR_o STR_m STR_i STR_c STR_UNDERSCORE STR_s STR_c STR_r STR_i STR_p STR_t STR_UNDERSCORE STR_r STR_u STR_n - -#define STRING_alpha0 STR_a STR_l STR_p STR_h STR_a "\0" -#define STRING_lower0 STR_l STR_o STR_w STR_e STR_r "\0" -#define STRING_upper0 STR_u STR_p STR_p STR_e STR_r "\0" -#define STRING_alnum0 STR_a STR_l STR_n STR_u STR_m "\0" -#define STRING_ascii0 STR_a STR_s STR_c STR_i STR_i "\0" -#define STRING_blank0 STR_b STR_l STR_a STR_n STR_k "\0" -#define STRING_cntrl0 STR_c STR_n STR_t STR_r STR_l "\0" -#define STRING_digit0 STR_d STR_i STR_g STR_i STR_t "\0" -#define STRING_graph0 STR_g STR_r STR_a STR_p STR_h "\0" -#define STRING_print0 STR_p STR_r STR_i STR_n STR_t "\0" -#define STRING_punct0 STR_p STR_u STR_n STR_c STR_t "\0" -#define STRING_space0 STR_s STR_p STR_a STR_c STR_e "\0" -#define STRING_word0 STR_w STR_o STR_r STR_d "\0" -#define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t - -#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E -#define STRING_VERSION STR_V STR_E STR_R STR_S STR_I STR_O STR_N -#define STRING_WEIRD_STARTWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_LESS_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET -#define STRING_WEIRD_ENDWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_GREATER_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET - -#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS -#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS -#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS -#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS -#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS -#define STRING_NUL_RIGHTPAR STR_N STR_U STR_L STR_RIGHT_PARENTHESIS -#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS -#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS -#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS -#define STRING_UTF16_RIGHTPAR STR_U STR_T STR_F STR_1 STR_6 STR_RIGHT_PARENTHESIS -#define STRING_UTF32_RIGHTPAR STR_U STR_T STR_F STR_3 STR_2 STR_RIGHT_PARENTHESIS -#define STRING_UTF_RIGHTPAR STR_U STR_T STR_F STR_RIGHT_PARENTHESIS -#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS -#define STRING_NO_AUTO_POSSESS_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_A STR_U STR_T STR_O STR_UNDERSCORE STR_P STR_O STR_S STR_S STR_E STR_S STR_S STR_RIGHT_PARENTHESIS -#define STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_D STR_O STR_T STR_S STR_T STR_A STR_R STR_UNDERSCORE STR_A STR_N STR_C STR_H STR_O STR_R STR_RIGHT_PARENTHESIS -#define STRING_NO_JIT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_J STR_I STR_T STR_RIGHT_PARENTHESIS -#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS -#define STRING_NOTEMPTY_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_RIGHT_PARENTHESIS -#define STRING_NOTEMPTY_ATSTART_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_UNDERSCORE STR_A STR_T STR_S STR_T STR_A STR_R STR_T STR_RIGHT_PARENTHESIS -#define STRING_LIMIT_HEAP_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_H STR_E STR_A STR_P STR_EQUALS_SIGN -#define STRING_LIMIT_MATCH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_M STR_A STR_T STR_C STR_H STR_EQUALS_SIGN -#define STRING_LIMIT_DEPTH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_D STR_E STR_P STR_T STR_H STR_EQUALS_SIGN -#define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN -#define STRING_MARK STR_M STR_A STR_R STR_K - -#endif /* SUPPORT_UNICODE */ - -/* -------------------- End of character and string names -------------------*/ - -/* -------------------- Definitions for compiled patterns -------------------*/ - -/* Codes for different types of Unicode property */ - -#define PT_ANY 0 /* Any property - matches all chars */ -#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ -#define PT_GC 2 /* Specified general characteristic (e.g. L) */ -#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */ -#define PT_SC 4 /* Script (e.g. Han) */ -#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */ -#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */ -#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */ -#define PT_WORD 8 /* Word - L plus N plus underscore */ -#define PT_CLIST 9 /* Pseudo-property: match character list */ -#define PT_UCNC 10 /* Universal Character nameable character */ -#define PT_TABSIZE 11 /* Size of square table for autopossessify tests */ - -/* The following special properties are used only in XCLASS items, when POSIX -classes are specified and PCRE2_UCP is set - in other words, for Unicode -handling of these classes. They are not available via the \p or \P escapes like -those in the above list, and so they do not take part in the autopossessifying -table. */ - -#define PT_PXGRAPH 11 /* [:graph:] - characters that mark the paper */ -#define PT_PXPRINT 12 /* [:print:] - [:graph:] plus non-control spaces */ -#define PT_PXPUNCT 13 /* [:punct:] - punctuation characters */ - -/* Flag bits and data types for the extended class (OP_XCLASS) for classes that -contain characters with values greater than 255. */ - -#define XCL_NOT 0x01 /* Flag: this is a negative class */ -#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ -#define XCL_HASPROP 0x04 /* Flag: property checks are present. */ - -#define XCL_END 0 /* Marks end of individual items */ -#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ -#define XCL_RANGE 2 /* A range (two multibyte chars) follows */ -#define XCL_PROP 3 /* Unicode property (2-byte property code follows) */ -#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ - -/* These are escaped items that aren't just an encoding of a particular data -value such as \n. They must have non-zero values, as check_escape() returns 0 -for a data character. In the escapes[] table in pcre2_compile.c their values -are negated in order to distinguish them from data values. - -They must appear here in the same order as in the opcode definitions below, up -to ESC_z. There's a dummy for OP_ALLANY because it corresponds to "." in DOTALL -mode rather than an escape sequence. It is also used for [^] in JavaScript -compatibility mode, and for \C in non-utf mode. In non-DOTALL mode, "." behaves -like \N. - -Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in -check_escape(). There are tests in the code for an escape greater than ESC_b -and less than ESC_Z to detect the types that may be repeated. These are the -types that consume characters. If any new escapes are put in between that don't -consume a character, that code will have to change. */ - -enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, - ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, - ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, - ESC_E, ESC_Q, ESC_g, ESC_k }; - - -/********************** Opcode definitions ******************/ - -/****** NOTE NOTE NOTE ****** - -Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in -order to the list of escapes immediately above. Furthermore, values up to -OP_DOLLM must not be changed without adjusting the table called autoposstab in -pcre2_auto_possess.c. - -Whenever this list is updated, the two macro definitions that follow must be -updated to match. The possessification table called "opcode_possessify" in -pcre2_compile.c must also be updated, and also the tables called "coptable" -and "poptable" in pcre2_dfa_match.c. - -****** NOTE NOTE NOTE ******/ - - -/* The values between FIRST_AUTOTAB_OP and LAST_AUTOTAB_RIGHT_OP, inclusive, -are used in a table for deciding whether a repeated character type can be -auto-possessified. */ - -#define FIRST_AUTOTAB_OP OP_NOT_DIGIT -#define LAST_AUTOTAB_LEFT_OP OP_EXTUNI -#define LAST_AUTOTAB_RIGHT_OP OP_DOLLM - -enum { - OP_END, /* 0 End of pattern */ - - /* Values corresponding to backslashed metacharacters */ - - OP_SOD, /* 1 Start of data: \A */ - OP_SOM, /* 2 Start of match (subject + offset): \G */ - OP_SET_SOM, /* 3 Set start of match (\K) */ - OP_NOT_WORD_BOUNDARY, /* 4 \B */ - OP_WORD_BOUNDARY, /* 5 \b */ - OP_NOT_DIGIT, /* 6 \D */ - OP_DIGIT, /* 7 \d */ - OP_NOT_WHITESPACE, /* 8 \S */ - OP_WHITESPACE, /* 9 \s */ - OP_NOT_WORDCHAR, /* 10 \W */ - OP_WORDCHAR, /* 11 \w */ - - OP_ANY, /* 12 Match any character except newline (\N) */ - OP_ALLANY, /* 13 Match any character */ - OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */ - OP_NOTPROP, /* 15 \P (not Unicode property) */ - OP_PROP, /* 16 \p (Unicode property) */ - OP_ANYNL, /* 17 \R (any newline sequence) */ - OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */ - OP_HSPACE, /* 19 \h (horizontal whitespace) */ - OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */ - OP_VSPACE, /* 21 \v (vertical whitespace) */ - OP_EXTUNI, /* 22 \X (extended Unicode sequence */ - OP_EODN, /* 23 End of data or \n at end of data (\Z) */ - OP_EOD, /* 24 End of data (\z) */ - - /* Line end assertions */ - - OP_DOLL, /* 25 End of line - not multiline */ - OP_DOLLM, /* 26 End of line - multiline */ - OP_CIRC, /* 27 Start of line - not multiline */ - OP_CIRCM, /* 28 Start of line - multiline */ - - /* Single characters; caseful must precede the caseless ones, and these - must remain in this order, and adjacent. */ - - OP_CHAR, /* 29 Match one character, casefully */ - OP_CHARI, /* 30 Match one character, caselessly */ - OP_NOT, /* 31 Match one character, not the given one, casefully */ - OP_NOTI, /* 32 Match one character, not the given one, caselessly */ - - /* The following sets of 13 opcodes must always be kept in step because - the offset from the first one is used to generate the others. */ - - /* Repeated characters; caseful must precede the caseless ones */ - - OP_STAR, /* 33 The maximizing and minimizing versions of */ - OP_MINSTAR, /* 34 these six opcodes must come in pairs, with */ - OP_PLUS, /* 35 the minimizing one second. */ - OP_MINPLUS, /* 36 */ - OP_QUERY, /* 37 */ - OP_MINQUERY, /* 38 */ - - OP_UPTO, /* 39 From 0 to n matches of one character, caseful*/ - OP_MINUPTO, /* 40 */ - OP_EXACT, /* 41 Exactly n matches */ - - OP_POSSTAR, /* 42 Possessified star, caseful */ - OP_POSPLUS, /* 43 Possessified plus, caseful */ - OP_POSQUERY, /* 44 Posesssified query, caseful */ - OP_POSUPTO, /* 45 Possessified upto, caseful */ - - /* Repeated characters; caseless must follow the caseful ones */ - - OP_STARI, /* 46 */ - OP_MINSTARI, /* 47 */ - OP_PLUSI, /* 48 */ - OP_MINPLUSI, /* 49 */ - OP_QUERYI, /* 50 */ - OP_MINQUERYI, /* 51 */ - - OP_UPTOI, /* 52 From 0 to n matches of one character, caseless */ - OP_MINUPTOI, /* 53 */ - OP_EXACTI, /* 54 */ - - OP_POSSTARI, /* 55 Possessified star, caseless */ - OP_POSPLUSI, /* 56 Possessified plus, caseless */ - OP_POSQUERYI, /* 57 Posesssified query, caseless */ - OP_POSUPTOI, /* 58 Possessified upto, caseless */ - - /* The negated ones must follow the non-negated ones, and match them */ - /* Negated repeated character, caseful; must precede the caseless ones */ - - OP_NOTSTAR, /* 59 The maximizing and minimizing versions of */ - OP_NOTMINSTAR, /* 60 these six opcodes must come in pairs, with */ - OP_NOTPLUS, /* 61 the minimizing one second. They must be in */ - OP_NOTMINPLUS, /* 62 exactly the same order as those above. */ - OP_NOTQUERY, /* 63 */ - OP_NOTMINQUERY, /* 64 */ - - OP_NOTUPTO, /* 65 From 0 to n matches, caseful */ - OP_NOTMINUPTO, /* 66 */ - OP_NOTEXACT, /* 67 Exactly n matches */ - - OP_NOTPOSSTAR, /* 68 Possessified versions, caseful */ - OP_NOTPOSPLUS, /* 69 */ - OP_NOTPOSQUERY, /* 70 */ - OP_NOTPOSUPTO, /* 71 */ - - /* Negated repeated character, caseless; must follow the caseful ones */ - - OP_NOTSTARI, /* 72 */ - OP_NOTMINSTARI, /* 73 */ - OP_NOTPLUSI, /* 74 */ - OP_NOTMINPLUSI, /* 75 */ - OP_NOTQUERYI, /* 76 */ - OP_NOTMINQUERYI, /* 77 */ - - OP_NOTUPTOI, /* 78 From 0 to n matches, caseless */ - OP_NOTMINUPTOI, /* 79 */ - OP_NOTEXACTI, /* 80 Exactly n matches */ - - OP_NOTPOSSTARI, /* 81 Possessified versions, caseless */ - OP_NOTPOSPLUSI, /* 82 */ - OP_NOTPOSQUERYI, /* 83 */ - OP_NOTPOSUPTOI, /* 84 */ - - /* Character types */ - - OP_TYPESTAR, /* 85 The maximizing and minimizing versions of */ - OP_TYPEMINSTAR, /* 86 these six opcodes must come in pairs, with */ - OP_TYPEPLUS, /* 87 the minimizing one second. These codes must */ - OP_TYPEMINPLUS, /* 88 be in exactly the same order as those above. */ - OP_TYPEQUERY, /* 89 */ - OP_TYPEMINQUERY, /* 90 */ - - OP_TYPEUPTO, /* 91 From 0 to n matches */ - OP_TYPEMINUPTO, /* 92 */ - OP_TYPEEXACT, /* 93 Exactly n matches */ - - OP_TYPEPOSSTAR, /* 94 Possessified versions */ - OP_TYPEPOSPLUS, /* 95 */ - OP_TYPEPOSQUERY, /* 96 */ - OP_TYPEPOSUPTO, /* 97 */ - - /* These are used for character classes and back references; only the - first six are the same as the sets above. */ - - OP_CRSTAR, /* 98 The maximizing and minimizing versions of */ - OP_CRMINSTAR, /* 99 all these opcodes must come in pairs, with */ - OP_CRPLUS, /* 100 the minimizing one second. These codes must */ - OP_CRMINPLUS, /* 101 be in exactly the same order as those above. */ - OP_CRQUERY, /* 102 */ - OP_CRMINQUERY, /* 103 */ - - OP_CRRANGE, /* 104 These are different to the three sets above. */ - OP_CRMINRANGE, /* 105 */ - - OP_CRPOSSTAR, /* 106 Possessified versions */ - OP_CRPOSPLUS, /* 107 */ - OP_CRPOSQUERY, /* 108 */ - OP_CRPOSRANGE, /* 109 */ - - /* End of quantifier opcodes */ - - OP_CLASS, /* 110 Match a character class, chars < 256 only */ - OP_NCLASS, /* 111 Same, but the bitmap was created from a negative - class - the difference is relevant only when a - character > 255 is encountered. */ - OP_XCLASS, /* 112 Extended class for handling > 255 chars within the - class. This does both positive and negative. */ - OP_REF, /* 113 Match a back reference, casefully */ - OP_REFI, /* 114 Match a back reference, caselessly */ - OP_DNREF, /* 115 Match a duplicate name backref, casefully */ - OP_DNREFI, /* 116 Match a duplicate name backref, caselessly */ - OP_RECURSE, /* 117 Match a numbered subpattern (possibly recursive) */ - OP_CALLOUT, /* 118 Call out to external function if provided */ - OP_CALLOUT_STR, /* 119 Call out with string argument */ - - OP_ALT, /* 120 Start of alternation */ - OP_KET, /* 121 End of group that doesn't have an unbounded repeat */ - OP_KETRMAX, /* 122 These two must remain together and in this */ - OP_KETRMIN, /* 123 order. They are for groups the repeat for ever. */ - OP_KETRPOS, /* 124 Possessive unlimited repeat. */ - - /* The assertions must come before BRA, CBRA, ONCE, and COND. */ - - OP_REVERSE, /* 125 Move pointer back - used in lookbehind assertions */ - OP_ASSERT, /* 126 Positive lookahead */ - OP_ASSERT_NOT, /* 127 Negative lookahead */ - OP_ASSERTBACK, /* 128 Positive lookbehind */ - OP_ASSERTBACK_NOT, /* 129 Negative lookbehind */ - OP_ASSERT_NA, /* 130 Positive non-atomic lookahead */ - OP_ASSERTBACK_NA, /* 131 Positive non-atomic lookbehind */ - - /* ONCE, SCRIPT_RUN, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come - immediately after the assertions, with ONCE first, as there's a test for >= - ONCE for a subpattern that isn't an assertion. The POS versions must - immediately follow the non-POS versions in each case. */ - - OP_ONCE, /* 132 Atomic group, contains captures */ - OP_SCRIPT_RUN, /* 133 Non-capture, but check characters' scripts */ - OP_BRA, /* 134 Start of non-capturing bracket */ - OP_BRAPOS, /* 135 Ditto, with unlimited, possessive repeat */ - OP_CBRA, /* 136 Start of capturing bracket */ - OP_CBRAPOS, /* 137 Ditto, with unlimited, possessive repeat */ - OP_COND, /* 138 Conditional group */ - - /* These five must follow the previous five, in the same order. There's a - check for >= SBRA to distinguish the two sets. */ - - OP_SBRA, /* 139 Start of non-capturing bracket, check empty */ - OP_SBRAPOS, /* 149 Ditto, with unlimited, possessive repeat */ - OP_SCBRA, /* 141 Start of capturing bracket, check empty */ - OP_SCBRAPOS, /* 142 Ditto, with unlimited, possessive repeat */ - OP_SCOND, /* 143 Conditional group, check empty */ - - /* The next two pairs must (respectively) be kept together. */ - - OP_CREF, /* 144 Used to hold a capture number as condition */ - OP_DNCREF, /* 145 Used to point to duplicate names as a condition */ - OP_RREF, /* 146 Used to hold a recursion number as condition */ - OP_DNRREF, /* 147 Used to point to duplicate names as a condition */ - OP_FALSE, /* 148 Always false (used by DEFINE and VERSION) */ - OP_TRUE, /* 149 Always true (used by VERSION) */ - - OP_BRAZERO, /* 150 These two must remain together and in this */ - OP_BRAMINZERO, /* 151 order. */ - OP_BRAPOSZERO, /* 152 */ - - /* These are backtracking control verbs */ - - OP_MARK, /* 153 always has an argument */ - OP_PRUNE, /* 154 */ - OP_PRUNE_ARG, /* 155 same, but with argument */ - OP_SKIP, /* 156 */ - OP_SKIP_ARG, /* 157 same, but with argument */ - OP_THEN, /* 158 */ - OP_THEN_ARG, /* 159 same, but with argument */ - OP_COMMIT, /* 160 */ - OP_COMMIT_ARG, /* 161 same, but with argument */ - - /* These are forced failure and success verbs. FAIL and ACCEPT do accept an - argument, but these cases can be compiled as, for example, (*MARK:X)(*FAIL) - without the need for a special opcode. */ - - OP_FAIL, /* 162 */ - OP_ACCEPT, /* 163 */ - OP_ASSERT_ACCEPT, /* 164 Used inside assertions */ - OP_CLOSE, /* 165 Used before OP_ACCEPT to close open captures */ - - /* This is used to skip a subpattern with a {0} quantifier */ - - OP_SKIPZERO, /* 166 */ - - /* This is used to identify a DEFINE group during compilation so that it can - be checked for having only one branch. It is changed to OP_FALSE before - compilation finishes. */ - - OP_DEFINE, /* 167 */ - - /* This is not an opcode, but is used to check that tables indexed by opcode - are the correct length, in order to catch updating errors - there have been - some in the past. */ - - OP_TABLE_LENGTH - -}; - -/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro -definitions that follow must also be updated to match. There are also tables -called "opcode_possessify" in pcre2_compile.c and "coptable" and "poptable" in -pcre2_dfa_match.c that must be updated. */ - - -/* This macro defines textual names for all the opcodes. These are used only -for debugging, and some of them are only partial names. The macro is referenced -only in pcre2_printint.c, which fills out the full names in many cases (and in -some cases doesn't actually use these names at all). */ - -#define OP_NAME_LIST \ - "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \ - "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \ - "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \ - "extuni", "\\Z", "\\z", \ - "$", "$", "^", "^", "char", "chari", "not", "noti", \ - "*", "*?", "+", "+?", "?", "??", \ - "{", "{", "{", \ - "*+","++", "?+", "{", \ - "*", "*?", "+", "+?", "?", "??", \ - "{", "{", "{", \ - "*+","++", "?+", "{", \ - "*", "*?", "+", "+?", "?", "??", \ - "{", "{", "{", \ - "*+","++", "?+", "{", \ - "*", "*?", "+", "+?", "?", "??", \ - "{", "{", "{", \ - "*+","++", "?+", "{", \ - "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ - "*+","++", "?+", "{", \ - "*", "*?", "+", "+?", "?", "??", "{", "{", \ - "*+","++", "?+", "{", \ - "class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \ - "Recurse", "Callout", "CalloutStr", \ - "Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \ - "Reverse", "Assert", "Assert not", \ - "Assert back", "Assert back not", \ - "Non-atomic assert", "Non-atomic assert back", \ - "Once", \ - "Script run", \ - "Bra", "BraPos", "CBra", "CBraPos", \ - "Cond", \ - "SBra", "SBraPos", "SCBra", "SCBraPos", \ - "SCond", \ - "Cond ref", "Cond dnref", "Cond rec", "Cond dnrec", \ - "Cond false", "Cond true", \ - "Brazero", "Braminzero", "Braposzero", \ - "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \ - "*THEN", "*THEN", "*COMMIT", "*COMMIT", "*FAIL", \ - "*ACCEPT", "*ASSERT_ACCEPT", \ - "Close", "Skip zero", "Define" - - -/* This macro defines the length of fixed length operations in the compiled -regex. The lengths are used when searching for specific things, and also in the -debugging printing of a compiled regex. We use a macro so that it can be -defined close to the definitions of the opcodes themselves. - -As things have been extended, some of these are no longer fixed lenths, but are -minima instead. For example, the length of a single-character repeat may vary -in UTF-8 mode. The code that uses this table must know about such things. */ - -#define OP_LENGTHS \ - 1, /* End */ \ - 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \ - 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \ - 1, 1, 1, /* Any, AllAny, Anybyte */ \ - 3, 3, /* \P, \p */ \ - 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ - 1, /* \X */ \ - 1, 1, 1, 1, 1, 1, /* \Z, \z, $, $M ^, ^M */ \ - 2, /* Char - the minimum length */ \ - 2, /* Chari - the minimum length */ \ - 2, /* not */ \ - 2, /* noti */ \ - /* Positive single-char repeats ** These are */ \ - 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ - 2+IMM2_SIZE, 2+IMM2_SIZE, /* upto, minupto ** mode */ \ - 2+IMM2_SIZE, /* exact */ \ - 2, 2, 2, 2+IMM2_SIZE, /* *+, ++, ?+, upto+ */ \ - 2, 2, 2, 2, 2, 2, /* *I, *?I, +I, +?I, ?I, ??I ** UTF-8 */ \ - 2+IMM2_SIZE, 2+IMM2_SIZE, /* upto I, minupto I */ \ - 2+IMM2_SIZE, /* exact I */ \ - 2, 2, 2, 2+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ \ - /* Negative single-char repeats - only for chars < 256 */ \ - 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ - 2+IMM2_SIZE, 2+IMM2_SIZE, /* NOT upto, minupto */ \ - 2+IMM2_SIZE, /* NOT exact */ \ - 2, 2, 2, 2+IMM2_SIZE, /* Possessive NOT *, +, ?, upto */ \ - 2, 2, 2, 2, 2, 2, /* NOT *I, *?I, +I, +?I, ?I, ??I */ \ - 2+IMM2_SIZE, 2+IMM2_SIZE, /* NOT upto I, minupto I */ \ - 2+IMM2_SIZE, /* NOT exact I */ \ - 2, 2, 2, 2+IMM2_SIZE, /* Possessive NOT *I, +I, ?I, upto I */ \ - /* Positive type repeats */ \ - 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ - 2+IMM2_SIZE, 2+IMM2_SIZE, /* Type upto, minupto */ \ - 2+IMM2_SIZE, /* Type exact */ \ - 2, 2, 2, 2+IMM2_SIZE, /* Possessive *+, ++, ?+, upto+ */ \ - /* Character class & ref repeats */ \ - 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ - 1+2*IMM2_SIZE, 1+2*IMM2_SIZE, /* CRRANGE, CRMINRANGE */ \ - 1, 1, 1, 1+2*IMM2_SIZE, /* Possessive *+, ++, ?+, CRPOSRANGE */ \ - 1+(32/sizeof(PCRE2_UCHAR)), /* CLASS */ \ - 1+(32/sizeof(PCRE2_UCHAR)), /* NCLASS */ \ - 0, /* XCLASS - variable length */ \ - 1+IMM2_SIZE, /* REF */ \ - 1+IMM2_SIZE, /* REFI */ \ - 1+2*IMM2_SIZE, /* DNREF */ \ - 1+2*IMM2_SIZE, /* DNREFI */ \ - 1+LINK_SIZE, /* RECURSE */ \ - 1+2*LINK_SIZE+1, /* CALLOUT */ \ - 0, /* CALLOUT_STR - variable length */ \ - 1+LINK_SIZE, /* Alt */ \ - 1+LINK_SIZE, /* Ket */ \ - 1+LINK_SIZE, /* KetRmax */ \ - 1+LINK_SIZE, /* KetRmin */ \ - 1+LINK_SIZE, /* KetRpos */ \ - 1+LINK_SIZE, /* Reverse */ \ - 1+LINK_SIZE, /* Assert */ \ - 1+LINK_SIZE, /* Assert not */ \ - 1+LINK_SIZE, /* Assert behind */ \ - 1+LINK_SIZE, /* Assert behind not */ \ - 1+LINK_SIZE, /* NA Assert */ \ - 1+LINK_SIZE, /* NA Assert behind */ \ - 1+LINK_SIZE, /* ONCE */ \ - 1+LINK_SIZE, /* SCRIPT_RUN */ \ - 1+LINK_SIZE, /* BRA */ \ - 1+LINK_SIZE, /* BRAPOS */ \ - 1+LINK_SIZE+IMM2_SIZE, /* CBRA */ \ - 1+LINK_SIZE+IMM2_SIZE, /* CBRAPOS */ \ - 1+LINK_SIZE, /* COND */ \ - 1+LINK_SIZE, /* SBRA */ \ - 1+LINK_SIZE, /* SBRAPOS */ \ - 1+LINK_SIZE+IMM2_SIZE, /* SCBRA */ \ - 1+LINK_SIZE+IMM2_SIZE, /* SCBRAPOS */ \ - 1+LINK_SIZE, /* SCOND */ \ - 1+IMM2_SIZE, 1+2*IMM2_SIZE, /* CREF, DNCREF */ \ - 1+IMM2_SIZE, 1+2*IMM2_SIZE, /* RREF, DNRREF */ \ - 1, 1, /* FALSE, TRUE */ \ - 1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \ - 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \ - 1, 3, /* SKIP, SKIP_ARG */ \ - 1, 3, /* THEN, THEN_ARG */ \ - 1, 3, /* COMMIT, COMMIT_ARG */ \ - 1, 1, 1, /* FAIL, ACCEPT, ASSERT_ACCEPT */ \ - 1+IMM2_SIZE, 1, /* CLOSE, SKIPZERO */ \ - 1 /* DEFINE */ - -/* A magic value for OP_RREF to indicate the "any recursion" condition. */ - -#define RREF_ANY 0xffff - - -/* ---------- Private structures that are mode-independent. ---------- */ - -/* Structure to hold data for custom memory management. */ - -typedef struct pcre2_memctl { - void * (*malloc)(size_t, void *); - void (*free)(void *, void *); - void *memory_data; -} pcre2_memctl; - -/* Structure for building a chain of open capturing subpatterns during -compiling, so that instructions to close them can be compiled when (*ACCEPT) is -encountered. */ - -typedef struct open_capitem { - struct open_capitem *next; /* Chain link */ - uint16_t number; /* Capture number */ - uint16_t assert_depth; /* Assertion depth when opened */ -} open_capitem; - -/* Layout of the UCP type table that translates property names into types and -codes. Each entry used to point directly to a name, but to reduce the number of -relocations in shared libraries, it now has an offset into a single string -instead. */ - -typedef struct { - uint16_t name_offset; - uint16_t type; - uint16_t value; -} ucp_type_table; - -/* Unicode character database (UCD) record format */ - -typedef struct { - uint8_t script; /* ucp_Arabic, etc. */ - uint8_t chartype; /* ucp_Cc, etc. (general categories) */ - uint8_t gbprop; /* ucp_gbControl, etc. (grapheme break property) */ - uint8_t caseset; /* offset to multichar other cases or zero */ - int32_t other_case; /* offset to other case, or zero if none */ - int16_t scriptx; /* script extension value */ - int16_t dummy; /* spare - to round to multiple of 4 bytes */ -} ucd_record; - -/* UCD access macros */ - -#define UCD_BLOCK_SIZE 128 -#define REAL_GET_UCD(ch) (PRIV(ucd_records) + \ - PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \ - UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE]) - -#if PCRE2_CODE_UNIT_WIDTH == 32 -#define GET_UCD(ch) ((ch > MAX_UTF_CODE_POINT)? \ - PRIV(dummy_ucd_record) : REAL_GET_UCD(ch)) -#else -#define GET_UCD(ch) REAL_GET_UCD(ch) -#endif - -#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype -#define UCD_SCRIPT(ch) GET_UCD(ch)->script -#define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)] -#define UCD_GRAPHBREAK(ch) GET_UCD(ch)->gbprop -#define UCD_CASESET(ch) GET_UCD(ch)->caseset -#define UCD_OTHERCASE(ch) ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case))) -#define UCD_SCRIPTX(ch) GET_UCD(ch)->scriptx - -/* Header for serialized pcre2 codes. */ - -typedef struct pcre2_serialized_data { - uint32_t magic; - uint32_t version; - uint32_t config; - int32_t number_of_codes; -} pcre2_serialized_data; - - - -/* ----------------- Items that need PCRE2_CODE_UNIT_WIDTH ----------------- */ - -/* When this file is included by pcre2test, PCRE2_CODE_UNIT_WIDTH is defined as -0, so the following items are omitted. */ - -#if defined PCRE2_CODE_UNIT_WIDTH && PCRE2_CODE_UNIT_WIDTH != 0 - -/* EBCDIC is supported only for the 8-bit library. */ - -#if defined EBCDIC && PCRE2_CODE_UNIT_WIDTH != 8 -#error EBCDIC is not supported for the 16-bit or 32-bit libraries -#endif - -/* This is the largest non-UTF code point. */ - -#define MAX_NON_UTF_CHAR (0xffffffffU >> (32 - PCRE2_CODE_UNIT_WIDTH)) - -/* Internal shared data tables and variables. These are used by more than one -of the exported public functions. They have to be "external" in the C sense, -but are not part of the PCRE2 public API. Although the data for some of them is -identical in all libraries, they must have different names so that multiple -libraries can be simultaneously linked to a single application. However, UTF-8 -tables are needed only when compiling the 8-bit library. */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 -extern const int PRIV(utf8_table1)[]; -extern const int PRIV(utf8_table1_size); -extern const int PRIV(utf8_table2)[]; -extern const int PRIV(utf8_table3)[]; -extern const uint8_t PRIV(utf8_table4)[]; -#endif - -#define _pcre2_OP_lengths PCRE2_SUFFIX(_pcre2_OP_lengths_) -#define _pcre2_callout_end_delims PCRE2_SUFFIX(_pcre2_callout_end_delims_) -#define _pcre2_callout_start_delims PCRE2_SUFFIX(_pcre2_callout_start_delims_) -#define _pcre2_default_compile_context PCRE2_SUFFIX(_pcre2_default_compile_context_) -#define _pcre2_default_convert_context PCRE2_SUFFIX(_pcre2_default_convert_context_) -#define _pcre2_default_match_context PCRE2_SUFFIX(_pcre2_default_match_context_) -#define _pcre2_default_tables PCRE2_SUFFIX(_pcre2_default_tables_) -#if PCRE2_CODE_UNIT_WIDTH == 32 -#define _pcre2_dummy_ucd_record PCRE2_SUFFIX(_pcre2_dummy_ucd_record_) -#endif -#define _pcre2_hspace_list PCRE2_SUFFIX(_pcre2_hspace_list_) -#define _pcre2_vspace_list PCRE2_SUFFIX(_pcre2_vspace_list_) -#define _pcre2_ucd_caseless_sets PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_) -#define _pcre2_ucd_digit_sets PCRE2_SUFFIX(_pcre2_ucd_digit_sets_) -#define _pcre2_ucd_script_sets PCRE2_SUFFIX(_pcre2_ucd_script_sets_) -#define _pcre2_ucd_records PCRE2_SUFFIX(_pcre2_ucd_records_) -#define _pcre2_ucd_stage1 PCRE2_SUFFIX(_pcre2_ucd_stage1_) -#define _pcre2_ucd_stage2 PCRE2_SUFFIX(_pcre2_ucd_stage2_) -#define _pcre2_ucp_gbtable PCRE2_SUFFIX(_pcre2_ucp_gbtable_) -#define _pcre2_ucp_gentype PCRE2_SUFFIX(_pcre2_ucp_gentype_) -#define _pcre2_ucp_typerange PCRE2_SUFFIX(_pcre2_ucp_typerange_) -#define _pcre2_unicode_version PCRE2_SUFFIX(_pcre2_unicode_version_) -#define _pcre2_utt PCRE2_SUFFIX(_pcre2_utt_) -#define _pcre2_utt_names PCRE2_SUFFIX(_pcre2_utt_names_) -#define _pcre2_utt_size PCRE2_SUFFIX(_pcre2_utt_size_) - -extern const uint8_t PRIV(OP_lengths)[]; -extern const uint32_t PRIV(callout_end_delims)[]; -extern const uint32_t PRIV(callout_start_delims)[]; -extern const pcre2_compile_context PRIV(default_compile_context); -extern const pcre2_convert_context PRIV(default_convert_context); -extern const pcre2_match_context PRIV(default_match_context); -extern const uint8_t PRIV(default_tables)[]; -extern const uint32_t PRIV(hspace_list)[]; -extern const uint32_t PRIV(vspace_list)[]; -extern const uint32_t PRIV(ucd_caseless_sets)[]; -extern const uint32_t PRIV(ucd_digit_sets)[]; -extern const uint8_t PRIV(ucd_script_sets)[]; -extern const ucd_record PRIV(ucd_records)[]; -#if PCRE2_CODE_UNIT_WIDTH == 32 -extern const ucd_record PRIV(dummy_ucd_record)[]; -#endif -extern const uint16_t PRIV(ucd_stage1)[]; -extern const uint16_t PRIV(ucd_stage2)[]; -extern const uint32_t PRIV(ucp_gbtable)[]; -extern const uint32_t PRIV(ucp_gentype)[]; -#ifdef SUPPORT_JIT -extern const int PRIV(ucp_typerange)[]; -#endif -extern const char *PRIV(unicode_version); -extern const ucp_type_table PRIV(utt)[]; -extern const char PRIV(utt_names)[]; -extern const size_t PRIV(utt_size); - -/* Mode-dependent macros and hidden and private structures are defined in a -separate file so that pcre2test can include them at all supported widths. When -compiling the library, PCRE2_CODE_UNIT_WIDTH will be defined, and we can -include them at the appropriate width, after setting up suffix macros for the -private structures. */ - -#define branch_chain PCRE2_SUFFIX(branch_chain_) -#define compile_block PCRE2_SUFFIX(compile_block_) -#define dfa_match_block PCRE2_SUFFIX(dfa_match_block_) -#define match_block PCRE2_SUFFIX(match_block_) -#define named_group PCRE2_SUFFIX(named_group_) - -#include "pcre2_intmodedep.h" - -/* Private "external" functions. These are internal functions that are called -from modules other than the one in which they are defined. They have to be -"external" in the C sense, but are not part of the PCRE2 public API. They are -not referenced from pcre2test, and must not be defined when no code unit width -is available. */ - -#define _pcre2_auto_possessify PCRE2_SUFFIX(_pcre2_auto_possessify_) -#define _pcre2_check_escape PCRE2_SUFFIX(_pcre2_check_escape_) -#define _pcre2_extuni PCRE2_SUFFIX(_pcre2_extuni_) -#define _pcre2_find_bracket PCRE2_SUFFIX(_pcre2_find_bracket_) -#define _pcre2_is_newline PCRE2_SUFFIX(_pcre2_is_newline_) -#define _pcre2_jit_free_rodata PCRE2_SUFFIX(_pcre2_jit_free_rodata_) -#define _pcre2_jit_free PCRE2_SUFFIX(_pcre2_jit_free_) -#define _pcre2_jit_get_size PCRE2_SUFFIX(_pcre2_jit_get_size_) -#define _pcre2_jit_get_target PCRE2_SUFFIX(_pcre2_jit_get_target_) -#define _pcre2_memctl_malloc PCRE2_SUFFIX(_pcre2_memctl_malloc_) -#define _pcre2_ord2utf PCRE2_SUFFIX(_pcre2_ord2utf_) -#define _pcre2_script_run PCRE2_SUFFIX(_pcre2_script_run_) -#define _pcre2_strcmp PCRE2_SUFFIX(_pcre2_strcmp_) -#define _pcre2_strcmp_c8 PCRE2_SUFFIX(_pcre2_strcmp_c8_) -#define _pcre2_strcpy_c8 PCRE2_SUFFIX(_pcre2_strcpy_c8_) -#define _pcre2_strlen PCRE2_SUFFIX(_pcre2_strlen_) -#define _pcre2_strncmp PCRE2_SUFFIX(_pcre2_strncmp_) -#define _pcre2_strncmp_c8 PCRE2_SUFFIX(_pcre2_strncmp_c8_) -#define _pcre2_study PCRE2_SUFFIX(_pcre2_study_) -#define _pcre2_valid_utf PCRE2_SUFFIX(_pcre2_valid_utf_) -#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_) -#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_) - -extern int _pcre2_auto_possessify(PCRE2_UCHAR *, - const compile_block *); -extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *, - int *, uint32_t, uint32_t, BOOL, compile_block *); -extern PCRE2_SPTR _pcre2_extuni(uint32_t, PCRE2_SPTR, PCRE2_SPTR, PCRE2_SPTR, - BOOL, int *); -extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int); -extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, - uint32_t *, BOOL); -extern void _pcre2_jit_free_rodata(void *, void *); -extern void _pcre2_jit_free(void *, pcre2_memctl *); -extern size_t _pcre2_jit_get_size(void *); -const char * _pcre2_jit_get_target(void); -extern void * _pcre2_memctl_malloc(size_t, pcre2_memctl *); -extern unsigned int _pcre2_ord2utf(uint32_t, PCRE2_UCHAR *); -extern BOOL _pcre2_script_run(PCRE2_SPTR, PCRE2_SPTR, BOOL); -extern int _pcre2_strcmp(PCRE2_SPTR, PCRE2_SPTR); -extern int _pcre2_strcmp_c8(PCRE2_SPTR, const char *); -extern PCRE2_SIZE _pcre2_strcpy_c8(PCRE2_UCHAR *, const char *); -extern PCRE2_SIZE _pcre2_strlen(PCRE2_SPTR); -extern int _pcre2_strncmp(PCRE2_SPTR, PCRE2_SPTR, size_t); -extern int _pcre2_strncmp_c8(PCRE2_SPTR, const char *, size_t); -extern int _pcre2_study(pcre2_real_code *); -extern int _pcre2_valid_utf(PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE *); -extern BOOL _pcre2_was_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, - uint32_t *, BOOL); -extern BOOL _pcre2_xclass(uint32_t, PCRE2_SPTR, BOOL); - -/* This function is needed only when memmove() is not available. */ - -#if !defined(VPCOMPAT) && !defined(HAVE_MEMMOVE) -#define _pcre2_memmove PCRE2_SUFFIX(_pcre2_memmove) -extern void * _pcre2_memmove(void *, const void *, size_t); -#endif - -#endif /* PCRE2_CODE_UNIT_WIDTH */ -#endif /* PCRE2_INTERNAL_H_IDEMPOTENT_GUARD */ - -/* End of pcre2_internal.h */ diff --git a/pcre2/src/pcre2_intmodedep.h b/pcre2/src/pcre2_intmodedep.h deleted file mode 100644 index ea3b3ec69..000000000 --- a/pcre2/src/pcre2_intmodedep.h +++ /dev/null @@ -1,923 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2018 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -/* This module contains mode-dependent macro and structure definitions. The -file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined. -These mode-dependent items are kept in a separate file so that they can also be -#included multiple times for different code unit widths by pcre2test in order -to have access to the hidden structures at all supported widths. - -Some of the mode-dependent macros are required at different widths for -different parts of the pcre2test code (in particular, the included -pcre_printint.c file). We undefine them here so that they can be re-defined for -multiple inclusions. Not all of these are used in pcre2test, but it's easier -just to undefine them all. */ - -#undef ACROSSCHAR -#undef BACKCHAR -#undef BYTES2CU -#undef CHMAX_255 -#undef CU2BYTES -#undef FORWARDCHAR -#undef FORWARDCHARTEST -#undef GET -#undef GET2 -#undef GETCHAR -#undef GETCHARINC -#undef GETCHARINCTEST -#undef GETCHARLEN -#undef GETCHARLENTEST -#undef GETCHARTEST -#undef GET_EXTRALEN -#undef HAS_EXTRALEN -#undef IMM2_SIZE -#undef MAX_255 -#undef MAX_MARK -#undef MAX_PATTERN_SIZE -#undef MAX_UTF_SINGLE_CU -#undef NOT_FIRSTCU -#undef PUT -#undef PUT2 -#undef PUT2INC -#undef PUTCHAR -#undef PUTINC -#undef TABLE_GET - - - -/* -------------------------- MACROS ----------------------------- */ - -/* PCRE keeps offsets in its compiled code as at least 16-bit quantities -(always stored in big-endian order in 8-bit mode) by default. These are used, -for example, to link from the start of a subpattern to its alternatives and its -end. The use of 16 bits per offset limits the size of an 8-bit compiled regex -to around 64K, which is big enough for almost everybody. However, I received a -request for an even bigger limit. For this reason, and also to make the code -easier to maintain, the storing and loading of offsets from the compiled code -unit string is now handled by the macros that are defined here. - -The macros are controlled by the value of LINK_SIZE. This defaults to 2, but -values of 3 or 4 are also supported. */ - -/* ------------------- 8-bit support ------------------ */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 - -#if LINK_SIZE == 2 -#define PUT(a,n,d) \ - (a[n] = (PCRE2_UCHAR)((d) >> 8)), \ - (a[(n)+1] = (PCRE2_UCHAR)((d) & 255)) -#define GET(a,n) \ - (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) -#define MAX_PATTERN_SIZE (1 << 16) - -#elif LINK_SIZE == 3 -#define PUT(a,n,d) \ - (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ - (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \ - (a[(n)+2] = (PCRE2_UCHAR)((d) & 255)) -#define GET(a,n) \ - (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) -#define MAX_PATTERN_SIZE (1 << 24) - -#elif LINK_SIZE == 4 -#define PUT(a,n,d) \ - (a[n] = (PCRE2_UCHAR)((d) >> 24)), \ - (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \ - (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)), \ - (a[(n)+3] = (PCRE2_UCHAR)((d) & 255)) -#define GET(a,n) \ - (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) -#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ - -#else -#error LINK_SIZE must be 2, 3, or 4 -#endif - - -/* ------------------- 16-bit support ------------------ */ - -#elif PCRE2_CODE_UNIT_WIDTH == 16 - -#if LINK_SIZE == 2 -#undef LINK_SIZE -#define LINK_SIZE 1 -#define PUT(a,n,d) \ - (a[n] = (PCRE2_UCHAR)(d)) -#define GET(a,n) \ - (a[n]) -#define MAX_PATTERN_SIZE (1 << 16) - -#elif LINK_SIZE == 3 || LINK_SIZE == 4 -#undef LINK_SIZE -#define LINK_SIZE 2 -#define PUT(a,n,d) \ - (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ - (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535)) -#define GET(a,n) \ - (unsigned int)(((a)[n] << 16) | (a)[(n)+1]) -#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ - -#else -#error LINK_SIZE must be 2, 3, or 4 -#endif - - -/* ------------------- 32-bit support ------------------ */ - -#elif PCRE2_CODE_UNIT_WIDTH == 32 -#undef LINK_SIZE -#define LINK_SIZE 1 -#define PUT(a,n,d) \ - (a[n] = (d)) -#define GET(a,n) \ - (a[n]) -#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ - -#else -#error Unsupported compiling mode -#endif - - -/* --------------- Other mode-specific macros ----------------- */ - -/* PCRE uses some other (at least) 16-bit quantities that do not change when -the size of offsets changes. There are used for repeat counts and for other -things such as capturing parenthesis numbers in back references. - -Define the number of code units required to hold a 16-bit count/offset, and -macros to load and store such a value. For reasons that I do not understand, -the expression in the 8-bit GET2 macro is treated by gcc as a signed -expression, even when a is declared as unsigned. It seems that any kind of -arithmetic results in a signed value. Hence the cast. */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 -#define IMM2_SIZE 2 -#define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) -#define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255 - -#else /* Code units are 16 or 32 bits */ -#define IMM2_SIZE 1 -#define GET2(a,n) a[n] -#define PUT2(a,n,d) a[n] = d -#endif - -/* Other macros that are different for 8-bit mode. The MAX_255 macro checks -whether its argument, which is assumed to be one code unit, is less than 256. -The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK -name must fit in one code unit; currently it is set to 255 or 65535. The -TABLE_GET macro is used to access elements of tables containing exactly 256 -items. Its argument is a code unit. When code points can be greater than 255, a -check is needed before accessing these tables. */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 -#define MAX_255(c) TRUE -#define MAX_MARK ((1u << 8) - 1) -#define TABLE_GET(c, table, default) ((table)[c]) -#ifdef SUPPORT_UNICODE -#define SUPPORT_WIDE_CHARS -#define CHMAX_255(c) ((c) <= 255u) -#else -#define CHMAX_255(c) TRUE -#endif /* SUPPORT_UNICODE */ - -#else /* Code units are 16 or 32 bits */ -#define CHMAX_255(c) ((c) <= 255u) -#define MAX_255(c) ((c) <= 255u) -#define MAX_MARK ((1u << 16) - 1) -#define SUPPORT_WIDE_CHARS -#define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default)) -#endif - - -/* ----------------- Character-handling macros ----------------- */ - -/* There is a proposed future special "UTF-21" mode, in which only the lowest -21 bits of a 32-bit character are interpreted as UTF, with the remaining 11 -high-order bits available to the application for other uses. In preparation for -the future implementation of this mode, there are macros that load a data item -and, if in this special mode, mask it to 21 bits. These macros all have names -starting with UCHAR21. In all other modes, including the normal 32-bit -library, the macros all have the same simple definitions. When the new mode is -implemented, it is expected that these definitions will be varied appropriately -using #ifdef when compiling the library that supports the special mode. */ - -#define UCHAR21(eptr) (*(eptr)) -#define UCHAR21TEST(eptr) (*(eptr)) -#define UCHAR21INC(eptr) (*(eptr)++) -#define UCHAR21INCTEST(eptr) (*(eptr)++) - -/* When UTF encoding is being used, a character is no longer just a single -byte in 8-bit mode or a single short in 16-bit mode. The macros for character -handling generate simple sequences when used in the basic mode, and more -complicated ones for UTF characters. GETCHARLENTEST and other macros are not -used when UTF is not supported. To make sure they can never even appear when -UTF support is omitted, we don't even define them. */ - -#ifndef SUPPORT_UNICODE - -/* #define MAX_UTF_SINGLE_CU */ -/* #define HAS_EXTRALEN(c) */ -/* #define GET_EXTRALEN(c) */ -/* #define NOT_FIRSTCU(c) */ -#define GETCHAR(c, eptr) c = *eptr; -#define GETCHARTEST(c, eptr) c = *eptr; -#define GETCHARINC(c, eptr) c = *eptr++; -#define GETCHARINCTEST(c, eptr) c = *eptr++; -#define GETCHARLEN(c, eptr, len) c = *eptr; -#define PUTCHAR(c, p) (*p = c, 1) -/* #define GETCHARLENTEST(c, eptr, len) */ -/* #define BACKCHAR(eptr) */ -/* #define FORWARDCHAR(eptr) */ -/* #define FORWARCCHARTEST(eptr,end) */ -/* #define ACROSSCHAR(condition, eptr, action) */ - -#else /* SUPPORT_UNICODE */ - -/* ------------------- 8-bit support ------------------ */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 -#define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */ - -/* The largest UTF code point that can be encoded as a single code unit. */ - -#define MAX_UTF_SINGLE_CU 127 - -/* Tests whether the code point needs extra characters to decode. */ - -#define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c) - -/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. -Otherwise it has an undefined behaviour. */ - -#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu]) - -/* Returns TRUE, if the given value is not the first code unit of a UTF -sequence. */ - -#define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u) - -/* Get the next UTF-8 character, not advancing the pointer. This is called when -we know we are in UTF-8 mode. */ - -#define GETCHAR(c, eptr) \ - c = *eptr; \ - if (c >= 0xc0u) GETUTF8(c, eptr); - -/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the -pointer. */ - -#define GETCHARTEST(c, eptr) \ - c = *eptr; \ - if (utf && c >= 0xc0u) GETUTF8(c, eptr); - -/* Get the next UTF-8 character, advancing the pointer. This is called when we -know we are in UTF-8 mode. */ - -#define GETCHARINC(c, eptr) \ - c = *eptr++; \ - if (c >= 0xc0u) GETUTF8INC(c, eptr); - -/* Get the next character, testing for UTF-8 mode, and advancing the pointer. -This is called when we don't know if we are in UTF-8 mode. */ - -#define GETCHARINCTEST(c, eptr) \ - c = *eptr++; \ - if (utf && c >= 0xc0u) GETUTF8INC(c, eptr); - -/* Get the next UTF-8 character, not advancing the pointer, incrementing length -if there are extra bytes. This is called when we know we are in UTF-8 mode. */ - -#define GETCHARLEN(c, eptr, len) \ - c = *eptr; \ - if (c >= 0xc0u) GETUTF8LEN(c, eptr, len); - -/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the -pointer, incrementing length if there are extra bytes. This is called when we -do not know if we are in UTF-8 mode. */ - -#define GETCHARLENTEST(c, eptr, len) \ - c = *eptr; \ - if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len); - -/* If the pointer is not at the start of a character, move it back until -it is. This is called only in UTF-8 mode - we don't put a test within the macro -because almost all calls are already within a block of UTF-8 only code. */ - -#define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr-- - -/* Same as above, just in the other direction. */ -#define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++ -#define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++ - -/* Same as above, but it allows a fully customizable form. */ -#define ACROSSCHAR(condition, eptr, action) \ - while((condition) && ((*eptr) & 0xc0u) == 0x80u) action - -/* Deposit a character into memory, returning the number of code units. */ - -#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \ - PRIV(ord2utf)(c,p) : (*p = c, 1)) - - -/* ------------------- 16-bit support ------------------ */ - -#elif PCRE2_CODE_UNIT_WIDTH == 16 -#define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */ - -/* The largest UTF code point that can be encoded as a single code unit. */ - -#define MAX_UTF_SINGLE_CU 65535 - -/* Tests whether the code point needs extra characters to decode. */ - -#define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u) - -/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. -Otherwise it has an undefined behaviour. */ - -#define GET_EXTRALEN(c) 1 - -/* Returns TRUE, if the given value is not the first code unit of a UTF -sequence. */ - -#define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u) - -/* Base macro to pick up the low surrogate of a UTF-16 character, not -advancing the pointer. */ - -#define GETUTF16(c, eptr) \ - { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; } - -/* Get the next UTF-16 character, not advancing the pointer. This is called when -we know we are in UTF-16 mode. */ - -#define GETCHAR(c, eptr) \ - c = *eptr; \ - if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); - -/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the -pointer. */ - -#define GETCHARTEST(c, eptr) \ - c = *eptr; \ - if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); - -/* Base macro to pick up the low surrogate of a UTF-16 character, advancing -the pointer. */ - -#define GETUTF16INC(c, eptr) \ - { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; } - -/* Get the next UTF-16 character, advancing the pointer. This is called when we -know we are in UTF-16 mode. */ - -#define GETCHARINC(c, eptr) \ - c = *eptr++; \ - if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); - -/* Get the next character, testing for UTF-16 mode, and advancing the pointer. -This is called when we don't know if we are in UTF-16 mode. */ - -#define GETCHARINCTEST(c, eptr) \ - c = *eptr++; \ - if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); - -/* Base macro to pick up the low surrogate of a UTF-16 character, not -advancing the pointer, incrementing the length. */ - -#define GETUTF16LEN(c, eptr, len) \ - { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; } - -/* Get the next UTF-16 character, not advancing the pointer, incrementing -length if there is a low surrogate. This is called when we know we are in -UTF-16 mode. */ - -#define GETCHARLEN(c, eptr, len) \ - c = *eptr; \ - if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); - -/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the -pointer, incrementing length if there is a low surrogate. This is called when -we do not know if we are in UTF-16 mode. */ - -#define GETCHARLENTEST(c, eptr, len) \ - c = *eptr; \ - if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); - -/* If the pointer is not at the start of a character, move it back until -it is. This is called only in UTF-16 mode - we don't put a test within the -macro because almost all calls are already within a block of UTF-16 only -code. */ - -#define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr-- - -/* Same as above, just in the other direction. */ -#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++ -#define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++ - -/* Same as above, but it allows a fully customizable form. */ -#define ACROSSCHAR(condition, eptr, action) \ - if ((condition) && ((*eptr) & 0xfc00u) == 0xdc00u) action - -/* Deposit a character into memory, returning the number of code units. */ - -#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \ - PRIV(ord2utf)(c,p) : (*p = c, 1)) - - -/* ------------------- 32-bit support ------------------ */ - -#else - -/* These are trivial for the 32-bit library, since all UTF-32 characters fit -into one PCRE2_UCHAR unit. */ - -#define MAX_UTF_SINGLE_CU (0x10ffffu) -#define HAS_EXTRALEN(c) (0) -#define GET_EXTRALEN(c) (0) -#define NOT_FIRSTCU(c) (0) - -/* Get the next UTF-32 character, not advancing the pointer. This is called when -we know we are in UTF-32 mode. */ - -#define GETCHAR(c, eptr) \ - c = *(eptr); - -/* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the -pointer. */ - -#define GETCHARTEST(c, eptr) \ - c = *(eptr); - -/* Get the next UTF-32 character, advancing the pointer. This is called when we -know we are in UTF-32 mode. */ - -#define GETCHARINC(c, eptr) \ - c = *((eptr)++); - -/* Get the next character, testing for UTF-32 mode, and advancing the pointer. -This is called when we don't know if we are in UTF-32 mode. */ - -#define GETCHARINCTEST(c, eptr) \ - c = *((eptr)++); - -/* Get the next UTF-32 character, not advancing the pointer, not incrementing -length (since all UTF-32 is of length 1). This is called when we know we are in -UTF-32 mode. */ - -#define GETCHARLEN(c, eptr, len) \ - GETCHAR(c, eptr) - -/* Get the next UTF-32character, testing for UTF-32 mode, not advancing the -pointer, not incrementing the length (since all UTF-32 is of length 1). -This is called when we do not know if we are in UTF-32 mode. */ - -#define GETCHARLENTEST(c, eptr, len) \ - GETCHARTEST(c, eptr) - -/* If the pointer is not at the start of a character, move it back until -it is. This is called only in UTF-32 mode - we don't put a test within the -macro because almost all calls are already within a block of UTF-32 only -code. - -These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */ - -#define BACKCHAR(eptr) do { } while (0) - -/* Same as above, just in the other direction. */ - -#define FORWARDCHAR(eptr) do { } while (0) -#define FORWARDCHARTEST(eptr,end) do { } while (0) - -/* Same as above, but it allows a fully customizable form. */ - -#define ACROSSCHAR(condition, eptr, action) do { } while (0) - -/* Deposit a character into memory, returning the number of code units. */ - -#define PUTCHAR(c, p) (*p = c, 1) - -#endif /* UTF-32 character handling */ -#endif /* SUPPORT_UNICODE */ - - -/* Mode-dependent macros that have the same definition in all modes. */ - -#define CU2BYTES(x) ((x)*((PCRE2_CODE_UNIT_WIDTH/8))) -#define BYTES2CU(x) ((x)/((PCRE2_CODE_UNIT_WIDTH/8))) -#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE -#define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE - - -/* ----------------------- HIDDEN STRUCTURES ----------------------------- */ - -/* NOTE: All these structures *must* start with a pcre2_memctl structure. The -code that uses them is simpler because it assumes this. */ - -/* The real general context structure. At present it holds only data for custom -memory control. */ - -typedef struct pcre2_real_general_context { - pcre2_memctl memctl; -} pcre2_real_general_context; - -/* The real compile context structure */ - -typedef struct pcre2_real_compile_context { - pcre2_memctl memctl; - int (*stack_guard)(uint32_t, void *); - void *stack_guard_data; - const uint8_t *tables; - PCRE2_SIZE max_pattern_length; - uint16_t bsr_convention; - uint16_t newline_convention; - uint32_t parens_nest_limit; - uint32_t extra_options; -} pcre2_real_compile_context; - -/* The real match context structure. */ - -typedef struct pcre2_real_match_context { - pcre2_memctl memctl; -#ifdef SUPPORT_JIT - pcre2_jit_callback jit_callback; - void *jit_callback_data; -#endif - int (*callout)(pcre2_callout_block *, void *); - void *callout_data; - int (*substitute_callout)(pcre2_substitute_callout_block *, void *); - void *substitute_callout_data; - PCRE2_SIZE offset_limit; - uint32_t heap_limit; - uint32_t match_limit; - uint32_t depth_limit; -} pcre2_real_match_context; - -/* The real convert context structure. */ - -typedef struct pcre2_real_convert_context { - pcre2_memctl memctl; - uint32_t glob_separator; - uint32_t glob_escape; -} pcre2_real_convert_context; - -/* The real compiled code structure. The type for the blocksize field is -defined specially because it is required in pcre2_serialize_decode() when -copying the size from possibly unaligned memory into a variable of the same -type. Use a macro rather than a typedef to avoid compiler warnings when this -file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the -largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit -argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field -here.) */ - -#undef CODE_BLOCKSIZE_TYPE -#define CODE_BLOCKSIZE_TYPE size_t - -#undef LOOKBEHIND_MAX -#define LOOKBEHIND_MAX UINT16_MAX - -typedef struct pcre2_real_code { - pcre2_memctl memctl; /* Memory control fields */ - const uint8_t *tables; /* The character tables */ - void *executable_jit; /* Pointer to JIT code */ - uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */ - CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */ - uint32_t magic_number; /* Paranoid and endianness check */ - uint32_t compile_options; /* Options passed to pcre2_compile() */ - uint32_t overall_options; /* Options after processing the pattern */ - uint32_t extra_options; /* Taken from compile_context */ - uint32_t flags; /* Various state flags */ - uint32_t limit_heap; /* Limit set in the pattern */ - uint32_t limit_match; /* Limit set in the pattern */ - uint32_t limit_depth; /* Limit set in the pattern */ - uint32_t first_codeunit; /* Starting code unit */ - uint32_t last_codeunit; /* This codeunit must be seen */ - uint16_t bsr_convention; /* What \R matches */ - uint16_t newline_convention; /* What is a newline? */ - uint16_t max_lookbehind; /* Longest lookbehind (characters) */ - uint16_t minlength; /* Minimum length of match */ - uint16_t top_bracket; /* Highest numbered group */ - uint16_t top_backref; /* Highest numbered back reference */ - uint16_t name_entry_size; /* Size (code units) of table entries */ - uint16_t name_count; /* Number of name entries in the table */ -} pcre2_real_code; - -/* The real match data structure. Define ovector as large as it can ever -actually be so that array bound checkers don't grumble. Memory for this -structure is obtained by calling pcre2_match_data_create(), which sets the size -as the offset of ovector plus a pair of elements for each capturable string, so -the size varies from call to call. As the maximum number of capturing -subpatterns is 65535 we must allow for 65536 strings to include the overall -match. (See also the heapframe structure below.) */ - -typedef struct pcre2_real_match_data { - pcre2_memctl memctl; - const pcre2_real_code *code; /* The pattern used for the match */ - PCRE2_SPTR subject; /* The subject that was matched */ - PCRE2_SPTR mark; /* Pointer to last mark */ - PCRE2_SIZE leftchar; /* Offset to leftmost code unit */ - PCRE2_SIZE rightchar; /* Offset to rightmost code unit */ - PCRE2_SIZE startchar; /* Offset to starting code unit */ - uint8_t matchedby; /* Type of match (normal, JIT, DFA) */ - uint8_t flags; /* Various flags */ - uint16_t oveccount; /* Number of pairs */ - int rc; /* The return code from the match */ - PCRE2_SIZE ovector[131072]; /* Must be last in the structure */ -} pcre2_real_match_data; - - -/* ----------------------- PRIVATE STRUCTURES ----------------------------- */ - -/* These structures are not needed for pcre2test. */ - -#ifndef PCRE2_PCRE2TEST - -/* Structures for checking for mutual recursion when scanning compiled or -parsed code. */ - -typedef struct recurse_check { - struct recurse_check *prev; - PCRE2_SPTR group; -} recurse_check; - -typedef struct parsed_recurse_check { - struct parsed_recurse_check *prev; - uint32_t *groupptr; -} parsed_recurse_check; - -/* Structure for building a cache when filling in recursion offsets. */ - -typedef struct recurse_cache { - PCRE2_SPTR group; - int groupnumber; -} recurse_cache; - -/* Structure for maintaining a chain of pointers to the currently incomplete -branches, for testing for left recursion while compiling. */ - -typedef struct branch_chain { - struct branch_chain *outer; - PCRE2_UCHAR *current_branch; -} branch_chain; - -/* Structure for building a list of named groups during the first pass of -compiling. */ - -typedef struct named_group { - PCRE2_SPTR name; /* Points to the name in the pattern */ - uint32_t number; /* Group number */ - uint16_t length; /* Length of the name */ - uint16_t isdup; /* TRUE if a duplicate */ -} named_group; - -/* Structure for passing "static" information around between the functions -doing the compiling, so that they are thread-safe. */ - -typedef struct compile_block { - pcre2_real_compile_context *cx; /* Points to the compile context */ - const uint8_t *lcc; /* Points to lower casing table */ - const uint8_t *fcc; /* Points to case-flipping table */ - const uint8_t *cbits; /* Points to character type table */ - const uint8_t *ctypes; /* Points to table of type maps */ - PCRE2_SPTR start_workspace; /* The start of working space */ - PCRE2_SPTR start_code; /* The start of the compiled code */ - PCRE2_SPTR start_pattern; /* The start of the pattern */ - PCRE2_SPTR end_pattern; /* The end of the pattern */ - PCRE2_UCHAR *name_table; /* The name/number table */ - PCRE2_SIZE workspace_size; /* Size of workspace */ - PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */ - PCRE2_SIZE erroroffset; /* Offset of error in pattern */ - uint16_t names_found; /* Number of entries so far */ - uint16_t name_entry_size; /* Size of each entry */ - uint16_t parens_depth; /* Depth of nested parentheses */ - uint16_t assert_depth; /* Depth of nested assertions */ - open_capitem *open_caps; /* Chain of open capture items */ - named_group *named_groups; /* Points to vector in pre-compile */ - uint32_t named_group_list_size; /* Number of entries in the list */ - uint32_t external_options; /* External (initial) options */ - uint32_t external_flags; /* External flag bits to be set */ - uint32_t bracount; /* Count of capturing parentheses */ - uint32_t lastcapture; /* Last capture encountered */ - uint32_t *parsed_pattern; /* Parsed pattern buffer */ - uint32_t *parsed_pattern_end; /* Parsed pattern should not get here */ - uint32_t *groupinfo; /* Group info vector */ - uint32_t top_backref; /* Maximum back reference */ - uint32_t backref_map; /* Bitmap of low back refs */ - uint32_t nltype; /* Newline type */ - uint32_t nllen; /* Newline string length */ - uint32_t class_range_start; /* Overall class range start */ - uint32_t class_range_end; /* Overall class range end */ - PCRE2_UCHAR nl[4]; /* Newline string when fixed length */ - int max_lookbehind; /* Maximum lookbehind (characters) */ - int req_varyopt; /* "After variable item" flag for reqbyte */ - BOOL had_accept; /* (*ACCEPT) encountered */ - BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ - BOOL had_recurse; /* Had a recursion or subroutine call */ - BOOL dupnames; /* Duplicate names exist */ -} compile_block; - -/* Structure for keeping the properties of the in-memory stack used -by the JIT matcher. */ - -typedef struct pcre2_real_jit_stack { - pcre2_memctl memctl; - void* stack; -} pcre2_real_jit_stack; - -/* Structure for items in a linked list that represents an explicit recursive -call within the pattern when running pcre_dfa_match(). */ - -typedef struct dfa_recursion_info { - struct dfa_recursion_info *prevrec; - PCRE2_SPTR subject_position; - uint32_t group_num; -} dfa_recursion_info; - -/* Structure for "stack" frames that are used for remembering backtracking -positions during matching. As these are used in a vector, with the ovector item -being extended, the size of the structure must be a multiple of PCRE2_SIZE. The -only way to check this at compile time is to force an error by generating an -array with a negative size. By putting this in a typedef (which is never used), -we don't generate any code when all is well. */ - -typedef struct heapframe { - - /* The first set of fields are variables that have to be preserved over calls - to RRMATCH(), but which do not need to be copied to new frames. */ - - PCRE2_SPTR ecode; /* The current position in the pattern */ - PCRE2_SPTR temp_sptr[2]; /* Used for short-term PCRE_SPTR values */ - PCRE2_SIZE length; /* Used for character, string, or code lengths */ - PCRE2_SIZE back_frame; /* Amount to subtract on RRETURN */ - PCRE2_SIZE temp_size; /* Used for short-term PCRE2_SIZE values */ - uint32_t rdepth; /* "Recursion" depth */ - uint32_t group_frame_type; /* Type information for group frames */ - uint32_t temp_32[4]; /* Used for short-term 32-bit or BOOL values */ - uint8_t return_id; /* Where to go on in internal "return" */ - uint8_t op; /* Processing opcode */ - - /* At this point, the structure is 16-bit aligned. On most architectures - the alignment requirement for a pointer will ensure that the eptr field below - is 32-bit or 64-bit aligned. However, on m68k it is fine to have a pointer - that is 16-bit aligned. We must therefore ensure that what comes between here - and eptr is an odd multiple of 16 bits so as to get back into 32-bit - alignment. This happens naturally when PCRE2_UCHAR is 8 bits wide, but needs - fudges in the other cases. In the 32-bit case the padding comes first so that - the occu field itself is 32-bit aligned. Without the padding, this structure - is no longer a multiple of PCRE2_SIZE on m68k, and the check below fails. */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 - PCRE2_UCHAR occu[6]; /* Used for other case code units */ -#elif PCRE2_CODE_UNIT_WIDTH == 16 - PCRE2_UCHAR occu[2]; /* Used for other case code units */ - uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */ -#else - uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */ - PCRE2_UCHAR occu[1]; /* Used for other case code units */ -#endif - - /* The rest have to be copied from the previous frame whenever a new frame - becomes current. The final field is specified as a large vector so that - runtime array bound checks don't catch references to it. However, for any - specific call to pcre2_match() the memory allocated for each frame structure - allows for exactly the right size ovector for the number of capturing - parentheses. (See also the comment for pcre2_real_match_data above.) */ - - PCRE2_SPTR eptr; /* MUST BE FIRST */ - PCRE2_SPTR start_match; /* Can be adjusted by \K */ - PCRE2_SPTR mark; /* Most recent mark on the success path */ - uint32_t current_recurse; /* Current (deepest) recursion number */ - uint32_t capture_last; /* Most recent capture */ - PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */ - PCRE2_SIZE offset_top; /* Offset after highest capture */ - PCRE2_SIZE ovector[131072]; /* Must be last in the structure */ -} heapframe; - -/* This typedef is a check that the size of the heapframe structure is a -multiple of PCRE2_SIZE. See various comments above. */ - -typedef char check_heapframe_size[ - ((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)]; - -/* Structure for passing "static" information around between the functions -doing traditional NFA matching (pcre2_match() and friends). */ - -typedef struct match_block { - pcre2_memctl memctl; /* For general use */ - PCRE2_SIZE frame_vector_size; /* Size of a backtracking frame */ - heapframe *match_frames; /* Points to vector of frames */ - heapframe *match_frames_top; /* Points after the end of the vector */ - heapframe *stack_frames; /* The original vector on the stack */ - PCRE2_SIZE heap_limit; /* As it says */ - uint32_t match_limit; /* As it says */ - uint32_t match_limit_depth; /* As it says */ - uint32_t match_call_count; /* Number of times a new frame is created */ - BOOL hitend; /* Hit the end of the subject at some point */ - BOOL hasthen; /* Pattern contains (*THEN) */ - BOOL allowemptypartial; /* Allow empty hard partial */ - const uint8_t *lcc; /* Points to lower casing table */ - const uint8_t *fcc; /* Points to case-flipping table */ - const uint8_t *ctypes; /* Points to table of type maps */ - PCRE2_SIZE start_offset; /* The start offset value */ - PCRE2_SIZE end_offset_top; /* Highwater mark at end of match */ - uint16_t partial; /* PARTIAL options */ - uint16_t bsr_convention; /* \R interpretation */ - uint16_t name_count; /* Number of names in name table */ - uint16_t name_entry_size; /* Size of entry in names table */ - PCRE2_SPTR name_table; /* Table of group names */ - PCRE2_SPTR start_code; /* For use when recursing */ - PCRE2_SPTR start_subject; /* Start of the subject string */ - PCRE2_SPTR check_subject; /* Where UTF-checked from */ - PCRE2_SPTR end_subject; /* End of the subject string */ - PCRE2_SPTR end_match_ptr; /* Subject position at end match */ - PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ - PCRE2_SPTR last_used_ptr; /* Latest consulted character */ - PCRE2_SPTR mark; /* Mark pointer to pass back on success */ - PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */ - PCRE2_SPTR verb_ecode_ptr; /* For passing back info */ - PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */ - uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */ - uint32_t moptions; /* Match options */ - uint32_t poptions; /* Pattern options */ - uint32_t skip_arg_count; /* For counting SKIP_ARGs */ - uint32_t ignore_skip_arg; /* For re-run when SKIP arg name not found */ - uint32_t nltype; /* Newline type */ - uint32_t nllen; /* Newline string length */ - PCRE2_UCHAR nl[4]; /* Newline string when fixed */ - pcre2_callout_block *cb; /* Points to a callout block */ - void *callout_data; /* To pass back to callouts */ - int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */ -} match_block; - -/* A similar structure is used for the same purpose by the DFA matching -functions. */ - -typedef struct dfa_match_block { - pcre2_memctl memctl; /* For general use */ - PCRE2_SPTR start_code; /* Start of the compiled pattern */ - PCRE2_SPTR start_subject ; /* Start of the subject string */ - PCRE2_SPTR end_subject; /* End of subject string */ - PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ - PCRE2_SPTR last_used_ptr; /* Latest consulted character */ - const uint8_t *tables; /* Character tables */ - PCRE2_SIZE start_offset; /* The start offset value */ - PCRE2_SIZE heap_limit; /* As it says */ - PCRE2_SIZE heap_used; /* As it says */ - uint32_t match_limit; /* As it says */ - uint32_t match_limit_depth; /* As it says */ - uint32_t match_call_count; /* Number of calls of internal function */ - uint32_t moptions; /* Match options */ - uint32_t poptions; /* Pattern options */ - uint32_t nltype; /* Newline type */ - uint32_t nllen; /* Newline string length */ - BOOL allowemptypartial; /* Allow empty hard partial */ - PCRE2_UCHAR nl[4]; /* Newline string when fixed */ - uint16_t bsr_convention; /* \R interpretation */ - pcre2_callout_block *cb; /* Points to a callout block */ - void *callout_data; /* To pass back to callouts */ - int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */ - dfa_recursion_info *recursive; /* Linked list of recursion data */ -} dfa_match_block; - -#endif /* PCRE2_PCRE2TEST */ - -/* End of pcre2_intmodedep.h */ diff --git a/pcre2/src/pcre2_jit_compile.c b/pcre2/src/pcre2_jit_compile.c deleted file mode 100644 index 1977d28aa..000000000 --- a/pcre2/src/pcre2_jit_compile.c +++ /dev/null @@ -1,14254 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - This module by Zoltan Herczeg - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - -#ifdef SUPPORT_JIT - -/* All-in-one: Since we use the JIT compiler only from here, -we just include it. This way we don't need to touch the build -system files. */ - -#define SLJIT_CONFIG_AUTO 1 -#define SLJIT_CONFIG_STATIC 1 -#define SLJIT_VERBOSE 0 - -#ifdef PCRE2_DEBUG -#define SLJIT_DEBUG 1 -#else -#define SLJIT_DEBUG 0 -#endif - -#define SLJIT_MALLOC(size, allocator_data) pcre2_jit_malloc(size, allocator_data) -#define SLJIT_FREE(ptr, allocator_data) pcre2_jit_free(ptr, allocator_data) - -static void * pcre2_jit_malloc(size_t size, void *allocator_data) -{ -pcre2_memctl *allocator = ((pcre2_memctl*)allocator_data); -return allocator->malloc(size, allocator->memory_data); -} - -static void pcre2_jit_free(void *ptr, void *allocator_data) -{ -pcre2_memctl *allocator = ((pcre2_memctl*)allocator_data); -allocator->free(ptr, allocator->memory_data); -} - -#include "sljit/sljitLir.c" - -#if defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED -#error Unsupported architecture -#endif - -/* Defines for debugging purposes. */ - -/* 1 - Use unoptimized capturing brackets. - 2 - Enable capture_last_ptr (includes option 1). */ -/* #define DEBUG_FORCE_UNOPTIMIZED_CBRAS 2 */ - -/* 1 - Always have a control head. */ -/* #define DEBUG_FORCE_CONTROL_HEAD 1 */ - -/* Allocate memory for the regex stack on the real machine stack. -Fast, but limited size. */ -#define MACHINE_STACK_SIZE 32768 - -/* Growth rate for stack allocated by the OS. Should be the multiply -of page size. */ -#define STACK_GROWTH_RATE 8192 - -/* Enable to check that the allocation could destroy temporaries. */ -#if defined SLJIT_DEBUG && SLJIT_DEBUG -#define DESTROY_REGISTERS 1 -#endif - -/* -Short summary about the backtracking mechanism empolyed by the jit code generator: - -The code generator follows the recursive nature of the PERL compatible regular -expressions. The basic blocks of regular expressions are condition checkers -whose execute different commands depending on the result of the condition check. -The relationship between the operators can be horizontal (concatenation) and -vertical (sub-expression) (See struct backtrack_common for more details). - - 'ab' - 'a' and 'b' regexps are concatenated - 'a+' - 'a' is the sub-expression of the '+' operator - -The condition checkers are boolean (true/false) checkers. Machine code is generated -for the checker itself and for the actions depending on the result of the checker. -The 'true' case is called as the matching path (expected path), and the other is called as -the 'backtrack' path. Branch instructions are expesive for all CPUs, so we avoid taken -branches on the matching path. - - Greedy star operator (*) : - Matching path: match happens. - Backtrack path: match failed. - Non-greedy star operator (*?) : - Matching path: no need to perform a match. - Backtrack path: match is required. - -The following example shows how the code generated for a capturing bracket -with two alternatives. Let A, B, C, D are arbirary regular expressions, and -we have the following regular expression: - - A(B|C)D - -The generated code will be the following: - - A matching path - '(' matching path (pushing arguments to the stack) - B matching path - ')' matching path (pushing arguments to the stack) - D matching path - return with successful match - - D backtrack path - ')' backtrack path (If we arrived from "C" jump to the backtrack of "C") - B backtrack path - C expected path - jump to D matching path - C backtrack path - A backtrack path - - Notice, that the order of backtrack code paths are the opposite of the fast - code paths. In this way the topmost value on the stack is always belong - to the current backtrack code path. The backtrack path must check - whether there is a next alternative. If so, it needs to jump back to - the matching path eventually. Otherwise it needs to clear out its own stack - frame and continue the execution on the backtrack code paths. -*/ - -/* -Saved stack frames: - -Atomic blocks and asserts require reloading the values of private data -when the backtrack mechanism performed. Because of OP_RECURSE, the data -are not necessarly known in compile time, thus we need a dynamic restore -mechanism. - -The stack frames are stored in a chain list, and have the following format: -([ capturing bracket offset ][ start value ][ end value ])+ ... [ 0 ] [ previous head ] - -Thus we can restore the private data to a particular point in the stack. -*/ - -typedef struct jit_arguments { - /* Pointers first. */ - struct sljit_stack *stack; - PCRE2_SPTR str; - PCRE2_SPTR begin; - PCRE2_SPTR end; - pcre2_match_data *match_data; - PCRE2_SPTR startchar_ptr; - PCRE2_UCHAR *mark_ptr; - int (*callout)(pcre2_callout_block *, void *); - void *callout_data; - /* Everything else after. */ - sljit_uw offset_limit; - sljit_u32 limit_match; - sljit_u32 oveccount; - sljit_u32 options; -} jit_arguments; - -#define JIT_NUMBER_OF_COMPILE_MODES 3 - -typedef struct executable_functions { - void *executable_funcs[JIT_NUMBER_OF_COMPILE_MODES]; - void *read_only_data_heads[JIT_NUMBER_OF_COMPILE_MODES]; - sljit_uw executable_sizes[JIT_NUMBER_OF_COMPILE_MODES]; - sljit_u32 top_bracket; - sljit_u32 limit_match; -} executable_functions; - -typedef struct jump_list { - struct sljit_jump *jump; - struct jump_list *next; -} jump_list; - -typedef struct stub_list { - struct sljit_jump *start; - struct sljit_label *quit; - struct stub_list *next; -} stub_list; - -enum frame_types { - no_frame = -1, - no_stack = -2 -}; - -enum control_types { - type_mark = 0, - type_then_trap = 1 -}; - -enum early_fail_types { - type_skip = 0, - type_fail = 1, - type_fail_range = 2 -}; - -typedef int (SLJIT_FUNC *jit_function)(jit_arguments *args); - -/* The following structure is the key data type for the recursive -code generator. It is allocated by compile_matchingpath, and contains -the arguments for compile_backtrackingpath. Must be the first member -of its descendants. */ -typedef struct backtrack_common { - /* Concatenation stack. */ - struct backtrack_common *prev; - jump_list *nextbacktracks; - /* Internal stack (for component operators). */ - struct backtrack_common *top; - jump_list *topbacktracks; - /* Opcode pointer. */ - PCRE2_SPTR cc; -} backtrack_common; - -typedef struct assert_backtrack { - backtrack_common common; - jump_list *condfailed; - /* Less than 0 if a frame is not needed. */ - int framesize; - /* Points to our private memory word on the stack. */ - int private_data_ptr; - /* For iterators. */ - struct sljit_label *matchingpath; -} assert_backtrack; - -typedef struct bracket_backtrack { - backtrack_common common; - /* Where to coninue if an alternative is successfully matched. */ - struct sljit_label *alternative_matchingpath; - /* For rmin and rmax iterators. */ - struct sljit_label *recursive_matchingpath; - /* For greedy ? operator. */ - struct sljit_label *zero_matchingpath; - /* Contains the branches of a failed condition. */ - union { - /* Both for OP_COND, OP_SCOND. */ - jump_list *condfailed; - assert_backtrack *assert; - /* For OP_ONCE. Less than 0 if not needed. */ - int framesize; - /* For brackets with >3 alternatives. */ - struct sljit_put_label *matching_put_label; - } u; - /* Points to our private memory word on the stack. */ - int private_data_ptr; -} bracket_backtrack; - -typedef struct bracketpos_backtrack { - backtrack_common common; - /* Points to our private memory word on the stack. */ - int private_data_ptr; - /* Reverting stack is needed. */ - int framesize; - /* Allocated stack size. */ - int stacksize; -} bracketpos_backtrack; - -typedef struct braminzero_backtrack { - backtrack_common common; - struct sljit_label *matchingpath; -} braminzero_backtrack; - -typedef struct char_iterator_backtrack { - backtrack_common common; - /* Next iteration. */ - struct sljit_label *matchingpath; - union { - jump_list *backtracks; - struct { - unsigned int othercasebit; - PCRE2_UCHAR chr; - BOOL enabled; - } charpos; - } u; -} char_iterator_backtrack; - -typedef struct ref_iterator_backtrack { - backtrack_common common; - /* Next iteration. */ - struct sljit_label *matchingpath; -} ref_iterator_backtrack; - -typedef struct recurse_entry { - struct recurse_entry *next; - /* Contains the function entry label. */ - struct sljit_label *entry_label; - /* Contains the function entry label. */ - struct sljit_label *backtrack_label; - /* Collects the entry calls until the function is not created. */ - jump_list *entry_calls; - /* Collects the backtrack calls until the function is not created. */ - jump_list *backtrack_calls; - /* Points to the starting opcode. */ - sljit_sw start; -} recurse_entry; - -typedef struct recurse_backtrack { - backtrack_common common; - /* Return to the matching path. */ - struct sljit_label *matchingpath; - /* Recursive pattern. */ - recurse_entry *entry; - /* Pattern is inlined. */ - BOOL inlined_pattern; -} recurse_backtrack; - -#define OP_THEN_TRAP OP_TABLE_LENGTH - -typedef struct then_trap_backtrack { - backtrack_common common; - /* If then_trap is not NULL, this structure contains the real - then_trap for the backtracking path. */ - struct then_trap_backtrack *then_trap; - /* Points to the starting opcode. */ - sljit_sw start; - /* Exit point for the then opcodes of this alternative. */ - jump_list *quit; - /* Frame size of the current alternative. */ - int framesize; -} then_trap_backtrack; - -#define MAX_N_CHARS 12 -#define MAX_DIFF_CHARS 5 - -typedef struct fast_forward_char_data { - /* Number of characters in the chars array, 255 for any character. */ - sljit_u8 count; - /* Number of last UTF-8 characters in the chars array. */ - sljit_u8 last_count; - /* Available characters in the current position. */ - PCRE2_UCHAR chars[MAX_DIFF_CHARS]; -} fast_forward_char_data; - -#define MAX_CLASS_RANGE_SIZE 4 -#define MAX_CLASS_CHARS_SIZE 3 - -typedef struct compiler_common { - /* The sljit ceneric compiler. */ - struct sljit_compiler *compiler; - /* Compiled regular expression. */ - pcre2_real_code *re; - /* First byte code. */ - PCRE2_SPTR start; - /* Maps private data offset to each opcode. */ - sljit_s32 *private_data_ptrs; - /* Chain list of read-only data ptrs. */ - void *read_only_data_head; - /* Tells whether the capturing bracket is optimized. */ - sljit_u8 *optimized_cbracket; - /* Tells whether the starting offset is a target of then. */ - sljit_u8 *then_offsets; - /* Current position where a THEN must jump. */ - then_trap_backtrack *then_trap; - /* Starting offset of private data for capturing brackets. */ - sljit_s32 cbra_ptr; - /* Output vector starting point. Must be divisible by 2. */ - sljit_s32 ovector_start; - /* Points to the starting character of the current match. */ - sljit_s32 start_ptr; - /* Last known position of the requested byte. */ - sljit_s32 req_char_ptr; - /* Head of the last recursion. */ - sljit_s32 recursive_head_ptr; - /* First inspected character for partial matching. - (Needed for avoiding zero length partial matches.) */ - sljit_s32 start_used_ptr; - /* Starting pointer for partial soft matches. */ - sljit_s32 hit_start; - /* Pointer of the match end position. */ - sljit_s32 match_end_ptr; - /* Points to the marked string. */ - sljit_s32 mark_ptr; - /* Recursive control verb management chain. */ - sljit_s32 control_head_ptr; - /* Points to the last matched capture block index. */ - sljit_s32 capture_last_ptr; - /* Fast forward skipping byte code pointer. */ - PCRE2_SPTR fast_forward_bc_ptr; - /* Locals used by fast fail optimization. */ - sljit_s32 early_fail_start_ptr; - sljit_s32 early_fail_end_ptr; - - /* Flipped and lower case tables. */ - const sljit_u8 *fcc; - sljit_sw lcc; - /* Mode can be PCRE2_JIT_COMPLETE and others. */ - int mode; - /* TRUE, when empty match is accepted for partial matching. */ - BOOL allow_empty_partial; - /* TRUE, when minlength is greater than 0. */ - BOOL might_be_empty; - /* \K is found in the pattern. */ - BOOL has_set_som; - /* (*SKIP:arg) is found in the pattern. */ - BOOL has_skip_arg; - /* (*THEN) is found in the pattern. */ - BOOL has_then; - /* (*SKIP) or (*SKIP:arg) is found in lookbehind assertion. */ - BOOL has_skip_in_assert_back; - /* Quit is redirected by recurse, negative assertion, or positive assertion in conditional block. */ - BOOL local_quit_available; - /* Currently in a positive assertion. */ - BOOL in_positive_assertion; - /* Newline control. */ - int nltype; - sljit_u32 nlmax; - sljit_u32 nlmin; - int newline; - int bsr_nltype; - sljit_u32 bsr_nlmax; - sljit_u32 bsr_nlmin; - /* Dollar endonly. */ - int endonly; - /* Tables. */ - sljit_sw ctypes; - /* Named capturing brackets. */ - PCRE2_SPTR name_table; - sljit_sw name_count; - sljit_sw name_entry_size; - - /* Labels and jump lists. */ - struct sljit_label *partialmatchlabel; - struct sljit_label *quit_label; - struct sljit_label *abort_label; - struct sljit_label *accept_label; - struct sljit_label *ff_newline_shortcut; - stub_list *stubs; - recurse_entry *entries; - recurse_entry *currententry; - jump_list *partialmatch; - jump_list *quit; - jump_list *positive_assertion_quit; - jump_list *abort; - jump_list *failed_match; - jump_list *accept; - jump_list *calllimit; - jump_list *stackalloc; - jump_list *revertframes; - jump_list *wordboundary; - jump_list *anynewline; - jump_list *hspace; - jump_list *vspace; - jump_list *casefulcmp; - jump_list *caselesscmp; - jump_list *reset_match; - BOOL unset_backref; - BOOL alt_circumflex; -#ifdef SUPPORT_UNICODE - BOOL utf; - BOOL invalid_utf; - BOOL ucp; - /* Points to saving area for iref. */ - sljit_s32 iref_ptr; - jump_list *getucd; - jump_list *getucdtype; -#if PCRE2_CODE_UNIT_WIDTH == 8 - jump_list *utfreadchar; - jump_list *utfreadtype8; - jump_list *utfpeakcharback; -#endif -#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 - jump_list *utfreadchar_invalid; - jump_list *utfreadnewline_invalid; - jump_list *utfmoveback_invalid; - jump_list *utfpeakcharback_invalid; -#endif -#endif /* SUPPORT_UNICODE */ -} compiler_common; - -/* For byte_sequence_compare. */ - -typedef struct compare_context { - int length; - int sourcereg; -#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED - int ucharptr; - union { - sljit_s32 asint; - sljit_u16 asushort; -#if PCRE2_CODE_UNIT_WIDTH == 8 - sljit_u8 asbyte; - sljit_u8 asuchars[4]; -#elif PCRE2_CODE_UNIT_WIDTH == 16 - sljit_u16 asuchars[2]; -#elif PCRE2_CODE_UNIT_WIDTH == 32 - sljit_u32 asuchars[1]; -#endif - } c; - union { - sljit_s32 asint; - sljit_u16 asushort; -#if PCRE2_CODE_UNIT_WIDTH == 8 - sljit_u8 asbyte; - sljit_u8 asuchars[4]; -#elif PCRE2_CODE_UNIT_WIDTH == 16 - sljit_u16 asuchars[2]; -#elif PCRE2_CODE_UNIT_WIDTH == 32 - sljit_u32 asuchars[1]; -#endif - } oc; -#endif -} compare_context; - -/* Undefine sljit macros. */ -#undef CMP - -/* Used for accessing the elements of the stack. */ -#define STACK(i) ((i) * (int)sizeof(sljit_sw)) - -#ifdef SLJIT_PREF_SHIFT_REG -#if SLJIT_PREF_SHIFT_REG == SLJIT_R2 -/* Nothing. */ -#elif SLJIT_PREF_SHIFT_REG == SLJIT_R3 -#define SHIFT_REG_IS_R3 -#else -#error "Unsupported shift register" -#endif -#endif - -#define TMP1 SLJIT_R0 -#ifdef SHIFT_REG_IS_R3 -#define TMP2 SLJIT_R3 -#define TMP3 SLJIT_R2 -#else -#define TMP2 SLJIT_R2 -#define TMP3 SLJIT_R3 -#endif -#define STR_PTR SLJIT_R1 -#define STR_END SLJIT_S0 -#define STACK_TOP SLJIT_S1 -#define STACK_LIMIT SLJIT_S2 -#define COUNT_MATCH SLJIT_S3 -#define ARGUMENTS SLJIT_S4 -#define RETURN_ADDR SLJIT_R4 - -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) -#define HAS_VIRTUAL_REGISTERS 1 -#else -#define HAS_VIRTUAL_REGISTERS 0 -#endif - -/* Local space layout. */ -/* These two locals can be used by the current opcode. */ -#define LOCALS0 (0 * sizeof(sljit_sw)) -#define LOCALS1 (1 * sizeof(sljit_sw)) -/* Two local variables for possessive quantifiers (char1 cannot use them). */ -#define POSSESSIVE0 (2 * sizeof(sljit_sw)) -#define POSSESSIVE1 (3 * sizeof(sljit_sw)) -/* Max limit of recursions. */ -#define LIMIT_MATCH (4 * sizeof(sljit_sw)) -/* The output vector is stored on the stack, and contains pointers -to characters. The vector data is divided into two groups: the first -group contains the start / end character pointers, and the second is -the start pointers when the end of the capturing group has not yet reached. */ -#define OVECTOR_START (common->ovector_start) -#define OVECTOR(i) (OVECTOR_START + (i) * (sljit_sw)sizeof(sljit_sw)) -#define OVECTOR_PRIV(i) (common->cbra_ptr + (i) * (sljit_sw)sizeof(sljit_sw)) -#define PRIVATE_DATA(cc) (common->private_data_ptrs[(cc) - common->start]) - -#if PCRE2_CODE_UNIT_WIDTH == 8 -#define MOV_UCHAR SLJIT_MOV_U8 -#define IN_UCHARS(x) (x) -#elif PCRE2_CODE_UNIT_WIDTH == 16 -#define MOV_UCHAR SLJIT_MOV_U16 -#define UCHAR_SHIFT (1) -#define IN_UCHARS(x) ((x) * 2) -#elif PCRE2_CODE_UNIT_WIDTH == 32 -#define MOV_UCHAR SLJIT_MOV_U32 -#define UCHAR_SHIFT (2) -#define IN_UCHARS(x) ((x) * 4) -#else -#error Unsupported compiling mode -#endif - -/* Shortcuts. */ -#define DEFINE_COMPILER \ - struct sljit_compiler *compiler = common->compiler -#define OP1(op, dst, dstw, src, srcw) \ - sljit_emit_op1(compiler, (op), (dst), (dstw), (src), (srcw)) -#define OP2(op, dst, dstw, src1, src1w, src2, src2w) \ - sljit_emit_op2(compiler, (op), (dst), (dstw), (src1), (src1w), (src2), (src2w)) -#define OP_SRC(op, src, srcw) \ - sljit_emit_op_src(compiler, (op), (src), (srcw)) -#define LABEL() \ - sljit_emit_label(compiler) -#define JUMP(type) \ - sljit_emit_jump(compiler, (type)) -#define JUMPTO(type, label) \ - sljit_set_label(sljit_emit_jump(compiler, (type)), (label)) -#define JUMPHERE(jump) \ - sljit_set_label((jump), sljit_emit_label(compiler)) -#define SET_LABEL(jump, label) \ - sljit_set_label((jump), (label)) -#define CMP(type, src1, src1w, src2, src2w) \ - sljit_emit_cmp(compiler, (type), (src1), (src1w), (src2), (src2w)) -#define CMPTO(type, src1, src1w, src2, src2w, label) \ - sljit_set_label(sljit_emit_cmp(compiler, (type), (src1), (src1w), (src2), (src2w)), (label)) -#define OP_FLAGS(op, dst, dstw, type) \ - sljit_emit_op_flags(compiler, (op), (dst), (dstw), (type)) -#define CMOV(type, dst_reg, src, srcw) \ - sljit_emit_cmov(compiler, (type), (dst_reg), (src), (srcw)) -#define GET_LOCAL_BASE(dst, dstw, offset) \ - sljit_get_local_base(compiler, (dst), (dstw), (offset)) - -#define READ_CHAR_MAX 0x7fffffff - -#define INVALID_UTF_CHAR -1 -#define UNASSIGNED_UTF_CHAR 888 - -#if defined SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - -#define GETCHARINC_INVALID(c, ptr, end, invalid_action) \ - { \ - if (ptr[0] <= 0x7f) \ - c = *ptr++; \ - else if (ptr + 1 < end && ptr[1] >= 0x80 && ptr[1] < 0xc0) \ - { \ - c = ptr[1] - 0x80; \ - \ - if (ptr[0] >= 0xc2 && ptr[0] <= 0xdf) \ - { \ - c |= (ptr[0] - 0xc0) << 6; \ - ptr += 2; \ - } \ - else if (ptr + 2 < end && ptr[2] >= 0x80 && ptr[2] < 0xc0) \ - { \ - c = c << 6 | (ptr[2] - 0x80); \ - \ - if (ptr[0] >= 0xe0 && ptr[0] <= 0xef) \ - { \ - c |= (ptr[0] - 0xe0) << 12; \ - ptr += 3; \ - \ - if (c < 0x800 || (c >= 0xd800 && c < 0xe000)) \ - { \ - invalid_action; \ - } \ - } \ - else if (ptr + 3 < end && ptr[3] >= 0x80 && ptr[3] < 0xc0) \ - { \ - c = c << 6 | (ptr[3] - 0x80); \ - \ - if (ptr[0] >= 0xf0 && ptr[0] <= 0xf4) \ - { \ - c |= (ptr[0] - 0xf0) << 18; \ - ptr += 4; \ - \ - if (c >= 0x110000 || c < 0x10000) \ - { \ - invalid_action; \ - } \ - } \ - else \ - { \ - invalid_action; \ - } \ - } \ - else \ - { \ - invalid_action; \ - } \ - } \ - else \ - { \ - invalid_action; \ - } \ - } \ - else \ - { \ - invalid_action; \ - } \ - } - -#define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \ - { \ - c = ptr[-1]; \ - if (c <= 0x7f) \ - ptr--; \ - else if (ptr - 1 > start && ptr[-1] >= 0x80 && ptr[-1] < 0xc0) \ - { \ - c -= 0x80; \ - \ - if (ptr[-2] >= 0xc2 && ptr[-2] <= 0xdf) \ - { \ - c |= (ptr[-2] - 0xc0) << 6; \ - ptr -= 2; \ - } \ - else if (ptr - 2 > start && ptr[-2] >= 0x80 && ptr[-2] < 0xc0) \ - { \ - c = c << 6 | (ptr[-2] - 0x80); \ - \ - if (ptr[-3] >= 0xe0 && ptr[-3] <= 0xef) \ - { \ - c |= (ptr[-3] - 0xe0) << 12; \ - ptr -= 3; \ - \ - if (c < 0x800 || (c >= 0xd800 && c < 0xe000)) \ - { \ - invalid_action; \ - } \ - } \ - else if (ptr - 3 > start && ptr[-3] >= 0x80 && ptr[-3] < 0xc0) \ - { \ - c = c << 6 | (ptr[-3] - 0x80); \ - \ - if (ptr[-4] >= 0xf0 && ptr[-4] <= 0xf4) \ - { \ - c |= (ptr[-4] - 0xf0) << 18; \ - ptr -= 4; \ - \ - if (c >= 0x110000 || c < 0x10000) \ - { \ - invalid_action; \ - } \ - } \ - else \ - { \ - invalid_action; \ - } \ - } \ - else \ - { \ - invalid_action; \ - } \ - } \ - else \ - { \ - invalid_action; \ - } \ - } \ - else \ - { \ - invalid_action; \ - } \ - } - -#elif PCRE2_CODE_UNIT_WIDTH == 16 - -#define GETCHARINC_INVALID(c, ptr, end, invalid_action) \ - { \ - if (ptr[0] < 0xd800 || ptr[0] >= 0xe000) \ - c = *ptr++; \ - else if (ptr[0] < 0xdc00 && ptr + 1 < end && ptr[1] >= 0xdc00 && ptr[1] < 0xe000) \ - { \ - c = (((ptr[0] - 0xd800) << 10) | (ptr[1] - 0xdc00)) + 0x10000; \ - ptr += 2; \ - } \ - else \ - { \ - invalid_action; \ - } \ - } - -#define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \ - { \ - c = ptr[-1]; \ - if (c < 0xd800 || c >= 0xe000) \ - ptr--; \ - else if (c >= 0xdc00 && ptr - 1 > start && ptr[-2] >= 0xd800 && ptr[-2] < 0xdc00) \ - { \ - c = (((ptr[-2] - 0xd800) << 10) | (c - 0xdc00)) + 0x10000; \ - ptr -= 2; \ - } \ - else \ - { \ - invalid_action; \ - } \ - } - - -#elif PCRE2_CODE_UNIT_WIDTH == 32 - -#define GETCHARINC_INVALID(c, ptr, end, invalid_action) \ - { \ - if (ptr[0] < 0xd800 || (ptr[0] >= 0xe000 && ptr[0] < 0x110000)) \ - c = *ptr++; \ - else \ - { \ - invalid_action; \ - } \ - } - -#define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \ - { \ - c = ptr[-1]; \ - if (ptr[-1] < 0xd800 || (ptr[-1] >= 0xe000 && ptr[-1] < 0x110000)) \ - ptr--; \ - else \ - { \ - invalid_action; \ - } \ - } - -#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ -#endif /* SUPPORT_UNICODE */ - -static PCRE2_SPTR bracketend(PCRE2_SPTR cc) -{ -SLJIT_ASSERT((*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NA) || (*cc >= OP_ONCE && *cc <= OP_SCOND)); -do cc += GET(cc, 1); while (*cc == OP_ALT); -SLJIT_ASSERT(*cc >= OP_KET && *cc <= OP_KETRPOS); -cc += 1 + LINK_SIZE; -return cc; -} - -static int no_alternatives(PCRE2_SPTR cc) -{ -int count = 0; -SLJIT_ASSERT((*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NA) || (*cc >= OP_ONCE && *cc <= OP_SCOND)); -do - { - cc += GET(cc, 1); - count++; - } -while (*cc == OP_ALT); -SLJIT_ASSERT(*cc >= OP_KET && *cc <= OP_KETRPOS); -return count; -} - -/* Functions whose might need modification for all new supported opcodes: - next_opcode - check_opcode_types - set_private_data_ptrs - get_framesize - init_frame - get_recurse_data_length - copy_recurse_data - compile_matchingpath - compile_backtrackingpath -*/ - -static PCRE2_SPTR next_opcode(compiler_common *common, PCRE2_SPTR cc) -{ -SLJIT_UNUSED_ARG(common); -switch(*cc) - { - case OP_SOD: - case OP_SOM: - case OP_SET_SOM: - case OP_NOT_WORD_BOUNDARY: - case OP_WORD_BOUNDARY: - case OP_NOT_DIGIT: - case OP_DIGIT: - case OP_NOT_WHITESPACE: - case OP_WHITESPACE: - case OP_NOT_WORDCHAR: - case OP_WORDCHAR: - case OP_ANY: - case OP_ALLANY: - case OP_NOTPROP: - case OP_PROP: - case OP_ANYNL: - case OP_NOT_HSPACE: - case OP_HSPACE: - case OP_NOT_VSPACE: - case OP_VSPACE: - case OP_EXTUNI: - case OP_EODN: - case OP_EOD: - case OP_CIRC: - case OP_CIRCM: - case OP_DOLL: - case OP_DOLLM: - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSSTAR: - case OP_CRPOSPLUS: - case OP_CRPOSQUERY: - case OP_CRPOSRANGE: - case OP_CLASS: - case OP_NCLASS: - case OP_REF: - case OP_REFI: - case OP_DNREF: - case OP_DNREFI: - case OP_RECURSE: - case OP_CALLOUT: - case OP_ALT: - case OP_KET: - case OP_KETRMAX: - case OP_KETRMIN: - case OP_KETRPOS: - case OP_REVERSE: - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - case OP_ONCE: - case OP_SCRIPT_RUN: - case OP_BRA: - case OP_BRAPOS: - case OP_CBRA: - case OP_CBRAPOS: - case OP_COND: - case OP_SBRA: - case OP_SBRAPOS: - case OP_SCBRA: - case OP_SCBRAPOS: - case OP_SCOND: - case OP_CREF: - case OP_DNCREF: - case OP_RREF: - case OP_DNRREF: - case OP_FALSE: - case OP_TRUE: - case OP_BRAZERO: - case OP_BRAMINZERO: - case OP_BRAPOSZERO: - case OP_PRUNE: - case OP_SKIP: - case OP_THEN: - case OP_COMMIT: - case OP_FAIL: - case OP_ACCEPT: - case OP_ASSERT_ACCEPT: - case OP_CLOSE: - case OP_SKIPZERO: - return cc + PRIV(OP_lengths)[*cc]; - - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - case OP_STAR: - case OP_MINSTAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_QUERY: - case OP_MINQUERY: - case OP_UPTO: - case OP_MINUPTO: - case OP_EXACT: - case OP_POSSTAR: - case OP_POSPLUS: - case OP_POSQUERY: - case OP_POSUPTO: - case OP_STARI: - case OP_MINSTARI: - case OP_PLUSI: - case OP_MINPLUSI: - case OP_QUERYI: - case OP_MINQUERYI: - case OP_UPTOI: - case OP_MINUPTOI: - case OP_EXACTI: - case OP_POSSTARI: - case OP_POSPLUSI: - case OP_POSQUERYI: - case OP_POSUPTOI: - case OP_NOTSTAR: - case OP_NOTMINSTAR: - case OP_NOTPLUS: - case OP_NOTMINPLUS: - case OP_NOTQUERY: - case OP_NOTMINQUERY: - case OP_NOTUPTO: - case OP_NOTMINUPTO: - case OP_NOTEXACT: - case OP_NOTPOSSTAR: - case OP_NOTPOSPLUS: - case OP_NOTPOSQUERY: - case OP_NOTPOSUPTO: - case OP_NOTSTARI: - case OP_NOTMINSTARI: - case OP_NOTPLUSI: - case OP_NOTMINPLUSI: - case OP_NOTQUERYI: - case OP_NOTMINQUERYI: - case OP_NOTUPTOI: - case OP_NOTMINUPTOI: - case OP_NOTEXACTI: - case OP_NOTPOSSTARI: - case OP_NOTPOSPLUSI: - case OP_NOTPOSQUERYI: - case OP_NOTPOSUPTOI: - cc += PRIV(OP_lengths)[*cc]; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - return cc; - - /* Special cases. */ - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEEXACT: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - case OP_TYPEPOSUPTO: - return cc + PRIV(OP_lengths)[*cc] - 1; - - case OP_ANYBYTE: -#ifdef SUPPORT_UNICODE - if (common->utf) return NULL; -#endif - return cc + 1; - - case OP_CALLOUT_STR: - return cc + GET(cc, 1 + 2*LINK_SIZE); - -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 - case OP_XCLASS: - return cc + GET(cc, 1); -#endif - - case OP_MARK: - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - case OP_SKIP_ARG: - case OP_THEN_ARG: - return cc + 1 + 2 + cc[1]; - - default: - SLJIT_UNREACHABLE(); - return NULL; - } -} - -static BOOL check_opcode_types(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend) -{ -int count; -PCRE2_SPTR slot; -PCRE2_SPTR assert_back_end = cc - 1; -PCRE2_SPTR assert_na_end = cc - 1; - -/* Calculate important variables (like stack size) and checks whether all opcodes are supported. */ -while (cc < ccend) - { - switch(*cc) - { - case OP_SET_SOM: - common->has_set_som = TRUE; - common->might_be_empty = TRUE; - cc += 1; - break; - - case OP_REFI: -#ifdef SUPPORT_UNICODE - if (common->iref_ptr == 0) - { - common->iref_ptr = common->ovector_start; - common->ovector_start += 3 * sizeof(sljit_sw); - } -#endif /* SUPPORT_UNICODE */ - /* Fall through. */ - case OP_REF: - common->optimized_cbracket[GET2(cc, 1)] = 0; - cc += 1 + IMM2_SIZE; - break; - - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - slot = bracketend(cc); - if (slot > assert_na_end) - assert_na_end = slot; - cc += 1 + LINK_SIZE; - break; - - case OP_CBRAPOS: - case OP_SCBRAPOS: - common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] = 0; - cc += 1 + LINK_SIZE + IMM2_SIZE; - break; - - case OP_COND: - case OP_SCOND: - /* Only AUTO_CALLOUT can insert this opcode. We do - not intend to support this case. */ - if (cc[1 + LINK_SIZE] == OP_CALLOUT || cc[1 + LINK_SIZE] == OP_CALLOUT_STR) - return FALSE; - cc += 1 + LINK_SIZE; - break; - - case OP_CREF: - common->optimized_cbracket[GET2(cc, 1)] = 0; - cc += 1 + IMM2_SIZE; - break; - - case OP_DNREF: - case OP_DNREFI: - case OP_DNCREF: - count = GET2(cc, 1 + IMM2_SIZE); - slot = common->name_table + GET2(cc, 1) * common->name_entry_size; - while (count-- > 0) - { - common->optimized_cbracket[GET2(slot, 0)] = 0; - slot += common->name_entry_size; - } - cc += 1 + 2 * IMM2_SIZE; - break; - - case OP_RECURSE: - /* Set its value only once. */ - if (common->recursive_head_ptr == 0) - { - common->recursive_head_ptr = common->ovector_start; - common->ovector_start += sizeof(sljit_sw); - } - cc += 1 + LINK_SIZE; - break; - - case OP_CALLOUT: - case OP_CALLOUT_STR: - if (common->capture_last_ptr == 0) - { - common->capture_last_ptr = common->ovector_start; - common->ovector_start += sizeof(sljit_sw); - } - cc += (*cc == OP_CALLOUT) ? PRIV(OP_lengths)[OP_CALLOUT] : GET(cc, 1 + 2*LINK_SIZE); - break; - - case OP_ASSERTBACK: - slot = bracketend(cc); - if (slot > assert_back_end) - assert_back_end = slot; - cc += 1 + LINK_SIZE; - break; - - case OP_THEN_ARG: - common->has_then = TRUE; - common->control_head_ptr = 1; - /* Fall through. */ - - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - if (cc < assert_na_end) - return FALSE; - /* Fall through */ - case OP_MARK: - if (common->mark_ptr == 0) - { - common->mark_ptr = common->ovector_start; - common->ovector_start += sizeof(sljit_sw); - } - cc += 1 + 2 + cc[1]; - break; - - case OP_THEN: - common->has_then = TRUE; - common->control_head_ptr = 1; - cc += 1; - break; - - case OP_SKIP: - if (cc < assert_back_end) - common->has_skip_in_assert_back = TRUE; - if (cc < assert_na_end) - return FALSE; - cc += 1; - break; - - case OP_SKIP_ARG: - common->control_head_ptr = 1; - common->has_skip_arg = TRUE; - if (cc < assert_back_end) - common->has_skip_in_assert_back = TRUE; - if (cc < assert_na_end) - return FALSE; - cc += 1 + 2 + cc[1]; - break; - - case OP_PRUNE: - case OP_COMMIT: - case OP_ASSERT_ACCEPT: - if (cc < assert_na_end) - return FALSE; - cc++; - break; - - default: - cc = next_opcode(common, cc); - if (cc == NULL) - return FALSE; - break; - } - } -return TRUE; -} - -#define EARLY_FAIL_ENHANCE_MAX (1 + 1) - -/* -start: - 0 - skip / early fail allowed - 1 - only early fail with range allowed - >1 - (start - 1) early fail is processed - -return: current number of iterators enhanced with fast fail -*/ -static int detect_early_fail(compiler_common *common, PCRE2_SPTR cc, int *private_data_start, sljit_s32 depth, int start) -{ -PCRE2_SPTR next_alt; -PCRE2_SPTR end; -PCRE2_SPTR accelerated_start; -int result = 0; -int count; -BOOL fast_forward_allowed = TRUE; - -SLJIT_ASSERT(*cc == OP_ONCE || *cc == OP_BRA || *cc == OP_CBRA); -SLJIT_ASSERT(*cc != OP_CBRA || common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] != 0); -SLJIT_ASSERT(start < EARLY_FAIL_ENHANCE_MAX); - -do - { - count = start; - next_alt = cc + GET(cc, 1); - cc += 1 + LINK_SIZE + ((*cc == OP_CBRA) ? IMM2_SIZE : 0); - - while (TRUE) - { - accelerated_start = NULL; - - switch(*cc) - { - case OP_SOD: - case OP_SOM: - case OP_SET_SOM: - case OP_NOT_WORD_BOUNDARY: - case OP_WORD_BOUNDARY: - case OP_EODN: - case OP_EOD: - case OP_CIRC: - case OP_CIRCM: - case OP_DOLL: - case OP_DOLLM: - /* Zero width assertions. */ - cc++; - continue; - - case OP_NOT_DIGIT: - case OP_DIGIT: - case OP_NOT_WHITESPACE: - case OP_WHITESPACE: - case OP_NOT_WORDCHAR: - case OP_WORDCHAR: - case OP_ANY: - case OP_ALLANY: - case OP_ANYBYTE: - case OP_NOT_HSPACE: - case OP_HSPACE: - case OP_NOT_VSPACE: - case OP_VSPACE: - fast_forward_allowed = FALSE; - cc++; - continue; - - case OP_ANYNL: - case OP_EXTUNI: - fast_forward_allowed = FALSE; - if (count == 0) - count = 1; - cc++; - continue; - - case OP_NOTPROP: - case OP_PROP: - fast_forward_allowed = FALSE; - cc += 1 + 2; - continue; - - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - fast_forward_allowed = FALSE; - cc += 2; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - continue; - - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - /* The type or prop opcode is skipped in the next iteration. */ - cc += 1; - - if (cc[0] != OP_ANYNL && cc[0] != OP_EXTUNI) - { - accelerated_start = cc - 1; - break; - } - - if (count == 0) - count = 1; - fast_forward_allowed = FALSE; - continue; - - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEEXACT: - case OP_TYPEPOSUPTO: - cc += IMM2_SIZE; - /* Fall through */ - - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSQUERY: - /* The type or prop opcode is skipped in the next iteration. */ - fast_forward_allowed = FALSE; - if (count == 0) - count = 1; - cc += 1; - continue; - - case OP_STAR: - case OP_MINSTAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_POSSTAR: - case OP_POSPLUS: - - case OP_STARI: - case OP_MINSTARI: - case OP_PLUSI: - case OP_MINPLUSI: - case OP_POSSTARI: - case OP_POSPLUSI: - - case OP_NOTSTAR: - case OP_NOTMINSTAR: - case OP_NOTPLUS: - case OP_NOTMINPLUS: - case OP_NOTPOSSTAR: - case OP_NOTPOSPLUS: - - case OP_NOTSTARI: - case OP_NOTMINSTARI: - case OP_NOTPLUSI: - case OP_NOTMINPLUSI: - case OP_NOTPOSSTARI: - case OP_NOTPOSPLUSI: - accelerated_start = cc; - cc += 2; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; - - case OP_UPTO: - case OP_MINUPTO: - case OP_EXACT: - case OP_POSUPTO: - case OP_UPTOI: - case OP_MINUPTOI: - case OP_EXACTI: - case OP_POSUPTOI: - case OP_NOTUPTO: - case OP_NOTMINUPTO: - case OP_NOTEXACT: - case OP_NOTPOSUPTO: - case OP_NOTUPTOI: - case OP_NOTMINUPTOI: - case OP_NOTEXACTI: - case OP_NOTPOSUPTOI: - cc += IMM2_SIZE; - /* Fall through */ - - case OP_QUERY: - case OP_MINQUERY: - case OP_POSQUERY: - case OP_QUERYI: - case OP_MINQUERYI: - case OP_POSQUERYI: - case OP_NOTQUERY: - case OP_NOTMINQUERY: - case OP_NOTPOSQUERY: - case OP_NOTQUERYI: - case OP_NOTMINQUERYI: - case OP_NOTPOSQUERYI: - fast_forward_allowed = FALSE; - if (count == 0) - count = 1; - cc += 2; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - continue; - - case OP_CLASS: - case OP_NCLASS: -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 - case OP_XCLASS: - accelerated_start = cc; - cc += ((*cc == OP_XCLASS) ? GET(cc, 1) : (unsigned int)(1 + (32 / sizeof(PCRE2_UCHAR)))); -#else - accelerated_start = cc; - cc += (1 + (32 / sizeof(PCRE2_UCHAR))); -#endif - - switch (*cc) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRPOSSTAR: - case OP_CRPOSPLUS: - cc++; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - cc += 2 * IMM2_SIZE; - /* Fall through */ - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSQUERY: - cc++; - if (count == 0) - count = 1; - /* Fall through */ - default: - accelerated_start = NULL; - fast_forward_allowed = FALSE; - continue; - } - break; - - case OP_ONCE: - case OP_BRA: - case OP_CBRA: - end = cc + GET(cc, 1); - - if (*end == OP_KET && PRIVATE_DATA(end) == 0) - { - if (*cc == OP_CBRA) - { - if (common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] == 0) - break; - cc += IMM2_SIZE; - } - - cc += 1 + LINK_SIZE; - continue; - } - - fast_forward_allowed = FALSE; - if (depth >= 4) - break; - - end = bracketend(cc) - (1 + LINK_SIZE); - if (*end != OP_KET || PRIVATE_DATA(end) != 0) - break; - - if (*cc == OP_CBRA && common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] == 0) - break; - - count = detect_early_fail(common, cc, private_data_start, depth + 1, count); - if (count < EARLY_FAIL_ENHANCE_MAX) - { - cc = end + (1 + LINK_SIZE); - continue; - } - break; - - case OP_KET: - SLJIT_ASSERT(PRIVATE_DATA(cc) == 0); - if (cc >= next_alt) - break; - cc += 1 + LINK_SIZE; - continue; - } - - if (accelerated_start != NULL) - { - if (count == 0) - { - count++; - - if (fast_forward_allowed && *next_alt == OP_KET) - { - common->fast_forward_bc_ptr = accelerated_start; - common->private_data_ptrs[(accelerated_start + 1) - common->start] = ((*private_data_start) << 3) | type_skip; - *private_data_start += sizeof(sljit_sw); - } - else - { - common->private_data_ptrs[(accelerated_start + 1) - common->start] = ((*private_data_start) << 3) | type_fail; - - if (common->early_fail_start_ptr == 0) - common->early_fail_start_ptr = *private_data_start; - - *private_data_start += sizeof(sljit_sw); - common->early_fail_end_ptr = *private_data_start; - - if (*private_data_start > SLJIT_MAX_LOCAL_SIZE) - return EARLY_FAIL_ENHANCE_MAX; - } - } - else - { - common->private_data_ptrs[(accelerated_start + 1) - common->start] = ((*private_data_start) << 3) | type_fail_range; - - if (common->early_fail_start_ptr == 0) - common->early_fail_start_ptr = *private_data_start; - - *private_data_start += 2 * sizeof(sljit_sw); - common->early_fail_end_ptr = *private_data_start; - - if (*private_data_start > SLJIT_MAX_LOCAL_SIZE) - return EARLY_FAIL_ENHANCE_MAX; - } - - count++; - - if (count < EARLY_FAIL_ENHANCE_MAX) - continue; - } - - break; - } - - if (*cc != OP_ALT && *cc != OP_KET) - result = EARLY_FAIL_ENHANCE_MAX; - else if (result < count) - result = count; - - fast_forward_allowed = FALSE; - cc = next_alt; - } -while (*cc == OP_ALT); - -return result; -} - -static int get_class_iterator_size(PCRE2_SPTR cc) -{ -sljit_u32 min; -sljit_u32 max; -switch(*cc) - { - case OP_CRSTAR: - case OP_CRPLUS: - return 2; - - case OP_CRMINSTAR: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - return 1; - - case OP_CRRANGE: - case OP_CRMINRANGE: - min = GET2(cc, 1); - max = GET2(cc, 1 + IMM2_SIZE); - if (max == 0) - return (*cc == OP_CRRANGE) ? 2 : 1; - max -= min; - if (max > 2) - max = 2; - return max; - - default: - return 0; - } -} - -static BOOL detect_repeat(compiler_common *common, PCRE2_SPTR begin) -{ -PCRE2_SPTR end = bracketend(begin); -PCRE2_SPTR next; -PCRE2_SPTR next_end; -PCRE2_SPTR max_end; -PCRE2_UCHAR type; -sljit_sw length = end - begin; -sljit_s32 min, max, i; - -/* Detect fixed iterations first. */ -if (end[-(1 + LINK_SIZE)] != OP_KET) - return FALSE; - -/* Already detected repeat. */ -if (common->private_data_ptrs[end - common->start - LINK_SIZE] != 0) - return TRUE; - -next = end; -min = 1; -while (1) - { - if (*next != *begin) - break; - next_end = bracketend(next); - if (next_end - next != length || memcmp(begin, next, IN_UCHARS(length)) != 0) - break; - next = next_end; - min++; - } - -if (min == 2) - return FALSE; - -max = 0; -max_end = next; -if (*next == OP_BRAZERO || *next == OP_BRAMINZERO) - { - type = *next; - while (1) - { - if (next[0] != type || next[1] != OP_BRA || next[2 + LINK_SIZE] != *begin) - break; - next_end = bracketend(next + 2 + LINK_SIZE); - if (next_end - next != (length + 2 + LINK_SIZE) || memcmp(begin, next + 2 + LINK_SIZE, IN_UCHARS(length)) != 0) - break; - next = next_end; - max++; - } - - if (next[0] == type && next[1] == *begin && max >= 1) - { - next_end = bracketend(next + 1); - if (next_end - next == (length + 1) && memcmp(begin, next + 1, IN_UCHARS(length)) == 0) - { - for (i = 0; i < max; i++, next_end += 1 + LINK_SIZE) - if (*next_end != OP_KET) - break; - - if (i == max) - { - common->private_data_ptrs[max_end - common->start - LINK_SIZE] = next_end - max_end; - common->private_data_ptrs[max_end - common->start - LINK_SIZE + 1] = (type == OP_BRAZERO) ? OP_UPTO : OP_MINUPTO; - /* +2 the original and the last. */ - common->private_data_ptrs[max_end - common->start - LINK_SIZE + 2] = max + 2; - if (min == 1) - return TRUE; - min--; - max_end -= (1 + LINK_SIZE) + GET(max_end, -LINK_SIZE); - } - } - } - } - -if (min >= 3) - { - common->private_data_ptrs[end - common->start - LINK_SIZE] = max_end - end; - common->private_data_ptrs[end - common->start - LINK_SIZE + 1] = OP_EXACT; - common->private_data_ptrs[end - common->start - LINK_SIZE + 2] = min; - return TRUE; - } - -return FALSE; -} - -#define CASE_ITERATOR_PRIVATE_DATA_1 \ - case OP_MINSTAR: \ - case OP_MINPLUS: \ - case OP_QUERY: \ - case OP_MINQUERY: \ - case OP_MINSTARI: \ - case OP_MINPLUSI: \ - case OP_QUERYI: \ - case OP_MINQUERYI: \ - case OP_NOTMINSTAR: \ - case OP_NOTMINPLUS: \ - case OP_NOTQUERY: \ - case OP_NOTMINQUERY: \ - case OP_NOTMINSTARI: \ - case OP_NOTMINPLUSI: \ - case OP_NOTQUERYI: \ - case OP_NOTMINQUERYI: - -#define CASE_ITERATOR_PRIVATE_DATA_2A \ - case OP_STAR: \ - case OP_PLUS: \ - case OP_STARI: \ - case OP_PLUSI: \ - case OP_NOTSTAR: \ - case OP_NOTPLUS: \ - case OP_NOTSTARI: \ - case OP_NOTPLUSI: - -#define CASE_ITERATOR_PRIVATE_DATA_2B \ - case OP_UPTO: \ - case OP_MINUPTO: \ - case OP_UPTOI: \ - case OP_MINUPTOI: \ - case OP_NOTUPTO: \ - case OP_NOTMINUPTO: \ - case OP_NOTUPTOI: \ - case OP_NOTMINUPTOI: - -#define CASE_ITERATOR_TYPE_PRIVATE_DATA_1 \ - case OP_TYPEMINSTAR: \ - case OP_TYPEMINPLUS: \ - case OP_TYPEQUERY: \ - case OP_TYPEMINQUERY: - -#define CASE_ITERATOR_TYPE_PRIVATE_DATA_2A \ - case OP_TYPESTAR: \ - case OP_TYPEPLUS: - -#define CASE_ITERATOR_TYPE_PRIVATE_DATA_2B \ - case OP_TYPEUPTO: \ - case OP_TYPEMINUPTO: - -static void set_private_data_ptrs(compiler_common *common, int *private_data_start, PCRE2_SPTR ccend) -{ -PCRE2_SPTR cc = common->start; -PCRE2_SPTR alternative; -PCRE2_SPTR end = NULL; -int private_data_ptr = *private_data_start; -int space, size, bracketlen; -BOOL repeat_check = TRUE; - -while (cc < ccend) - { - space = 0; - size = 0; - bracketlen = 0; - if (private_data_ptr > SLJIT_MAX_LOCAL_SIZE) - break; - - if (repeat_check && (*cc == OP_ONCE || *cc == OP_BRA || *cc == OP_CBRA || *cc == OP_COND)) - { - if (detect_repeat(common, cc)) - { - /* These brackets are converted to repeats, so no global - based single character repeat is allowed. */ - if (cc >= end) - end = bracketend(cc); - } - } - repeat_check = TRUE; - - switch(*cc) - { - case OP_KET: - if (common->private_data_ptrs[cc + 1 - common->start] != 0) - { - common->private_data_ptrs[cc - common->start] = private_data_ptr; - private_data_ptr += sizeof(sljit_sw); - cc += common->private_data_ptrs[cc + 1 - common->start]; - } - cc += 1 + LINK_SIZE; - break; - - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - case OP_ONCE: - case OP_SCRIPT_RUN: - case OP_BRAPOS: - case OP_SBRA: - case OP_SBRAPOS: - case OP_SCOND: - common->private_data_ptrs[cc - common->start] = private_data_ptr; - private_data_ptr += sizeof(sljit_sw); - bracketlen = 1 + LINK_SIZE; - break; - - case OP_CBRAPOS: - case OP_SCBRAPOS: - common->private_data_ptrs[cc - common->start] = private_data_ptr; - private_data_ptr += sizeof(sljit_sw); - bracketlen = 1 + LINK_SIZE + IMM2_SIZE; - break; - - case OP_COND: - /* Might be a hidden SCOND. */ - alternative = cc + GET(cc, 1); - if (*alternative == OP_KETRMAX || *alternative == OP_KETRMIN) - { - common->private_data_ptrs[cc - common->start] = private_data_ptr; - private_data_ptr += sizeof(sljit_sw); - } - bracketlen = 1 + LINK_SIZE; - break; - - case OP_BRA: - bracketlen = 1 + LINK_SIZE; - break; - - case OP_CBRA: - case OP_SCBRA: - bracketlen = 1 + LINK_SIZE + IMM2_SIZE; - break; - - case OP_BRAZERO: - case OP_BRAMINZERO: - case OP_BRAPOSZERO: - size = 1; - repeat_check = FALSE; - break; - - CASE_ITERATOR_PRIVATE_DATA_1 - size = -2; - space = 1; - break; - - CASE_ITERATOR_PRIVATE_DATA_2A - size = -2; - space = 2; - break; - - CASE_ITERATOR_PRIVATE_DATA_2B - size = -(2 + IMM2_SIZE); - space = 2; - break; - - CASE_ITERATOR_TYPE_PRIVATE_DATA_1 - size = 1; - space = 1; - break; - - CASE_ITERATOR_TYPE_PRIVATE_DATA_2A - size = 1; - if (cc[1] != OP_ANYNL && cc[1] != OP_EXTUNI) - space = 2; - break; - - case OP_TYPEUPTO: - size = 1 + IMM2_SIZE; - if (cc[1 + IMM2_SIZE] != OP_ANYNL && cc[1 + IMM2_SIZE] != OP_EXTUNI) - space = 2; - break; - - case OP_TYPEMINUPTO: - size = 1 + IMM2_SIZE; - space = 2; - break; - - case OP_CLASS: - case OP_NCLASS: - size = 1 + 32 / sizeof(PCRE2_UCHAR); - space = get_class_iterator_size(cc + size); - break; - -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 - case OP_XCLASS: - size = GET(cc, 1); - space = get_class_iterator_size(cc + size); - break; -#endif - - default: - cc = next_opcode(common, cc); - SLJIT_ASSERT(cc != NULL); - break; - } - - /* Character iterators, which are not inside a repeated bracket, - gets a private slot instead of allocating it on the stack. */ - if (space > 0 && cc >= end) - { - common->private_data_ptrs[cc - common->start] = private_data_ptr; - private_data_ptr += sizeof(sljit_sw) * space; - } - - if (size != 0) - { - if (size < 0) - { - cc += -size; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - } - else - cc += size; - } - - if (bracketlen > 0) - { - if (cc >= end) - { - end = bracketend(cc); - if (end[-1 - LINK_SIZE] == OP_KET) - end = NULL; - } - cc += bracketlen; - } - } -*private_data_start = private_data_ptr; -} - -/* Returns with a frame_types (always < 0) if no need for frame. */ -static int get_framesize(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, BOOL recursive, BOOL *needs_control_head) -{ -int length = 0; -int possessive = 0; -BOOL stack_restore = FALSE; -BOOL setsom_found = recursive; -BOOL setmark_found = recursive; -/* The last capture is a local variable even for recursions. */ -BOOL capture_last_found = FALSE; - -#if defined DEBUG_FORCE_CONTROL_HEAD && DEBUG_FORCE_CONTROL_HEAD -SLJIT_ASSERT(common->control_head_ptr != 0); -*needs_control_head = TRUE; -#else -*needs_control_head = FALSE; -#endif - -if (ccend == NULL) - { - ccend = bracketend(cc) - (1 + LINK_SIZE); - if (!recursive && (*cc == OP_CBRAPOS || *cc == OP_SCBRAPOS)) - { - possessive = length = (common->capture_last_ptr != 0) ? 5 : 3; - /* This is correct regardless of common->capture_last_ptr. */ - capture_last_found = TRUE; - } - cc = next_opcode(common, cc); - } - -SLJIT_ASSERT(cc != NULL); -while (cc < ccend) - switch(*cc) - { - case OP_SET_SOM: - SLJIT_ASSERT(common->has_set_som); - stack_restore = TRUE; - if (!setsom_found) - { - length += 2; - setsom_found = TRUE; - } - cc += 1; - break; - - case OP_MARK: - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - case OP_THEN_ARG: - SLJIT_ASSERT(common->mark_ptr != 0); - stack_restore = TRUE; - if (!setmark_found) - { - length += 2; - setmark_found = TRUE; - } - if (common->control_head_ptr != 0) - *needs_control_head = TRUE; - cc += 1 + 2 + cc[1]; - break; - - case OP_RECURSE: - stack_restore = TRUE; - if (common->has_set_som && !setsom_found) - { - length += 2; - setsom_found = TRUE; - } - if (common->mark_ptr != 0 && !setmark_found) - { - length += 2; - setmark_found = TRUE; - } - if (common->capture_last_ptr != 0 && !capture_last_found) - { - length += 2; - capture_last_found = TRUE; - } - cc += 1 + LINK_SIZE; - break; - - case OP_CBRA: - case OP_CBRAPOS: - case OP_SCBRA: - case OP_SCBRAPOS: - stack_restore = TRUE; - if (common->capture_last_ptr != 0 && !capture_last_found) - { - length += 2; - capture_last_found = TRUE; - } - length += 3; - cc += 1 + LINK_SIZE + IMM2_SIZE; - break; - - case OP_THEN: - stack_restore = TRUE; - if (common->control_head_ptr != 0) - *needs_control_head = TRUE; - cc ++; - break; - - default: - stack_restore = TRUE; - /* Fall through. */ - - case OP_NOT_WORD_BOUNDARY: - case OP_WORD_BOUNDARY: - case OP_NOT_DIGIT: - case OP_DIGIT: - case OP_NOT_WHITESPACE: - case OP_WHITESPACE: - case OP_NOT_WORDCHAR: - case OP_WORDCHAR: - case OP_ANY: - case OP_ALLANY: - case OP_ANYBYTE: - case OP_NOTPROP: - case OP_PROP: - case OP_ANYNL: - case OP_NOT_HSPACE: - case OP_HSPACE: - case OP_NOT_VSPACE: - case OP_VSPACE: - case OP_EXTUNI: - case OP_EODN: - case OP_EOD: - case OP_CIRC: - case OP_CIRCM: - case OP_DOLL: - case OP_DOLLM: - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - - case OP_EXACT: - case OP_POSSTAR: - case OP_POSPLUS: - case OP_POSQUERY: - case OP_POSUPTO: - - case OP_EXACTI: - case OP_POSSTARI: - case OP_POSPLUSI: - case OP_POSQUERYI: - case OP_POSUPTOI: - - case OP_NOTEXACT: - case OP_NOTPOSSTAR: - case OP_NOTPOSPLUS: - case OP_NOTPOSQUERY: - case OP_NOTPOSUPTO: - - case OP_NOTEXACTI: - case OP_NOTPOSSTARI: - case OP_NOTPOSPLUSI: - case OP_NOTPOSQUERYI: - case OP_NOTPOSUPTOI: - - case OP_TYPEEXACT: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - case OP_TYPEPOSUPTO: - - case OP_CLASS: - case OP_NCLASS: - case OP_XCLASS: - - case OP_CALLOUT: - case OP_CALLOUT_STR: - - cc = next_opcode(common, cc); - SLJIT_ASSERT(cc != NULL); - break; - } - -/* Possessive quantifiers can use a special case. */ -if (SLJIT_UNLIKELY(possessive == length)) - return stack_restore ? no_frame : no_stack; - -if (length > 0) - return length + 1; -return stack_restore ? no_frame : no_stack; -} - -static void init_frame(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, int stackpos, int stacktop) -{ -DEFINE_COMPILER; -BOOL setsom_found = FALSE; -BOOL setmark_found = FALSE; -/* The last capture is a local variable even for recursions. */ -BOOL capture_last_found = FALSE; -int offset; - -/* >= 1 + shortest item size (2) */ -SLJIT_UNUSED_ARG(stacktop); -SLJIT_ASSERT(stackpos >= stacktop + 2); - -stackpos = STACK(stackpos); -if (ccend == NULL) - { - ccend = bracketend(cc) - (1 + LINK_SIZE); - if (*cc != OP_CBRAPOS && *cc != OP_SCBRAPOS) - cc = next_opcode(common, cc); - } - -SLJIT_ASSERT(cc != NULL); -while (cc < ccend) - switch(*cc) - { - case OP_SET_SOM: - SLJIT_ASSERT(common->has_set_som); - if (!setsom_found) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -OVECTOR(0)); - stackpos -= (int)sizeof(sljit_sw); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0); - stackpos -= (int)sizeof(sljit_sw); - setsom_found = TRUE; - } - cc += 1; - break; - - case OP_MARK: - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - case OP_THEN_ARG: - SLJIT_ASSERT(common->mark_ptr != 0); - if (!setmark_found) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->mark_ptr); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -common->mark_ptr); - stackpos -= (int)sizeof(sljit_sw); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0); - stackpos -= (int)sizeof(sljit_sw); - setmark_found = TRUE; - } - cc += 1 + 2 + cc[1]; - break; - - case OP_RECURSE: - if (common->has_set_som && !setsom_found) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -OVECTOR(0)); - stackpos -= (int)sizeof(sljit_sw); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0); - stackpos -= (int)sizeof(sljit_sw); - setsom_found = TRUE; - } - if (common->mark_ptr != 0 && !setmark_found) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->mark_ptr); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -common->mark_ptr); - stackpos -= (int)sizeof(sljit_sw); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0); - stackpos -= (int)sizeof(sljit_sw); - setmark_found = TRUE; - } - if (common->capture_last_ptr != 0 && !capture_last_found) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -common->capture_last_ptr); - stackpos -= (int)sizeof(sljit_sw); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0); - stackpos -= (int)sizeof(sljit_sw); - capture_last_found = TRUE; - } - cc += 1 + LINK_SIZE; - break; - - case OP_CBRA: - case OP_CBRAPOS: - case OP_SCBRA: - case OP_SCBRAPOS: - if (common->capture_last_ptr != 0 && !capture_last_found) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -common->capture_last_ptr); - stackpos -= (int)sizeof(sljit_sw); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0); - stackpos -= (int)sizeof(sljit_sw); - capture_last_found = TRUE; - } - offset = (GET2(cc, 1 + LINK_SIZE)) << 1; - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, OVECTOR(offset)); - stackpos -= (int)sizeof(sljit_sw); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset)); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0); - stackpos -= (int)sizeof(sljit_sw); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP2, 0); - stackpos -= (int)sizeof(sljit_sw); - - cc += 1 + LINK_SIZE + IMM2_SIZE; - break; - - default: - cc = next_opcode(common, cc); - SLJIT_ASSERT(cc != NULL); - break; - } - -OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, 0); -SLJIT_ASSERT(stackpos == STACK(stacktop)); -} - -#define RECURSE_TMP_REG_COUNT 3 - -typedef struct delayed_mem_copy_status { - struct sljit_compiler *compiler; - int store_bases[RECURSE_TMP_REG_COUNT]; - int store_offsets[RECURSE_TMP_REG_COUNT]; - int tmp_regs[RECURSE_TMP_REG_COUNT]; - int saved_tmp_regs[RECURSE_TMP_REG_COUNT]; - int next_tmp_reg; -} delayed_mem_copy_status; - -static void delayed_mem_copy_init(delayed_mem_copy_status *status, compiler_common *common) -{ -int i; - -for (i = 0; i < RECURSE_TMP_REG_COUNT; i++) - { - SLJIT_ASSERT(status->tmp_regs[i] >= 0); - SLJIT_ASSERT(sljit_get_register_index(status->saved_tmp_regs[i]) < 0 || status->tmp_regs[i] == status->saved_tmp_regs[i]); - - status->store_bases[i] = -1; - } -status->next_tmp_reg = 0; -status->compiler = common->compiler; -} - -static void delayed_mem_copy_move(delayed_mem_copy_status *status, int load_base, sljit_sw load_offset, - int store_base, sljit_sw store_offset) -{ -struct sljit_compiler *compiler = status->compiler; -int next_tmp_reg = status->next_tmp_reg; -int tmp_reg = status->tmp_regs[next_tmp_reg]; - -SLJIT_ASSERT(load_base > 0 && store_base > 0); - -if (status->store_bases[next_tmp_reg] == -1) - { - /* Preserve virtual registers. */ - if (sljit_get_register_index(status->saved_tmp_regs[next_tmp_reg]) < 0) - OP1(SLJIT_MOV, status->saved_tmp_regs[next_tmp_reg], 0, tmp_reg, 0); - } -else - OP1(SLJIT_MOV, SLJIT_MEM1(status->store_bases[next_tmp_reg]), status->store_offsets[next_tmp_reg], tmp_reg, 0); - -OP1(SLJIT_MOV, tmp_reg, 0, SLJIT_MEM1(load_base), load_offset); -status->store_bases[next_tmp_reg] = store_base; -status->store_offsets[next_tmp_reg] = store_offset; - -status->next_tmp_reg = (next_tmp_reg + 1) % RECURSE_TMP_REG_COUNT; -} - -static void delayed_mem_copy_finish(delayed_mem_copy_status *status) -{ -struct sljit_compiler *compiler = status->compiler; -int next_tmp_reg = status->next_tmp_reg; -int tmp_reg, saved_tmp_reg, i; - -for (i = 0; i < RECURSE_TMP_REG_COUNT; i++) - { - if (status->store_bases[next_tmp_reg] != -1) - { - tmp_reg = status->tmp_regs[next_tmp_reg]; - saved_tmp_reg = status->saved_tmp_regs[next_tmp_reg]; - - OP1(SLJIT_MOV, SLJIT_MEM1(status->store_bases[next_tmp_reg]), status->store_offsets[next_tmp_reg], tmp_reg, 0); - - /* Restore virtual registers. */ - if (sljit_get_register_index(saved_tmp_reg) < 0) - OP1(SLJIT_MOV, tmp_reg, 0, saved_tmp_reg, 0); - } - - next_tmp_reg = (next_tmp_reg + 1) % RECURSE_TMP_REG_COUNT; - } -} - -#undef RECURSE_TMP_REG_COUNT - -static int get_recurse_data_length(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, - BOOL *needs_control_head, BOOL *has_quit, BOOL *has_accept) -{ -int length = 1; -int size; -PCRE2_SPTR alternative; -BOOL quit_found = FALSE; -BOOL accept_found = FALSE; -BOOL setsom_found = FALSE; -BOOL setmark_found = FALSE; -BOOL capture_last_found = FALSE; -BOOL control_head_found = FALSE; - -#if defined DEBUG_FORCE_CONTROL_HEAD && DEBUG_FORCE_CONTROL_HEAD -SLJIT_ASSERT(common->control_head_ptr != 0); -control_head_found = TRUE; -#endif - -/* Calculate the sum of the private machine words. */ -while (cc < ccend) - { - size = 0; - switch(*cc) - { - case OP_SET_SOM: - SLJIT_ASSERT(common->has_set_som); - setsom_found = TRUE; - cc += 1; - break; - - case OP_RECURSE: - if (common->has_set_som) - setsom_found = TRUE; - if (common->mark_ptr != 0) - setmark_found = TRUE; - if (common->capture_last_ptr != 0) - capture_last_found = TRUE; - cc += 1 + LINK_SIZE; - break; - - case OP_KET: - if (PRIVATE_DATA(cc) != 0) - { - length++; - SLJIT_ASSERT(PRIVATE_DATA(cc + 1) != 0); - cc += PRIVATE_DATA(cc + 1); - } - cc += 1 + LINK_SIZE; - break; - - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - case OP_ONCE: - case OP_SCRIPT_RUN: - case OP_BRAPOS: - case OP_SBRA: - case OP_SBRAPOS: - case OP_SCOND: - length++; - SLJIT_ASSERT(PRIVATE_DATA(cc) != 0); - cc += 1 + LINK_SIZE; - break; - - case OP_CBRA: - case OP_SCBRA: - length += 2; - if (common->capture_last_ptr != 0) - capture_last_found = TRUE; - if (common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] == 0) - length++; - cc += 1 + LINK_SIZE + IMM2_SIZE; - break; - - case OP_CBRAPOS: - case OP_SCBRAPOS: - length += 2 + 2; - if (common->capture_last_ptr != 0) - capture_last_found = TRUE; - cc += 1 + LINK_SIZE + IMM2_SIZE; - break; - - case OP_COND: - /* Might be a hidden SCOND. */ - alternative = cc + GET(cc, 1); - if (*alternative == OP_KETRMAX || *alternative == OP_KETRMIN) - length++; - cc += 1 + LINK_SIZE; - break; - - CASE_ITERATOR_PRIVATE_DATA_1 - if (PRIVATE_DATA(cc) != 0) - length++; - cc += 2; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; - - CASE_ITERATOR_PRIVATE_DATA_2A - if (PRIVATE_DATA(cc) != 0) - length += 2; - cc += 2; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; - - CASE_ITERATOR_PRIVATE_DATA_2B - if (PRIVATE_DATA(cc) != 0) - length += 2; - cc += 2 + IMM2_SIZE; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; - - CASE_ITERATOR_TYPE_PRIVATE_DATA_1 - if (PRIVATE_DATA(cc) != 0) - length++; - cc += 1; - break; - - CASE_ITERATOR_TYPE_PRIVATE_DATA_2A - if (PRIVATE_DATA(cc) != 0) - length += 2; - cc += 1; - break; - - CASE_ITERATOR_TYPE_PRIVATE_DATA_2B - if (PRIVATE_DATA(cc) != 0) - length += 2; - cc += 1 + IMM2_SIZE; - break; - - case OP_CLASS: - case OP_NCLASS: -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 - case OP_XCLASS: - size = (*cc == OP_XCLASS) ? GET(cc, 1) : 1 + 32 / (int)sizeof(PCRE2_UCHAR); -#else - size = 1 + 32 / (int)sizeof(PCRE2_UCHAR); -#endif - if (PRIVATE_DATA(cc) != 0) - length += get_class_iterator_size(cc + size); - cc += size; - break; - - case OP_MARK: - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - case OP_THEN_ARG: - SLJIT_ASSERT(common->mark_ptr != 0); - if (!setmark_found) - setmark_found = TRUE; - if (common->control_head_ptr != 0) - control_head_found = TRUE; - if (*cc != OP_MARK) - quit_found = TRUE; - - cc += 1 + 2 + cc[1]; - break; - - case OP_PRUNE: - case OP_SKIP: - case OP_COMMIT: - quit_found = TRUE; - cc++; - break; - - case OP_SKIP_ARG: - quit_found = TRUE; - cc += 1 + 2 + cc[1]; - break; - - case OP_THEN: - SLJIT_ASSERT(common->control_head_ptr != 0); - quit_found = TRUE; - if (!control_head_found) - control_head_found = TRUE; - cc++; - break; - - case OP_ACCEPT: - case OP_ASSERT_ACCEPT: - accept_found = TRUE; - cc++; - break; - - default: - cc = next_opcode(common, cc); - SLJIT_ASSERT(cc != NULL); - break; - } - } -SLJIT_ASSERT(cc == ccend); - -if (control_head_found) - length++; -if (capture_last_found) - length++; -if (quit_found) - { - if (setsom_found) - length++; - if (setmark_found) - length++; - } - -*needs_control_head = control_head_found; -*has_quit = quit_found; -*has_accept = accept_found; -return length; -} - -enum copy_recurse_data_types { - recurse_copy_from_global, - recurse_copy_private_to_global, - recurse_copy_shared_to_global, - recurse_copy_kept_shared_to_global, - recurse_swap_global -}; - -static void copy_recurse_data(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, - int type, int stackptr, int stacktop, BOOL has_quit) -{ -delayed_mem_copy_status status; -PCRE2_SPTR alternative; -sljit_sw private_srcw[2]; -sljit_sw shared_srcw[3]; -sljit_sw kept_shared_srcw[2]; -int private_count, shared_count, kept_shared_count; -int from_sp, base_reg, offset, i; -BOOL setsom_found = FALSE; -BOOL setmark_found = FALSE; -BOOL capture_last_found = FALSE; -BOOL control_head_found = FALSE; - -#if defined DEBUG_FORCE_CONTROL_HEAD && DEBUG_FORCE_CONTROL_HEAD -SLJIT_ASSERT(common->control_head_ptr != 0); -control_head_found = TRUE; -#endif - -switch (type) - { - case recurse_copy_from_global: - from_sp = TRUE; - base_reg = STACK_TOP; - break; - - case recurse_copy_private_to_global: - case recurse_copy_shared_to_global: - case recurse_copy_kept_shared_to_global: - from_sp = FALSE; - base_reg = STACK_TOP; - break; - - default: - SLJIT_ASSERT(type == recurse_swap_global); - from_sp = FALSE; - base_reg = TMP2; - break; - } - -stackptr = STACK(stackptr); -stacktop = STACK(stacktop); - -status.tmp_regs[0] = TMP1; -status.saved_tmp_regs[0] = TMP1; - -if (base_reg != TMP2) - { - status.tmp_regs[1] = TMP2; - status.saved_tmp_regs[1] = TMP2; - } -else - { - status.saved_tmp_regs[1] = RETURN_ADDR; - if (HAS_VIRTUAL_REGISTERS) - status.tmp_regs[1] = STR_PTR; - else - status.tmp_regs[1] = RETURN_ADDR; - } - -status.saved_tmp_regs[2] = TMP3; -if (HAS_VIRTUAL_REGISTERS) - status.tmp_regs[2] = STR_END; -else - status.tmp_regs[2] = TMP3; - -delayed_mem_copy_init(&status, common); - -if (type != recurse_copy_shared_to_global && type != recurse_copy_kept_shared_to_global) - { - SLJIT_ASSERT(type == recurse_copy_from_global || type == recurse_copy_private_to_global || type == recurse_swap_global); - - if (!from_sp) - delayed_mem_copy_move(&status, base_reg, stackptr, SLJIT_SP, common->recursive_head_ptr); - - if (from_sp || type == recurse_swap_global) - delayed_mem_copy_move(&status, SLJIT_SP, common->recursive_head_ptr, base_reg, stackptr); - } - -stackptr += sizeof(sljit_sw); - -#if defined DEBUG_FORCE_CONTROL_HEAD && DEBUG_FORCE_CONTROL_HEAD -if (type != recurse_copy_shared_to_global) - { - if (!from_sp) - delayed_mem_copy_move(&status, base_reg, stackptr, SLJIT_SP, common->control_head_ptr); - - if (from_sp || type == recurse_swap_global) - delayed_mem_copy_move(&status, SLJIT_SP, common->control_head_ptr, base_reg, stackptr); - } - -stackptr += sizeof(sljit_sw); -#endif - -while (cc < ccend) - { - private_count = 0; - shared_count = 0; - kept_shared_count = 0; - - switch(*cc) - { - case OP_SET_SOM: - SLJIT_ASSERT(common->has_set_som); - if (has_quit && !setsom_found) - { - kept_shared_srcw[0] = OVECTOR(0); - kept_shared_count = 1; - setsom_found = TRUE; - } - cc += 1; - break; - - case OP_RECURSE: - if (has_quit) - { - if (common->has_set_som && !setsom_found) - { - kept_shared_srcw[0] = OVECTOR(0); - kept_shared_count = 1; - setsom_found = TRUE; - } - if (common->mark_ptr != 0 && !setmark_found) - { - kept_shared_srcw[kept_shared_count] = common->mark_ptr; - kept_shared_count++; - setmark_found = TRUE; - } - } - if (common->capture_last_ptr != 0 && !capture_last_found) - { - shared_srcw[0] = common->capture_last_ptr; - shared_count = 1; - capture_last_found = TRUE; - } - cc += 1 + LINK_SIZE; - break; - - case OP_KET: - if (PRIVATE_DATA(cc) != 0) - { - private_count = 1; - private_srcw[0] = PRIVATE_DATA(cc); - SLJIT_ASSERT(PRIVATE_DATA(cc + 1) != 0); - cc += PRIVATE_DATA(cc + 1); - } - cc += 1 + LINK_SIZE; - break; - - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - case OP_ONCE: - case OP_SCRIPT_RUN: - case OP_BRAPOS: - case OP_SBRA: - case OP_SBRAPOS: - case OP_SCOND: - private_count = 1; - private_srcw[0] = PRIVATE_DATA(cc); - cc += 1 + LINK_SIZE; - break; - - case OP_CBRA: - case OP_SCBRA: - offset = (GET2(cc, 1 + LINK_SIZE)) << 1; - shared_srcw[0] = OVECTOR(offset); - shared_srcw[1] = OVECTOR(offset + 1); - shared_count = 2; - - if (common->capture_last_ptr != 0 && !capture_last_found) - { - shared_srcw[2] = common->capture_last_ptr; - shared_count = 3; - capture_last_found = TRUE; - } - - if (common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] == 0) - { - private_count = 1; - private_srcw[0] = OVECTOR_PRIV(GET2(cc, 1 + LINK_SIZE)); - } - cc += 1 + LINK_SIZE + IMM2_SIZE; - break; - - case OP_CBRAPOS: - case OP_SCBRAPOS: - offset = (GET2(cc, 1 + LINK_SIZE)) << 1; - shared_srcw[0] = OVECTOR(offset); - shared_srcw[1] = OVECTOR(offset + 1); - shared_count = 2; - - if (common->capture_last_ptr != 0 && !capture_last_found) - { - shared_srcw[2] = common->capture_last_ptr; - shared_count = 3; - capture_last_found = TRUE; - } - - private_count = 2; - private_srcw[0] = PRIVATE_DATA(cc); - private_srcw[1] = OVECTOR_PRIV(GET2(cc, 1 + LINK_SIZE)); - cc += 1 + LINK_SIZE + IMM2_SIZE; - break; - - case OP_COND: - /* Might be a hidden SCOND. */ - alternative = cc + GET(cc, 1); - if (*alternative == OP_KETRMAX || *alternative == OP_KETRMIN) - { - private_count = 1; - private_srcw[0] = PRIVATE_DATA(cc); - } - cc += 1 + LINK_SIZE; - break; - - CASE_ITERATOR_PRIVATE_DATA_1 - if (PRIVATE_DATA(cc)) - { - private_count = 1; - private_srcw[0] = PRIVATE_DATA(cc); - } - cc += 2; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; - - CASE_ITERATOR_PRIVATE_DATA_2A - if (PRIVATE_DATA(cc)) - { - private_count = 2; - private_srcw[0] = PRIVATE_DATA(cc); - private_srcw[1] = PRIVATE_DATA(cc) + sizeof(sljit_sw); - } - cc += 2; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; - - CASE_ITERATOR_PRIVATE_DATA_2B - if (PRIVATE_DATA(cc)) - { - private_count = 2; - private_srcw[0] = PRIVATE_DATA(cc); - private_srcw[1] = PRIVATE_DATA(cc) + sizeof(sljit_sw); - } - cc += 2 + IMM2_SIZE; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; - - CASE_ITERATOR_TYPE_PRIVATE_DATA_1 - if (PRIVATE_DATA(cc)) - { - private_count = 1; - private_srcw[0] = PRIVATE_DATA(cc); - } - cc += 1; - break; - - CASE_ITERATOR_TYPE_PRIVATE_DATA_2A - if (PRIVATE_DATA(cc)) - { - private_count = 2; - private_srcw[0] = PRIVATE_DATA(cc); - private_srcw[1] = private_srcw[0] + sizeof(sljit_sw); - } - cc += 1; - break; - - CASE_ITERATOR_TYPE_PRIVATE_DATA_2B - if (PRIVATE_DATA(cc)) - { - private_count = 2; - private_srcw[0] = PRIVATE_DATA(cc); - private_srcw[1] = private_srcw[0] + sizeof(sljit_sw); - } - cc += 1 + IMM2_SIZE; - break; - - case OP_CLASS: - case OP_NCLASS: -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 - case OP_XCLASS: - i = (*cc == OP_XCLASS) ? GET(cc, 1) : 1 + 32 / (int)sizeof(PCRE2_UCHAR); -#else - i = 1 + 32 / (int)sizeof(PCRE2_UCHAR); -#endif - if (PRIVATE_DATA(cc) != 0) - switch(get_class_iterator_size(cc + i)) - { - case 1: - private_count = 1; - private_srcw[0] = PRIVATE_DATA(cc); - break; - - case 2: - private_count = 2; - private_srcw[0] = PRIVATE_DATA(cc); - private_srcw[1] = private_srcw[0] + sizeof(sljit_sw); - break; - - default: - SLJIT_UNREACHABLE(); - break; - } - cc += i; - break; - - case OP_MARK: - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - case OP_THEN_ARG: - SLJIT_ASSERT(common->mark_ptr != 0); - if (has_quit && !setmark_found) - { - kept_shared_srcw[0] = common->mark_ptr; - kept_shared_count = 1; - setmark_found = TRUE; - } - if (common->control_head_ptr != 0 && !control_head_found) - { - private_srcw[0] = common->control_head_ptr; - private_count = 1; - control_head_found = TRUE; - } - cc += 1 + 2 + cc[1]; - break; - - case OP_THEN: - SLJIT_ASSERT(common->control_head_ptr != 0); - if (!control_head_found) - { - private_srcw[0] = common->control_head_ptr; - private_count = 1; - control_head_found = TRUE; - } - cc++; - break; - - default: - cc = next_opcode(common, cc); - SLJIT_ASSERT(cc != NULL); - break; - } - - if (type != recurse_copy_shared_to_global && type != recurse_copy_kept_shared_to_global) - { - SLJIT_ASSERT(type == recurse_copy_from_global || type == recurse_copy_private_to_global || type == recurse_swap_global); - - for (i = 0; i < private_count; i++) - { - SLJIT_ASSERT(private_srcw[i] != 0); - - if (!from_sp) - delayed_mem_copy_move(&status, base_reg, stackptr, SLJIT_SP, private_srcw[i]); - - if (from_sp || type == recurse_swap_global) - delayed_mem_copy_move(&status, SLJIT_SP, private_srcw[i], base_reg, stackptr); - - stackptr += sizeof(sljit_sw); - } - } - else - stackptr += sizeof(sljit_sw) * private_count; - - if (type != recurse_copy_private_to_global && type != recurse_copy_kept_shared_to_global) - { - SLJIT_ASSERT(type == recurse_copy_from_global || type == recurse_copy_shared_to_global || type == recurse_swap_global); - - for (i = 0; i < shared_count; i++) - { - SLJIT_ASSERT(shared_srcw[i] != 0); - - if (!from_sp) - delayed_mem_copy_move(&status, base_reg, stackptr, SLJIT_SP, shared_srcw[i]); - - if (from_sp || type == recurse_swap_global) - delayed_mem_copy_move(&status, SLJIT_SP, shared_srcw[i], base_reg, stackptr); - - stackptr += sizeof(sljit_sw); - } - } - else - stackptr += sizeof(sljit_sw) * shared_count; - - if (type != recurse_copy_private_to_global && type != recurse_swap_global) - { - SLJIT_ASSERT(type == recurse_copy_from_global || type == recurse_copy_shared_to_global || type == recurse_copy_kept_shared_to_global); - - for (i = 0; i < kept_shared_count; i++) - { - SLJIT_ASSERT(kept_shared_srcw[i] != 0); - - if (!from_sp) - delayed_mem_copy_move(&status, base_reg, stackptr, SLJIT_SP, kept_shared_srcw[i]); - - if (from_sp || type == recurse_swap_global) - delayed_mem_copy_move(&status, SLJIT_SP, kept_shared_srcw[i], base_reg, stackptr); - - stackptr += sizeof(sljit_sw); - } - } - else - stackptr += sizeof(sljit_sw) * kept_shared_count; - } - -SLJIT_ASSERT(cc == ccend && stackptr == stacktop); - -delayed_mem_copy_finish(&status); -} - -static SLJIT_INLINE PCRE2_SPTR set_then_offsets(compiler_common *common, PCRE2_SPTR cc, sljit_u8 *current_offset) -{ -PCRE2_SPTR end = bracketend(cc); -BOOL has_alternatives = cc[GET(cc, 1)] == OP_ALT; - -/* Assert captures then. */ -if (*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NA) - current_offset = NULL; -/* Conditional block does not. */ -if (*cc == OP_COND || *cc == OP_SCOND) - has_alternatives = FALSE; - -cc = next_opcode(common, cc); -if (has_alternatives) - current_offset = common->then_offsets + (cc - common->start); - -while (cc < end) - { - if ((*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NA) || (*cc >= OP_ONCE && *cc <= OP_SCOND)) - cc = set_then_offsets(common, cc, current_offset); - else - { - if (*cc == OP_ALT && has_alternatives) - current_offset = common->then_offsets + (cc + 1 + LINK_SIZE - common->start); - if (*cc >= OP_THEN && *cc <= OP_THEN_ARG && current_offset != NULL) - *current_offset = 1; - cc = next_opcode(common, cc); - } - } - -return end; -} - -#undef CASE_ITERATOR_PRIVATE_DATA_1 -#undef CASE_ITERATOR_PRIVATE_DATA_2A -#undef CASE_ITERATOR_PRIVATE_DATA_2B -#undef CASE_ITERATOR_TYPE_PRIVATE_DATA_1 -#undef CASE_ITERATOR_TYPE_PRIVATE_DATA_2A -#undef CASE_ITERATOR_TYPE_PRIVATE_DATA_2B - -static SLJIT_INLINE BOOL is_powerof2(unsigned int value) -{ -return (value & (value - 1)) == 0; -} - -static SLJIT_INLINE void set_jumps(jump_list *list, struct sljit_label *label) -{ -while (list) - { - /* sljit_set_label is clever enough to do nothing - if either the jump or the label is NULL. */ - SET_LABEL(list->jump, label); - list = list->next; - } -} - -static SLJIT_INLINE void add_jump(struct sljit_compiler *compiler, jump_list **list, struct sljit_jump *jump) -{ -jump_list *list_item = sljit_alloc_memory(compiler, sizeof(jump_list)); -if (list_item) - { - list_item->next = *list; - list_item->jump = jump; - *list = list_item; - } -} - -static void add_stub(compiler_common *common, struct sljit_jump *start) -{ -DEFINE_COMPILER; -stub_list *list_item = sljit_alloc_memory(compiler, sizeof(stub_list)); - -if (list_item) - { - list_item->start = start; - list_item->quit = LABEL(); - list_item->next = common->stubs; - common->stubs = list_item; - } -} - -static void flush_stubs(compiler_common *common) -{ -DEFINE_COMPILER; -stub_list *list_item = common->stubs; - -while (list_item) - { - JUMPHERE(list_item->start); - add_jump(compiler, &common->stackalloc, JUMP(SLJIT_FAST_CALL)); - JUMPTO(SLJIT_JUMP, list_item->quit); - list_item = list_item->next; - } -common->stubs = NULL; -} - -static SLJIT_INLINE void count_match(compiler_common *common) -{ -DEFINE_COMPILER; - -OP2(SLJIT_SUB | SLJIT_SET_Z, COUNT_MATCH, 0, COUNT_MATCH, 0, SLJIT_IMM, 1); -add_jump(compiler, &common->calllimit, JUMP(SLJIT_ZERO)); -} - -static SLJIT_INLINE void allocate_stack(compiler_common *common, int size) -{ -/* May destroy all locals and registers except TMP2. */ -DEFINE_COMPILER; - -SLJIT_ASSERT(size > 0); -OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, size * sizeof(sljit_sw)); -#ifdef DESTROY_REGISTERS -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 12345); -OP1(SLJIT_MOV, TMP3, 0, TMP1, 0); -OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0); -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, TMP1, 0); -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP1, 0); -#endif -add_stub(common, CMP(SLJIT_LESS, STACK_TOP, 0, STACK_LIMIT, 0)); -} - -static SLJIT_INLINE void free_stack(compiler_common *common, int size) -{ -DEFINE_COMPILER; - -SLJIT_ASSERT(size > 0); -OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, size * sizeof(sljit_sw)); -} - -static sljit_uw * allocate_read_only_data(compiler_common *common, sljit_uw size) -{ -DEFINE_COMPILER; -sljit_uw *result; - -if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - return NULL; - -result = (sljit_uw *)SLJIT_MALLOC(size + sizeof(sljit_uw), compiler->allocator_data); -if (SLJIT_UNLIKELY(result == NULL)) - { - sljit_set_compiler_memory_error(compiler); - return NULL; - } - -*(void**)result = common->read_only_data_head; -common->read_only_data_head = (void *)result; -return result + 1; -} - -static SLJIT_INLINE void reset_ovector(compiler_common *common, int length) -{ -DEFINE_COMPILER; -struct sljit_label *loop; -sljit_s32 i; - -/* At this point we can freely use all temporary registers. */ -SLJIT_ASSERT(length > 1); -/* TMP1 returns with begin - 1. */ -OP2(SLJIT_SUB, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), SLJIT_OFFSETOF(jit_arguments, begin), SLJIT_IMM, IN_UCHARS(1)); -if (length < 8) - { - for (i = 1; i < length; i++) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(i), SLJIT_R0, 0); - } -else - { - if (sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw)) == SLJIT_SUCCESS) - { - GET_LOCAL_BASE(SLJIT_R1, 0, OVECTOR_START); - OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, length - 1); - loop = LABEL(); - sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw)); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, 1); - JUMPTO(SLJIT_NOT_ZERO, loop); - } - else - { - GET_LOCAL_BASE(SLJIT_R1, 0, OVECTOR_START + sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, length - 1); - loop = LABEL(); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_R0, 0); - OP2(SLJIT_ADD, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, sizeof(sljit_sw)); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, 1); - JUMPTO(SLJIT_NOT_ZERO, loop); - } - } -} - -static SLJIT_INLINE void reset_early_fail(compiler_common *common) -{ -DEFINE_COMPILER; -sljit_u32 size = (sljit_u32)(common->early_fail_end_ptr - common->early_fail_start_ptr); -sljit_u32 uncleared_size; -sljit_s32 src = SLJIT_IMM; -sljit_s32 i; -struct sljit_label *loop; - -SLJIT_ASSERT(common->early_fail_start_ptr < common->early_fail_end_ptr); - -if (size == sizeof(sljit_sw)) - { - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->early_fail_start_ptr, SLJIT_IMM, 0); - return; - } - -if (sljit_get_register_index(TMP3) >= 0 && !sljit_has_cpu_feature(SLJIT_HAS_ZERO_REGISTER)) - { - OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, 0); - src = TMP3; - } - -if (size <= 6 * sizeof(sljit_sw)) - { - for (i = common->early_fail_start_ptr; i < common->early_fail_end_ptr; i += sizeof(sljit_sw)) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), i, src, 0); - return; - } - -GET_LOCAL_BASE(TMP1, 0, common->early_fail_start_ptr); - -uncleared_size = ((size / sizeof(sljit_sw)) % 3) * sizeof(sljit_sw); - -OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, size - uncleared_size); - -loop = LABEL(); -OP1(SLJIT_MOV, SLJIT_MEM1(TMP1), 0, src, 0); -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 3 * sizeof(sljit_sw)); -OP1(SLJIT_MOV, SLJIT_MEM1(TMP1), -2 * (sljit_sw)sizeof(sljit_sw), src, 0); -OP1(SLJIT_MOV, SLJIT_MEM1(TMP1), -1 * (sljit_sw)sizeof(sljit_sw), src, 0); -CMPTO(SLJIT_LESS, TMP1, 0, TMP2, 0, loop); - -if (uncleared_size >= sizeof(sljit_sw)) - OP1(SLJIT_MOV, SLJIT_MEM1(TMP1), 0, src, 0); - -if (uncleared_size >= 2 * sizeof(sljit_sw)) - OP1(SLJIT_MOV, SLJIT_MEM1(TMP1), sizeof(sljit_sw), src, 0); -} - -static SLJIT_INLINE void do_reset_match(compiler_common *common, int length) -{ -DEFINE_COMPILER; -struct sljit_label *loop; -int i; - -SLJIT_ASSERT(length > 1); -/* OVECTOR(1) contains the "string begin - 1" constant. */ -if (length > 2) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(1)); -if (length < 8) - { - for (i = 2; i < length; i++) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(i), TMP1, 0); - } -else - { - if (sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, TMP1, SLJIT_MEM1(TMP2), sizeof(sljit_sw)) == SLJIT_SUCCESS) - { - GET_LOCAL_BASE(TMP2, 0, OVECTOR_START + sizeof(sljit_sw)); - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_IMM, length - 2); - loop = LABEL(); - sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_PRE, TMP1, SLJIT_MEM1(TMP2), sizeof(sljit_sw)); - OP2(SLJIT_SUB | SLJIT_SET_Z, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 1); - JUMPTO(SLJIT_NOT_ZERO, loop); - } - else - { - GET_LOCAL_BASE(TMP2, 0, OVECTOR_START + 2 * sizeof(sljit_sw)); - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_IMM, length - 2); - loop = LABEL(); - OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, TMP1, 0); - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, sizeof(sljit_sw)); - OP2(SLJIT_SUB | SLJIT_SET_Z, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 1); - JUMPTO(SLJIT_NOT_ZERO, loop); - } - } - -if (!HAS_VIRTUAL_REGISTERS) - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, stack)); -else - OP1(SLJIT_MOV, STACK_TOP, 0, ARGUMENTS, 0); - -if (common->mark_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->mark_ptr, SLJIT_IMM, 0); -if (common->control_head_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_IMM, 0); -if (HAS_VIRTUAL_REGISTERS) - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(STACK_TOP), SLJIT_OFFSETOF(jit_arguments, stack)); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->start_ptr); -OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(STACK_TOP), SLJIT_OFFSETOF(struct sljit_stack, end)); -} - -static sljit_sw SLJIT_FUNC do_search_mark(sljit_sw *current, PCRE2_SPTR skip_arg) -{ -while (current != NULL) - { - switch (current[1]) - { - case type_then_trap: - break; - - case type_mark: - if (PRIV(strcmp)(skip_arg, (PCRE2_SPTR)current[2]) == 0) - return current[3]; - break; - - default: - SLJIT_UNREACHABLE(); - break; - } - SLJIT_ASSERT(current[0] == 0 || current < (sljit_sw*)current[0]); - current = (sljit_sw*)current[0]; - } -return 0; -} - -static SLJIT_INLINE void copy_ovector(compiler_common *common, int topbracket) -{ -DEFINE_COMPILER; -struct sljit_label *loop; -BOOL has_pre; - -/* At this point we can freely use all registers. */ -OP1(SLJIT_MOV, SLJIT_S2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(1)); -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(1), STR_PTR, 0); - -if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, SLJIT_R0, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, SLJIT_S0, 0, SLJIT_MEM1(SLJIT_SP), common->start_ptr); - if (common->mark_ptr != 0) - OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), common->mark_ptr); - OP1(SLJIT_MOV_U32, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, oveccount)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, startchar_ptr), SLJIT_S0, 0); - if (common->mark_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, mark_ptr), SLJIT_R2, 0); - OP2(SLJIT_ADD, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, match_data), - SLJIT_IMM, SLJIT_OFFSETOF(pcre2_match_data, ovector) - sizeof(PCRE2_SIZE)); - } -else - { - OP1(SLJIT_MOV, SLJIT_S0, 0, SLJIT_MEM1(SLJIT_SP), common->start_ptr); - OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, match_data)); - if (common->mark_ptr != 0) - OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->mark_ptr); - OP1(SLJIT_MOV_U32, SLJIT_R1, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, oveccount)); - OP1(SLJIT_MOV, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, startchar_ptr), SLJIT_S0, 0); - if (common->mark_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, mark_ptr), SLJIT_R0, 0); - OP2(SLJIT_ADD, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, SLJIT_OFFSETOF(pcre2_match_data, ovector) - sizeof(PCRE2_SIZE)); - } - -has_pre = sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_S1, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw)) == SLJIT_SUCCESS; - -GET_LOCAL_BASE(SLJIT_S0, 0, OVECTOR_START - (has_pre ? sizeof(sljit_sw) : 0)); -OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(HAS_VIRTUAL_REGISTERS ? SLJIT_R0 : ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, begin)); - -loop = LABEL(); - -if (has_pre) - sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_PRE, SLJIT_S1, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw)); -else - { - OP1(SLJIT_MOV, SLJIT_S1, 0, SLJIT_MEM1(SLJIT_S0), 0); - OP2(SLJIT_ADD, SLJIT_S0, 0, SLJIT_S0, 0, SLJIT_IMM, sizeof(sljit_sw)); - } - -OP2(SLJIT_ADD, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, sizeof(PCRE2_SIZE)); -OP2(SLJIT_SUB, SLJIT_S1, 0, SLJIT_S1, 0, SLJIT_R0, 0); -/* Copy the integer value to the output buffer */ -#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 -OP2(SLJIT_ASHR, SLJIT_S1, 0, SLJIT_S1, 0, SLJIT_IMM, UCHAR_SHIFT); -#endif - -SLJIT_ASSERT(sizeof(PCRE2_SIZE) == 4 || sizeof(PCRE2_SIZE) == 8); -OP1(((sizeof(PCRE2_SIZE) == 4) ? SLJIT_MOV_U32 : SLJIT_MOV), SLJIT_MEM1(SLJIT_R2), 0, SLJIT_S1, 0); - -OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1); -JUMPTO(SLJIT_NOT_ZERO, loop); - -/* Calculate the return value, which is the maximum ovector value. */ -if (topbracket > 1) - { - if (sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_R2, SLJIT_MEM1(SLJIT_R0), -(2 * (sljit_sw)sizeof(sljit_sw))) == SLJIT_SUCCESS) - { - GET_LOCAL_BASE(SLJIT_R0, 0, OVECTOR_START + topbracket * 2 * sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, topbracket + 1); - - /* OVECTOR(0) is never equal to SLJIT_S2. */ - loop = LABEL(); - sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_PRE, SLJIT_R2, SLJIT_MEM1(SLJIT_R0), -(2 * (sljit_sw)sizeof(sljit_sw))); - OP2(SLJIT_SUB, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1); - CMPTO(SLJIT_EQUAL, SLJIT_R2, 0, SLJIT_S2, 0, loop); - OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_R1, 0); - } - else - { - GET_LOCAL_BASE(SLJIT_R0, 0, OVECTOR_START + (topbracket - 1) * 2 * sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, topbracket + 1); - - /* OVECTOR(0) is never equal to SLJIT_S2. */ - loop = LABEL(); - OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_R0), 0); - OP2(SLJIT_SUB, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 2 * (sljit_sw)sizeof(sljit_sw)); - OP2(SLJIT_SUB, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1); - CMPTO(SLJIT_EQUAL, SLJIT_R2, 0, SLJIT_S2, 0, loop); - OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_R1, 0); - } - } -else - OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, 1); -} - -static SLJIT_INLINE void return_with_partial_match(compiler_common *common, struct sljit_label *quit) -{ -DEFINE_COMPILER; -sljit_s32 mov_opcode; -sljit_s32 arguments_reg = !HAS_VIRTUAL_REGISTERS ? ARGUMENTS : SLJIT_R1; - -SLJIT_COMPILE_ASSERT(STR_END == SLJIT_S0, str_end_must_be_saved_reg0); -SLJIT_ASSERT(common->start_used_ptr != 0 && common->start_ptr != 0 - && (common->mode == PCRE2_JIT_PARTIAL_SOFT ? common->hit_start != 0 : common->hit_start == 0)); - -if (arguments_reg != ARGUMENTS) - OP1(SLJIT_MOV, arguments_reg, 0, ARGUMENTS, 0); -OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), - common->mode == PCRE2_JIT_PARTIAL_SOFT ? common->hit_start : common->start_ptr); -OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_PARTIAL); - -/* Store match begin and end. */ -OP1(SLJIT_MOV, SLJIT_S1, 0, SLJIT_MEM1(arguments_reg), SLJIT_OFFSETOF(jit_arguments, begin)); -OP1(SLJIT_MOV, SLJIT_MEM1(arguments_reg), SLJIT_OFFSETOF(jit_arguments, startchar_ptr), SLJIT_R2, 0); -OP1(SLJIT_MOV, SLJIT_R1, 0, SLJIT_MEM1(arguments_reg), SLJIT_OFFSETOF(jit_arguments, match_data)); - -mov_opcode = (sizeof(PCRE2_SIZE) == 4) ? SLJIT_MOV_U32 : SLJIT_MOV; - -OP2(SLJIT_SUB, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_S1, 0); -#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 -OP2(SLJIT_ASHR, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, UCHAR_SHIFT); -#endif -OP1(mov_opcode, SLJIT_MEM1(SLJIT_R1), SLJIT_OFFSETOF(pcre2_match_data, ovector), SLJIT_R2, 0); - -OP2(SLJIT_SUB, STR_END, 0, STR_END, 0, SLJIT_S1, 0); -#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 -OP2(SLJIT_ASHR, STR_END, 0, STR_END, 0, SLJIT_IMM, UCHAR_SHIFT); -#endif -OP1(mov_opcode, SLJIT_MEM1(SLJIT_R1), SLJIT_OFFSETOF(pcre2_match_data, ovector) + sizeof(PCRE2_SIZE), STR_END, 0); - -JUMPTO(SLJIT_JUMP, quit); -} - -static SLJIT_INLINE void check_start_used_ptr(compiler_common *common) -{ -/* May destroy TMP1. */ -DEFINE_COMPILER; -struct sljit_jump *jump; - -if (common->mode == PCRE2_JIT_PARTIAL_SOFT) - { - /* The value of -1 must be kept for start_used_ptr! */ - OP2(SLJIT_ADD, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, SLJIT_IMM, 1); - /* Jumps if start_used_ptr < STR_PTR, or start_used_ptr == -1. Although overwriting - is not necessary if start_used_ptr == STR_PTR, it does not hurt as well. */ - jump = CMP(SLJIT_LESS_EQUAL, TMP1, 0, STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0); - JUMPHERE(jump); - } -else if (common->mode == PCRE2_JIT_PARTIAL_HARD) - { - jump = CMP(SLJIT_LESS_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0); - JUMPHERE(jump); - } -} - -static SLJIT_INLINE BOOL char_has_othercase(compiler_common *common, PCRE2_SPTR cc) -{ -/* Detects if the character has an othercase. */ -unsigned int c; - -#ifdef SUPPORT_UNICODE -if (common->utf || common->ucp) - { - if (common->utf) - { - GETCHAR(c, cc); - } - else - c = *cc; - - if (c > 127) - return c != UCD_OTHERCASE(c); - - return common->fcc[c] != c; - } -else -#endif - c = *cc; -return MAX_255(c) ? common->fcc[c] != c : FALSE; -} - -static SLJIT_INLINE unsigned int char_othercase(compiler_common *common, unsigned int c) -{ -/* Returns with the othercase. */ -#ifdef SUPPORT_UNICODE -if ((common->utf || common->ucp) && c > 127) - return UCD_OTHERCASE(c); -#endif -return TABLE_GET(c, common->fcc, c); -} - -static unsigned int char_get_othercase_bit(compiler_common *common, PCRE2_SPTR cc) -{ -/* Detects if the character and its othercase has only 1 bit difference. */ -unsigned int c, oc, bit; -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 -int n; -#endif - -#ifdef SUPPORT_UNICODE -if (common->utf || common->ucp) - { - if (common->utf) - { - GETCHAR(c, cc); - } - else - c = *cc; - - if (c <= 127) - oc = common->fcc[c]; - else - oc = UCD_OTHERCASE(c); - } -else - { - c = *cc; - oc = TABLE_GET(c, common->fcc, c); - } -#else -c = *cc; -oc = TABLE_GET(c, common->fcc, c); -#endif - -SLJIT_ASSERT(c != oc); - -bit = c ^ oc; -/* Optimized for English alphabet. */ -if (c <= 127 && bit == 0x20) - return (0 << 8) | 0x20; - -/* Since c != oc, they must have at least 1 bit difference. */ -if (!is_powerof2(bit)) - return 0; - -#if PCRE2_CODE_UNIT_WIDTH == 8 - -#ifdef SUPPORT_UNICODE -if (common->utf && c > 127) - { - n = GET_EXTRALEN(*cc); - while ((bit & 0x3f) == 0) - { - n--; - bit >>= 6; - } - return (n << 8) | bit; - } -#endif /* SUPPORT_UNICODE */ -return (0 << 8) | bit; - -#elif PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 - -#ifdef SUPPORT_UNICODE -if (common->utf && c > 65535) - { - if (bit >= (1u << 10)) - bit >>= 10; - else - return (bit < 256) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8)); - } -#endif /* SUPPORT_UNICODE */ -return (bit < 256) ? ((0u << 8) | bit) : ((1u << 8) | (bit >> 8)); - -#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ -} - -static void check_partial(compiler_common *common, BOOL force) -{ -/* Checks whether a partial matching is occurred. Does not modify registers. */ -DEFINE_COMPILER; -struct sljit_jump *jump = NULL; - -SLJIT_ASSERT(!force || common->mode != PCRE2_JIT_COMPLETE); - -if (common->mode == PCRE2_JIT_COMPLETE) - return; - -if (!force && !common->allow_empty_partial) - jump = CMP(SLJIT_GREATER_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0); -else if (common->mode == PCRE2_JIT_PARTIAL_SOFT) - jump = CMP(SLJIT_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, SLJIT_IMM, -1); - -if (common->mode == PCRE2_JIT_PARTIAL_SOFT) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->hit_start, SLJIT_IMM, 0); -else - { - if (common->partialmatchlabel != NULL) - JUMPTO(SLJIT_JUMP, common->partialmatchlabel); - else - add_jump(compiler, &common->partialmatch, JUMP(SLJIT_JUMP)); - } - -if (jump != NULL) - JUMPHERE(jump); -} - -static void check_str_end(compiler_common *common, jump_list **end_reached) -{ -/* Does not affect registers. Usually used in a tight spot. */ -DEFINE_COMPILER; -struct sljit_jump *jump; - -if (common->mode == PCRE2_JIT_COMPLETE) - { - add_jump(compiler, end_reached, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - return; - } - -jump = CMP(SLJIT_LESS, STR_PTR, 0, STR_END, 0); -if (common->mode == PCRE2_JIT_PARTIAL_SOFT) - { - add_jump(compiler, end_reached, CMP(SLJIT_GREATER_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->hit_start, SLJIT_IMM, 0); - add_jump(compiler, end_reached, JUMP(SLJIT_JUMP)); - } -else - { - add_jump(compiler, end_reached, CMP(SLJIT_GREATER_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0)); - if (common->partialmatchlabel != NULL) - JUMPTO(SLJIT_JUMP, common->partialmatchlabel); - else - add_jump(compiler, &common->partialmatch, JUMP(SLJIT_JUMP)); - } -JUMPHERE(jump); -} - -static void detect_partial_match(compiler_common *common, jump_list **backtracks) -{ -DEFINE_COMPILER; -struct sljit_jump *jump; - -if (common->mode == PCRE2_JIT_COMPLETE) - { - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - return; - } - -/* Partial matching mode. */ -jump = CMP(SLJIT_LESS, STR_PTR, 0, STR_END, 0); -if (!common->allow_empty_partial) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0)); -else if (common->mode == PCRE2_JIT_PARTIAL_SOFT) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, SLJIT_IMM, -1)); - -if (common->mode == PCRE2_JIT_PARTIAL_SOFT) - { - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->hit_start, SLJIT_IMM, 0); - add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); - } -else - { - if (common->partialmatchlabel != NULL) - JUMPTO(SLJIT_JUMP, common->partialmatchlabel); - else - add_jump(compiler, &common->partialmatch, JUMP(SLJIT_JUMP)); - } -JUMPHERE(jump); -} - -static void process_partial_match(compiler_common *common) -{ -DEFINE_COMPILER; -struct sljit_jump *jump; - -/* Partial matching mode. */ -if (common->mode == PCRE2_JIT_PARTIAL_SOFT) - { - jump = CMP(SLJIT_GREATER_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->hit_start, SLJIT_IMM, 0); - JUMPHERE(jump); - } -else if (common->mode == PCRE2_JIT_PARTIAL_HARD) - { - if (common->partialmatchlabel != NULL) - CMPTO(SLJIT_LESS, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0, common->partialmatchlabel); - else - add_jump(compiler, &common->partialmatch, CMP(SLJIT_LESS, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0)); - } -} - -static void detect_partial_match_to(compiler_common *common, struct sljit_label *label) -{ -DEFINE_COMPILER; - -CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, label); -process_partial_match(common); -} - -static void peek_char(compiler_common *common, sljit_u32 max, sljit_s32 dst, sljit_sw dstw, jump_list **backtracks) -{ -/* Reads the character into TMP1, keeps STR_PTR. -Does not check STR_END. TMP2, dst, RETURN_ADDR Destroyed. */ -DEFINE_COMPILER; -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -struct sljit_jump *jump; -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */ - -SLJIT_UNUSED_ARG(max); -SLJIT_UNUSED_ARG(dst); -SLJIT_UNUSED_ARG(dstw); -SLJIT_UNUSED_ARG(backtracks); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 -if (common->utf) - { - if (max < 128) return; - - jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80); - OP1(SLJIT_MOV, dst, dstw, STR_PTR, 0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - add_jump(compiler, common->invalid_utf ? &common->utfreadchar_invalid : &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); - OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw); - if (backtracks && common->invalid_utf) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); - JUMPHERE(jump); - } -#elif PCRE2_CODE_UNIT_WIDTH == 16 -if (common->utf) - { - if (max < 0xd800) return; - - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); - - if (common->invalid_utf) - { - jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); - OP1(SLJIT_MOV, dst, dstw, STR_PTR, 0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); - OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw); - if (backtracks && common->invalid_utf) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); - } - else - { - /* TMP2 contains the high surrogate. */ - jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); - } - - JUMPHERE(jump); - } -#elif PCRE2_CODE_UNIT_WIDTH == 32 -if (common->invalid_utf) - { - if (max < 0xd800) return; - - if (backtracks != NULL) - { - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800)); - } - else - { - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); - OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000); - CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); - CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); - } - } -#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ -#endif /* SUPPORT_UNICODE */ -} - -static void peek_char_back(compiler_common *common, sljit_u32 max, jump_list **backtracks) -{ -/* Reads one character back without moving STR_PTR. TMP2 must -contain the start of the subject buffer. Affects TMP1, TMP2, and RETURN_ADDR. */ -DEFINE_COMPILER; - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -struct sljit_jump *jump; -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */ - -SLJIT_UNUSED_ARG(max); -SLJIT_UNUSED_ARG(backtracks); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); - -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 -if (common->utf) - { - if (max < 128) return; - - jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80); - if (common->invalid_utf) - { - add_jump(compiler, &common->utfpeakcharback_invalid, JUMP(SLJIT_FAST_CALL)); - if (backtracks != NULL) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); - } - else - add_jump(compiler, &common->utfpeakcharback, JUMP(SLJIT_FAST_CALL)); - JUMPHERE(jump); - } -#elif PCRE2_CODE_UNIT_WIDTH == 16 -if (common->utf) - { - if (max < 0xd800) return; - - if (common->invalid_utf) - { - jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xd800); - add_jump(compiler, &common->utfpeakcharback_invalid, JUMP(SLJIT_FAST_CALL)); - if (backtracks != NULL) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); - } - else - { - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xdc00); - jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xdc00); - /* TMP2 contains the low surrogate. */ - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x10000); - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); - } - JUMPHERE(jump); - } -#elif PCRE2_CODE_UNIT_WIDTH == 32 -if (common->invalid_utf) - { - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800)); - } -#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ -#endif /* SUPPORT_UNICODE */ -} - -#define READ_CHAR_UPDATE_STR_PTR 0x1 -#define READ_CHAR_UTF8_NEWLINE 0x2 -#define READ_CHAR_NEWLINE (READ_CHAR_UPDATE_STR_PTR | READ_CHAR_UTF8_NEWLINE) -#define READ_CHAR_VALID_UTF 0x4 - -static void read_char(compiler_common *common, sljit_u32 min, sljit_u32 max, - jump_list **backtracks, sljit_u32 options) -{ -/* Reads the precise value of a character into TMP1, if the character is -between min and max (c >= min && c <= max). Otherwise it returns with a value -outside the range. Does not check STR_END. */ -DEFINE_COMPILER; -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -struct sljit_jump *jump; -#endif -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 -struct sljit_jump *jump2; -#endif - -SLJIT_UNUSED_ARG(min); -SLJIT_UNUSED_ARG(max); -SLJIT_UNUSED_ARG(backtracks); -SLJIT_UNUSED_ARG(options); -SLJIT_ASSERT(min <= max); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 -if (common->utf) - { - if (max < 128 && !(options & READ_CHAR_UPDATE_STR_PTR)) return; - - if (common->invalid_utf && !(options & READ_CHAR_VALID_UTF)) - { - jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80); - - if (options & READ_CHAR_UTF8_NEWLINE) - add_jump(compiler, &common->utfreadnewline_invalid, JUMP(SLJIT_FAST_CALL)); - else - add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); - - if (backtracks != NULL) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); - JUMPHERE(jump); - return; - } - - jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0); - if (min >= 0x10000) - { - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xf0); - if (options & READ_CHAR_UPDATE_STR_PTR) - OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x7); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); - OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); - OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2)); - if (!(options & READ_CHAR_UPDATE_STR_PTR)) - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); - OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - JUMPHERE(jump2); - if (options & READ_CHAR_UPDATE_STR_PTR) - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0); - } - else if (min >= 0x800 && max <= 0xffff) - { - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xe0); - if (options & READ_CHAR_UPDATE_STR_PTR) - OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xf); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); - OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - if (!(options & READ_CHAR_UPDATE_STR_PTR)) - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); - OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - JUMPHERE(jump2); - if (options & READ_CHAR_UPDATE_STR_PTR) - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0); - } - else if (max >= 0x800) - { - add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); - } - else if (max < 128) - { - OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); - } - else - { - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - if (!(options & READ_CHAR_UPDATE_STR_PTR)) - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - else - OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); - OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); - OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - if (options & READ_CHAR_UPDATE_STR_PTR) - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0); - } - JUMPHERE(jump); - } -#elif PCRE2_CODE_UNIT_WIDTH == 16 -if (common->utf) - { - if (max < 0xd800 && !(options & READ_CHAR_UPDATE_STR_PTR)) return; - - if (common->invalid_utf && !(options & READ_CHAR_VALID_UTF)) - { - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); - jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); - - if (options & READ_CHAR_UTF8_NEWLINE) - add_jump(compiler, &common->utfreadnewline_invalid, JUMP(SLJIT_FAST_CALL)); - else - add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); - - if (backtracks != NULL) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); - JUMPHERE(jump); - return; - } - - if (max >= 0x10000) - { - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); - jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800); - /* TMP2 contains the high surrogate. */ - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); - JUMPHERE(jump); - return; - } - - /* Skip low surrogate if necessary. */ - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); - - if (sljit_has_cpu_feature(SLJIT_HAS_CMOV) && !HAS_VIRTUAL_REGISTERS) - { - if (options & READ_CHAR_UPDATE_STR_PTR) - OP2(SLJIT_ADD, RETURN_ADDR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x400); - if (options & READ_CHAR_UPDATE_STR_PTR) - CMOV(SLJIT_LESS, STR_PTR, RETURN_ADDR, 0); - if (max >= 0xd800) - CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, 0x10000); - } - else - { - jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400); - if (options & READ_CHAR_UPDATE_STR_PTR) - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - if (max >= 0xd800) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x10000); - JUMPHERE(jump); - } - } -#elif PCRE2_CODE_UNIT_WIDTH == 32 -if (common->invalid_utf) - { - if (backtracks != NULL) - { - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800)); - } - else - { - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); - OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000); - CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); - CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); - } - } -#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ -#endif /* SUPPORT_UNICODE */ -} - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - -static BOOL is_char7_bitset(const sljit_u8 *bitset, BOOL nclass) -{ -/* Tells whether the character codes below 128 are enough -to determine a match. */ -const sljit_u8 value = nclass ? 0xff : 0; -const sljit_u8 *end = bitset + 32; - -bitset += 16; -do - { - if (*bitset++ != value) - return FALSE; - } -while (bitset < end); -return TRUE; -} - -static void read_char7_type(compiler_common *common, jump_list **backtracks, BOOL negated) -{ -/* Reads the precise character type of a character into TMP1, if the character -is less than 128. Otherwise it returns with zero. Does not check STR_END. The -full_read argument tells whether characters above max are accepted or not. */ -DEFINE_COMPILER; -struct sljit_jump *jump; - -SLJIT_ASSERT(common->utf); - -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -/* All values > 127 are zero in ctypes. */ -OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); - -if (negated) - { - jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x80); - - if (common->invalid_utf) - { - add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); - } - else - { - OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); - } - JUMPHERE(jump); - } -} - -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */ - -static void read_char8_type(compiler_common *common, jump_list **backtracks, BOOL negated) -{ -/* Reads the character type into TMP1, updates STR_PTR. Does not check STR_END. */ -DEFINE_COMPILER; -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 -struct sljit_jump *jump; -#endif -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 -struct sljit_jump *jump2; -#endif - -SLJIT_UNUSED_ARG(backtracks); -SLJIT_UNUSED_ARG(negated); - -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 -if (common->utf) - { - /* The result of this read may be unused, but saves an "else" part. */ - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); - jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x80); - - if (!negated) - { - if (common->invalid_utf) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2); - if (common->invalid_utf) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe0 - 0xc2)); - - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80); - if (common->invalid_utf) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40)); - - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); - jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255); - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); - JUMPHERE(jump2); - } - else if (common->invalid_utf) - { - add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); - OP1(SLJIT_MOV, TMP2, 0, TMP1, 0); - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); - - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); - jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255); - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); - JUMPHERE(jump2); - } - else - add_jump(compiler, &common->utfreadtype8, JUMP(SLJIT_FAST_CALL)); - - JUMPHERE(jump); - return; - } -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */ - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 32 -if (common->invalid_utf && negated) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x110000)); -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 32 */ - -#if PCRE2_CODE_UNIT_WIDTH != 8 -/* The ctypes array contains only 256 values. */ -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); -jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255); -#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ -OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); -#if PCRE2_CODE_UNIT_WIDTH != 8 -JUMPHERE(jump); -#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16 -if (common->utf && negated) - { - /* Skip low surrogate if necessary. */ - if (!common->invalid_utf) - { - OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800); - - if (sljit_has_cpu_feature(SLJIT_HAS_CMOV) && !HAS_VIRTUAL_REGISTERS) - { - OP2(SLJIT_ADD, RETURN_ADDR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x400); - CMOV(SLJIT_LESS, STR_PTR, RETURN_ADDR, 0); - } - else - { - jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - JUMPHERE(jump); - } - return; - } - - OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800); - jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400)); - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xdc00); - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400)); - - JUMPHERE(jump); - return; - } -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16 */ -} - -static void move_back(compiler_common *common, jump_list **backtracks, BOOL must_be_valid) -{ -/* Goes one character back. Affects STR_PTR and TMP1. If must_be_valid is TRUE, -TMP2 is not used. Otherwise TMP2 must contain the start of the subject buffer, -and it is destroyed. Does not modify STR_PTR for invalid character sequences. */ -DEFINE_COMPILER; - -SLJIT_UNUSED_ARG(backtracks); -SLJIT_UNUSED_ARG(must_be_valid); - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -struct sljit_jump *jump; -#endif - -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 -struct sljit_label *label; - -if (common->utf) - { - if (!must_be_valid && common->invalid_utf) - { - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80); - add_jump(compiler, &common->utfmoveback_invalid, JUMP(SLJIT_FAST_CALL)); - if (backtracks != NULL) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0)); - JUMPHERE(jump); - return; - } - - label = LABEL(); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); - CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0x80, label); - return; - } -#elif PCRE2_CODE_UNIT_WIDTH == 16 -if (common->utf) - { - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - - if (!must_be_valid && common->invalid_utf) - { - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); - jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000 - 0xd800); - add_jump(compiler, &common->utfmoveback_invalid, JUMP(SLJIT_FAST_CALL)); - if (backtracks != NULL) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0)); - JUMPHERE(jump); - return; - } - - /* Skip low surrogate if necessary. */ - OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xdc00); - OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL); - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - return; - } -#elif PCRE2_CODE_UNIT_WIDTH == 32 -if (common->invalid_utf && !must_be_valid) - { - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); - if (backtracks != NULL) - { - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - return; - } - - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000); - OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_LESS); - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - return; - } -#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ -#endif /* SUPPORT_UNICODE */ -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -} - -static void check_newlinechar(compiler_common *common, int nltype, jump_list **backtracks, BOOL jumpifmatch) -{ -/* Character comes in TMP1. Checks if it is a newline. TMP2 may be destroyed. */ -DEFINE_COMPILER; -struct sljit_jump *jump; - -if (nltype == NLTYPE_ANY) - { - add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL)); - sljit_set_current_flags(compiler, SLJIT_SET_Z); - add_jump(compiler, backtracks, JUMP(jumpifmatch ? SLJIT_NOT_ZERO : SLJIT_ZERO)); - } -else if (nltype == NLTYPE_ANYCRLF) - { - if (jumpifmatch) - { - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR)); - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL)); - } - else - { - jump = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL)); - JUMPHERE(jump); - } - } -else - { - SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline < 256); - add_jump(compiler, backtracks, CMP(jumpifmatch ? SLJIT_EQUAL : SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline)); - } -} - -#ifdef SUPPORT_UNICODE - -#if PCRE2_CODE_UNIT_WIDTH == 8 -static void do_utfreadchar(compiler_common *common) -{ -/* Fast decoding a UTF-8 character. TMP1 contains the first byte -of the character (>= 0xc0). Return char value in TMP1. */ -DEFINE_COMPILER; -struct sljit_jump *jump; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - -/* Searching for the first zero. */ -OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); -jump = JUMP(SLJIT_NOT_ZERO); -/* Two byte sequence. */ -OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3000); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(jump); -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - -OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10000); -jump = JUMP(SLJIT_NOT_ZERO); -/* Three byte sequence. */ -OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0000); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -/* Four byte sequence. */ -JUMPHERE(jump); -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2)); -OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xf0000); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void do_utfreadtype8(compiler_common *common) -{ -/* Fast decoding a UTF-8 character type. TMP2 contains the first byte -of the character (>= 0xc0). Return value in TMP1. */ -DEFINE_COMPILER; -struct sljit_jump *jump; -struct sljit_jump *compare; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x20); -jump = JUMP(SLJIT_NOT_ZERO); -/* Two byte sequence. */ -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x1f); -/* The upper 5 bits are known at this point. */ -compare = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x3); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); -OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_OR, TMP2, 0, TMP2, 0, TMP1, 0); -OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(compare); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -/* We only have types for characters less than 256. */ -JUMPHERE(jump); -OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void do_utfreadchar_invalid(compiler_common *common) -{ -/* Slow decoding a UTF-8 character. TMP1 contains the first byte -of the character (>= 0xc0). Return char value in TMP1. STR_PTR is -undefined for invalid characters. */ -DEFINE_COMPILER; -sljit_s32 i; -sljit_s32 has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV); -struct sljit_jump *jump; -struct sljit_jump *buffer_end_close; -struct sljit_label *three_byte_entry; -struct sljit_label *exit_invalid_label; -struct sljit_jump *exit_invalid[11]; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc2); - -/* Usually more than 3 characters remained in the subject buffer. */ -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); - -/* Not a valid start of a multi-byte sequence, no more bytes read. */ -exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf5 - 0xc2); - -buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0); - -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3)); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); -/* If TMP2 is in 0x80-0xbf range, TMP1 is also increased by (0x2 << 6). */ -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); -exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); - -OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); -jump = JUMP(SLJIT_NOT_ZERO); - -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(jump); - -/* Three-byte sequence. */ -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -if (has_cmov) - { - OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x40); - CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, 0x20000); - exit_invalid[2] = NULL; - } -else - exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); - -OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10000); -jump = JUMP(SLJIT_NOT_ZERO); - -three_byte_entry = LABEL(); - -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x2d800); -if (has_cmov) - { - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); - CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR - 0xd800); - exit_invalid[3] = NULL; - } -else - exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800); -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -if (has_cmov) - { - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); - CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); - exit_invalid[4] = NULL; - } -else - exit_invalid[4] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(jump); - -/* Four-byte sequence. */ -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -if (has_cmov) - { - OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x40); - CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, 0); - exit_invalid[5] = NULL; - } -else - exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); - -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc10000); -if (has_cmov) - { - OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x100000); - CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR - 0x10000); - exit_invalid[6] = NULL; - } -else - exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000); - -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(buffer_end_close); -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); -exit_invalid[7] = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0); - -/* Two-byte sequence. */ -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); -/* If TMP2 is in 0x80-0xbf range, TMP1 is also increased by (0x2 << 6). */ -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); -exit_invalid[8] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); - -OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); -jump = JUMP(SLJIT_NOT_ZERO); - -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -/* Three-byte sequence. */ -JUMPHERE(jump); -exit_invalid[9] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -if (has_cmov) - { - OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x40); - CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); - exit_invalid[10] = NULL; - } -else - exit_invalid[10] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); - -/* One will be substracted from STR_PTR later. */ -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); - -/* Four byte sequences are not possible. */ -CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x30000, three_byte_entry); - -exit_invalid_label = LABEL(); -for (i = 0; i < 11; i++) - sljit_set_label(exit_invalid[i], exit_invalid_label); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void do_utfreadnewline_invalid(compiler_common *common) -{ -/* Slow decoding a UTF-8 character, specialized for newlines. -TMP1 contains the first byte of the character (>= 0xc0). Return -char value in TMP1. */ -DEFINE_COMPILER; -struct sljit_label *loop; -struct sljit_label *skip_start; -struct sljit_label *three_byte_exit; -struct sljit_jump *jump[5]; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -if (common->nltype != NLTYPE_ANY) - { - SLJIT_ASSERT(common->nltype != NLTYPE_FIXED || common->newline < 128); - - /* All newlines are ascii, just skip intermediate octets. */ - jump[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - loop = LABEL(); - if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_POST, TMP2, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)) == SLJIT_SUCCESS) - sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, TMP2, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - else - { - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - } - - OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc0); - CMPTO(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, 0x80, loop); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - - JUMPHERE(jump[0]); - - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); - OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - return; - } - -jump[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -jump[1] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0xc2); -jump[2] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0xe2); - -skip_start = LABEL(); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc0); -jump[3] = CMP(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0x80); - -/* Skip intermediate octets. */ -loop = LABEL(); -jump[4] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc0); -CMPTO(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, 0x80, loop); - -JUMPHERE(jump[3]); -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -three_byte_exit = LABEL(); -JUMPHERE(jump[0]); -JUMPHERE(jump[4]); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -/* Two byte long newline: 0x85. */ -JUMPHERE(jump[1]); -CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0x85, skip_start); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x85); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -/* Three byte long newlines: 0x2028 and 0x2029. */ -JUMPHERE(jump[2]); -CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0x80, skip_start); -CMPTO(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0, three_byte_exit); - -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 0x80); -CMPTO(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x40, skip_start); - -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0x2000); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void do_utfmoveback_invalid(compiler_common *common) -{ -/* Goes one character back. */ -DEFINE_COMPILER; -sljit_s32 i; -struct sljit_jump *jump; -struct sljit_jump *buffer_start_close; -struct sljit_label *exit_ok_label; -struct sljit_label *exit_invalid_label; -struct sljit_jump *exit_invalid[7]; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); -exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xc0); - -/* Two-byte sequence. */ -buffer_start_close = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2)); - -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); -jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x20); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -/* Three-byte sequence. */ -JUMPHERE(jump); -exit_invalid[1] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, -0x40); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0); -jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x10); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -/* Four-byte sequence. */ -JUMPHERE(jump); -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0 - 0x80); -exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x40); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xf0); -exit_invalid[3] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x05); - -exit_ok_label = LABEL(); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -/* Two-byte sequence. */ -JUMPHERE(buffer_start_close); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); - -exit_invalid[4] = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); -CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x20, exit_ok_label); - -/* Three-byte sequence. */ -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -exit_invalid[5] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, -0x40); -exit_invalid[6] = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0); -CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x10, exit_ok_label); - -/* Four-byte sequences are not possible. */ - -exit_invalid_label = LABEL(); -sljit_set_label(exit_invalid[5], exit_invalid_label); -sljit_set_label(exit_invalid[6], exit_invalid_label); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(exit_invalid[4]); -/* -2 + 4 = 2 */ -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); - -exit_invalid_label = LABEL(); -for (i = 0; i < 4; i++) - sljit_set_label(exit_invalid[i], exit_invalid_label); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(4)); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void do_utfpeakcharback(compiler_common *common) -{ -/* Peak a character back. Does not modify STR_PTR. */ -DEFINE_COMPILER; -struct sljit_jump *jump[2]; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); -jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x20); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3)); -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0); -jump[1] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x10); - -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-4)); -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0 - 0x80); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf0); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - -JUMPHERE(jump[1]); -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - -JUMPHERE(jump[0]); -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void do_utfpeakcharback_invalid(compiler_common *common) -{ -/* Peak a character back. Does not modify STR_PTR. */ -DEFINE_COMPILER; -sljit_s32 i; -sljit_s32 has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV); -struct sljit_jump *jump[2]; -struct sljit_label *two_byte_entry; -struct sljit_label *three_byte_entry; -struct sljit_label *exit_invalid_label; -struct sljit_jump *exit_invalid[8]; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(3)); -exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xc0); -jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0); - -/* Two-byte sequence. */ -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2); -jump[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x1e); - -two_byte_entry = LABEL(); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); -/* If TMP1 is in 0x80-0xbf range, TMP1 is also increased by (0x2 << 6). */ -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(jump[1]); -OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2 - 0x80); -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80); -exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - -/* Three-byte sequence. */ -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3)); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xe0); -jump[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x10); - -three_byte_entry = LABEL(); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); -if (has_cmov) - { - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); - CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, -0xd800); - exit_invalid[2] = NULL; - } -else - exit_invalid[2] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800); - -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); -if (has_cmov) - { - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); - CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); - exit_invalid[3] = NULL; - } -else - exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800); - -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(jump[1]); -OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xe0 - 0x80); -exit_invalid[4] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - -/* Four-byte sequence. */ -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-4)); -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf0); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 18); -/* ADD is used instead of OR because of the SUB 0x10000 above. */ -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); - -if (has_cmov) - { - OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x100000); - CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR - 0x10000); - exit_invalid[5] = NULL; - } -else - exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000); - -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(jump[0]); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); -jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0); - -/* Two-byte sequence. */ -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2); -CMPTO(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x1e, two_byte_entry); - -OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2 - 0x80); -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80); -exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - -/* Three-byte sequence. */ -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3)); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xe0); -CMPTO(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x10, three_byte_entry); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(jump[0]); -exit_invalid[7] = CMP(SLJIT_GREATER, TMP2, 0, STR_PTR, 0); - -/* Two-byte sequence. */ -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2); -CMPTO(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x1e, two_byte_entry); - -exit_invalid_label = LABEL(); -for (i = 0; i < 8; i++) - sljit_set_label(exit_invalid[i], exit_invalid_label); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - -#if PCRE2_CODE_UNIT_WIDTH == 16 - -static void do_utfreadchar_invalid(compiler_common *common) -{ -/* Slow decoding a UTF-16 character. TMP1 contains the first half -of the character (>= 0xd800). Return char value in TMP1. STR_PTR is -undefined for invalid characters. */ -DEFINE_COMPILER; -struct sljit_jump *exit_invalid[3]; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -/* TMP2 contains the high surrogate. */ -exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00); -exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xdc00); -OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x10000); -exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x400); - -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(exit_invalid[0]); -JUMPHERE(exit_invalid[1]); -JUMPHERE(exit_invalid[2]); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void do_utfreadnewline_invalid(compiler_common *common) -{ -/* Slow decoding a UTF-16 character, specialized for newlines. -TMP1 contains the first half of the character (>= 0xd800). Return -char value in TMP1. */ - -DEFINE_COMPILER; -struct sljit_jump *exit_invalid[2]; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -/* TMP2 contains the high surrogate. */ -exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); -exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00); - -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xdc00); -OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x400); -OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x10000); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCHAR_SHIFT); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); - -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(exit_invalid[0]); -JUMPHERE(exit_invalid[1]); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void do_utfmoveback_invalid(compiler_common *common) -{ -/* Goes one character back. */ -DEFINE_COMPILER; -struct sljit_jump *exit_invalid[3]; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -exit_invalid[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x400); -exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); -exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x400); - -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(exit_invalid[0]); -JUMPHERE(exit_invalid[1]); -JUMPHERE(exit_invalid[2]); - -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void do_utfpeakcharback_invalid(compiler_common *common) -{ -/* Peak a character back. Does not modify STR_PTR. */ -DEFINE_COMPILER; -struct sljit_jump *jump; -struct sljit_jump *exit_invalid[3]; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000); -OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); -exit_invalid[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xdc00); -exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0); - -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00); -OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800); -exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10); -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); - -JUMPHERE(jump); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(exit_invalid[0]); -JUMPHERE(exit_invalid[1]); -JUMPHERE(exit_invalid[2]); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -#endif /* PCRE2_CODE_UNIT_WIDTH == 16 */ - -/* UCD_BLOCK_SIZE must be 128 (see the assert below). */ -#define UCD_BLOCK_MASK 127 -#define UCD_BLOCK_SHIFT 7 - -static void do_getucd(compiler_common *common) -{ -/* Search the UCD record for the character comes in TMP1. -Returns chartype in TMP1 and UCD offset in TMP2. */ -DEFINE_COMPILER; -#if PCRE2_CODE_UNIT_WIDTH == 32 -struct sljit_jump *jump; -#endif - -#if defined SLJIT_DEBUG && SLJIT_DEBUG -/* dummy_ucd_record */ -const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR); -SLJIT_ASSERT(record->script == ucp_Unknown && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther); -SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0); -#endif - -SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 12); - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -#if PCRE2_CODE_UNIT_WIDTH == 32 -if (!common->utf) - { - jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, UNASSIGNED_UTF_CHAR); - JUMPHERE(jump); - } -#endif - -OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); -OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1)); -OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2)); -OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void do_getucdtype(compiler_common *common) -{ -/* Search the UCD record for the character comes in TMP1. -Returns chartype in TMP1 and UCD offset in TMP2. */ -DEFINE_COMPILER; -#if PCRE2_CODE_UNIT_WIDTH == 32 -struct sljit_jump *jump; -#endif - -#if defined SLJIT_DEBUG && SLJIT_DEBUG -/* dummy_ucd_record */ -const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR); -SLJIT_ASSERT(record->script == ucp_Unknown && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther); -SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0); -#endif - -SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 12); - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -#if PCRE2_CODE_UNIT_WIDTH == 32 -if (!common->utf) - { - jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, UNASSIGNED_UTF_CHAR); - JUMPHERE(jump); - } -#endif - -OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); -OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1)); -OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2)); -OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1); - -/* TMP2 is multiplied by 12. Same as (TMP2 << 2) + ((TMP2 << 2) << 1). */ -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2); -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 1); - -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -#endif /* SUPPORT_UNICODE */ - -static SLJIT_INLINE struct sljit_label *mainloop_entry(compiler_common *common) -{ -DEFINE_COMPILER; -struct sljit_label *mainloop; -struct sljit_label *newlinelabel = NULL; -struct sljit_jump *start; -struct sljit_jump *end = NULL; -struct sljit_jump *end2 = NULL; -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -struct sljit_label *loop; -struct sljit_jump *jump; -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */ -jump_list *newline = NULL; -sljit_u32 overall_options = common->re->overall_options; -BOOL hascrorlf = (common->re->flags & PCRE2_HASCRORLF) != 0; -BOOL newlinecheck = FALSE; -BOOL readuchar = FALSE; - -if (!(hascrorlf || (overall_options & PCRE2_FIRSTLINE) != 0) - && (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF || common->newline > 255)) - newlinecheck = TRUE; - -SLJIT_ASSERT(common->abort_label == NULL); - -if ((overall_options & PCRE2_FIRSTLINE) != 0) - { - /* Search for the end of the first line. */ - SLJIT_ASSERT(common->match_end_ptr != 0); - OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0); - - if (common->nltype == NLTYPE_FIXED && common->newline > 255) - { - mainloop = LABEL(); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - end = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, mainloop); - CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff, mainloop); - JUMPHERE(end); - OP2(SLJIT_SUB, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - } - else - { - end = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - mainloop = LABEL(); - /* Continual stores does not cause data dependency. */ - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0); - read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_NEWLINE); - check_newlinechar(common, common->nltype, &newline, TRUE); - CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, mainloop); - JUMPHERE(end); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0); - set_jumps(newline, LABEL()); - } - - OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0); - } -else if ((overall_options & PCRE2_USE_OFFSET_LIMIT) != 0) - { - /* Check whether offset limit is set and valid. */ - SLJIT_ASSERT(common->match_end_ptr != 0); - - if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, offset_limit)); - } - else - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, offset_limit)); - - OP1(SLJIT_MOV, TMP2, 0, STR_END, 0); - end = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw) PCRE2_UNSET); - if (HAS_VIRTUAL_REGISTERS) - OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); - else - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, begin)); - -#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT); -#endif /* PCRE2_CODE_UNIT_WIDTH == [16|32] */ - if (HAS_VIRTUAL_REGISTERS) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, begin)); - - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); - end2 = CMP(SLJIT_LESS_EQUAL, TMP2, 0, STR_END, 0); - OP1(SLJIT_MOV, TMP2, 0, STR_END, 0); - JUMPHERE(end2); - OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_NOMATCH); - add_jump(compiler, &common->abort, CMP(SLJIT_LESS, TMP2, 0, STR_PTR, 0)); - JUMPHERE(end); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, TMP2, 0); - } - -start = JUMP(SLJIT_JUMP); - -if (newlinecheck) - { - newlinelabel = LABEL(); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - end = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, common->newline & 0xff); - OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL); -#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT); -#endif /* PCRE2_CODE_UNIT_WIDTH == [16|32] */ - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - end2 = JUMP(SLJIT_JUMP); - } - -mainloop = LABEL(); - -/* Increasing the STR_PTR here requires one less jump in the most common case. */ -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -if (common->utf && !common->invalid_utf) readuchar = TRUE; -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */ -if (newlinecheck) readuchar = TRUE; - -if (readuchar) - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - -if (newlinecheck) - CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, newlinelabel); - -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -#if PCRE2_CODE_UNIT_WIDTH == 8 -if (common->invalid_utf) - { - /* Skip continuation code units. */ - loop = LABEL(); - jump = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80); - CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x40, loop); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - JUMPHERE(jump); - } -else if (common->utf) - { - jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0); - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - JUMPHERE(jump); - } -#elif PCRE2_CODE_UNIT_WIDTH == 16 -if (common->invalid_utf) - { - /* Skip continuation code units. */ - loop = LABEL(); - jump = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xdc00); - CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x400, loop); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - JUMPHERE(jump); - } -else if (common->utf) - { - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); - - if (sljit_has_cpu_feature(SLJIT_HAS_CMOV)) - { - OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x400); - CMOV(SLJIT_LESS, STR_PTR, TMP2, 0); - } - else - { - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x400); - OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_LESS); - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - } - } -#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16] */ -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */ -JUMPHERE(start); - -if (newlinecheck) - { - JUMPHERE(end); - JUMPHERE(end2); - } - -return mainloop; -} - - -static SLJIT_INLINE void add_prefix_char(PCRE2_UCHAR chr, fast_forward_char_data *chars, BOOL last) -{ -sljit_u32 i, count = chars->count; - -if (count == 255) - return; - -if (count == 0) - { - chars->count = 1; - chars->chars[0] = chr; - - if (last) - chars->last_count = 1; - return; - } - -for (i = 0; i < count; i++) - if (chars->chars[i] == chr) - return; - -if (count >= MAX_DIFF_CHARS) - { - chars->count = 255; - return; - } - -chars->chars[count] = chr; -chars->count = count + 1; - -if (last) - chars->last_count++; -} - -static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars, int max_chars, sljit_u32 *rec_count) -{ -/* Recursive function, which scans prefix literals. */ -BOOL last, any, class, caseless; -int len, repeat, len_save, consumed = 0; -sljit_u32 chr; /* Any unicode character. */ -sljit_u8 *bytes, *bytes_end, byte; -PCRE2_SPTR alternative, cc_save, oc; -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 -PCRE2_UCHAR othercase[4]; -#elif defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16 -PCRE2_UCHAR othercase[2]; -#else -PCRE2_UCHAR othercase[1]; -#endif - -repeat = 1; -while (TRUE) - { - if (*rec_count == 0) - return 0; - (*rec_count)--; - - last = TRUE; - any = FALSE; - class = FALSE; - caseless = FALSE; - - switch (*cc) - { - case OP_CHARI: - caseless = TRUE; - /* Fall through */ - case OP_CHAR: - last = FALSE; - cc++; - break; - - case OP_SOD: - case OP_SOM: - case OP_SET_SOM: - case OP_NOT_WORD_BOUNDARY: - case OP_WORD_BOUNDARY: - case OP_EODN: - case OP_EOD: - case OP_CIRC: - case OP_CIRCM: - case OP_DOLL: - case OP_DOLLM: - /* Zero width assertions. */ - cc++; - continue; - - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - cc = bracketend(cc); - continue; - - case OP_PLUSI: - case OP_MINPLUSI: - case OP_POSPLUSI: - caseless = TRUE; - /* Fall through */ - case OP_PLUS: - case OP_MINPLUS: - case OP_POSPLUS: - cc++; - break; - - case OP_EXACTI: - caseless = TRUE; - /* Fall through */ - case OP_EXACT: - repeat = GET2(cc, 1); - last = FALSE; - cc += 1 + IMM2_SIZE; - break; - - case OP_QUERYI: - case OP_MINQUERYI: - case OP_POSQUERYI: - caseless = TRUE; - /* Fall through */ - case OP_QUERY: - case OP_MINQUERY: - case OP_POSQUERY: - len = 1; - cc++; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc); -#endif - max_chars = scan_prefix(common, cc + len, chars, max_chars, rec_count); - if (max_chars == 0) - return consumed; - last = FALSE; - break; - - case OP_KET: - cc += 1 + LINK_SIZE; - continue; - - case OP_ALT: - cc += GET(cc, 1); - continue; - - case OP_ONCE: - case OP_BRA: - case OP_BRAPOS: - case OP_CBRA: - case OP_CBRAPOS: - alternative = cc + GET(cc, 1); - while (*alternative == OP_ALT) - { - max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, max_chars, rec_count); - if (max_chars == 0) - return consumed; - alternative += GET(alternative, 1); - } - - if (*cc == OP_CBRA || *cc == OP_CBRAPOS) - cc += IMM2_SIZE; - cc += 1 + LINK_SIZE; - continue; - - case OP_CLASS: -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && !is_char7_bitset((const sljit_u8 *)(cc + 1), FALSE)) - return consumed; -#endif - class = TRUE; - break; - - case OP_NCLASS: -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) return consumed; -#endif - class = TRUE; - break; - -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 - case OP_XCLASS: -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) return consumed; -#endif - any = TRUE; - cc += GET(cc, 1); - break; -#endif - - case OP_DIGIT: -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_digit, FALSE)) - return consumed; -#endif - any = TRUE; - cc++; - break; - - case OP_WHITESPACE: -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_space, FALSE)) - return consumed; -#endif - any = TRUE; - cc++; - break; - - case OP_WORDCHAR: -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_word, FALSE)) - return consumed; -#endif - any = TRUE; - cc++; - break; - - case OP_NOT: - case OP_NOTI: - cc++; - /* Fall through. */ - case OP_NOT_DIGIT: - case OP_NOT_WHITESPACE: - case OP_NOT_WORDCHAR: - case OP_ANY: - case OP_ALLANY: -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) return consumed; -#endif - any = TRUE; - cc++; - break; - -#ifdef SUPPORT_UNICODE - case OP_NOTPROP: - case OP_PROP: -#if PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) return consumed; -#endif - any = TRUE; - cc += 1 + 2; - break; -#endif - - case OP_TYPEEXACT: - repeat = GET2(cc, 1); - cc += 1 + IMM2_SIZE; - continue; - - case OP_NOTEXACT: - case OP_NOTEXACTI: -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) return consumed; -#endif - any = TRUE; - repeat = GET2(cc, 1); - cc += 1 + IMM2_SIZE + 1; - break; - - default: - return consumed; - } - - if (any) - { - do - { - chars->count = 255; - - consumed++; - if (--max_chars == 0) - return consumed; - chars++; - } - while (--repeat > 0); - - repeat = 1; - continue; - } - - if (class) - { - bytes = (sljit_u8*) (cc + 1); - cc += 1 + 32 / sizeof(PCRE2_UCHAR); - - switch (*cc) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPOSSTAR: - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSQUERY: - max_chars = scan_prefix(common, cc + 1, chars, max_chars, rec_count); - if (max_chars == 0) - return consumed; - break; - - default: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRPOSPLUS: - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - repeat = GET2(cc, 1); - if (repeat <= 0) - return consumed; - break; - } - - do - { - if (bytes[31] & 0x80) - chars->count = 255; - else if (chars->count != 255) - { - bytes_end = bytes + 32; - chr = 0; - do - { - byte = *bytes++; - SLJIT_ASSERT((chr & 0x7) == 0); - if (byte == 0) - chr += 8; - else - { - do - { - if ((byte & 0x1) != 0) - add_prefix_char(chr, chars, TRUE); - byte >>= 1; - chr++; - } - while (byte != 0); - chr = (chr + 7) & ~7; - } - } - while (chars->count != 255 && bytes < bytes_end); - bytes = bytes_end - 32; - } - - consumed++; - if (--max_chars == 0) - return consumed; - chars++; - } - while (--repeat > 0); - - switch (*cc) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPOSSTAR: - return consumed; - - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSQUERY: - cc++; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - if (GET2(cc, 1) != GET2(cc, 1 + IMM2_SIZE)) - return consumed; - cc += 1 + 2 * IMM2_SIZE; - break; - } - - repeat = 1; - continue; - } - - len = 1; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc); -#endif - - if (caseless && char_has_othercase(common, cc)) - { -#ifdef SUPPORT_UNICODE - if (common->utf) - { - GETCHAR(chr, cc); - if ((int)PRIV(ord2utf)(char_othercase(common, chr), othercase) != len) - return consumed; - } - else -#endif - { - chr = *cc; -#ifdef SUPPORT_UNICODE - if (common->ucp && chr > 127) - othercase[0] = UCD_OTHERCASE(chr); - else -#endif - othercase[0] = TABLE_GET(chr, common->fcc, chr); - } - } - else - { - caseless = FALSE; - othercase[0] = 0; /* Stops compiler warning - PH */ - } - - len_save = len; - cc_save = cc; - while (TRUE) - { - oc = othercase; - do - { - len--; - consumed++; - - chr = *cc; - add_prefix_char(*cc, chars, len == 0); - - if (caseless) - add_prefix_char(*oc, chars, len == 0); - - if (--max_chars == 0) - return consumed; - chars++; - cc++; - oc++; - } - while (len > 0); - - if (--repeat == 0) - break; - - len = len_save; - cc = cc_save; - } - - repeat = 1; - if (last) - return consumed; - } -} - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -static void jumpto_if_not_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg, struct sljit_label *label) -{ -#if PCRE2_CODE_UNIT_WIDTH == 8 -OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0); -CMPTO(SLJIT_EQUAL, reg, 0, SLJIT_IMM, 0x80, label); -#elif PCRE2_CODE_UNIT_WIDTH == 16 -OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00); -CMPTO(SLJIT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00, label); -#else -#error "Unknown code width" -#endif -} -#endif - -#include "pcre2_jit_simd_inc.h" - -#ifdef JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD - -static BOOL check_fast_forward_char_pair_simd(compiler_common *common, fast_forward_char_data *chars, int max) -{ - sljit_s32 i, j, max_i = 0, max_j = 0; - sljit_u32 max_pri = 0; - PCRE2_UCHAR a1, a2, a_pri, b1, b2, b_pri; - - for (i = max - 1; i >= 1; i--) - { - if (chars[i].last_count > 2) - { - a1 = chars[i].chars[0]; - a2 = chars[i].chars[1]; - a_pri = chars[i].last_count; - - j = i - max_fast_forward_char_pair_offset(); - if (j < 0) - j = 0; - - while (j < i) - { - b_pri = chars[j].last_count; - if (b_pri > 2 && a_pri + b_pri >= max_pri) - { - b1 = chars[j].chars[0]; - b2 = chars[j].chars[1]; - - if (a1 != b1 && a1 != b2 && a2 != b1 && a2 != b2) - { - max_pri = a_pri + b_pri; - max_i = i; - max_j = j; - } - } - j++; - } - } - } - -if (max_pri == 0) - return FALSE; - -fast_forward_char_pair_simd(common, max_i, chars[max_i].chars[0], chars[max_i].chars[1], max_j, chars[max_j].chars[0], chars[max_j].chars[1]); -return TRUE; -} - -#endif /* JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD */ - -static void fast_forward_first_char2(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset) -{ -DEFINE_COMPILER; -struct sljit_label *start; -struct sljit_jump *match; -struct sljit_jump *partial_quit; -PCRE2_UCHAR mask; -BOOL has_match_end = (common->match_end_ptr != 0); - -SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE || offset == 0); - -if (has_match_end) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); - -if (offset > 0) - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offset)); - -if (has_match_end) - { - OP1(SLJIT_MOV, TMP3, 0, STR_END, 0); - - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offset + 1)); - OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_END, 0, TMP1, 0); - CMOV(SLJIT_GREATER, STR_END, TMP1, 0); - } - -#ifdef JIT_HAS_FAST_FORWARD_CHAR_SIMD - -if (JIT_HAS_FAST_FORWARD_CHAR_SIMD) - { - fast_forward_char_simd(common, char1, char2, offset); - - if (offset > 0) - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offset)); - - if (has_match_end) - OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); - return; - } - -#endif - -start = LABEL(); - -partial_quit = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); -if (common->mode == PCRE2_JIT_COMPLETE) - add_jump(compiler, &common->failed_match, partial_quit); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -if (char1 == char2) - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, char1, start); -else - { - mask = char1 ^ char2; - if (is_powerof2(mask)) - { - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, mask); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, char1 | mask, start); - } - else - { - match = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, char1); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, char2, start); - JUMPHERE(match); - } - } - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -if (common->utf && offset > 0) - { - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-(offset + 1))); - jumpto_if_not_utf_char_start(compiler, TMP1, start); - } -#endif - -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offset + 1)); - -if (common->mode != PCRE2_JIT_COMPLETE) - JUMPHERE(partial_quit); - -if (has_match_end) - OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); -} - -static SLJIT_INLINE BOOL fast_forward_first_n_chars(compiler_common *common) -{ -DEFINE_COMPILER; -struct sljit_label *start; -struct sljit_jump *match; -fast_forward_char_data chars[MAX_N_CHARS]; -sljit_s32 offset; -PCRE2_UCHAR mask; -PCRE2_UCHAR *char_set, *char_set_end; -int i, max, from; -int range_right = -1, range_len; -sljit_u8 *update_table = NULL; -BOOL in_range; -sljit_u32 rec_count; - -for (i = 0; i < MAX_N_CHARS; i++) - { - chars[i].count = 0; - chars[i].last_count = 0; - } - -rec_count = 10000; -max = scan_prefix(common, common->start, chars, MAX_N_CHARS, &rec_count); - -if (max < 1) - return FALSE; - -/* Convert last_count to priority. */ -for (i = 0; i < max; i++) - { - SLJIT_ASSERT(chars[i].count > 0 && chars[i].last_count <= chars[i].count); - - if (chars[i].count == 1) - { - chars[i].last_count = (chars[i].last_count == 1) ? 7 : 5; - /* Simplifies algorithms later. */ - chars[i].chars[1] = chars[i].chars[0]; - } - else if (chars[i].count == 2) - { - SLJIT_ASSERT(chars[i].chars[0] != chars[i].chars[1]); - - if (is_powerof2(chars[i].chars[0] ^ chars[i].chars[1])) - chars[i].last_count = (chars[i].last_count == 2) ? 6 : 4; - else - chars[i].last_count = (chars[i].last_count == 2) ? 3 : 2; - } - else - chars[i].last_count = (chars[i].count == 255) ? 0 : 1; - } - -#ifdef JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD -if (JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD && check_fast_forward_char_pair_simd(common, chars, max)) - return TRUE; -#endif - -in_range = FALSE; -/* Prevent compiler "uninitialized" warning */ -from = 0; -range_len = 4 /* minimum length */ - 1; -for (i = 0; i <= max; i++) - { - if (in_range && (i - from) > range_len && (chars[i - 1].count < 255)) - { - range_len = i - from; - range_right = i - 1; - } - - if (i < max && chars[i].count < 255) - { - SLJIT_ASSERT(chars[i].count > 0); - if (!in_range) - { - in_range = TRUE; - from = i; - } - } - else - in_range = FALSE; - } - -if (range_right >= 0) - { - update_table = (sljit_u8 *)allocate_read_only_data(common, 256); - if (update_table == NULL) - return TRUE; - memset(update_table, IN_UCHARS(range_len), 256); - - for (i = 0; i < range_len; i++) - { - SLJIT_ASSERT(chars[range_right - i].count > 0 && chars[range_right - i].count < 255); - - char_set = chars[range_right - i].chars; - char_set_end = char_set + chars[range_right - i].count; - do - { - if (update_table[(*char_set) & 0xff] > IN_UCHARS(i)) - update_table[(*char_set) & 0xff] = IN_UCHARS(i); - char_set++; - } - while (char_set < char_set_end); - } - } - -offset = -1; -/* Scan forward. */ -for (i = 0; i < max; i++) - { - if (range_right == i) - continue; - - if (offset == -1) - { - if (chars[i].last_count >= 2) - offset = i; - } - else if (chars[offset].last_count < chars[i].last_count) - offset = i; - } - -SLJIT_ASSERT(offset == -1 || (chars[offset].count >= 1 && chars[offset].count <= 2)); - -if (range_right < 0) - { - if (offset < 0) - return FALSE; - /* Works regardless the value is 1 or 2. */ - fast_forward_first_char2(common, chars[offset].chars[0], chars[offset].chars[1], offset); - return TRUE; - } - -SLJIT_ASSERT(range_right != offset); - -if (common->match_end_ptr != 0) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); - OP1(SLJIT_MOV, TMP3, 0, STR_END, 0); - OP2(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, STR_END, 0, SLJIT_IMM, IN_UCHARS(max)); - add_jump(compiler, &common->failed_match, JUMP(SLJIT_LESS)); - OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_END, 0, TMP1, 0); - CMOV(SLJIT_GREATER, STR_END, TMP1, 0); - } -else - { - OP2(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, STR_END, 0, SLJIT_IMM, IN_UCHARS(max)); - add_jump(compiler, &common->failed_match, JUMP(SLJIT_LESS)); - } - -SLJIT_ASSERT(range_right >= 0); - -if (!HAS_VIRTUAL_REGISTERS) - OP1(SLJIT_MOV, RETURN_ADDR, 0, SLJIT_IMM, (sljit_sw)update_table); - -start = LABEL(); -add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0)); - -#if PCRE2_CODE_UNIT_WIDTH == 8 || (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN) -OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right)); -#else -OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right + 1) - 1); -#endif - -if (!HAS_VIRTUAL_REGISTERS) - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(RETURN_ADDR, TMP1), 0); -else - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)update_table); - -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); -CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0, start); - -if (offset >= 0) - { - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offset)); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - - if (chars[offset].count == 1) - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset].chars[0], start); - else - { - mask = chars[offset].chars[0] ^ chars[offset].chars[1]; - if (is_powerof2(mask)) - { - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, mask); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset].chars[0] | mask, start); - } - else - { - match = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset].chars[0]); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset].chars[1], start); - JUMPHERE(match); - } - } - } - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -if (common->utf && offset != 0) - { - if (offset < 0) - { - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - } - else - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); - - jumpto_if_not_utf_char_start(compiler, TMP1, start); - - if (offset < 0) - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - } -#endif - -if (offset >= 0) - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -if (common->match_end_ptr != 0) - OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); -else - OP2(SLJIT_ADD, STR_END, 0, STR_END, 0, SLJIT_IMM, IN_UCHARS(max)); -return TRUE; -} - -static SLJIT_INLINE void fast_forward_first_char(compiler_common *common) -{ -PCRE2_UCHAR first_char = (PCRE2_UCHAR)(common->re->first_codeunit); -PCRE2_UCHAR oc; - -oc = first_char; -if ((common->re->flags & PCRE2_FIRSTCASELESS) != 0) - { - oc = TABLE_GET(first_char, common->fcc, first_char); -#if defined SUPPORT_UNICODE - if (first_char > 127 && (common->utf || common->ucp)) - oc = UCD_OTHERCASE(first_char); -#endif - } - -fast_forward_first_char2(common, first_char, oc, 0); -} - -static SLJIT_INLINE void fast_forward_newline(compiler_common *common) -{ -DEFINE_COMPILER; -struct sljit_label *loop; -struct sljit_jump *lastchar = NULL; -struct sljit_jump *firstchar; -struct sljit_jump *quit = NULL; -struct sljit_jump *foundcr = NULL; -struct sljit_jump *notfoundnl; -jump_list *newline = NULL; - -if (common->match_end_ptr != 0) - { - OP1(SLJIT_MOV, TMP3, 0, STR_END, 0); - OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); - } - -if (common->nltype == NLTYPE_FIXED && common->newline > 255) - { -#ifdef JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD - if (JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD && common->mode == PCRE2_JIT_COMPLETE) - { - if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, str)); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); - } - else - { - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, str)); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, begin)); - } - firstchar = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0); - - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, STR_PTR, 0, TMP1, 0); - OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_NOT_EQUAL); -#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT); -#endif - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - - fast_forward_char_pair_simd(common, 1, common->newline & 0xff, common->newline & 0xff, 0, (common->newline >> 8) & 0xff, (common->newline >> 8) & 0xff); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); - } - else -#endif /* JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD */ - { - lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, str)); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); - } - else - { - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, str)); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, begin)); - } - firstchar = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0); - - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(2)); - OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, STR_PTR, 0, TMP1, 0); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_GREATER_EQUAL); -#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCHAR_SHIFT); -#endif - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0); - - loop = LABEL(); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - quit = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, loop); - CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff, loop); - - JUMPHERE(quit); - JUMPHERE(lastchar); - } - - JUMPHERE(firstchar); - - if (common->match_end_ptr != 0) - OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); - return; - } - -if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, str)); - } -else - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, str)); - -/* Example: match /^/ to \r\n from offset 1. */ -firstchar = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0); - -if (common->nltype == NLTYPE_ANY) - move_back(common, NULL, FALSE); -else - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -loop = LABEL(); -common->ff_newline_shortcut = loop; - -#ifdef JIT_HAS_FAST_FORWARD_CHAR_SIMD -if (JIT_HAS_FAST_FORWARD_CHAR_SIMD && (common->nltype == NLTYPE_FIXED || common->nltype == NLTYPE_ANYCRLF)) - { - if (common->nltype == NLTYPE_ANYCRLF) - { - fast_forward_char_simd(common, CHAR_CR, CHAR_LF, 0); - if (common->mode != PCRE2_JIT_COMPLETE) - lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - quit = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); - } - else - { - fast_forward_char_simd(common, common->newline, common->newline, 0); - - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - if (common->mode != PCRE2_JIT_COMPLETE) - { - OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0); - CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0); - } - } - } -else -#endif /* JIT_HAS_FAST_FORWARD_CHAR_SIMD */ - { - read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_NEWLINE); - lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF) - foundcr = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); - check_newlinechar(common, common->nltype, &newline, FALSE); - set_jumps(newline, loop); - } - -if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF) - { - if (quit == NULL) - { - quit = JUMP(SLJIT_JUMP); - JUMPHERE(foundcr); - } - - notfoundnl = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_NL); - OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL); -#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT); -#endif - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - JUMPHERE(notfoundnl); - JUMPHERE(quit); - } - -if (lastchar) - JUMPHERE(lastchar); -JUMPHERE(firstchar); - -if (common->match_end_ptr != 0) - OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); -} - -static BOOL optimize_class(compiler_common *common, const sljit_u8 *bits, BOOL nclass, BOOL invert, jump_list **backtracks); - -static SLJIT_INLINE void fast_forward_start_bits(compiler_common *common) -{ -DEFINE_COMPILER; -const sljit_u8 *start_bits = common->re->start_bitmap; -struct sljit_label *start; -struct sljit_jump *partial_quit; -#if PCRE2_CODE_UNIT_WIDTH != 8 -struct sljit_jump *found = NULL; -#endif -jump_list *matches = NULL; - -if (common->match_end_ptr != 0) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); - OP1(SLJIT_MOV, RETURN_ADDR, 0, STR_END, 0); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_END, 0, TMP1, 0); - CMOV(SLJIT_GREATER, STR_END, TMP1, 0); - } - -start = LABEL(); - -partial_quit = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); -if (common->mode == PCRE2_JIT_COMPLETE) - add_jump(compiler, &common->failed_match, partial_quit); - -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -if (!optimize_class(common, start_bits, (start_bits[31] & 0x80) != 0, FALSE, &matches)) - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - if ((start_bits[31] & 0x80) != 0) - found = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 255); - else - CMPTO(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 255, start); -#elif defined SUPPORT_UNICODE - if (common->utf && is_char7_bitset(start_bits, FALSE)) - CMPTO(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 127, start); -#endif - OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); - OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)start_bits); - if (!HAS_VIRTUAL_REGISTERS) - { - OP2(SLJIT_SHL, TMP3, 0, SLJIT_IMM, 1, TMP2, 0); - OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, TMP3, 0); - } - else - { - OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); - OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); - } - JUMPTO(SLJIT_ZERO, start); - } -else - set_jumps(matches, start); - -#if PCRE2_CODE_UNIT_WIDTH != 8 -if (found != NULL) - JUMPHERE(found); -#endif - -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -if (common->mode != PCRE2_JIT_COMPLETE) - JUMPHERE(partial_quit); - -if (common->match_end_ptr != 0) - OP1(SLJIT_MOV, STR_END, 0, RETURN_ADDR, 0); -} - -static SLJIT_INLINE jump_list *search_requested_char(compiler_common *common, PCRE2_UCHAR req_char, BOOL caseless, BOOL has_firstchar) -{ -DEFINE_COMPILER; -struct sljit_label *loop; -struct sljit_jump *toolong; -struct sljit_jump *already_found; -struct sljit_jump *found; -struct sljit_jump *found_oc = NULL; -jump_list *not_found = NULL; -sljit_u32 oc, bit; - -SLJIT_ASSERT(common->req_char_ptr != 0); -OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(REQ_CU_MAX) * 100); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->req_char_ptr); -toolong = CMP(SLJIT_LESS, TMP2, 0, STR_END, 0); -already_found = CMP(SLJIT_LESS, STR_PTR, 0, TMP1, 0); - -if (has_firstchar) - OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -else - OP1(SLJIT_MOV, TMP1, 0, STR_PTR, 0); - -oc = req_char; -if (caseless) - { - oc = TABLE_GET(req_char, common->fcc, req_char); -#if defined SUPPORT_UNICODE - if (req_char > 127 && (common->utf || common->ucp)) - oc = UCD_OTHERCASE(req_char); -#endif - } - -#ifdef JIT_HAS_FAST_REQUESTED_CHAR_SIMD -if (JIT_HAS_FAST_REQUESTED_CHAR_SIMD) - { - not_found = fast_requested_char_simd(common, req_char, oc); - } -else -#endif - { - loop = LABEL(); - add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0)); - - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(TMP1), 0); - - if (req_char == oc) - found = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, req_char); - else - { - bit = req_char ^ oc; - if (is_powerof2(bit)) - { - OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, bit); - found = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, req_char | bit); - } - else - { - found = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, req_char); - found_oc = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, oc); - } - } - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1)); - JUMPTO(SLJIT_JUMP, loop); - - JUMPHERE(found); - if (found_oc) - JUMPHERE(found_oc); - } - -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->req_char_ptr, TMP1, 0); - -JUMPHERE(already_found); -JUMPHERE(toolong); -return not_found; -} - -static void do_revertframes(compiler_common *common) -{ -DEFINE_COMPILER; -struct sljit_jump *jump; -struct sljit_label *mainloop; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); -GET_LOCAL_BASE(TMP1, 0, 0); - -/* Drop frames until we reach STACK_TOP. */ -mainloop = LABEL(); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), -sizeof(sljit_sw)); -jump = CMP(SLJIT_SIG_LESS_EQUAL, TMP2, 0, SLJIT_IMM, 0); - -OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); -if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, SLJIT_MEM1(STACK_TOP), -(2 * sizeof(sljit_sw))); - OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), sizeof(sljit_sw), SLJIT_MEM1(STACK_TOP), -(3 * sizeof(sljit_sw))); - OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 3 * sizeof(sljit_sw)); - } -else - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), -(2 * sizeof(sljit_sw))); - OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(STACK_TOP), -(3 * sizeof(sljit_sw))); - OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 3 * sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, TMP1, 0); - GET_LOCAL_BASE(TMP1, 0, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), sizeof(sljit_sw), TMP3, 0); - } -JUMPTO(SLJIT_JUMP, mainloop); - -JUMPHERE(jump); -jump = CMP(SLJIT_NOT_ZERO /* SIG_LESS */, TMP2, 0, SLJIT_IMM, 0); -/* End of reverting values. */ -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); - -JUMPHERE(jump); -OP1(SLJIT_NEG, TMP2, 0, TMP2, 0); -OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); -if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, SLJIT_MEM1(STACK_TOP), -(2 * sizeof(sljit_sw))); - OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 2 * sizeof(sljit_sw)); - } -else - { - OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(STACK_TOP), -(2 * sizeof(sljit_sw))); - OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 2 * sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, TMP3, 0); - } -JUMPTO(SLJIT_JUMP, mainloop); -} - -static void check_wordboundary(compiler_common *common) -{ -DEFINE_COMPILER; -struct sljit_jump *skipread; -jump_list *skipread_list = NULL; -#ifdef SUPPORT_UNICODE -struct sljit_label *valid_utf; -jump_list *invalid_utf1 = NULL; -#endif /* SUPPORT_UNICODE */ -jump_list *invalid_utf2 = NULL; -#if PCRE2_CODE_UNIT_WIDTH != 8 || defined SUPPORT_UNICODE -struct sljit_jump *jump; -#endif /* PCRE2_CODE_UNIT_WIDTH != 8 || SUPPORT_UNICODE */ - -SLJIT_COMPILE_ASSERT(ctype_word == 0x10, ctype_word_must_be_16); - -sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0); -/* Get type of the previous char, and put it to TMP3. */ -OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); -OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, 0); -skipread = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0); - -#ifdef SUPPORT_UNICODE -if (common->invalid_utf) - { - peek_char_back(common, READ_CHAR_MAX, &invalid_utf1); - - if (common->mode != PCRE2_JIT_COMPLETE) - { - OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0); - OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0); - move_back(common, NULL, TRUE); - check_start_used_ptr(common); - OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0); - OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0); - } - } -else -#endif /* SUPPORT_UNICODE */ - { - if (common->mode == PCRE2_JIT_COMPLETE) - peek_char_back(common, READ_CHAR_MAX, NULL); - else - { - move_back(common, NULL, TRUE); - check_start_used_ptr(common); - read_char(common, 0, READ_CHAR_MAX, NULL, READ_CHAR_UPDATE_STR_PTR); - } - } - -/* Testing char type. */ -#ifdef SUPPORT_UNICODE -if (common->ucp) - { - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1); - jump = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE); - add_jump(compiler, &common->getucdtype, JUMP(SLJIT_FAST_CALL)); - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Ll); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ucp_Lu - ucp_Ll); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Nd - ucp_Ll); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ucp_No - ucp_Nd); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL); - JUMPHERE(jump); - OP1(SLJIT_MOV, TMP3, 0, TMP2, 0); - } -else -#endif /* SUPPORT_UNICODE */ - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); -#elif defined SUPPORT_UNICODE - /* Here TMP3 has already been zeroed. */ - jump = NULL; - if (common->utf) - jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes); - OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 4 /* ctype_word */); - OP2(SLJIT_AND, TMP3, 0, TMP1, 0, SLJIT_IMM, 1); -#if PCRE2_CODE_UNIT_WIDTH != 8 - JUMPHERE(jump); -#elif defined SUPPORT_UNICODE - if (jump != NULL) - JUMPHERE(jump); -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - } -JUMPHERE(skipread); - -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0); -check_str_end(common, &skipread_list); -peek_char(common, READ_CHAR_MAX, SLJIT_MEM1(SLJIT_SP), LOCALS1, &invalid_utf2); - -/* Testing char type. This is a code duplication. */ -#ifdef SUPPORT_UNICODE - -valid_utf = LABEL(); - -if (common->ucp) - { - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1); - jump = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE); - add_jump(compiler, &common->getucdtype, JUMP(SLJIT_FAST_CALL)); - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Ll); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ucp_Lu - ucp_Ll); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Nd - ucp_Ll); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ucp_No - ucp_Nd); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL); - JUMPHERE(jump); - } -else -#endif /* SUPPORT_UNICODE */ - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - /* TMP2 may be destroyed by peek_char. */ - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0); - jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); -#elif defined SUPPORT_UNICODE - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0); - jump = NULL; - if (common->utf) - jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); -#endif - OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP1), common->ctypes); - OP2(SLJIT_LSHR, TMP2, 0, TMP2, 0, SLJIT_IMM, 4 /* ctype_word */); - OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); -#if PCRE2_CODE_UNIT_WIDTH != 8 - JUMPHERE(jump); -#elif defined SUPPORT_UNICODE - if (jump != NULL) - JUMPHERE(jump); -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - } -set_jumps(skipread_list, LABEL()); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); -OP2(SLJIT_XOR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, TMP3, 0); -OP_SRC(SLJIT_FAST_RETURN, TMP1, 0); - -#ifdef SUPPORT_UNICODE -if (common->invalid_utf) - { - set_jumps(invalid_utf1, LABEL()); - - peek_char(common, READ_CHAR_MAX, SLJIT_MEM1(SLJIT_SP), LOCALS1, NULL); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR, valid_utf); - - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, -1); - OP_SRC(SLJIT_FAST_RETURN, TMP1, 0); - - set_jumps(invalid_utf2, LABEL()); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); - OP1(SLJIT_MOV, TMP2, 0, TMP3, 0); - OP_SRC(SLJIT_FAST_RETURN, TMP1, 0); - } -#endif /* SUPPORT_UNICODE */ -} - -static BOOL optimize_class_ranges(compiler_common *common, const sljit_u8 *bits, BOOL nclass, BOOL invert, jump_list **backtracks) -{ -/* May destroy TMP1. */ -DEFINE_COMPILER; -int ranges[MAX_CLASS_RANGE_SIZE]; -sljit_u8 bit, cbit, all; -int i, byte, length = 0; - -bit = bits[0] & 0x1; -/* All bits will be zero or one (since bit is zero or one). */ -all = -bit; - -for (i = 0; i < 256; ) - { - byte = i >> 3; - if ((i & 0x7) == 0 && bits[byte] == all) - i += 8; - else - { - cbit = (bits[byte] >> (i & 0x7)) & 0x1; - if (cbit != bit) - { - if (length >= MAX_CLASS_RANGE_SIZE) - return FALSE; - ranges[length] = i; - length++; - bit = cbit; - all = -cbit; - } - i++; - } - } - -if (((bit == 0) && nclass) || ((bit == 1) && !nclass)) - { - if (length >= MAX_CLASS_RANGE_SIZE) - return FALSE; - ranges[length] = 256; - length++; - } - -if (length < 0 || length > 4) - return FALSE; - -bit = bits[0] & 0x1; -if (invert) bit ^= 0x1; - -/* No character is accepted. */ -if (length == 0 && bit == 0) - add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); - -switch(length) - { - case 0: - /* When bit != 0, all characters are accepted. */ - return TRUE; - - case 1: - add_jump(compiler, backtracks, CMP(bit == 0 ? SLJIT_LESS : SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[0])); - return TRUE; - - case 2: - if (ranges[0] + 1 != ranges[1]) - { - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[0]); - add_jump(compiler, backtracks, CMP(bit != 0 ? SLJIT_LESS : SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[1] - ranges[0])); - } - else - add_jump(compiler, backtracks, CMP(bit != 0 ? SLJIT_EQUAL : SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, ranges[0])); - return TRUE; - - case 3: - if (bit != 0) - { - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2])); - if (ranges[0] + 1 != ranges[1]) - { - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[0]); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, ranges[1] - ranges[0])); - } - else - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, ranges[0])); - return TRUE; - } - - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, ranges[0])); - if (ranges[1] + 1 != ranges[2]) - { - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[1]); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, ranges[2] - ranges[1])); - } - else - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, ranges[1])); - return TRUE; - - case 4: - if ((ranges[1] - ranges[0]) == (ranges[3] - ranges[2]) - && (ranges[0] | (ranges[2] - ranges[0])) == ranges[2] - && (ranges[1] & (ranges[2] - ranges[0])) == 0 - && is_powerof2(ranges[2] - ranges[0])) - { - SLJIT_ASSERT((ranges[0] & (ranges[2] - ranges[0])) == 0 && (ranges[2] & ranges[3] & (ranges[2] - ranges[0])) != 0); - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2] - ranges[0]); - if (ranges[2] + 1 != ranges[3]) - { - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2]); - add_jump(compiler, backtracks, CMP(bit != 0 ? SLJIT_LESS : SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[3] - ranges[2])); - } - else - add_jump(compiler, backtracks, CMP(bit != 0 ? SLJIT_EQUAL : SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2])); - return TRUE; - } - - if (bit != 0) - { - i = 0; - if (ranges[0] + 1 != ranges[1]) - { - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[0]); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, ranges[1] - ranges[0])); - i = ranges[0]; - } - else - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, ranges[0])); - - if (ranges[2] + 1 != ranges[3]) - { - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2] - i); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, ranges[3] - ranges[2])); - } - else - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2] - i)); - return TRUE; - } - - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[0]); - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[3] - ranges[0])); - if (ranges[1] + 1 != ranges[2]) - { - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[1] - ranges[0]); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, ranges[2] - ranges[1])); - } - else - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, ranges[1] - ranges[0])); - return TRUE; - - default: - SLJIT_UNREACHABLE(); - return FALSE; - } -} - -static BOOL optimize_class_chars(compiler_common *common, const sljit_u8 *bits, BOOL nclass, BOOL invert, jump_list **backtracks) -{ -/* May destroy TMP1. */ -DEFINE_COMPILER; -uint16_t char_list[MAX_CLASS_CHARS_SIZE]; -uint8_t byte; -sljit_s32 type; -int i, j, k, len, c; - -if (!sljit_has_cpu_feature(SLJIT_HAS_CMOV)) - return FALSE; - -len = 0; - -for (i = 0; i < 32; i++) - { - byte = bits[i]; - - if (nclass) - byte = ~byte; - - j = 0; - while (byte != 0) - { - if (byte & 0x1) - { - c = i * 8 + j; - - k = len; - - if ((c & 0x20) != 0) - { - for (k = 0; k < len; k++) - if (char_list[k] == c - 0x20) - { - char_list[k] |= 0x120; - break; - } - } - - if (k == len) - { - if (len >= MAX_CLASS_CHARS_SIZE) - return FALSE; - - char_list[len++] = (uint16_t) c; - } - } - - byte >>= 1; - j++; - } - } - -if (len == 0) return FALSE; /* Should never occur, but stops analyzers complaining. */ - -i = 0; -j = 0; - -if (char_list[0] == 0) - { - i++; - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_ZERO); - } -else - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0); - -while (i < len) - { - if ((char_list[i] & 0x100) != 0) - j++; - else - { - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, char_list[i]); - CMOV(SLJIT_ZERO, TMP2, TMP1, 0); - } - i++; - } - -if (j != 0) - { - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x20); - - for (i = 0; i < len; i++) - if ((char_list[i] & 0x100) != 0) - { - j--; - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, char_list[i] & 0xff); - CMOV(SLJIT_ZERO, TMP2, TMP1, 0); - } - } - -if (invert) - nclass = !nclass; - -type = nclass ? SLJIT_NOT_EQUAL : SLJIT_EQUAL; -add_jump(compiler, backtracks, CMP(type, TMP2, 0, SLJIT_IMM, 0)); -return TRUE; -} - -static BOOL optimize_class(compiler_common *common, const sljit_u8 *bits, BOOL nclass, BOOL invert, jump_list **backtracks) -{ -/* May destroy TMP1. */ -if (optimize_class_ranges(common, bits, nclass, invert, backtracks)) - return TRUE; -return optimize_class_chars(common, bits, nclass, invert, backtracks); -} - -static void check_anynewline(compiler_common *common) -{ -/* Check whether TMP1 contains a newline character. TMP2 destroyed. */ -DEFINE_COMPILER; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x0a); -OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x0d - 0x0a); -OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); -OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a); -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 -#if PCRE2_CODE_UNIT_WIDTH == 8 -if (common->utf) - { -#endif - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x2029 - 0x0a); -#if PCRE2_CODE_UNIT_WIDTH == 8 - } -#endif -#endif /* SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == [16|32] */ -OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_EQUAL); -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void check_hspace(compiler_common *common) -{ -/* Check whether TMP1 contains a newline character. TMP2 destroyed. */ -DEFINE_COMPILER; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x09); -OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); -OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x20); -OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); -OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xa0); -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 -#if PCRE2_CODE_UNIT_WIDTH == 8 -if (common->utf) - { -#endif - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x1680); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x180e); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x2000); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x200A - 0x2000); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x202f - 0x2000); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x205f - 0x2000); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x3000 - 0x2000); -#if PCRE2_CODE_UNIT_WIDTH == 8 - } -#endif -#endif /* SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == [16|32] */ -OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_EQUAL); - -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void check_vspace(compiler_common *common) -{ -/* Check whether TMP1 contains a newline character. TMP2 destroyed. */ -DEFINE_COMPILER; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); - -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x0a); -OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x0d - 0x0a); -OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); -OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a); -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 -#if PCRE2_CODE_UNIT_WIDTH == 8 -if (common->utf) - { -#endif - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x2029 - 0x0a); -#if PCRE2_CODE_UNIT_WIDTH == 8 - } -#endif -#endif /* SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == [16|32] */ -OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_EQUAL); - -OP_SRC(SLJIT_FAST_RETURN, RETURN_ADDR, 0); -} - -static void do_casefulcmp(compiler_common *common) -{ -DEFINE_COMPILER; -struct sljit_jump *jump; -struct sljit_label *label; -int char1_reg; -int char2_reg; - -if (HAS_VIRTUAL_REGISTERS) - { - char1_reg = STR_END; - char2_reg = STACK_TOP; - } -else - { - char1_reg = TMP3; - char2_reg = RETURN_ADDR; - } - -sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0); -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0); - -if (char1_reg == STR_END) - { - OP1(SLJIT_MOV, TMP3, 0, char1_reg, 0); - OP1(SLJIT_MOV, RETURN_ADDR, 0, char2_reg, 0); - } - -if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS) - { - label = LABEL(); - sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)); - sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - jump = CMP(SLJIT_NOT_EQUAL, char1_reg, 0, char2_reg, 0); - OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); - JUMPTO(SLJIT_NOT_ZERO, label); - - JUMPHERE(jump); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); - } -else if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS) - { - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - - label = LABEL(); - sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)); - sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - jump = CMP(SLJIT_NOT_EQUAL, char1_reg, 0, char2_reg, 0); - OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); - JUMPTO(SLJIT_NOT_ZERO, label); - - JUMPHERE(jump); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - } -else - { - label = LABEL(); - OP1(MOV_UCHAR, char1_reg, 0, SLJIT_MEM1(TMP1), 0); - OP1(MOV_UCHAR, char2_reg, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - jump = CMP(SLJIT_NOT_EQUAL, char1_reg, 0, char2_reg, 0); - OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); - JUMPTO(SLJIT_NOT_ZERO, label); - - JUMPHERE(jump); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); - } - -if (char1_reg == STR_END) - { - OP1(SLJIT_MOV, char1_reg, 0, TMP3, 0); - OP1(SLJIT_MOV, char2_reg, 0, RETURN_ADDR, 0); - } - -OP_SRC(SLJIT_FAST_RETURN, TMP1, 0); -} - -static void do_caselesscmp(compiler_common *common) -{ -DEFINE_COMPILER; -struct sljit_jump *jump; -struct sljit_label *label; -int char1_reg = STR_END; -int char2_reg; -int lcc_table; -int opt_type = 0; - -if (HAS_VIRTUAL_REGISTERS) - { - char2_reg = STACK_TOP; - lcc_table = STACK_LIMIT; - } -else - { - char2_reg = RETURN_ADDR; - lcc_table = TMP3; - } - -if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS) - opt_type = 1; -else if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS) - opt_type = 2; - -sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0); -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0); - -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, char1_reg, 0); - -if (char2_reg == STACK_TOP) - { - OP1(SLJIT_MOV, TMP3, 0, char2_reg, 0); - OP1(SLJIT_MOV, RETURN_ADDR, 0, lcc_table, 0); - } - -OP1(SLJIT_MOV, lcc_table, 0, SLJIT_IMM, common->lcc); - -if (opt_type == 1) - { - label = LABEL(); - sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)); - sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - } -else if (opt_type == 2) - { - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - - label = LABEL(); - sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)); - sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - } -else - { - label = LABEL(); - OP1(MOV_UCHAR, char1_reg, 0, SLJIT_MEM1(TMP1), 0); - OP1(MOV_UCHAR, char2_reg, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1)); - } - -#if PCRE2_CODE_UNIT_WIDTH != 8 -jump = CMP(SLJIT_GREATER, char1_reg, 0, SLJIT_IMM, 255); -#endif -OP1(SLJIT_MOV_U8, char1_reg, 0, SLJIT_MEM2(lcc_table, char1_reg), 0); -#if PCRE2_CODE_UNIT_WIDTH != 8 -JUMPHERE(jump); -jump = CMP(SLJIT_GREATER, char2_reg, 0, SLJIT_IMM, 255); -#endif -OP1(SLJIT_MOV_U8, char2_reg, 0, SLJIT_MEM2(lcc_table, char2_reg), 0); -#if PCRE2_CODE_UNIT_WIDTH != 8 -JUMPHERE(jump); -#endif - -if (opt_type == 0) - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -jump = CMP(SLJIT_NOT_EQUAL, char1_reg, 0, char2_reg, 0); -OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); -JUMPTO(SLJIT_NOT_ZERO, label); - -JUMPHERE(jump); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); - -if (opt_type == 2) - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - -if (char2_reg == STACK_TOP) - { - OP1(SLJIT_MOV, char2_reg, 0, TMP3, 0); - OP1(SLJIT_MOV, lcc_table, 0, RETURN_ADDR, 0); - } - -OP1(SLJIT_MOV, char1_reg, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1); -OP_SRC(SLJIT_FAST_RETURN, TMP1, 0); -} - -static PCRE2_SPTR byte_sequence_compare(compiler_common *common, BOOL caseless, PCRE2_SPTR cc, - compare_context *context, jump_list **backtracks) -{ -DEFINE_COMPILER; -unsigned int othercasebit = 0; -PCRE2_SPTR othercasechar = NULL; -#ifdef SUPPORT_UNICODE -int utflength; -#endif - -if (caseless && char_has_othercase(common, cc)) - { - othercasebit = char_get_othercase_bit(common, cc); - SLJIT_ASSERT(othercasebit); - /* Extracting bit difference info. */ -#if PCRE2_CODE_UNIT_WIDTH == 8 - othercasechar = cc + (othercasebit >> 8); - othercasebit &= 0xff; -#elif PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 - /* Note that this code only handles characters in the BMP. If there - ever are characters outside the BMP whose othercase differs in only one - bit from itself (there currently are none), this code will need to be - revised for PCRE2_CODE_UNIT_WIDTH == 32. */ - othercasechar = cc + (othercasebit >> 9); - if ((othercasebit & 0x100) != 0) - othercasebit = (othercasebit & 0xff) << 8; - else - othercasebit &= 0xff; -#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ - } - -if (context->sourcereg == -1) - { -#if PCRE2_CODE_UNIT_WIDTH == 8 -#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED - if (context->length >= 4) - OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); - else if (context->length >= 2) - OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); - else -#endif - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); -#elif PCRE2_CODE_UNIT_WIDTH == 16 -#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED - if (context->length >= 4) - OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); - else -#endif - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); -#elif PCRE2_CODE_UNIT_WIDTH == 32 - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); -#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ - context->sourcereg = TMP2; - } - -#ifdef SUPPORT_UNICODE -utflength = 1; -if (common->utf && HAS_EXTRALEN(*cc)) - utflength += GET_EXTRALEN(*cc); - -do - { -#endif - - context->length -= IN_UCHARS(1); -#if (defined SLJIT_UNALIGNED && SLJIT_UNALIGNED) && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16) - - /* Unaligned read is supported. */ - if (othercasebit != 0 && othercasechar == cc) - { - context->c.asuchars[context->ucharptr] = *cc | othercasebit; - context->oc.asuchars[context->ucharptr] = othercasebit; - } - else - { - context->c.asuchars[context->ucharptr] = *cc; - context->oc.asuchars[context->ucharptr] = 0; - } - context->ucharptr++; - -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (context->ucharptr >= 4 || context->length == 0 || (context->ucharptr == 2 && context->length == 1)) -#else - if (context->ucharptr >= 2 || context->length == 0) -#endif - { - if (context->length >= 4) - OP1(SLJIT_MOV_S32, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); - else if (context->length >= 2) - OP1(SLJIT_MOV_U16, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); -#if PCRE2_CODE_UNIT_WIDTH == 8 - else if (context->length >= 1) - OP1(SLJIT_MOV_U8, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1; - - switch(context->ucharptr) - { - case 4 / sizeof(PCRE2_UCHAR): - if (context->oc.asint != 0) - OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asint); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asint | context->oc.asint)); - break; - - case 2 / sizeof(PCRE2_UCHAR): - if (context->oc.asushort != 0) - OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asushort); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asushort | context->oc.asushort)); - break; - -#if PCRE2_CODE_UNIT_WIDTH == 8 - case 1: - if (context->oc.asbyte != 0) - OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asbyte); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asbyte | context->oc.asbyte)); - break; -#endif - - default: - SLJIT_UNREACHABLE(); - break; - } - context->ucharptr = 0; - } - -#else - - /* Unaligned read is unsupported or in 32 bit mode. */ - if (context->length >= 1) - OP1(MOV_UCHAR, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); - - context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1; - - if (othercasebit != 0 && othercasechar == cc) - { - OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, othercasebit); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, *cc | othercasebit)); - } - else - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, *cc)); - -#endif - - cc++; -#ifdef SUPPORT_UNICODE - utflength--; - } -while (utflength > 0); -#endif - -return cc; -} - -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 - -#define SET_TYPE_OFFSET(value) \ - if ((value) != typeoffset) \ - { \ - if ((value) < typeoffset) \ - OP2(SLJIT_ADD, typereg, 0, typereg, 0, SLJIT_IMM, typeoffset - (value)); \ - else \ - OP2(SLJIT_SUB, typereg, 0, typereg, 0, SLJIT_IMM, (value) - typeoffset); \ - } \ - typeoffset = (value); - -#define SET_CHAR_OFFSET(value) \ - if ((value) != charoffset) \ - { \ - if ((value) < charoffset) \ - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(charoffset - (value))); \ - else \ - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)((value) - charoffset)); \ - } \ - charoffset = (value); - -static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr); - -static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks) -{ -DEFINE_COMPILER; -jump_list *found = NULL; -jump_list **list = (cc[0] & XCL_NOT) == 0 ? &found : backtracks; -sljit_uw c, charoffset, max = 256, min = READ_CHAR_MAX; -struct sljit_jump *jump = NULL; -PCRE2_SPTR ccbegin; -int compares, invertcmp, numberofcmps; -#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16) -BOOL utf = common->utf; -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */ - -#ifdef SUPPORT_UNICODE -BOOL needstype = FALSE, needsscript = FALSE, needschar = FALSE; -BOOL charsaved = FALSE; -int typereg = TMP1; -const sljit_u32 *other_cases; -sljit_uw typeoffset; -#endif /* SUPPORT_UNICODE */ - -/* Scanning the necessary info. */ -cc++; -ccbegin = cc; -compares = 0; - -if (cc[-1] & XCL_MAP) - { - min = 0; - cc += 32 / sizeof(PCRE2_UCHAR); - } - -while (*cc != XCL_END) - { - compares++; - if (*cc == XCL_SINGLE) - { - cc ++; - GETCHARINCTEST(c, cc); - if (c > max) max = c; - if (c < min) min = c; -#ifdef SUPPORT_UNICODE - needschar = TRUE; -#endif /* SUPPORT_UNICODE */ - } - else if (*cc == XCL_RANGE) - { - cc ++; - GETCHARINCTEST(c, cc); - if (c < min) min = c; - GETCHARINCTEST(c, cc); - if (c > max) max = c; -#ifdef SUPPORT_UNICODE - needschar = TRUE; -#endif /* SUPPORT_UNICODE */ - } -#ifdef SUPPORT_UNICODE - else - { - SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP); - cc++; - if (*cc == PT_CLIST) - { - other_cases = PRIV(ucd_caseless_sets) + cc[1]; - while (*other_cases != NOTACHAR) - { - if (*other_cases > max) max = *other_cases; - if (*other_cases < min) min = *other_cases; - other_cases++; - } - } - else - { - max = READ_CHAR_MAX; - min = 0; - } - - switch(*cc) - { - case PT_ANY: - /* Any either accepts everything or ignored. */ - if (cc[-1] == XCL_PROP) - { - compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE); - if (list == backtracks) - add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); - return; - } - break; - - case PT_LAMP: - case PT_GC: - case PT_PC: - case PT_ALNUM: - needstype = TRUE; - break; - - case PT_SC: - needsscript = TRUE; - break; - - case PT_SPACE: - case PT_PXSPACE: - case PT_WORD: - case PT_PXGRAPH: - case PT_PXPRINT: - case PT_PXPUNCT: - needstype = TRUE; - needschar = TRUE; - break; - - case PT_CLIST: - case PT_UCNC: - needschar = TRUE; - break; - - default: - SLJIT_UNREACHABLE(); - break; - } - cc += 2; - } -#endif /* SUPPORT_UNICODE */ - } -SLJIT_ASSERT(compares > 0); - -/* We are not necessary in utf mode even in 8 bit mode. */ -cc = ccbegin; -if ((cc[-1] & XCL_NOT) != 0) - read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR); -else - { -#ifdef SUPPORT_UNICODE - read_char(common, min, max, (needstype || needsscript) ? backtracks : NULL, 0); -#else /* !SUPPORT_UNICODE */ - read_char(common, min, max, NULL, 0); -#endif /* SUPPORT_UNICODE */ - } - -if ((cc[-1] & XCL_HASPROP) == 0) - { - if ((cc[-1] & XCL_MAP) != 0) - { - jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); - if (!optimize_class(common, (const sljit_u8 *)cc, (((const sljit_u8 *)cc)[31] & 0x80) != 0, TRUE, &found)) - { - OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); - OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc); - OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); - OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); - add_jump(compiler, &found, JUMP(SLJIT_NOT_ZERO)); - } - - add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); - JUMPHERE(jump); - - cc += 32 / sizeof(PCRE2_UCHAR); - } - else - { - OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, min); - add_jump(compiler, (cc[-1] & XCL_NOT) == 0 ? backtracks : &found, CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, max - min)); - } - } -else if ((cc[-1] & XCL_MAP) != 0) - { - OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0); -#ifdef SUPPORT_UNICODE - charsaved = TRUE; -#endif /* SUPPORT_UNICODE */ - if (!optimize_class(common, (const sljit_u8 *)cc, FALSE, TRUE, list)) - { -#if PCRE2_CODE_UNIT_WIDTH == 8 - jump = NULL; - if (common->utf) -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); - - OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); - OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc); - OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); - OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); - add_jump(compiler, list, JUMP(SLJIT_NOT_ZERO)); - -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf) -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - JUMPHERE(jump); - } - - OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0); - cc += 32 / sizeof(PCRE2_UCHAR); - } - -#ifdef SUPPORT_UNICODE -if (needstype || needsscript) - { - if (needschar && !charsaved) - OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0); - -#if PCRE2_CODE_UNIT_WIDTH == 32 - if (!common->utf) - { - jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, UNASSIGNED_UTF_CHAR); - JUMPHERE(jump); - } -#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ - - OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); - OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1)); - OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2)); - OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1); - - /* Before anything else, we deal with scripts. */ - if (needsscript) - { - OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 3); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); - - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script)); - - ccbegin = cc; - - while (*cc != XCL_END) - { - if (*cc == XCL_SINGLE) - { - cc ++; - GETCHARINCTEST(c, cc); - } - else if (*cc == XCL_RANGE) - { - cc ++; - GETCHARINCTEST(c, cc); - GETCHARINCTEST(c, cc); - } - else - { - SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP); - cc++; - if (*cc == PT_SC) - { - compares--; - invertcmp = (compares == 0 && list != backtracks); - if (cc[-1] == XCL_NOTPROP) - invertcmp ^= 0x1; - jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]); - add_jump(compiler, compares > 0 ? list : backtracks, jump); - } - cc += 2; - } - } - - cc = ccbegin; - - if (needstype) - { - /* TMP2 has already been shifted by 2 */ - if (!needschar) - { - OP2(SLJIT_ADD, TMP1, 0, TMP2, 0, TMP2, 0); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); - - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); - } - else - { - OP2(SLJIT_ADD, TMP1, 0, TMP2, 0, TMP2, 0); - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); - - OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0); - OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); - typereg = RETURN_ADDR; - } - } - else if (needschar) - OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0); - } - else if (needstype) - { - OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 3); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2); - - if (!needschar) - { - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); - - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); - } - else - { - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); - - OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0); - OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); - typereg = RETURN_ADDR; - } - } - else if (needschar) - OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0); - } -#endif /* SUPPORT_UNICODE */ - -/* Generating code. */ -charoffset = 0; -numberofcmps = 0; -#ifdef SUPPORT_UNICODE -typeoffset = 0; -#endif /* SUPPORT_UNICODE */ - -while (*cc != XCL_END) - { - compares--; - invertcmp = (compares == 0 && list != backtracks); - jump = NULL; - - if (*cc == XCL_SINGLE) - { - cc ++; - GETCHARINCTEST(c, cc); - - if (numberofcmps < 3 && (*cc == XCL_SINGLE || *cc == XCL_RANGE)) - { - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(c - charoffset)); - OP_FLAGS(numberofcmps == 0 ? SLJIT_MOV : SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - numberofcmps++; - } - else if (numberofcmps > 0) - { - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(c - charoffset)); - OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_EQUAL); - jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); - numberofcmps = 0; - } - else - { - jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(c - charoffset)); - numberofcmps = 0; - } - } - else if (*cc == XCL_RANGE) - { - cc ++; - GETCHARINCTEST(c, cc); - SET_CHAR_OFFSET(c); - GETCHARINCTEST(c, cc); - - if (numberofcmps < 3 && (*cc == XCL_SINGLE || *cc == XCL_RANGE)) - { - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(c - charoffset)); - OP_FLAGS(numberofcmps == 0 ? SLJIT_MOV : SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL); - numberofcmps++; - } - else if (numberofcmps > 0) - { - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(c - charoffset)); - OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_LESS_EQUAL); - jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); - numberofcmps = 0; - } - else - { - jump = CMP(SLJIT_LESS_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(c - charoffset)); - numberofcmps = 0; - } - } -#ifdef SUPPORT_UNICODE - else - { - SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP); - if (*cc == XCL_NOTPROP) - invertcmp ^= 0x1; - cc++; - switch(*cc) - { - case PT_ANY: - if (!invertcmp) - jump = JUMP(SLJIT_JUMP); - break; - - case PT_LAMP: - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Lu - typeoffset); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Ll - typeoffset); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Lt - typeoffset); - OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_EQUAL); - jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); - break; - - case PT_GC: - c = PRIV(ucp_typerange)[(int)cc[1] * 2]; - SET_TYPE_OFFSET(c); - jump = CMP(SLJIT_LESS_EQUAL ^ invertcmp, typereg, 0, SLJIT_IMM, PRIV(ucp_typerange)[(int)cc[1] * 2 + 1] - c); - break; - - case PT_PC: - jump = CMP(SLJIT_EQUAL ^ invertcmp, typereg, 0, SLJIT_IMM, (int)cc[1] - typeoffset); - break; - - case PT_SC: - compares++; - /* Do nothing. */ - break; - - case PT_SPACE: - case PT_PXSPACE: - SET_CHAR_OFFSET(9); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd - 0x9); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); - - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x9); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x180e - 0x9); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - - SET_TYPE_OFFSET(ucp_Zl); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Zs - ucp_Zl); - OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_LESS_EQUAL); - jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); - break; - - case PT_WORD: - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_UNDERSCORE - charoffset)); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); - /* Fall through. */ - - case PT_ALNUM: - SET_TYPE_OFFSET(ucp_Ll); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Lu - ucp_Ll); - OP_FLAGS((*cc == PT_ALNUM) ? SLJIT_MOV : SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL); - SET_TYPE_OFFSET(ucp_Nd); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_No - ucp_Nd); - OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_LESS_EQUAL); - jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); - break; - - case PT_CLIST: - other_cases = PRIV(ucd_caseless_sets) + cc[1]; - - /* At least three characters are required. - Otherwise this case would be handled by the normal code path. */ - SLJIT_ASSERT(other_cases[0] != NOTACHAR && other_cases[1] != NOTACHAR && other_cases[2] != NOTACHAR); - SLJIT_ASSERT(other_cases[0] < other_cases[1] && other_cases[1] < other_cases[2]); - - /* Optimizing character pairs, if their difference is power of 2. */ - if (is_powerof2(other_cases[1] ^ other_cases[0])) - { - if (charoffset == 0) - OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, other_cases[1] ^ other_cases[0]); - else - { - OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)charoffset); - OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, other_cases[1] ^ other_cases[0]); - } - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, other_cases[1]); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); - other_cases += 2; - } - else if (is_powerof2(other_cases[2] ^ other_cases[1])) - { - if (charoffset == 0) - OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, other_cases[2] ^ other_cases[1]); - else - { - OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)charoffset); - OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, other_cases[1] ^ other_cases[0]); - } - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, other_cases[2]); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); - - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(other_cases[0] - charoffset)); - OP_FLAGS(SLJIT_OR | ((other_cases[3] == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL); - - other_cases += 3; - } - else - { - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++ - charoffset)); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); - } - - while (*other_cases != NOTACHAR) - { - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++ - charoffset)); - OP_FLAGS(SLJIT_OR | ((*other_cases == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL); - } - jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); - break; - - case PT_UCNC: - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_DOLLAR_SIGN - charoffset)); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_COMMERCIAL_AT - charoffset)); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_GRAVE_ACCENT - charoffset)); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - - SET_CHAR_OFFSET(0xa0); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(0xd7ff - charoffset)); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL); - SET_CHAR_OFFSET(0); - OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xe000 - 0); - OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_GREATER_EQUAL); - jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); - break; - - case PT_PXGRAPH: - /* C and Z groups are the farthest two groups. */ - SET_TYPE_OFFSET(ucp_Ll); - OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_So - ucp_Ll); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_GREATER); - - jump = CMP(SLJIT_NOT_EQUAL, typereg, 0, SLJIT_IMM, ucp_Cf - ucp_Ll); - - /* In case of ucp_Cf, we overwrite the result. */ - SET_CHAR_OFFSET(0x2066); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); - - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x180e - 0x2066); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - - JUMPHERE(jump); - jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0); - break; - - case PT_PXPRINT: - /* C and Z groups are the farthest two groups. */ - SET_TYPE_OFFSET(ucp_Ll); - OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_So - ucp_Ll); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_GREATER); - - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Zs - ucp_Ll); - OP_FLAGS(SLJIT_AND, TMP2, 0, SLJIT_NOT_EQUAL); - - jump = CMP(SLJIT_NOT_EQUAL, typereg, 0, SLJIT_IMM, ucp_Cf - ucp_Ll); - - /* In case of ucp_Cf, we overwrite the result. */ - SET_CHAR_OFFSET(0x2066); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); - - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066); - OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); - - JUMPHERE(jump); - jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0); - break; - - case PT_PXPUNCT: - SET_TYPE_OFFSET(ucp_Sc); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_So - ucp_Sc); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); - - SET_CHAR_OFFSET(0); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x7f); - OP_FLAGS(SLJIT_AND, TMP2, 0, SLJIT_LESS_EQUAL); - - SET_TYPE_OFFSET(ucp_Pc); - OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Ps - ucp_Pc); - OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_LESS_EQUAL); - jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); - break; - - default: - SLJIT_UNREACHABLE(); - break; - } - cc += 2; - } -#endif /* SUPPORT_UNICODE */ - - if (jump != NULL) - add_jump(compiler, compares > 0 ? list : backtracks, jump); - } - -if (found != NULL) - set_jumps(found, LABEL()); -} - -#undef SET_TYPE_OFFSET -#undef SET_CHAR_OFFSET - -#endif - -static PCRE2_SPTR compile_simple_assertion_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks) -{ -DEFINE_COMPILER; -int length; -struct sljit_jump *jump[4]; -#ifdef SUPPORT_UNICODE -struct sljit_label *label; -#endif /* SUPPORT_UNICODE */ - -switch(type) - { - case OP_SOD: - if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); - } - else - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, begin)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, TMP1, 0)); - return cc; - - case OP_SOM: - if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, str)); - } - else - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, str)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, TMP1, 0)); - return cc; - - case OP_NOT_WORD_BOUNDARY: - case OP_WORD_BOUNDARY: - add_jump(compiler, &common->wordboundary, JUMP(SLJIT_FAST_CALL)); -#ifdef SUPPORT_UNICODE - if (common->invalid_utf) - { - add_jump(compiler, backtracks, CMP((type == OP_NOT_WORD_BOUNDARY) ? SLJIT_NOT_EQUAL : SLJIT_SIG_LESS_EQUAL, TMP2, 0, SLJIT_IMM, 0)); - return cc; - } -#endif /* SUPPORT_UNICODE */ - sljit_set_current_flags(compiler, SLJIT_SET_Z); - add_jump(compiler, backtracks, JUMP(type == OP_NOT_WORD_BOUNDARY ? SLJIT_NOT_ZERO : SLJIT_ZERO)); - return cc; - - case OP_EODN: - /* Requires rather complex checks. */ - jump[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - if (common->nltype == NLTYPE_FIXED && common->newline > 255) - { - OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - if (common->mode == PCRE2_JIT_COMPLETE) - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP2, 0, STR_END, 0)); - else - { - jump[1] = CMP(SLJIT_EQUAL, TMP2, 0, STR_END, 0); - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, STR_END, 0); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff); - OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_EQUAL); - add_jump(compiler, backtracks, JUMP(SLJIT_NOT_EQUAL)); - check_partial(common, TRUE); - add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); - JUMPHERE(jump[1]); - } - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff)); - } - else if (common->nltype == NLTYPE_FIXED) - { - OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP2, 0, STR_END, 0)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline)); - } - else - { - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - jump[1] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); - OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); - OP2(SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, TMP2, 0, STR_END, 0); - jump[2] = JUMP(SLJIT_GREATER); - add_jump(compiler, backtracks, JUMP(SLJIT_NOT_EQUAL) /* LESS */); - /* Equal. */ - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - jump[3] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL); - add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); - - JUMPHERE(jump[1]); - if (common->nltype == NLTYPE_ANYCRLF) - { - OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, STR_END, 0)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL)); - } - else - { - OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0); - read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, STR_END, 0)); - add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL)); - sljit_set_current_flags(compiler, SLJIT_SET_Z); - add_jump(compiler, backtracks, JUMP(SLJIT_ZERO)); - OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0); - } - JUMPHERE(jump[2]); - JUMPHERE(jump[3]); - } - JUMPHERE(jump[0]); - if (common->mode != PCRE2_JIT_COMPLETE) - check_partial(common, TRUE); - return cc; - - case OP_EOD: - add_jump(compiler, backtracks, CMP(SLJIT_LESS, STR_PTR, 0, STR_END, 0)); - if (common->mode != PCRE2_JIT_COMPLETE) - check_partial(common, TRUE); - return cc; - - case OP_DOLL: - if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); - OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTEOL); - } - else - OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTEOL); - add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO32)); - - if (!common->endonly) - compile_simple_assertion_matchingpath(common, OP_EODN, cc, backtracks); - else - { - add_jump(compiler, backtracks, CMP(SLJIT_LESS, STR_PTR, 0, STR_END, 0)); - check_partial(common, FALSE); - } - return cc; - - case OP_DOLLM: - jump[1] = CMP(SLJIT_LESS, STR_PTR, 0, STR_END, 0); - if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); - OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTEOL); - } - else - OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTEOL); - add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO32)); - check_partial(common, FALSE); - jump[0] = JUMP(SLJIT_JUMP); - JUMPHERE(jump[1]); - - if (common->nltype == NLTYPE_FIXED && common->newline > 255) - { - OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - if (common->mode == PCRE2_JIT_COMPLETE) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER, TMP2, 0, STR_END, 0)); - else - { - jump[1] = CMP(SLJIT_LESS_EQUAL, TMP2, 0, STR_END, 0); - /* STR_PTR = STR_END - IN_UCHARS(1) */ - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff)); - check_partial(common, TRUE); - add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); - JUMPHERE(jump[1]); - } - - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff)); - } - else - { - peek_char(common, common->nlmax, TMP3, 0, NULL); - check_newlinechar(common, common->nltype, backtracks, FALSE); - } - JUMPHERE(jump[0]); - return cc; - - case OP_CIRC: - if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, begin)); - add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, TMP1, 0)); - OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL); - add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO32)); - } - else - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, begin)); - add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, TMP1, 0)); - OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL); - add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO32)); - } - return cc; - - case OP_CIRCM: - /* TMP2 might be used by peek_char_back. */ - if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); - jump[1] = CMP(SLJIT_GREATER, STR_PTR, 0, TMP2, 0); - OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL); - } - else - { - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, begin)); - jump[1] = CMP(SLJIT_GREATER, STR_PTR, 0, TMP2, 0); - OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL); - } - add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO32)); - jump[0] = JUMP(SLJIT_JUMP); - JUMPHERE(jump[1]); - - if (!common->alt_circumflex) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - - if (common->nltype == NLTYPE_FIXED && common->newline > 255) - { - OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP1, 0, TMP2, 0)); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff)); - } - else - { - peek_char_back(common, common->nlmax, backtracks); - check_newlinechar(common, common->nltype, backtracks, FALSE); - } - JUMPHERE(jump[0]); - return cc; - - case OP_REVERSE: - length = GET(cc, 0); - if (length == 0) - return cc + LINK_SIZE; - if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); - } - else - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, begin)); -#ifdef SUPPORT_UNICODE - if (common->utf) - { - OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, length); - label = LABEL(); - add_jump(compiler, backtracks, CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0)); - move_back(common, backtracks, FALSE); - OP2(SLJIT_SUB | SLJIT_SET_Z, TMP3, 0, TMP3, 0, SLJIT_IMM, 1); - JUMPTO(SLJIT_NOT_ZERO, label); - } - else -#endif - { - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(length)); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0)); - } - check_start_used_ptr(common); - return cc + LINK_SIZE; - } -SLJIT_UNREACHABLE(); -return cc; -} - -#ifdef SUPPORT_UNICODE - -#if PCRE2_CODE_UNIT_WIDTH != 32 - -static PCRE2_SPTR SLJIT_FUNC do_extuni_utf(jit_arguments *args, PCRE2_SPTR cc) -{ -PCRE2_SPTR start_subject = args->begin; -PCRE2_SPTR end_subject = args->end; -int lgb, rgb, ricount; -PCRE2_SPTR prevcc, endcc, bptr; -BOOL first = TRUE; -uint32_t c; - -prevcc = cc; -endcc = NULL; -do - { - GETCHARINC(c, cc); - rgb = UCD_GRAPHBREAK(c); - - if (first) - { - lgb = rgb; - endcc = cc; - first = FALSE; - continue; - } - - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) - break; - - /* Not breaking between Regional Indicators is allowed only if there - are an even number of preceding RIs. */ - - if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator) - { - ricount = 0; - bptr = prevcc; - - /* bptr is pointing to the left-hand character */ - while (bptr > start_subject) - { - bptr--; - BACKCHAR(bptr); - GETCHAR(c, bptr); - - if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) - break; - - ricount++; - } - - if ((ricount & 1) != 0) break; /* Grapheme break required */ - } - - /* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this - allows any number of them before a following Extended_Pictographic. */ - - if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) || - lgb != ucp_gbExtended_Pictographic) - lgb = rgb; - - prevcc = endcc; - endcc = cc; - } -while (cc < end_subject); - -return endcc; -} - -#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ - -static PCRE2_SPTR SLJIT_FUNC do_extuni_utf_invalid(jit_arguments *args, PCRE2_SPTR cc) -{ -PCRE2_SPTR start_subject = args->begin; -PCRE2_SPTR end_subject = args->end; -int lgb, rgb, ricount; -PCRE2_SPTR prevcc, endcc, bptr; -BOOL first = TRUE; -uint32_t c; - -prevcc = cc; -endcc = NULL; -do - { - GETCHARINC_INVALID(c, cc, end_subject, break); - rgb = UCD_GRAPHBREAK(c); - - if (first) - { - lgb = rgb; - endcc = cc; - first = FALSE; - continue; - } - - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) - break; - - /* Not breaking between Regional Indicators is allowed only if there - are an even number of preceding RIs. */ - - if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator) - { - ricount = 0; - bptr = prevcc; - - /* bptr is pointing to the left-hand character */ - while (bptr > start_subject) - { - GETCHARBACK_INVALID(c, bptr, start_subject, break); - - if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) - break; - - ricount++; - } - - if ((ricount & 1) != 0) - break; /* Grapheme break required */ - } - - /* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this - allows any number of them before a following Extended_Pictographic. */ - - if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) || - lgb != ucp_gbExtended_Pictographic) - lgb = rgb; - - prevcc = endcc; - endcc = cc; - } -while (cc < end_subject); - -return endcc; -} - -static PCRE2_SPTR SLJIT_FUNC do_extuni_no_utf(jit_arguments *args, PCRE2_SPTR cc) -{ -PCRE2_SPTR start_subject = args->begin; -PCRE2_SPTR end_subject = args->end; -int lgb, rgb, ricount; -PCRE2_SPTR bptr; -uint32_t c; - -/* Patch by PH */ -/* GETCHARINC(c, cc); */ -c = *cc++; - -#if PCRE2_CODE_UNIT_WIDTH == 32 -if (c >= 0x110000) - return NULL; -#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ -lgb = UCD_GRAPHBREAK(c); - -while (cc < end_subject) - { - c = *cc; -#if PCRE2_CODE_UNIT_WIDTH == 32 - if (c >= 0x110000) - break; -#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ - rgb = UCD_GRAPHBREAK(c); - - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) - break; - - /* Not breaking between Regional Indicators is allowed only if there - are an even number of preceding RIs. */ - - if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator) - { - ricount = 0; - bptr = cc - 1; - - /* bptr is pointing to the left-hand character */ - while (bptr > start_subject) - { - bptr--; - c = *bptr; -#if PCRE2_CODE_UNIT_WIDTH == 32 - if (c >= 0x110000) - break; -#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ - - if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break; - - ricount++; - } - - if ((ricount & 1) != 0) - break; /* Grapheme break required */ - } - - /* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this - allows any number of them before a following Extended_Pictographic. */ - - if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) || - lgb != ucp_gbExtended_Pictographic) - lgb = rgb; - - cc++; - } - -return cc; -} - -#endif /* SUPPORT_UNICODE */ - -static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr) -{ -DEFINE_COMPILER; -int length; -unsigned int c, oc, bit; -compare_context context; -struct sljit_jump *jump[3]; -jump_list *end_list; -#ifdef SUPPORT_UNICODE -PCRE2_UCHAR propdata[5]; -#endif /* SUPPORT_UNICODE */ - -switch(type) - { - case OP_NOT_DIGIT: - case OP_DIGIT: - /* Digits are usually 0-9, so it is worth to optimize them. */ - if (check_str_ptr) - detect_partial_match(common, backtracks); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_digit, FALSE)) - read_char7_type(common, backtracks, type == OP_NOT_DIGIT); - else -#endif - read_char8_type(common, backtracks, type == OP_NOT_DIGIT); - /* Flip the starting bit in the negative case. */ - OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ctype_digit); - add_jump(compiler, backtracks, JUMP(type == OP_DIGIT ? SLJIT_ZERO : SLJIT_NOT_ZERO)); - return cc; - - case OP_NOT_WHITESPACE: - case OP_WHITESPACE: - if (check_str_ptr) - detect_partial_match(common, backtracks); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_space, FALSE)) - read_char7_type(common, backtracks, type == OP_NOT_WHITESPACE); - else -#endif - read_char8_type(common, backtracks, type == OP_NOT_WHITESPACE); - OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ctype_space); - add_jump(compiler, backtracks, JUMP(type == OP_WHITESPACE ? SLJIT_ZERO : SLJIT_NOT_ZERO)); - return cc; - - case OP_NOT_WORDCHAR: - case OP_WORDCHAR: - if (check_str_ptr) - detect_partial_match(common, backtracks); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_word, FALSE)) - read_char7_type(common, backtracks, type == OP_NOT_WORDCHAR); - else -#endif - read_char8_type(common, backtracks, type == OP_NOT_WORDCHAR); - OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ctype_word); - add_jump(compiler, backtracks, JUMP(type == OP_WORDCHAR ? SLJIT_ZERO : SLJIT_NOT_ZERO)); - return cc; - - case OP_ANY: - if (check_str_ptr) - detect_partial_match(common, backtracks); - read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR); - if (common->nltype == NLTYPE_FIXED && common->newline > 255) - { - jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff); - end_list = NULL; - if (common->mode != PCRE2_JIT_PARTIAL_HARD) - add_jump(compiler, &end_list, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - else - check_str_end(common, &end_list); - - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline & 0xff)); - set_jumps(end_list, LABEL()); - JUMPHERE(jump[0]); - } - else - check_newlinechar(common, common->nltype, backtracks, TRUE); - return cc; - - case OP_ALLANY: - if (check_str_ptr) - detect_partial_match(common, backtracks); -#ifdef SUPPORT_UNICODE - if (common->utf) - { - if (common->invalid_utf) - { - read_char(common, 0, READ_CHAR_MAX, backtracks, READ_CHAR_UPDATE_STR_PTR); - return cc; - } - -#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -#if PCRE2_CODE_UNIT_WIDTH == 8 - jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0); - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); -#elif PCRE2_CODE_UNIT_WIDTH == 16 - jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xd800); - OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800); - OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL); - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - JUMPHERE(jump[0]); - return cc; -#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16] */ - } -#endif /* SUPPORT_UNICODE */ - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - return cc; - - case OP_ANYBYTE: - if (check_str_ptr) - detect_partial_match(common, backtracks); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - return cc; - -#ifdef SUPPORT_UNICODE - case OP_NOTPROP: - case OP_PROP: - propdata[0] = XCL_HASPROP; - propdata[1] = type == OP_NOTPROP ? XCL_NOTPROP : XCL_PROP; - propdata[2] = cc[0]; - propdata[3] = cc[1]; - propdata[4] = XCL_END; - if (check_str_ptr) - detect_partial_match(common, backtracks); - compile_xclass_matchingpath(common, propdata, backtracks); - return cc + 2; -#endif - - case OP_ANYNL: - if (check_str_ptr) - detect_partial_match(common, backtracks); - read_char(common, common->bsr_nlmin, common->bsr_nlmax, NULL, 0); - jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); - /* We don't need to handle soft partial matching case. */ - end_list = NULL; - if (common->mode != PCRE2_JIT_PARTIAL_HARD) - add_jump(compiler, &end_list, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - else - check_str_end(common, &end_list); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - jump[1] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - jump[2] = JUMP(SLJIT_JUMP); - JUMPHERE(jump[0]); - check_newlinechar(common, common->bsr_nltype, backtracks, FALSE); - set_jumps(end_list, LABEL()); - JUMPHERE(jump[1]); - JUMPHERE(jump[2]); - return cc; - - case OP_NOT_HSPACE: - case OP_HSPACE: - if (check_str_ptr) - detect_partial_match(common, backtracks); - - if (type == OP_NOT_HSPACE) - read_char(common, 0x9, 0x3000, backtracks, READ_CHAR_UPDATE_STR_PTR); - else - read_char(common, 0x9, 0x3000, NULL, 0); - - add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL)); - sljit_set_current_flags(compiler, SLJIT_SET_Z); - add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO)); - return cc; - - case OP_NOT_VSPACE: - case OP_VSPACE: - if (check_str_ptr) - detect_partial_match(common, backtracks); - - if (type == OP_NOT_VSPACE) - read_char(common, 0xa, 0x2029, backtracks, READ_CHAR_UPDATE_STR_PTR); - else - read_char(common, 0xa, 0x2029, NULL, 0); - - add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL)); - sljit_set_current_flags(compiler, SLJIT_SET_Z); - add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO)); - return cc; - -#ifdef SUPPORT_UNICODE - case OP_EXTUNI: - if (check_str_ptr) - detect_partial_match(common, backtracks); - - SLJIT_ASSERT(TMP1 == SLJIT_R0 && STR_PTR == SLJIT_R1); - OP1(SLJIT_MOV, SLJIT_R0, 0, ARGUMENTS, 0); - -#if PCRE2_CODE_UNIT_WIDTH != 32 - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, - common->utf ? (common->invalid_utf ? SLJIT_FUNC_OFFSET(do_extuni_utf_invalid) : SLJIT_FUNC_OFFSET(do_extuni_utf)) : SLJIT_FUNC_OFFSET(do_extuni_no_utf)); - if (common->invalid_utf) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0)); -#else - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, - common->invalid_utf ? SLJIT_FUNC_OFFSET(do_extuni_utf_invalid) : SLJIT_FUNC_OFFSET(do_extuni_no_utf)); - if (!common->utf || common->invalid_utf) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0)); -#endif - - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0); - - if (common->mode == PCRE2_JIT_PARTIAL_HARD) - { - jump[0] = CMP(SLJIT_LESS, SLJIT_RETURN_REG, 0, STR_END, 0); - /* Since we successfully read a char above, partial matching must occure. */ - check_partial(common, TRUE); - JUMPHERE(jump[0]); - } - return cc; -#endif - - case OP_CHAR: - case OP_CHARI: - length = 1; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc); -#endif - - if (check_str_ptr && common->mode != PCRE2_JIT_COMPLETE) - detect_partial_match(common, backtracks); - - if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0) - { - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(length)); - if (length > 1 || (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE)) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0)); - - context.length = IN_UCHARS(length); - context.sourcereg = -1; -#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED - context.ucharptr = 0; -#endif - return byte_sequence_compare(common, type == OP_CHARI, cc, &context, backtracks); - } - -#ifdef SUPPORT_UNICODE - if (common->utf) - { - GETCHAR(c, cc); - } - else -#endif - c = *cc; - - SLJIT_ASSERT(type == OP_CHARI && char_has_othercase(common, cc)); - - if (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - - oc = char_othercase(common, c); - read_char(common, c < oc ? c : oc, c > oc ? c : oc, NULL, 0); - - SLJIT_ASSERT(!is_powerof2(c ^ oc)); - - if (sljit_has_cpu_feature(SLJIT_HAS_CMOV)) - { - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, oc); - CMOV(SLJIT_EQUAL, TMP1, SLJIT_IMM, c); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c)); - } - else - { - jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc)); - JUMPHERE(jump[0]); - } - return cc + length; - - case OP_NOT: - case OP_NOTI: - if (check_str_ptr) - detect_partial_match(common, backtracks); - - length = 1; -#ifdef SUPPORT_UNICODE - if (common->utf) - { -#if PCRE2_CODE_UNIT_WIDTH == 8 - c = *cc; - if (c < 128 && !common->invalid_utf) - { - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - if (type == OP_NOT || !char_has_othercase(common, cc)) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c)); - else - { - /* Since UTF8 code page is fixed, we know that c is in [a-z] or [A-Z] range. */ - OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x20); - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, c | 0x20)); - } - /* Skip the variable-length character. */ - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - JUMPHERE(jump[0]); - return cc + 1; - } - else -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - { - GETCHARLEN(c, cc, length); - } - } - else -#endif /* SUPPORT_UNICODE */ - c = *cc; - - if (type == OP_NOT || !char_has_othercase(common, cc)) - { - read_char(common, c, c, backtracks, READ_CHAR_UPDATE_STR_PTR); - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c)); - } - else - { - oc = char_othercase(common, c); - read_char(common, c < oc ? c : oc, c > oc ? c : oc, backtracks, READ_CHAR_UPDATE_STR_PTR); - bit = c ^ oc; - if (is_powerof2(bit)) - { - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, bit); - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit)); - } - else - { - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c)); - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, oc)); - } - } - return cc + length; - - case OP_CLASS: - case OP_NCLASS: - if (check_str_ptr) - detect_partial_match(common, backtracks); - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255; - if (type == OP_NCLASS) - read_char(common, 0, bit, backtracks, READ_CHAR_UPDATE_STR_PTR); - else - read_char(common, 0, bit, NULL, 0); -#else - if (type == OP_NCLASS) - read_char(common, 0, 255, backtracks, READ_CHAR_UPDATE_STR_PTR); - else - read_char(common, 0, 255, NULL, 0); -#endif - - if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks)) - return cc + 32 / sizeof(PCRE2_UCHAR); - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - jump[0] = NULL; - if (common->utf) - { - jump[0] = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, bit); - if (type == OP_CLASS) - { - add_jump(compiler, backtracks, jump[0]); - jump[0] = NULL; - } - } -#elif PCRE2_CODE_UNIT_WIDTH != 8 - jump[0] = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); - if (type == OP_CLASS) - { - add_jump(compiler, backtracks, jump[0]); - jump[0] = NULL; - } -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */ - - OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); - OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc); - OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); - OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); - add_jump(compiler, backtracks, JUMP(SLJIT_ZERO)); - -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 - if (jump[0] != NULL) - JUMPHERE(jump[0]); -#endif - return cc + 32 / sizeof(PCRE2_UCHAR); - -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 - case OP_XCLASS: - if (check_str_ptr) - detect_partial_match(common, backtracks); - compile_xclass_matchingpath(common, cc + LINK_SIZE, backtracks); - return cc + GET(cc, 0) - 1; -#endif - } -SLJIT_UNREACHABLE(); -return cc; -} - -static SLJIT_INLINE PCRE2_SPTR compile_charn_matchingpath(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, jump_list **backtracks) -{ -/* This function consumes at least one input character. */ -/* To decrease the number of length checks, we try to concatenate the fixed length character sequences. */ -DEFINE_COMPILER; -PCRE2_SPTR ccbegin = cc; -compare_context context; -int size; - -context.length = 0; -do - { - if (cc >= ccend) - break; - - if (*cc == OP_CHAR) - { - size = 1; -#ifdef SUPPORT_UNICODE - if (common->utf && HAS_EXTRALEN(cc[1])) - size += GET_EXTRALEN(cc[1]); -#endif - } - else if (*cc == OP_CHARI) - { - size = 1; -#ifdef SUPPORT_UNICODE - if (common->utf) - { - if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0) - size = 0; - else if (HAS_EXTRALEN(cc[1])) - size += GET_EXTRALEN(cc[1]); - } - else -#endif - if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0) - size = 0; - } - else - size = 0; - - cc += 1 + size; - context.length += IN_UCHARS(size); - } -while (size > 0 && context.length <= 128); - -cc = ccbegin; -if (context.length > 0) - { - /* We have a fixed-length byte sequence. */ - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, context.length); - add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0)); - - context.sourcereg = -1; -#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED - context.ucharptr = 0; -#endif - do cc = byte_sequence_compare(common, *cc == OP_CHARI, cc + 1, &context, backtracks); while (context.length > 0); - return cc; - } - -/* A non-fixed length character will be checked if length == 0. */ -return compile_char1_matchingpath(common, *cc, cc + 1, backtracks, TRUE); -} - -/* Forward definitions. */ -static void compile_matchingpath(compiler_common *, PCRE2_SPTR, PCRE2_SPTR, backtrack_common *); -static void compile_backtrackingpath(compiler_common *, struct backtrack_common *); - -#define PUSH_BACKTRACK(size, ccstart, error) \ - do \ - { \ - backtrack = sljit_alloc_memory(compiler, (size)); \ - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) \ - return error; \ - memset(backtrack, 0, size); \ - backtrack->prev = parent->top; \ - backtrack->cc = (ccstart); \ - parent->top = backtrack; \ - } \ - while (0) - -#define PUSH_BACKTRACK_NOVALUE(size, ccstart) \ - do \ - { \ - backtrack = sljit_alloc_memory(compiler, (size)); \ - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) \ - return; \ - memset(backtrack, 0, size); \ - backtrack->prev = parent->top; \ - backtrack->cc = (ccstart); \ - parent->top = backtrack; \ - } \ - while (0) - -#define BACKTRACK_AS(type) ((type *)backtrack) - -static void compile_dnref_search(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks) -{ -/* The OVECTOR offset goes to TMP2. */ -DEFINE_COMPILER; -int count = GET2(cc, 1 + IMM2_SIZE); -PCRE2_SPTR slot = common->name_table + GET2(cc, 1) * common->name_entry_size; -unsigned int offset; -jump_list *found = NULL; - -SLJIT_ASSERT(*cc == OP_DNREF || *cc == OP_DNREFI); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(1)); - -count--; -while (count-- > 0) - { - offset = GET2(slot, 0) << 1; - GET_LOCAL_BASE(TMP2, 0, OVECTOR(offset)); - add_jump(compiler, &found, CMP(SLJIT_NOT_EQUAL, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset), TMP1, 0)); - slot += common->name_entry_size; - } - -offset = GET2(slot, 0) << 1; -GET_LOCAL_BASE(TMP2, 0, OVECTOR(offset)); -if (backtracks != NULL && !common->unset_backref) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset), TMP1, 0)); - -set_jumps(found, LABEL()); -} - -static void compile_ref_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks, BOOL withchecks, BOOL emptyfail) -{ -DEFINE_COMPILER; -BOOL ref = (*cc == OP_REF || *cc == OP_REFI); -int offset = 0; -struct sljit_jump *jump = NULL; -struct sljit_jump *partial; -struct sljit_jump *nopartial; -#if defined SUPPORT_UNICODE -struct sljit_label *loop; -struct sljit_label *caseless_loop; -jump_list *no_match = NULL; -int source_reg = COUNT_MATCH; -int source_end_reg = ARGUMENTS; -int char1_reg = STACK_LIMIT; -#endif /* SUPPORT_UNICODE */ - -if (ref) - { - offset = GET2(cc, 1) << 1; - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset)); - /* OVECTOR(1) contains the "string begin - 1" constant. */ - if (withchecks && !common->unset_backref) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(1))); - } -else - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), 0); - -#if defined SUPPORT_UNICODE -if (common->utf && *cc == OP_REFI) - { - SLJIT_ASSERT(common->iref_ptr != 0); - - if (ref) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1)); - else - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP2), sizeof(sljit_sw)); - - if (withchecks && emptyfail) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, TMP2, 0)); - - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->iref_ptr, source_reg, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->iref_ptr + sizeof(sljit_sw), source_end_reg, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->iref_ptr + sizeof(sljit_sw) * 2, char1_reg, 0); - - OP1(SLJIT_MOV, source_reg, 0, TMP1, 0); - OP1(SLJIT_MOV, source_end_reg, 0, TMP2, 0); - - loop = LABEL(); - jump = CMP(SLJIT_GREATER_EQUAL, source_reg, 0, source_end_reg, 0); - partial = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - - /* Read original character. It must be a valid UTF character. */ - OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0); - OP1(SLJIT_MOV, STR_PTR, 0, source_reg, 0); - - read_char(common, 0, READ_CHAR_MAX, NULL, READ_CHAR_UPDATE_STR_PTR | READ_CHAR_VALID_UTF); - - OP1(SLJIT_MOV, source_reg, 0, STR_PTR, 0); - OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0); - OP1(SLJIT_MOV, char1_reg, 0, TMP1, 0); - - /* Read second character. */ - read_char(common, 0, READ_CHAR_MAX, &no_match, READ_CHAR_UPDATE_STR_PTR); - - CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop); - - OP1(SLJIT_MOV, TMP3, 0, TMP1, 0); - - add_jump(compiler, &common->getucd, JUMP(SLJIT_FAST_CALL)); - - OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3); - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); - - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records)); - - OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(ucd_record, other_case)); - OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(ucd_record, caseset)); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0); - CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop); - - add_jump(compiler, &no_match, CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, 0)); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2); - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_caseless_sets)); - - caseless_loop = LABEL(); - OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP2), 0); - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, sizeof(uint32_t)); - OP2(SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, char1_reg, 0); - JUMPTO(SLJIT_EQUAL, loop); - JUMPTO(SLJIT_LESS, caseless_loop); - - set_jumps(no_match, LABEL()); - if (common->mode == PCRE2_JIT_COMPLETE) - JUMPHERE(partial); - - OP1(SLJIT_MOV, source_reg, 0, SLJIT_MEM1(SLJIT_SP), common->iref_ptr); - OP1(SLJIT_MOV, source_end_reg, 0, SLJIT_MEM1(SLJIT_SP), common->iref_ptr + sizeof(sljit_sw)); - OP1(SLJIT_MOV, char1_reg, 0, SLJIT_MEM1(SLJIT_SP), common->iref_ptr + sizeof(sljit_sw) * 2); - add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); - - if (common->mode != PCRE2_JIT_COMPLETE) - { - JUMPHERE(partial); - OP1(SLJIT_MOV, source_reg, 0, SLJIT_MEM1(SLJIT_SP), common->iref_ptr); - OP1(SLJIT_MOV, source_end_reg, 0, SLJIT_MEM1(SLJIT_SP), common->iref_ptr + sizeof(sljit_sw)); - OP1(SLJIT_MOV, char1_reg, 0, SLJIT_MEM1(SLJIT_SP), common->iref_ptr + sizeof(sljit_sw) * 2); - - check_partial(common, FALSE); - add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); - } - - JUMPHERE(jump); - OP1(SLJIT_MOV, source_reg, 0, SLJIT_MEM1(SLJIT_SP), common->iref_ptr); - OP1(SLJIT_MOV, source_end_reg, 0, SLJIT_MEM1(SLJIT_SP), common->iref_ptr + sizeof(sljit_sw)); - OP1(SLJIT_MOV, char1_reg, 0, SLJIT_MEM1(SLJIT_SP), common->iref_ptr + sizeof(sljit_sw) * 2); - return; - } -else -#endif /* SUPPORT_UNICODE */ - { - if (ref) - OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1), TMP1, 0); - else - OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_MEM1(TMP2), sizeof(sljit_sw), TMP1, 0); - - if (withchecks) - jump = JUMP(SLJIT_ZERO); - - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); - partial = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0); - if (common->mode == PCRE2_JIT_COMPLETE) - add_jump(compiler, backtracks, partial); - - add_jump(compiler, *cc == OP_REF ? &common->casefulcmp : &common->caselesscmp, JUMP(SLJIT_FAST_CALL)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0)); - - if (common->mode != PCRE2_JIT_COMPLETE) - { - nopartial = JUMP(SLJIT_JUMP); - JUMPHERE(partial); - /* TMP2 -= STR_END - STR_PTR */ - OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, STR_PTR, 0); - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, STR_END, 0); - partial = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, 0); - OP1(SLJIT_MOV, STR_PTR, 0, STR_END, 0); - add_jump(compiler, *cc == OP_REF ? &common->casefulcmp : &common->caselesscmp, JUMP(SLJIT_FAST_CALL)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0)); - JUMPHERE(partial); - check_partial(common, FALSE); - add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); - JUMPHERE(nopartial); - } - } - -if (jump != NULL) - { - if (emptyfail) - add_jump(compiler, backtracks, jump); - else - JUMPHERE(jump); - } -} - -static SLJIT_INLINE PCRE2_SPTR compile_ref_iterator_matchingpath(compiler_common *common, PCRE2_SPTR cc, backtrack_common *parent) -{ -DEFINE_COMPILER; -BOOL ref = (*cc == OP_REF || *cc == OP_REFI); -backtrack_common *backtrack; -PCRE2_UCHAR type; -int offset = 0; -struct sljit_label *label; -struct sljit_jump *zerolength; -struct sljit_jump *jump = NULL; -PCRE2_SPTR ccbegin = cc; -int min = 0, max = 0; -BOOL minimize; - -PUSH_BACKTRACK(sizeof(ref_iterator_backtrack), cc, NULL); - -if (ref) - offset = GET2(cc, 1) << 1; -else - cc += IMM2_SIZE; -type = cc[1 + IMM2_SIZE]; - -SLJIT_COMPILE_ASSERT((OP_CRSTAR & 0x1) == 0, crstar_opcode_must_be_even); -minimize = (type & 0x1) != 0; -switch(type) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - min = 0; - max = 0; - cc += 1 + IMM2_SIZE + 1; - break; - case OP_CRPLUS: - case OP_CRMINPLUS: - min = 1; - max = 0; - cc += 1 + IMM2_SIZE + 1; - break; - case OP_CRQUERY: - case OP_CRMINQUERY: - min = 0; - max = 1; - cc += 1 + IMM2_SIZE + 1; - break; - case OP_CRRANGE: - case OP_CRMINRANGE: - min = GET2(cc, 1 + IMM2_SIZE + 1); - max = GET2(cc, 1 + IMM2_SIZE + 1 + IMM2_SIZE); - cc += 1 + IMM2_SIZE + 1 + 2 * IMM2_SIZE; - break; - default: - SLJIT_UNREACHABLE(); - break; - } - -if (!minimize) - { - if (min == 0) - { - allocate_stack(common, 2); - if (ref) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, 0); - /* Temporary release of STR_PTR. */ - OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, sizeof(sljit_sw)); - /* Handles both invalid and empty cases. Since the minimum repeat, - is zero the invalid case is basically the same as an empty case. */ - if (ref) - zerolength = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1)); - else - { - compile_dnref_search(common, ccbegin, NULL); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), POSSESSIVE1, TMP2, 0); - zerolength = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(TMP2), sizeof(sljit_sw)); - } - /* Restore if not zero length. */ - OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, sizeof(sljit_sw)); - } - else - { - allocate_stack(common, 1); - if (ref) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - if (ref) - { - add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(1))); - zerolength = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1)); - } - else - { - compile_dnref_search(common, ccbegin, &backtrack->topbacktracks); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), POSSESSIVE1, TMP2, 0); - zerolength = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(TMP2), sizeof(sljit_sw)); - } - } - - if (min > 1 || max > 1) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), POSSESSIVE0, SLJIT_IMM, 0); - - label = LABEL(); - if (!ref) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), POSSESSIVE1); - compile_ref_matchingpath(common, ccbegin, &backtrack->topbacktracks, FALSE, FALSE); - - if (min > 1 || max > 1) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), POSSESSIVE0); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), POSSESSIVE0, TMP1, 0); - if (min > 1) - CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, min, label); - if (max > 1) - { - jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, max); - allocate_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - JUMPTO(SLJIT_JUMP, label); - JUMPHERE(jump); - } - } - - if (max == 0) - { - /* Includes min > 1 case as well. */ - allocate_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - JUMPTO(SLJIT_JUMP, label); - } - - JUMPHERE(zerolength); - BACKTRACK_AS(ref_iterator_backtrack)->matchingpath = LABEL(); - - count_match(common); - return cc; - } - -allocate_stack(common, ref ? 2 : 3); -if (ref) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset)); -OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); -if (type != OP_CRMINSTAR) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, 0); - -if (min == 0) - { - /* Handles both invalid and empty cases. Since the minimum repeat, - is zero the invalid case is basically the same as an empty case. */ - if (ref) - zerolength = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1)); - else - { - compile_dnref_search(common, ccbegin, NULL); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(2), TMP2, 0); - zerolength = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(TMP2), sizeof(sljit_sw)); - } - /* Length is non-zero, we can match real repeats. */ - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - jump = JUMP(SLJIT_JUMP); - } -else - { - if (ref) - { - add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(1))); - zerolength = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1)); - } - else - { - compile_dnref_search(common, ccbegin, &backtrack->topbacktracks); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(2), TMP2, 0); - zerolength = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(TMP2), sizeof(sljit_sw)); - } - } - -BACKTRACK_AS(ref_iterator_backtrack)->matchingpath = LABEL(); -if (max > 0) - add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_GREATER_EQUAL, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, max)); - -if (!ref) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), STACK(2)); -compile_ref_matchingpath(common, ccbegin, &backtrack->topbacktracks, TRUE, TRUE); -OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - -if (min > 1) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(1)); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), TMP1, 0); - CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, min, BACKTRACK_AS(ref_iterator_backtrack)->matchingpath); - } -else if (max > 0) - OP2(SLJIT_ADD, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, 1); - -if (jump != NULL) - JUMPHERE(jump); -JUMPHERE(zerolength); - -count_match(common); -return cc; -} - -static SLJIT_INLINE PCRE2_SPTR compile_recurse_matchingpath(compiler_common *common, PCRE2_SPTR cc, backtrack_common *parent) -{ -DEFINE_COMPILER; -backtrack_common *backtrack; -recurse_entry *entry = common->entries; -recurse_entry *prev = NULL; -sljit_sw start = GET(cc, 1); -PCRE2_SPTR start_cc; -BOOL needs_control_head; - -PUSH_BACKTRACK(sizeof(recurse_backtrack), cc, NULL); - -/* Inlining simple patterns. */ -if (get_framesize(common, common->start + start, NULL, TRUE, &needs_control_head) == no_stack) - { - start_cc = common->start + start; - compile_matchingpath(common, next_opcode(common, start_cc), bracketend(start_cc) - (1 + LINK_SIZE), backtrack); - BACKTRACK_AS(recurse_backtrack)->inlined_pattern = TRUE; - return cc + 1 + LINK_SIZE; - } - -while (entry != NULL) - { - if (entry->start == start) - break; - prev = entry; - entry = entry->next; - } - -if (entry == NULL) - { - entry = sljit_alloc_memory(compiler, sizeof(recurse_entry)); - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - return NULL; - entry->next = NULL; - entry->entry_label = NULL; - entry->backtrack_label = NULL; - entry->entry_calls = NULL; - entry->backtrack_calls = NULL; - entry->start = start; - - if (prev != NULL) - prev->next = entry; - else - common->entries = entry; - } - -BACKTRACK_AS(recurse_backtrack)->entry = entry; - -if (entry->entry_label == NULL) - add_jump(compiler, &entry->entry_calls, JUMP(SLJIT_FAST_CALL)); -else - JUMPTO(SLJIT_FAST_CALL, entry->entry_label); -/* Leave if the match is failed. */ -add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0)); -BACKTRACK_AS(recurse_backtrack)->matchingpath = LABEL(); -return cc + 1 + LINK_SIZE; -} - -static sljit_s32 SLJIT_FUNC do_callout(struct jit_arguments *arguments, pcre2_callout_block *callout_block, PCRE2_SPTR *jit_ovector) -{ -PCRE2_SPTR begin; -PCRE2_SIZE *ovector; -sljit_u32 oveccount, capture_top; - -if (arguments->callout == NULL) - return 0; - -SLJIT_COMPILE_ASSERT(sizeof (PCRE2_SIZE) <= sizeof (sljit_sw), pcre2_size_must_be_lower_than_sljit_sw_size); - -begin = arguments->begin; -ovector = (PCRE2_SIZE*)(callout_block + 1); -oveccount = callout_block->capture_top; - -SLJIT_ASSERT(oveccount >= 1); - -callout_block->version = 2; -callout_block->callout_flags = 0; - -/* Offsets in subject. */ -callout_block->subject_length = arguments->end - arguments->begin; -callout_block->start_match = jit_ovector[0] - begin; -callout_block->current_position = (PCRE2_SPTR)callout_block->offset_vector - begin; -callout_block->subject = begin; - -/* Convert and copy the JIT offset vector to the ovector array. */ -callout_block->capture_top = 1; -callout_block->offset_vector = ovector; - -ovector[0] = PCRE2_UNSET; -ovector[1] = PCRE2_UNSET; -ovector += 2; -jit_ovector += 2; -capture_top = 1; - -/* Convert pointers to sizes. */ -while (--oveccount != 0) - { - capture_top++; - - ovector[0] = (PCRE2_SIZE)(jit_ovector[0] - begin); - ovector[1] = (PCRE2_SIZE)(jit_ovector[1] - begin); - - if (ovector[0] != PCRE2_UNSET) - callout_block->capture_top = capture_top; - - ovector += 2; - jit_ovector += 2; - } - -return (arguments->callout)(callout_block, arguments->callout_data); -} - -#define CALLOUT_ARG_OFFSET(arg) \ - SLJIT_OFFSETOF(pcre2_callout_block, arg) - -static SLJIT_INLINE PCRE2_SPTR compile_callout_matchingpath(compiler_common *common, PCRE2_SPTR cc, backtrack_common *parent) -{ -DEFINE_COMPILER; -backtrack_common *backtrack; -sljit_s32 mov_opcode; -unsigned int callout_length = (*cc == OP_CALLOUT) - ? PRIV(OP_lengths)[OP_CALLOUT] : GET(cc, 1 + 2 * LINK_SIZE); -sljit_sw value1; -sljit_sw value2; -sljit_sw value3; -sljit_uw callout_arg_size = (common->re->top_bracket + 1) * 2 * sizeof(sljit_sw); - -PUSH_BACKTRACK(sizeof(backtrack_common), cc, NULL); - -callout_arg_size = (sizeof(pcre2_callout_block) + callout_arg_size + sizeof(sljit_sw) - 1) / sizeof(sljit_sw); - -allocate_stack(common, callout_arg_size); - -SLJIT_ASSERT(common->capture_last_ptr != 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr); -OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); -value1 = (*cc == OP_CALLOUT) ? cc[1 + 2 * LINK_SIZE] : 0; -OP1(SLJIT_MOV_U32, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_number), SLJIT_IMM, value1); -OP1(SLJIT_MOV_U32, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(capture_last), TMP2, 0); -OP1(SLJIT_MOV_U32, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(capture_top), SLJIT_IMM, common->re->top_bracket + 1); - -/* These pointer sized fields temporarly stores internal variables. */ -OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(offset_vector), STR_PTR, 0); - -if (common->mark_ptr != 0) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, mark_ptr)); -mov_opcode = (sizeof(PCRE2_SIZE) == 4) ? SLJIT_MOV_U32 : SLJIT_MOV; -OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(pattern_position), SLJIT_IMM, GET(cc, 1)); -OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(next_item_length), SLJIT_IMM, GET(cc, 1 + LINK_SIZE)); - -if (*cc == OP_CALLOUT) - { - value1 = 0; - value2 = 0; - value3 = 0; - } -else - { - value1 = (sljit_sw) (cc + (1 + 4*LINK_SIZE) + 1); - value2 = (callout_length - (1 + 4*LINK_SIZE + 2)); - value3 = (sljit_sw) (GET(cc, 1 + 3*LINK_SIZE)); - } - -OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_string), SLJIT_IMM, value1); -OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_string_length), SLJIT_IMM, value2); -OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_string_offset), SLJIT_IMM, value3); -OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(mark), (common->mark_ptr != 0) ? TMP2 : SLJIT_IMM, 0); - -SLJIT_ASSERT(TMP1 == SLJIT_R0 && STR_PTR == SLJIT_R1); - -/* Needed to save important temporary registers. */ -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0); -/* SLJIT_R0 = arguments */ -OP1(SLJIT_MOV, SLJIT_R1, 0, STACK_TOP, 0); -GET_LOCAL_BASE(SLJIT_R2, 0, OVECTOR_START); -sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(S32) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_callout)); -OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); -free_stack(common, callout_arg_size); - -/* Check return value. */ -OP2(SLJIT_SUB32 | SLJIT_SET_Z | SLJIT_SET_SIG_GREATER, SLJIT_UNUSED, 0, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0); -add_jump(compiler, &backtrack->topbacktracks, JUMP(SLJIT_SIG_GREATER32)); -if (common->abort_label == NULL) - add_jump(compiler, &common->abort, JUMP(SLJIT_NOT_EQUAL32) /* SIG_LESS */); -else - JUMPTO(SLJIT_NOT_EQUAL32 /* SIG_LESS */, common->abort_label); -return cc + callout_length; -} - -#undef CALLOUT_ARG_SIZE -#undef CALLOUT_ARG_OFFSET - -static SLJIT_INLINE BOOL assert_needs_str_ptr_saving(PCRE2_SPTR cc) -{ -while (TRUE) - { - switch (*cc) - { - case OP_CALLOUT_STR: - cc += GET(cc, 1 + 2*LINK_SIZE); - break; - - case OP_NOT_WORD_BOUNDARY: - case OP_WORD_BOUNDARY: - case OP_CIRC: - case OP_CIRCM: - case OP_DOLL: - case OP_DOLLM: - case OP_CALLOUT: - case OP_ALT: - cc += PRIV(OP_lengths)[*cc]; - break; - - case OP_KET: - return FALSE; - - default: - return TRUE; - } - } -} - -static PCRE2_SPTR compile_assert_matchingpath(compiler_common *common, PCRE2_SPTR cc, assert_backtrack *backtrack, BOOL conditional) -{ -DEFINE_COMPILER; -int framesize; -int extrasize; -BOOL local_quit_available = FALSE; -BOOL needs_control_head; -int private_data_ptr; -backtrack_common altbacktrack; -PCRE2_SPTR ccbegin; -PCRE2_UCHAR opcode; -PCRE2_UCHAR bra = OP_BRA; -jump_list *tmp = NULL; -jump_list **target = (conditional) ? &backtrack->condfailed : &backtrack->common.topbacktracks; -jump_list **found; -/* Saving previous accept variables. */ -BOOL save_local_quit_available = common->local_quit_available; -BOOL save_in_positive_assertion = common->in_positive_assertion; -then_trap_backtrack *save_then_trap = common->then_trap; -struct sljit_label *save_quit_label = common->quit_label; -struct sljit_label *save_accept_label = common->accept_label; -jump_list *save_quit = common->quit; -jump_list *save_positive_assertion_quit = common->positive_assertion_quit; -jump_list *save_accept = common->accept; -struct sljit_jump *jump; -struct sljit_jump *brajump = NULL; - -/* Assert captures then. */ -common->then_trap = NULL; - -if (*cc == OP_BRAZERO || *cc == OP_BRAMINZERO) - { - SLJIT_ASSERT(!conditional); - bra = *cc; - cc++; - } -private_data_ptr = PRIVATE_DATA(cc); -SLJIT_ASSERT(private_data_ptr != 0); -framesize = get_framesize(common, cc, NULL, FALSE, &needs_control_head); -backtrack->framesize = framesize; -backtrack->private_data_ptr = private_data_ptr; -opcode = *cc; -SLJIT_ASSERT(opcode >= OP_ASSERT && opcode <= OP_ASSERTBACK_NOT); -found = (opcode == OP_ASSERT || opcode == OP_ASSERTBACK) ? &tmp : target; -ccbegin = cc; -cc += GET(cc, 1); - -if (bra == OP_BRAMINZERO) - { - /* This is a braminzero backtrack path. */ - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - free_stack(common, 1); - brajump = CMP(SLJIT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0); - } - -if (framesize < 0) - { - extrasize = 1; - if (bra == OP_BRA && !assert_needs_str_ptr_saving(ccbegin + 1 + LINK_SIZE)) - extrasize = 0; - - if (needs_control_head) - extrasize++; - - if (framesize == no_frame) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, STACK_TOP, 0); - - if (extrasize > 0) - allocate_stack(common, extrasize); - - if (needs_control_head) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr); - - if (extrasize > 0) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - - if (needs_control_head) - { - SLJIT_ASSERT(extrasize == 2); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_IMM, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), TMP1, 0); - } - } -else - { - extrasize = needs_control_head ? 3 : 2; - allocate_stack(common, framesize + extrasize); - - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - OP2(SLJIT_ADD, TMP2, 0, STACK_TOP, 0, SLJIT_IMM, (framesize + extrasize) * sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, TMP2, 0); - if (needs_control_head) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - - if (needs_control_head) - { - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(2), TMP1, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), TMP2, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_IMM, 0); - } - else - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), TMP1, 0); - - init_frame(common, ccbegin, NULL, framesize + extrasize - 1, extrasize); - } - -memset(&altbacktrack, 0, sizeof(backtrack_common)); -if (conditional || (opcode == OP_ASSERT_NOT || opcode == OP_ASSERTBACK_NOT)) - { - /* Control verbs cannot escape from these asserts. */ - local_quit_available = TRUE; - common->local_quit_available = TRUE; - common->quit_label = NULL; - common->quit = NULL; - } - -common->in_positive_assertion = (opcode == OP_ASSERT || opcode == OP_ASSERTBACK); -common->positive_assertion_quit = NULL; - -while (1) - { - common->accept_label = NULL; - common->accept = NULL; - altbacktrack.top = NULL; - altbacktrack.topbacktracks = NULL; - - if (*ccbegin == OP_ALT && extrasize > 0) - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - - altbacktrack.cc = ccbegin; - compile_matchingpath(common, ccbegin + 1 + LINK_SIZE, cc, &altbacktrack); - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - { - if (local_quit_available) - { - common->local_quit_available = save_local_quit_available; - common->quit_label = save_quit_label; - common->quit = save_quit; - } - common->in_positive_assertion = save_in_positive_assertion; - common->then_trap = save_then_trap; - common->accept_label = save_accept_label; - common->positive_assertion_quit = save_positive_assertion_quit; - common->accept = save_accept; - return NULL; - } - common->accept_label = LABEL(); - if (common->accept != NULL) - set_jumps(common->accept, common->accept_label); - - /* Reset stack. */ - if (framesize < 0) - { - if (framesize == no_frame) - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - else if (extrasize > 0) - free_stack(common, extrasize); - - if (needs_control_head) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(-1)); - } - else - { - if ((opcode != OP_ASSERT_NOT && opcode != OP_ASSERTBACK_NOT) || conditional) - { - /* We don't need to keep the STR_PTR, only the previous private_data_ptr. */ - OP2(SLJIT_SUB, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr, SLJIT_IMM, (framesize + 1) * sizeof(sljit_sw)); - if (needs_control_head) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(-1)); - } - else - { - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - if (needs_control_head) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(-framesize - 2)); - add_jump(compiler, &common->revertframes, JUMP(SLJIT_FAST_CALL)); - OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, (framesize - 1) * sizeof(sljit_sw)); - } - } - - if (opcode == OP_ASSERT_NOT || opcode == OP_ASSERTBACK_NOT) - { - /* We know that STR_PTR was stored on the top of the stack. */ - if (conditional) - { - if (extrasize > 0) - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), needs_control_head ? STACK(-2) : STACK(-1)); - } - else if (bra == OP_BRAZERO) - { - if (framesize < 0) - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(-extrasize)); - else - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(-framesize - 1)); - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(-framesize - extrasize)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, TMP1, 0); - } - OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - } - else if (framesize >= 0) - { - /* For OP_BRA and OP_BRAMINZERO. */ - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, SLJIT_MEM1(STACK_TOP), STACK(-framesize - 1)); - } - } - add_jump(compiler, found, JUMP(SLJIT_JUMP)); - - compile_backtrackingpath(common, altbacktrack.top); - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - { - if (local_quit_available) - { - common->local_quit_available = save_local_quit_available; - common->quit_label = save_quit_label; - common->quit = save_quit; - } - common->in_positive_assertion = save_in_positive_assertion; - common->then_trap = save_then_trap; - common->accept_label = save_accept_label; - common->positive_assertion_quit = save_positive_assertion_quit; - common->accept = save_accept; - return NULL; - } - set_jumps(altbacktrack.topbacktracks, LABEL()); - - if (*cc != OP_ALT) - break; - - ccbegin = cc; - cc += GET(cc, 1); - } - -if (local_quit_available) - { - SLJIT_ASSERT(common->positive_assertion_quit == NULL); - /* Makes the check less complicated below. */ - common->positive_assertion_quit = common->quit; - } - -/* None of them matched. */ -if (common->positive_assertion_quit != NULL) - { - jump = JUMP(SLJIT_JUMP); - set_jumps(common->positive_assertion_quit, LABEL()); - SLJIT_ASSERT(framesize != no_stack); - if (framesize < 0) - OP2(SLJIT_SUB, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr, SLJIT_IMM, extrasize * sizeof(sljit_sw)); - else - { - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - add_jump(compiler, &common->revertframes, JUMP(SLJIT_FAST_CALL)); - OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, (extrasize + 1) * sizeof(sljit_sw)); - } - JUMPHERE(jump); - } - -if (needs_control_head) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(1)); - -if (opcode == OP_ASSERT || opcode == OP_ASSERTBACK) - { - /* Assert is failed. */ - if ((conditional && extrasize > 0) || bra == OP_BRAZERO) - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - - if (framesize < 0) - { - /* The topmost item should be 0. */ - if (bra == OP_BRAZERO) - { - if (extrasize == 2) - free_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - } - else if (extrasize > 0) - free_stack(common, extrasize); - } - else - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(extrasize - 1)); - /* The topmost item should be 0. */ - if (bra == OP_BRAZERO) - { - free_stack(common, framesize + extrasize - 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - } - else - free_stack(common, framesize + extrasize); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, TMP1, 0); - } - jump = JUMP(SLJIT_JUMP); - if (bra != OP_BRAZERO) - add_jump(compiler, target, jump); - - /* Assert is successful. */ - set_jumps(tmp, LABEL()); - if (framesize < 0) - { - /* We know that STR_PTR was stored on the top of the stack. */ - if (extrasize > 0) - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(-extrasize)); - - /* Keep the STR_PTR on the top of the stack. */ - if (bra == OP_BRAZERO) - { - OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, sizeof(sljit_sw)); - if (extrasize == 2) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - } - else if (bra == OP_BRAMINZERO) - { - OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - } - } - else - { - if (bra == OP_BRA) - { - /* We don't need to keep the STR_PTR, only the previous private_data_ptr. */ - OP2(SLJIT_SUB, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr, SLJIT_IMM, (framesize + 1) * sizeof(sljit_sw)); - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(-extrasize + 1)); - } - else - { - /* We don't need to keep the STR_PTR, only the previous private_data_ptr. */ - OP2(SLJIT_SUB, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr, SLJIT_IMM, (framesize + 2) * sizeof(sljit_sw)); - if (extrasize == 2) - { - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - if (bra == OP_BRAMINZERO) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - } - else - { - SLJIT_ASSERT(extrasize == 3); - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(-1)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), bra == OP_BRAZERO ? STR_PTR : SLJIT_IMM, 0); - } - } - } - - if (bra == OP_BRAZERO) - { - backtrack->matchingpath = LABEL(); - SET_LABEL(jump, backtrack->matchingpath); - } - else if (bra == OP_BRAMINZERO) - { - JUMPTO(SLJIT_JUMP, backtrack->matchingpath); - JUMPHERE(brajump); - if (framesize >= 0) - { - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - add_jump(compiler, &common->revertframes, JUMP(SLJIT_FAST_CALL)); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(-2)); - OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, (framesize - 1) * sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, TMP1, 0); - } - set_jumps(backtrack->common.topbacktracks, LABEL()); - } - } -else - { - /* AssertNot is successful. */ - if (framesize < 0) - { - if (extrasize > 0) - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - - if (bra != OP_BRA) - { - if (extrasize == 2) - free_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - } - else if (extrasize > 0) - free_stack(common, extrasize); - } - else - { - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(extrasize - 1)); - /* The topmost item should be 0. */ - if (bra != OP_BRA) - { - free_stack(common, framesize + extrasize - 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - } - else - free_stack(common, framesize + extrasize); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, TMP1, 0); - } - - if (bra == OP_BRAZERO) - backtrack->matchingpath = LABEL(); - else if (bra == OP_BRAMINZERO) - { - JUMPTO(SLJIT_JUMP, backtrack->matchingpath); - JUMPHERE(brajump); - } - - if (bra != OP_BRA) - { - SLJIT_ASSERT(found == &backtrack->common.topbacktracks); - set_jumps(backtrack->common.topbacktracks, LABEL()); - backtrack->common.topbacktracks = NULL; - } - } - -if (local_quit_available) - { - common->local_quit_available = save_local_quit_available; - common->quit_label = save_quit_label; - common->quit = save_quit; - } -common->in_positive_assertion = save_in_positive_assertion; -common->then_trap = save_then_trap; -common->accept_label = save_accept_label; -common->positive_assertion_quit = save_positive_assertion_quit; -common->accept = save_accept; -return cc + 1 + LINK_SIZE; -} - -static SLJIT_INLINE void match_once_common(compiler_common *common, PCRE2_UCHAR ket, int framesize, int private_data_ptr, BOOL has_alternatives, BOOL needs_control_head) -{ -DEFINE_COMPILER; -int stacksize; - -if (framesize < 0) - { - if (framesize == no_frame) - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - else - { - stacksize = needs_control_head ? 1 : 0; - if (ket != OP_KET || has_alternatives) - stacksize++; - - if (stacksize > 0) - free_stack(common, stacksize); - } - - if (needs_control_head) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), (ket != OP_KET || has_alternatives) ? STACK(-2) : STACK(-1)); - - /* TMP2 which is set here used by OP_KETRMAX below. */ - if (ket == OP_KETRMAX) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), STACK(-1)); - else if (ket == OP_KETRMIN) - { - /* Move the STR_PTR to the private_data_ptr. */ - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, SLJIT_MEM1(STACK_TOP), STACK(-1)); - } - } -else - { - stacksize = (ket != OP_KET || has_alternatives) ? 2 : 1; - OP2(SLJIT_SUB, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr, SLJIT_IMM, (framesize + stacksize) * sizeof(sljit_sw)); - if (needs_control_head) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(-1)); - - if (ket == OP_KETRMAX) - { - /* TMP2 which is set here used by OP_KETRMAX below. */ - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - } - } -if (needs_control_head) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, TMP1, 0); -} - -static SLJIT_INLINE int match_capture_common(compiler_common *common, int stacksize, int offset, int private_data_ptr) -{ -DEFINE_COMPILER; - -if (common->capture_last_ptr != 0) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr, SLJIT_IMM, offset >> 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), TMP1, 0); - stacksize++; - } -if (common->optimized_cbracket[offset >> 1] == 0) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset)); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), TMP1, 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize + 1), TMP2, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1), STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset), TMP1, 0); - stacksize += 2; - } -return stacksize; -} - -static PCRE2_SPTR SLJIT_FUNC do_script_run(PCRE2_SPTR ptr, PCRE2_SPTR endptr) -{ - if (PRIV(script_run)(ptr, endptr, FALSE)) - return endptr; - return NULL; -} - -#ifdef SUPPORT_UNICODE - -static PCRE2_SPTR SLJIT_FUNC do_script_run_utf(PCRE2_SPTR ptr, PCRE2_SPTR endptr) -{ - if (PRIV(script_run)(ptr, endptr, TRUE)) - return endptr; - return NULL; -} - -#endif /* SUPPORT_UNICODE */ - -static SLJIT_INLINE void match_script_run_common(compiler_common *common, int private_data_ptr, backtrack_common *parent) -{ -DEFINE_COMPILER; - -SLJIT_ASSERT(TMP1 == SLJIT_R0 && STR_PTR == SLJIT_R1); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); -#ifdef SUPPORT_UNICODE -sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, - common->utf ? SLJIT_FUNC_OFFSET(do_script_run_utf) : SLJIT_FUNC_OFFSET(do_script_run)); -#else -sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_script_run)); -#endif - -OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0); -add_jump(compiler, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0)); -} - -/* - Handling bracketed expressions is probably the most complex part. - - Stack layout naming characters: - S - Push the current STR_PTR - 0 - Push a 0 (NULL) - A - Push the current STR_PTR. Needed for restoring the STR_PTR - before the next alternative. Not pushed if there are no alternatives. - M - Any values pushed by the current alternative. Can be empty, or anything. - C - Push the previous OVECTOR(i), OVECTOR(i+1) and OVECTOR_PRIV(i) to the stack. - L - Push the previous local (pointed by localptr) to the stack - () - opional values stored on the stack - ()* - optonal, can be stored multiple times - - The following list shows the regular expression templates, their PCRE byte codes - and stack layout supported by pcre-sljit. - - (?:) OP_BRA | OP_KET A M - () OP_CBRA | OP_KET C M - (?:)+ OP_BRA | OP_KETRMAX 0 A M S ( A M S )* - OP_SBRA | OP_KETRMAX 0 L M S ( L M S )* - (?:)+? OP_BRA | OP_KETRMIN 0 A M S ( A M S )* - OP_SBRA | OP_KETRMIN 0 L M S ( L M S )* - ()+ OP_CBRA | OP_KETRMAX 0 C M S ( C M S )* - OP_SCBRA | OP_KETRMAX 0 C M S ( C M S )* - ()+? OP_CBRA | OP_KETRMIN 0 C M S ( C M S )* - OP_SCBRA | OP_KETRMIN 0 C M S ( C M S )* - (?:)? OP_BRAZERO | OP_BRA | OP_KET S ( A M 0 ) - (?:)?? OP_BRAMINZERO | OP_BRA | OP_KET S ( A M 0 ) - ()? OP_BRAZERO | OP_CBRA | OP_KET S ( C M 0 ) - ()?? OP_BRAMINZERO | OP_CBRA | OP_KET S ( C M 0 ) - (?:)* OP_BRAZERO | OP_BRA | OP_KETRMAX S 0 ( A M S )* - OP_BRAZERO | OP_SBRA | OP_KETRMAX S 0 ( L M S )* - (?:)*? OP_BRAMINZERO | OP_BRA | OP_KETRMIN S 0 ( A M S )* - OP_BRAMINZERO | OP_SBRA | OP_KETRMIN S 0 ( L M S )* - ()* OP_BRAZERO | OP_CBRA | OP_KETRMAX S 0 ( C M S )* - OP_BRAZERO | OP_SCBRA | OP_KETRMAX S 0 ( C M S )* - ()*? OP_BRAMINZERO | OP_CBRA | OP_KETRMIN S 0 ( C M S )* - OP_BRAMINZERO | OP_SCBRA | OP_KETRMIN S 0 ( C M S )* - - - Stack layout naming characters: - A - Push the alternative index (starting from 0) on the stack. - Not pushed if there is no alternatives. - M - Any values pushed by the current alternative. Can be empty, or anything. - - The next list shows the possible content of a bracket: - (|) OP_*BRA | OP_ALT ... M A - (?()|) OP_*COND | OP_ALT M A - (?>|) OP_ONCE | OP_ALT ... [stack trace] M A - Or nothing, if trace is unnecessary -*/ - -static PCRE2_SPTR compile_bracket_matchingpath(compiler_common *common, PCRE2_SPTR cc, backtrack_common *parent) -{ -DEFINE_COMPILER; -backtrack_common *backtrack; -PCRE2_UCHAR opcode; -int private_data_ptr = 0; -int offset = 0; -int i, stacksize; -int repeat_ptr = 0, repeat_length = 0; -int repeat_type = 0, repeat_count = 0; -PCRE2_SPTR ccbegin; -PCRE2_SPTR matchingpath; -PCRE2_SPTR slot; -PCRE2_UCHAR bra = OP_BRA; -PCRE2_UCHAR ket; -assert_backtrack *assert; -BOOL has_alternatives; -BOOL needs_control_head = FALSE; -struct sljit_jump *jump; -struct sljit_jump *skip; -struct sljit_label *rmax_label = NULL; -struct sljit_jump *braminzero = NULL; - -PUSH_BACKTRACK(sizeof(bracket_backtrack), cc, NULL); - -if (*cc == OP_BRAZERO || *cc == OP_BRAMINZERO) - { - bra = *cc; - cc++; - opcode = *cc; - } - -opcode = *cc; -ccbegin = cc; -matchingpath = bracketend(cc) - 1 - LINK_SIZE; -ket = *matchingpath; -if (ket == OP_KET && PRIVATE_DATA(matchingpath) != 0) - { - repeat_ptr = PRIVATE_DATA(matchingpath); - repeat_length = PRIVATE_DATA(matchingpath + 1); - repeat_type = PRIVATE_DATA(matchingpath + 2); - repeat_count = PRIVATE_DATA(matchingpath + 3); - SLJIT_ASSERT(repeat_length != 0 && repeat_type != 0 && repeat_count != 0); - if (repeat_type == OP_UPTO) - ket = OP_KETRMAX; - if (repeat_type == OP_MINUPTO) - ket = OP_KETRMIN; - } - -matchingpath = ccbegin + 1 + LINK_SIZE; -SLJIT_ASSERT(ket == OP_KET || ket == OP_KETRMAX || ket == OP_KETRMIN); -SLJIT_ASSERT(!((bra == OP_BRAZERO && ket == OP_KETRMIN) || (bra == OP_BRAMINZERO && ket == OP_KETRMAX))); -cc += GET(cc, 1); - -has_alternatives = *cc == OP_ALT; -if (SLJIT_UNLIKELY(opcode == OP_COND || opcode == OP_SCOND)) - { - SLJIT_COMPILE_ASSERT(OP_DNRREF == OP_RREF + 1 && OP_FALSE == OP_RREF + 2 && OP_TRUE == OP_RREF + 3, - compile_time_checks_must_be_grouped_together); - has_alternatives = ((*matchingpath >= OP_RREF && *matchingpath <= OP_TRUE) || *matchingpath == OP_FAIL) ? FALSE : TRUE; - } - -if (SLJIT_UNLIKELY(opcode == OP_COND) && (*cc == OP_KETRMAX || *cc == OP_KETRMIN)) - opcode = OP_SCOND; - -if (opcode == OP_CBRA || opcode == OP_SCBRA) - { - /* Capturing brackets has a pre-allocated space. */ - offset = GET2(ccbegin, 1 + LINK_SIZE); - if (common->optimized_cbracket[offset] == 0) - { - private_data_ptr = OVECTOR_PRIV(offset); - offset <<= 1; - } - else - { - offset <<= 1; - private_data_ptr = OVECTOR(offset); - } - BACKTRACK_AS(bracket_backtrack)->private_data_ptr = private_data_ptr; - matchingpath += IMM2_SIZE; - } -else if (opcode == OP_ASSERT_NA || opcode == OP_ASSERTBACK_NA || opcode == OP_ONCE || opcode == OP_SCRIPT_RUN || opcode == OP_SBRA || opcode == OP_SCOND) - { - /* Other brackets simply allocate the next entry. */ - private_data_ptr = PRIVATE_DATA(ccbegin); - SLJIT_ASSERT(private_data_ptr != 0); - BACKTRACK_AS(bracket_backtrack)->private_data_ptr = private_data_ptr; - if (opcode == OP_ONCE) - BACKTRACK_AS(bracket_backtrack)->u.framesize = get_framesize(common, ccbegin, NULL, FALSE, &needs_control_head); - } - -/* Instructions before the first alternative. */ -stacksize = 0; -if (ket == OP_KETRMAX || (ket == OP_KETRMIN && bra != OP_BRAMINZERO)) - stacksize++; -if (bra == OP_BRAZERO) - stacksize++; - -if (stacksize > 0) - allocate_stack(common, stacksize); - -stacksize = 0; -if (ket == OP_KETRMAX || (ket == OP_KETRMIN && bra != OP_BRAMINZERO)) - { - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), SLJIT_IMM, 0); - stacksize++; - } - -if (bra == OP_BRAZERO) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), STR_PTR, 0); - -if (bra == OP_BRAMINZERO) - { - /* This is a backtrack path! (Since the try-path of OP_BRAMINZERO matches to the empty string) */ - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - if (ket != OP_KETRMIN) - { - free_stack(common, 1); - braminzero = CMP(SLJIT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0); - } - else if (opcode == OP_ONCE || opcode >= OP_SBRA) - { - jump = CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0); - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(1)); - /* Nothing stored during the first run. */ - skip = JUMP(SLJIT_JUMP); - JUMPHERE(jump); - /* Checking zero-length iteration. */ - if (opcode != OP_ONCE || BACKTRACK_AS(bracket_backtrack)->u.framesize < 0) - { - /* When we come from outside, private_data_ptr contains the previous STR_PTR. */ - braminzero = CMP(SLJIT_EQUAL, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - } - else - { - /* Except when the whole stack frame must be saved. */ - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - braminzero = CMP(SLJIT_EQUAL, STR_PTR, 0, SLJIT_MEM1(TMP1), STACK(-BACKTRACK_AS(bracket_backtrack)->u.framesize - 2)); - } - JUMPHERE(skip); - } - else - { - jump = CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0); - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(1)); - JUMPHERE(jump); - } - } - -if (repeat_type != 0) - { - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), repeat_ptr, SLJIT_IMM, repeat_count); - if (repeat_type == OP_EXACT) - rmax_label = LABEL(); - } - -if (ket == OP_KETRMIN) - BACKTRACK_AS(bracket_backtrack)->recursive_matchingpath = LABEL(); - -if (ket == OP_KETRMAX) - { - rmax_label = LABEL(); - if (has_alternatives && opcode >= OP_BRA && opcode < OP_SBRA && repeat_type == 0) - BACKTRACK_AS(bracket_backtrack)->alternative_matchingpath = rmax_label; - } - -/* Handling capturing brackets and alternatives. */ -if (opcode == OP_ONCE) - { - stacksize = 0; - if (needs_control_head) - { - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr); - stacksize++; - } - - if (BACKTRACK_AS(bracket_backtrack)->u.framesize < 0) - { - /* Neither capturing brackets nor recursions are found in the block. */ - if (ket == OP_KETRMIN) - { - stacksize += 2; - if (!needs_control_head) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - } - else - { - if (BACKTRACK_AS(bracket_backtrack)->u.framesize == no_frame) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, STACK_TOP, 0); - if (ket == OP_KETRMAX || has_alternatives) - stacksize++; - } - - if (stacksize > 0) - allocate_stack(common, stacksize); - - stacksize = 0; - if (needs_control_head) - { - stacksize++; - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), TMP2, 0); - } - - if (ket == OP_KETRMIN) - { - if (needs_control_head) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), STR_PTR, 0); - if (BACKTRACK_AS(bracket_backtrack)->u.framesize == no_frame) - OP2(SLJIT_ADD, SLJIT_MEM1(SLJIT_SP), private_data_ptr, STACK_TOP, 0, SLJIT_IMM, needs_control_head ? (2 * sizeof(sljit_sw)) : sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize + 1), TMP2, 0); - } - else if (ket == OP_KETRMAX || has_alternatives) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), STR_PTR, 0); - } - else - { - if (ket != OP_KET || has_alternatives) - stacksize++; - - stacksize += BACKTRACK_AS(bracket_backtrack)->u.framesize + 1; - allocate_stack(common, stacksize); - - if (needs_control_head) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), TMP2, 0); - - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - OP2(SLJIT_ADD, TMP2, 0, STACK_TOP, 0, SLJIT_IMM, stacksize * sizeof(sljit_sw)); - - stacksize = needs_control_head ? 1 : 0; - if (ket != OP_KET || has_alternatives) - { - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, TMP2, 0); - stacksize++; - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), TMP1, 0); - } - else - { - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, TMP2, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), TMP1, 0); - } - init_frame(common, ccbegin, NULL, BACKTRACK_AS(bracket_backtrack)->u.framesize + stacksize, stacksize + 1); - } - } -else if (opcode == OP_CBRA || opcode == OP_SCBRA) - { - /* Saving the previous values. */ - if (common->optimized_cbracket[offset >> 1] != 0) - { - SLJIT_ASSERT(private_data_ptr == OVECTOR(offset)); - allocate_stack(common, 2); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr + sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), TMP1, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), TMP2, 0); - } - else - { - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - allocate_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), TMP2, 0); - } - } -else if (opcode == OP_ASSERT_NA || opcode == OP_ASSERTBACK_NA || opcode == OP_SCRIPT_RUN || opcode == OP_SBRA || opcode == OP_SCOND) - { - /* Saving the previous value. */ - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - allocate_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), TMP2, 0); - } -else if (has_alternatives) - { - /* Pushing the starting string pointer. */ - allocate_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - } - -/* Generating code for the first alternative. */ -if (opcode == OP_COND || opcode == OP_SCOND) - { - if (*matchingpath == OP_CREF) - { - SLJIT_ASSERT(has_alternatives); - add_jump(compiler, &(BACKTRACK_AS(bracket_backtrack)->u.condfailed), - CMP(SLJIT_EQUAL, SLJIT_MEM1(SLJIT_SP), OVECTOR(GET2(matchingpath, 1) << 1), SLJIT_MEM1(SLJIT_SP), OVECTOR(1))); - matchingpath += 1 + IMM2_SIZE; - } - else if (*matchingpath == OP_DNCREF) - { - SLJIT_ASSERT(has_alternatives); - - i = GET2(matchingpath, 1 + IMM2_SIZE); - slot = common->name_table + GET2(matchingpath, 1) * common->name_entry_size; - OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(1)); - OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(GET2(slot, 0) << 1), TMP1, 0); - slot += common->name_entry_size; - i--; - while (i-- > 0) - { - OP2(SLJIT_SUB, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(GET2(slot, 0) << 1), TMP1, 0); - OP2(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, STR_PTR, 0); - slot += common->name_entry_size; - } - OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0); - add_jump(compiler, &(BACKTRACK_AS(bracket_backtrack)->u.condfailed), JUMP(SLJIT_ZERO)); - matchingpath += 1 + 2 * IMM2_SIZE; - } - else if ((*matchingpath >= OP_RREF && *matchingpath <= OP_TRUE) || *matchingpath == OP_FAIL) - { - /* Never has other case. */ - BACKTRACK_AS(bracket_backtrack)->u.condfailed = NULL; - SLJIT_ASSERT(!has_alternatives); - - if (*matchingpath == OP_TRUE) - { - stacksize = 1; - matchingpath++; - } - else if (*matchingpath == OP_FALSE || *matchingpath == OP_FAIL) - stacksize = 0; - else if (*matchingpath == OP_RREF) - { - stacksize = GET2(matchingpath, 1); - if (common->currententry == NULL) - stacksize = 0; - else if (stacksize == RREF_ANY) - stacksize = 1; - else if (common->currententry->start == 0) - stacksize = stacksize == 0; - else - stacksize = stacksize == (int)GET2(common->start, common->currententry->start + 1 + LINK_SIZE); - - if (stacksize != 0) - matchingpath += 1 + IMM2_SIZE; - } - else - { - if (common->currententry == NULL || common->currententry->start == 0) - stacksize = 0; - else - { - stacksize = GET2(matchingpath, 1 + IMM2_SIZE); - slot = common->name_table + GET2(matchingpath, 1) * common->name_entry_size; - i = (int)GET2(common->start, common->currententry->start + 1 + LINK_SIZE); - while (stacksize > 0) - { - if ((int)GET2(slot, 0) == i) - break; - slot += common->name_entry_size; - stacksize--; - } - } - - if (stacksize != 0) - matchingpath += 1 + 2 * IMM2_SIZE; - } - - /* The stacksize == 0 is a common "else" case. */ - if (stacksize == 0) - { - if (*cc == OP_ALT) - { - matchingpath = cc + 1 + LINK_SIZE; - cc += GET(cc, 1); - } - else - matchingpath = cc; - } - } - else - { - SLJIT_ASSERT(has_alternatives && *matchingpath >= OP_ASSERT && *matchingpath <= OP_ASSERTBACK_NOT); - /* Similar code as PUSH_BACKTRACK macro. */ - assert = sljit_alloc_memory(compiler, sizeof(assert_backtrack)); - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - return NULL; - memset(assert, 0, sizeof(assert_backtrack)); - assert->common.cc = matchingpath; - BACKTRACK_AS(bracket_backtrack)->u.assert = assert; - matchingpath = compile_assert_matchingpath(common, matchingpath, assert, TRUE); - } - } - -compile_matchingpath(common, matchingpath, cc, backtrack); -if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - return NULL; - -if (opcode == OP_ASSERT_NA || opcode == OP_ASSERTBACK_NA) - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - -if (opcode == OP_ONCE) - match_once_common(common, ket, BACKTRACK_AS(bracket_backtrack)->u.framesize, private_data_ptr, has_alternatives, needs_control_head); - -if (opcode == OP_SCRIPT_RUN) - match_script_run_common(common, private_data_ptr, backtrack); - -stacksize = 0; -if (repeat_type == OP_MINUPTO) - { - /* We need to preserve the counter. TMP2 will be used below. */ - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), repeat_ptr); - stacksize++; - } -if (ket != OP_KET || bra != OP_BRA) - stacksize++; -if (offset != 0) - { - if (common->capture_last_ptr != 0) - stacksize++; - if (common->optimized_cbracket[offset >> 1] == 0) - stacksize += 2; - } -if (has_alternatives && opcode != OP_ONCE) - stacksize++; - -if (stacksize > 0) - allocate_stack(common, stacksize); - -stacksize = 0; -if (repeat_type == OP_MINUPTO) - { - /* TMP2 was set above. */ - OP2(SLJIT_SUB, SLJIT_MEM1(STACK_TOP), STACK(stacksize), TMP2, 0, SLJIT_IMM, 1); - stacksize++; - } - -if (ket != OP_KET || bra != OP_BRA) - { - if (ket != OP_KET) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), STR_PTR, 0); - else - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), SLJIT_IMM, 0); - stacksize++; - } - -if (offset != 0) - stacksize = match_capture_common(common, stacksize, offset, private_data_ptr); - -/* Skip and count the other alternatives. */ -i = 1; -while (*cc == OP_ALT) - { - cc += GET(cc, 1); - i++; - } - -if (has_alternatives) - { - if (opcode != OP_ONCE) - { - if (i <= 3) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), SLJIT_IMM, 0); - else - BACKTRACK_AS(bracket_backtrack)->u.matching_put_label = sljit_emit_put_label(compiler, SLJIT_MEM1(STACK_TOP), STACK(stacksize)); - } - if (ket != OP_KETRMAX) - BACKTRACK_AS(bracket_backtrack)->alternative_matchingpath = LABEL(); - } - -/* Must be after the matchingpath label. */ -if (offset != 0 && common->optimized_cbracket[offset >> 1] != 0) - { - SLJIT_ASSERT(private_data_ptr == OVECTOR(offset + 0)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1), STR_PTR, 0); - } - -if (ket == OP_KETRMAX) - { - if (repeat_type != 0) - { - if (has_alternatives) - BACKTRACK_AS(bracket_backtrack)->alternative_matchingpath = LABEL(); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_MEM1(SLJIT_SP), repeat_ptr, SLJIT_MEM1(SLJIT_SP), repeat_ptr, SLJIT_IMM, 1); - JUMPTO(SLJIT_NOT_ZERO, rmax_label); - /* Drop STR_PTR for greedy plus quantifier. */ - if (opcode != OP_ONCE) - free_stack(common, 1); - } - else if (opcode < OP_BRA || opcode >= OP_SBRA) - { - if (has_alternatives) - BACKTRACK_AS(bracket_backtrack)->alternative_matchingpath = LABEL(); - - /* Checking zero-length iteration. */ - if (opcode != OP_ONCE) - { - /* This case includes opcodes such as OP_SCRIPT_RUN. */ - CMPTO(SLJIT_NOT_EQUAL, SLJIT_MEM1(SLJIT_SP), private_data_ptr, STR_PTR, 0, rmax_label); - /* Drop STR_PTR for greedy plus quantifier. */ - if (bra != OP_BRAZERO) - free_stack(common, 1); - } - else - /* TMP2 must contain the starting STR_PTR. */ - CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, STR_PTR, 0, rmax_label); - } - else - JUMPTO(SLJIT_JUMP, rmax_label); - BACKTRACK_AS(bracket_backtrack)->recursive_matchingpath = LABEL(); - } - -if (repeat_type == OP_EXACT) - { - count_match(common); - OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_MEM1(SLJIT_SP), repeat_ptr, SLJIT_MEM1(SLJIT_SP), repeat_ptr, SLJIT_IMM, 1); - JUMPTO(SLJIT_NOT_ZERO, rmax_label); - } -else if (repeat_type == OP_UPTO) - { - /* We need to preserve the counter. */ - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), repeat_ptr); - allocate_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), TMP2, 0); - } - -if (bra == OP_BRAZERO) - BACKTRACK_AS(bracket_backtrack)->zero_matchingpath = LABEL(); - -if (bra == OP_BRAMINZERO) - { - /* This is a backtrack path! (From the viewpoint of OP_BRAMINZERO) */ - JUMPTO(SLJIT_JUMP, ((braminzero_backtrack *)parent)->matchingpath); - if (braminzero != NULL) - { - JUMPHERE(braminzero); - /* We need to release the end pointer to perform the - backtrack for the zero-length iteration. When - framesize is < 0, OP_ONCE will do the release itself. */ - if (opcode == OP_ONCE && BACKTRACK_AS(bracket_backtrack)->u.framesize >= 0) - { - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - add_jump(compiler, &common->revertframes, JUMP(SLJIT_FAST_CALL)); - OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, (BACKTRACK_AS(bracket_backtrack)->u.framesize - 1) * sizeof(sljit_sw)); - } - else if (ket == OP_KETRMIN && opcode != OP_ONCE) - free_stack(common, 1); - } - /* Continue to the normal backtrack. */ - } - -if ((ket != OP_KET && bra != OP_BRAMINZERO) || bra == OP_BRAZERO) - count_match(common); - -cc += 1 + LINK_SIZE; - -if (opcode == OP_ONCE) - { - /* We temporarily encode the needs_control_head in the lowest bit. - Note: on the target architectures of SLJIT the ((x << 1) >> 1) returns - the same value for small signed numbers (including negative numbers). */ - BACKTRACK_AS(bracket_backtrack)->u.framesize = (int)((unsigned)BACKTRACK_AS(bracket_backtrack)->u.framesize << 1) | (needs_control_head ? 1 : 0); - } -return cc + repeat_length; -} - -static PCRE2_SPTR compile_bracketpos_matchingpath(compiler_common *common, PCRE2_SPTR cc, backtrack_common *parent) -{ -DEFINE_COMPILER; -backtrack_common *backtrack; -PCRE2_UCHAR opcode; -int private_data_ptr; -int cbraprivptr = 0; -BOOL needs_control_head; -int framesize; -int stacksize; -int offset = 0; -BOOL zero = FALSE; -PCRE2_SPTR ccbegin = NULL; -int stack; /* Also contains the offset of control head. */ -struct sljit_label *loop = NULL; -struct jump_list *emptymatch = NULL; - -PUSH_BACKTRACK(sizeof(bracketpos_backtrack), cc, NULL); -if (*cc == OP_BRAPOSZERO) - { - zero = TRUE; - cc++; - } - -opcode = *cc; -private_data_ptr = PRIVATE_DATA(cc); -SLJIT_ASSERT(private_data_ptr != 0); -BACKTRACK_AS(bracketpos_backtrack)->private_data_ptr = private_data_ptr; -switch(opcode) - { - case OP_BRAPOS: - case OP_SBRAPOS: - ccbegin = cc + 1 + LINK_SIZE; - break; - - case OP_CBRAPOS: - case OP_SCBRAPOS: - offset = GET2(cc, 1 + LINK_SIZE); - /* This case cannot be optimized in the same was as - normal capturing brackets. */ - SLJIT_ASSERT(common->optimized_cbracket[offset] == 0); - cbraprivptr = OVECTOR_PRIV(offset); - offset <<= 1; - ccbegin = cc + 1 + LINK_SIZE + IMM2_SIZE; - break; - - default: - SLJIT_UNREACHABLE(); - break; - } - -framesize = get_framesize(common, cc, NULL, FALSE, &needs_control_head); -BACKTRACK_AS(bracketpos_backtrack)->framesize = framesize; -if (framesize < 0) - { - if (offset != 0) - { - stacksize = 2; - if (common->capture_last_ptr != 0) - stacksize++; - } - else - stacksize = 1; - - if (needs_control_head) - stacksize++; - if (!zero) - stacksize++; - - BACKTRACK_AS(bracketpos_backtrack)->stacksize = stacksize; - allocate_stack(common, stacksize); - if (framesize == no_frame) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, STACK_TOP, 0); - - stack = 0; - if (offset != 0) - { - stack = 2; - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset)); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), TMP1, 0); - if (common->capture_last_ptr != 0) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), TMP2, 0); - if (needs_control_head) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr); - if (common->capture_last_ptr != 0) - { - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(2), TMP1, 0); - stack = 3; - } - } - else - { - if (needs_control_head) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - stack = 1; - } - - if (needs_control_head) - stack++; - if (!zero) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stack), SLJIT_IMM, 1); - if (needs_control_head) - { - stack--; - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stack), TMP2, 0); - } - } -else - { - stacksize = framesize + 1; - if (!zero) - stacksize++; - if (needs_control_head) - stacksize++; - if (offset == 0) - stacksize++; - BACKTRACK_AS(bracketpos_backtrack)->stacksize = stacksize; - - allocate_stack(common, stacksize); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - if (needs_control_head) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr); - OP2(SLJIT_ADD, SLJIT_MEM1(SLJIT_SP), private_data_ptr, STACK_TOP, 0, SLJIT_IMM, stacksize * sizeof(sljit_sw)); - - stack = 0; - if (!zero) - { - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 1); - stack = 1; - } - if (needs_control_head) - { - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stack), TMP2, 0); - stack++; - } - if (offset == 0) - { - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stack), STR_PTR, 0); - stack++; - } - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stack), TMP1, 0); - init_frame(common, cc, NULL, stacksize - 1, stacksize - framesize); - stack -= 1 + (offset == 0); - } - -if (offset != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), cbraprivptr, STR_PTR, 0); - -loop = LABEL(); -while (*cc != OP_KETRPOS) - { - backtrack->top = NULL; - backtrack->topbacktracks = NULL; - cc += GET(cc, 1); - - compile_matchingpath(common, ccbegin, cc, backtrack); - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - return NULL; - - if (framesize < 0) - { - if (framesize == no_frame) - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - - if (offset != 0) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), cbraprivptr); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1), STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), cbraprivptr, STR_PTR, 0); - if (common->capture_last_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr, SLJIT_IMM, offset >> 1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset), TMP1, 0); - } - else - { - if (opcode == OP_SBRAPOS) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - } - - /* Even if the match is empty, we need to reset the control head. */ - if (needs_control_head) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack)); - - if (opcode == OP_SBRAPOS || opcode == OP_SCBRAPOS) - add_jump(compiler, &emptymatch, CMP(SLJIT_EQUAL, TMP1, 0, STR_PTR, 0)); - - if (!zero) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize - 1), SLJIT_IMM, 0); - } - else - { - if (offset != 0) - { - OP2(SLJIT_SUB, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr, SLJIT_IMM, stacksize * sizeof(sljit_sw)); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), cbraprivptr); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1), STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), cbraprivptr, STR_PTR, 0); - if (common->capture_last_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr, SLJIT_IMM, offset >> 1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset), TMP1, 0); - } - else - { - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - OP2(SLJIT_SUB, STACK_TOP, 0, TMP2, 0, SLJIT_IMM, stacksize * sizeof(sljit_sw)); - if (opcode == OP_SBRAPOS) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), STACK(-framesize - 2)); - OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), STACK(-framesize - 2), STR_PTR, 0); - } - - /* Even if the match is empty, we need to reset the control head. */ - if (needs_control_head) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack)); - - if (opcode == OP_SBRAPOS || opcode == OP_SCBRAPOS) - add_jump(compiler, &emptymatch, CMP(SLJIT_EQUAL, TMP1, 0, STR_PTR, 0)); - - if (!zero) - { - if (framesize < 0) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize - 1), SLJIT_IMM, 0); - else - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - } - } - - JUMPTO(SLJIT_JUMP, loop); - flush_stubs(common); - - compile_backtrackingpath(common, backtrack->top); - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - return NULL; - set_jumps(backtrack->topbacktracks, LABEL()); - - if (framesize < 0) - { - if (offset != 0) - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), cbraprivptr); - else - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - } - else - { - if (offset != 0) - { - /* Last alternative. */ - if (*cc == OP_KETRPOS) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), cbraprivptr); - } - else - { - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(TMP2), STACK(-framesize - 2)); - } - } - - if (*cc == OP_KETRPOS) - break; - ccbegin = cc + 1 + LINK_SIZE; - } - -/* We don't have to restore the control head in case of a failed match. */ - -backtrack->topbacktracks = NULL; -if (!zero) - { - if (framesize < 0) - add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_NOT_EQUAL, SLJIT_MEM1(STACK_TOP), STACK(stacksize - 1), SLJIT_IMM, 0)); - else /* TMP2 is set to [private_data_ptr] above. */ - add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_NOT_EQUAL, SLJIT_MEM1(TMP2), STACK(-stacksize), SLJIT_IMM, 0)); - } - -/* None of them matched. */ -set_jumps(emptymatch, LABEL()); -count_match(common); -return cc + 1 + LINK_SIZE; -} - -static SLJIT_INLINE PCRE2_SPTR get_iterator_parameters(compiler_common *common, PCRE2_SPTR cc, PCRE2_UCHAR *opcode, PCRE2_UCHAR *type, sljit_u32 *max, sljit_u32 *exact, PCRE2_SPTR *end) -{ -int class_len; - -*opcode = *cc; -*exact = 0; - -if (*opcode >= OP_STAR && *opcode <= OP_POSUPTO) - { - cc++; - *type = OP_CHAR; - } -else if (*opcode >= OP_STARI && *opcode <= OP_POSUPTOI) - { - cc++; - *type = OP_CHARI; - *opcode -= OP_STARI - OP_STAR; - } -else if (*opcode >= OP_NOTSTAR && *opcode <= OP_NOTPOSUPTO) - { - cc++; - *type = OP_NOT; - *opcode -= OP_NOTSTAR - OP_STAR; - } -else if (*opcode >= OP_NOTSTARI && *opcode <= OP_NOTPOSUPTOI) - { - cc++; - *type = OP_NOTI; - *opcode -= OP_NOTSTARI - OP_STAR; - } -else if (*opcode >= OP_TYPESTAR && *opcode <= OP_TYPEPOSUPTO) - { - cc++; - *opcode -= OP_TYPESTAR - OP_STAR; - *type = OP_END; - } -else - { - SLJIT_ASSERT(*opcode == OP_CLASS || *opcode == OP_NCLASS || *opcode == OP_XCLASS); - *type = *opcode; - cc++; - class_len = (*type < OP_XCLASS) ? (int)(1 + (32 / sizeof(PCRE2_UCHAR))) : GET(cc, 0); - *opcode = cc[class_len - 1]; - - if (*opcode >= OP_CRSTAR && *opcode <= OP_CRMINQUERY) - { - *opcode -= OP_CRSTAR - OP_STAR; - *end = cc + class_len; - - if (*opcode == OP_PLUS || *opcode == OP_MINPLUS) - { - *exact = 1; - *opcode -= OP_PLUS - OP_STAR; - } - } - else if (*opcode >= OP_CRPOSSTAR && *opcode <= OP_CRPOSQUERY) - { - *opcode -= OP_CRPOSSTAR - OP_POSSTAR; - *end = cc + class_len; - - if (*opcode == OP_POSPLUS) - { - *exact = 1; - *opcode = OP_POSSTAR; - } - } - else - { - SLJIT_ASSERT(*opcode == OP_CRRANGE || *opcode == OP_CRMINRANGE || *opcode == OP_CRPOSRANGE); - *max = GET2(cc, (class_len + IMM2_SIZE)); - *exact = GET2(cc, class_len); - - if (*max == 0) - { - if (*opcode == OP_CRPOSRANGE) - *opcode = OP_POSSTAR; - else - *opcode -= OP_CRRANGE - OP_STAR; - } - else - { - *max -= *exact; - if (*max == 0) - *opcode = OP_EXACT; - else if (*max == 1) - { - if (*opcode == OP_CRPOSRANGE) - *opcode = OP_POSQUERY; - else - *opcode -= OP_CRRANGE - OP_QUERY; - } - else - { - if (*opcode == OP_CRPOSRANGE) - *opcode = OP_POSUPTO; - else - *opcode -= OP_CRRANGE - OP_UPTO; - } - } - *end = cc + class_len + 2 * IMM2_SIZE; - } - return cc; - } - -switch(*opcode) - { - case OP_EXACT: - *exact = GET2(cc, 0); - cc += IMM2_SIZE; - break; - - case OP_PLUS: - case OP_MINPLUS: - *exact = 1; - *opcode -= OP_PLUS - OP_STAR; - break; - - case OP_POSPLUS: - *exact = 1; - *opcode = OP_POSSTAR; - break; - - case OP_UPTO: - case OP_MINUPTO: - case OP_POSUPTO: - *max = GET2(cc, 0); - cc += IMM2_SIZE; - break; - } - -if (*type == OP_END) - { - *type = *cc; - *end = next_opcode(common, cc); - cc++; - return cc; - } - -*end = cc + 1; -#ifdef SUPPORT_UNICODE -if (common->utf && HAS_EXTRALEN(*cc)) *end += GET_EXTRALEN(*cc); -#endif -return cc; -} - -static PCRE2_SPTR compile_iterator_matchingpath(compiler_common *common, PCRE2_SPTR cc, backtrack_common *parent) -{ -DEFINE_COMPILER; -backtrack_common *backtrack; -PCRE2_UCHAR opcode; -PCRE2_UCHAR type; -sljit_u32 max = 0, exact; -sljit_s32 early_fail_ptr = PRIVATE_DATA(cc + 1); -sljit_s32 early_fail_type; -BOOL charpos_enabled; -PCRE2_UCHAR charpos_char; -unsigned int charpos_othercasebit; -PCRE2_SPTR end; -jump_list *no_match = NULL; -jump_list *no_char1_match = NULL; -struct sljit_jump *jump = NULL; -struct sljit_label *label; -int private_data_ptr = PRIVATE_DATA(cc); -int base = (private_data_ptr == 0) ? SLJIT_MEM1(STACK_TOP) : SLJIT_MEM1(SLJIT_SP); -int offset0 = (private_data_ptr == 0) ? STACK(0) : private_data_ptr; -int offset1 = (private_data_ptr == 0) ? STACK(1) : private_data_ptr + (int)sizeof(sljit_sw); -int tmp_base, tmp_offset; -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -BOOL use_tmp; -#endif - -PUSH_BACKTRACK(sizeof(char_iterator_backtrack), cc, NULL); - -early_fail_type = (early_fail_ptr & 0x7); -early_fail_ptr >>= 3; - -/* During recursion, these optimizations are disabled. */ -if (common->early_fail_start_ptr == 0) - { - early_fail_ptr = 0; - early_fail_type = type_skip; - } - -SLJIT_ASSERT(common->fast_forward_bc_ptr != NULL || early_fail_ptr == 0 - || (early_fail_ptr >= common->early_fail_start_ptr && early_fail_ptr <= common->early_fail_end_ptr)); - -if (early_fail_type == type_fail) - add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), early_fail_ptr)); - -cc = get_iterator_parameters(common, cc, &opcode, &type, &max, &exact, &end); - -if (type != OP_EXTUNI) - { - tmp_base = TMP3; - tmp_offset = 0; - } -else - { - tmp_base = SLJIT_MEM1(SLJIT_SP); - tmp_offset = POSSESSIVE0; - } - -/* Handle fixed part first. */ -if (exact > 1) - { - SLJIT_ASSERT(early_fail_ptr == 0); - - if (common->mode == PCRE2_JIT_COMPLETE -#ifdef SUPPORT_UNICODE - && !common->utf -#endif - && type != OP_ANYNL && type != OP_EXTUNI) - { - OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(exact)); - add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_GREATER, TMP1, 0, STR_END, 0)); - OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact); - label = LABEL(); - compile_char1_matchingpath(common, type, cc, &backtrack->topbacktracks, FALSE); - OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1); - JUMPTO(SLJIT_NOT_ZERO, label); - } - else - { - OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact); - label = LABEL(); - compile_char1_matchingpath(common, type, cc, &backtrack->topbacktracks, TRUE); - OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1); - JUMPTO(SLJIT_NOT_ZERO, label); - } - } -else if (exact == 1) - { - compile_char1_matchingpath(common, type, cc, &backtrack->topbacktracks, TRUE); - - if (early_fail_type == type_fail_range) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), early_fail_ptr); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), early_fail_ptr + (int)sizeof(sljit_sw)); - OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, TMP2, 0); - OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, TMP2, 0); - add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_LESS_EQUAL, TMP2, 0, TMP1, 0)); - - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr + (int)sizeof(sljit_sw), STR_PTR, 0); - } - } - -switch(opcode) - { - case OP_STAR: - case OP_UPTO: - SLJIT_ASSERT(early_fail_ptr == 0 || opcode == OP_STAR); - - if (type == OP_ANYNL || type == OP_EXTUNI) - { - SLJIT_ASSERT(private_data_ptr == 0); - SLJIT_ASSERT(early_fail_ptr == 0); - - allocate_stack(common, 2); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, 0); - - if (opcode == OP_UPTO) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), POSSESSIVE0, SLJIT_IMM, max); - - label = LABEL(); - compile_char1_matchingpath(common, type, cc, &BACKTRACK_AS(char_iterator_backtrack)->u.backtracks, TRUE); - if (opcode == OP_UPTO) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), POSSESSIVE0); - OP2(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); - jump = JUMP(SLJIT_ZERO); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), POSSESSIVE0, TMP1, 0); - } - - /* We cannot use TMP3 because of allocate_stack. */ - allocate_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - JUMPTO(SLJIT_JUMP, label); - if (jump != NULL) - JUMPHERE(jump); - BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL(); - break; - } -#ifdef SUPPORT_UNICODE - else if (type == OP_ALLANY && !common->invalid_utf) -#else - else if (type == OP_ALLANY) -#endif - { - if (opcode == OP_STAR) - { - if (private_data_ptr == 0) - allocate_stack(common, 2); - - OP1(SLJIT_MOV, base, offset0, STR_END, 0); - OP1(SLJIT_MOV, base, offset1, STR_PTR, 0); - - OP1(SLJIT_MOV, STR_PTR, 0, STR_END, 0); - process_partial_match(common); - - if (early_fail_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_END, 0); - BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL(); - break; - } -#ifdef SUPPORT_UNICODE - else if (!common->utf) -#else - else -#endif - { - if (private_data_ptr == 0) - allocate_stack(common, 2); - - OP1(SLJIT_MOV, base, offset1, STR_PTR, 0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(max)); - - if (common->mode == PCRE2_JIT_COMPLETE) - { - OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0); - CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0); - } - else - { - jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, STR_END, 0); - process_partial_match(common); - JUMPHERE(jump); - } - - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - - if (early_fail_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0); - BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL(); - break; - } - } - - charpos_enabled = FALSE; - charpos_char = 0; - charpos_othercasebit = 0; - - if ((type != OP_CHAR && type != OP_CHARI) && (*end == OP_CHAR || *end == OP_CHARI)) - { -#ifdef SUPPORT_UNICODE - charpos_enabled = !common->utf || !HAS_EXTRALEN(end[1]); -#else - charpos_enabled = TRUE; -#endif - if (charpos_enabled && *end == OP_CHARI && char_has_othercase(common, end + 1)) - { - charpos_othercasebit = char_get_othercase_bit(common, end + 1); - if (charpos_othercasebit == 0) - charpos_enabled = FALSE; - } - - if (charpos_enabled) - { - charpos_char = end[1]; - /* Consume the OP_CHAR opcode. */ - end += 2; -#if PCRE2_CODE_UNIT_WIDTH == 8 - SLJIT_ASSERT((charpos_othercasebit >> 8) == 0); -#elif PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 - SLJIT_ASSERT((charpos_othercasebit >> 9) == 0); - if ((charpos_othercasebit & 0x100) != 0) - charpos_othercasebit = (charpos_othercasebit & 0xff) << 8; -#endif - if (charpos_othercasebit != 0) - charpos_char |= charpos_othercasebit; - - BACKTRACK_AS(char_iterator_backtrack)->u.charpos.enabled = TRUE; - BACKTRACK_AS(char_iterator_backtrack)->u.charpos.chr = charpos_char; - BACKTRACK_AS(char_iterator_backtrack)->u.charpos.othercasebit = charpos_othercasebit; - } - } - - if (charpos_enabled) - { - if (opcode == OP_UPTO) - OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, max + 1); - - /* Search the first instance of charpos_char. */ - jump = JUMP(SLJIT_JUMP); - label = LABEL(); - if (opcode == OP_UPTO) - { - OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1); - add_jump(compiler, &backtrack->topbacktracks, JUMP(SLJIT_ZERO)); - } - compile_char1_matchingpath(common, type, cc, &backtrack->topbacktracks, FALSE); - if (early_fail_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0); - JUMPHERE(jump); - - detect_partial_match(common, &backtrack->topbacktracks); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - if (charpos_othercasebit != 0) - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, charpos_othercasebit); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, charpos_char, label); - - if (private_data_ptr == 0) - allocate_stack(common, 2); - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - OP1(SLJIT_MOV, base, offset1, STR_PTR, 0); - - if (opcode == OP_UPTO) - { - OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1); - add_jump(compiler, &no_match, JUMP(SLJIT_ZERO)); - } - - /* Search the last instance of charpos_char. */ - label = LABEL(); - compile_char1_matchingpath(common, type, cc, &no_match, FALSE); - if (early_fail_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0); - detect_partial_match(common, &no_match); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - if (charpos_othercasebit != 0) - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, charpos_othercasebit); - - if (opcode == OP_STAR) - { - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, charpos_char, label); - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - JUMPTO(SLJIT_JUMP, label); - } - else - { - jump = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, charpos_char); - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - JUMPHERE(jump); - OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1); - JUMPTO(SLJIT_NOT_ZERO, label); - } - - set_jumps(no_match, LABEL()); - OP2(SLJIT_ADD, STR_PTR, 0, base, offset0, SLJIT_IMM, IN_UCHARS(1)); - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - } - else - { - if (private_data_ptr == 0) - allocate_stack(common, 2); - - OP1(SLJIT_MOV, base, offset1, STR_PTR, 0); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - use_tmp = (!HAS_VIRTUAL_REGISTERS && opcode == OP_STAR); - SLJIT_ASSERT(!use_tmp || tmp_base == TMP3); - - if (common->utf) - OP1(SLJIT_MOV, use_tmp ? TMP3 : base, use_tmp ? 0 : offset0, STR_PTR, 0); -#endif - if (opcode == OP_UPTO) - OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, max); - - detect_partial_match(common, &no_match); - label = LABEL(); - compile_char1_matchingpath(common, type, cc, &no_char1_match, FALSE); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) - OP1(SLJIT_MOV, use_tmp ? TMP3 : base, use_tmp ? 0 : offset0, STR_PTR, 0); -#endif - - if (opcode == OP_UPTO) - { - OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1); - add_jump(compiler, &no_match, JUMP(SLJIT_ZERO)); - } - - detect_partial_match_to(common, label); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - - set_jumps(no_char1_match, LABEL()); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) - { - set_jumps(no_match, LABEL()); - if (use_tmp) - { - OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0); - OP1(SLJIT_MOV, base, offset0, TMP3, 0); - } - else - OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); - } - else -#endif - { - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - set_jumps(no_match, LABEL()); - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - } - - if (early_fail_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0); - } - - BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL(); - break; - - case OP_MINSTAR: - if (private_data_ptr == 0) - allocate_stack(common, 1); - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL(); - if (early_fail_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0); - break; - - case OP_MINUPTO: - SLJIT_ASSERT(early_fail_ptr == 0); - if (private_data_ptr == 0) - allocate_stack(common, 2); - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - OP1(SLJIT_MOV, base, offset1, SLJIT_IMM, max + 1); - BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL(); - break; - - case OP_QUERY: - case OP_MINQUERY: - SLJIT_ASSERT(early_fail_ptr == 0); - if (private_data_ptr == 0) - allocate_stack(common, 1); - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - if (opcode == OP_QUERY) - compile_char1_matchingpath(common, type, cc, &BACKTRACK_AS(char_iterator_backtrack)->u.backtracks, TRUE); - BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL(); - break; - - case OP_EXACT: - break; - - case OP_POSSTAR: -#if defined SUPPORT_UNICODE - if (type == OP_ALLANY && !common->invalid_utf) -#else - if (type == OP_ALLANY) -#endif - { - OP1(SLJIT_MOV, STR_PTR, 0, STR_END, 0); - process_partial_match(common); - if (early_fail_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_END, 0); - break; - } - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) - { - OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0); - detect_partial_match(common, &no_match); - label = LABEL(); - compile_char1_matchingpath(common, type, cc, &no_match, FALSE); - OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0); - detect_partial_match_to(common, label); - - set_jumps(no_match, LABEL()); - OP1(SLJIT_MOV, STR_PTR, 0, tmp_base, tmp_offset); - if (early_fail_ptr != 0) - { - if (!HAS_VIRTUAL_REGISTERS && tmp_base == TMP3) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, TMP3, 0); - else - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0); - } - break; - } -#endif - - detect_partial_match(common, &no_match); - label = LABEL(); - compile_char1_matchingpath(common, type, cc, &no_char1_match, FALSE); - detect_partial_match_to(common, label); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - - set_jumps(no_char1_match, LABEL()); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - set_jumps(no_match, LABEL()); - if (early_fail_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0); - break; - - case OP_POSUPTO: - SLJIT_ASSERT(early_fail_ptr == 0); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) - { - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), POSSESSIVE1, STR_PTR, 0); - OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, max); - - detect_partial_match(common, &no_match); - label = LABEL(); - compile_char1_matchingpath(common, type, cc, &no_match, FALSE); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), POSSESSIVE1, STR_PTR, 0); - OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1); - add_jump(compiler, &no_match, JUMP(SLJIT_ZERO)); - detect_partial_match_to(common, label); - - set_jumps(no_match, LABEL()); - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), POSSESSIVE1); - break; - } -#endif - - if (type == OP_ALLANY) - { - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(max)); - - if (common->mode == PCRE2_JIT_COMPLETE) - { - OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0); - CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0); - } - else - { - jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, STR_END, 0); - process_partial_match(common); - JUMPHERE(jump); - } - break; - } - - OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, max); - - detect_partial_match(common, &no_match); - label = LABEL(); - compile_char1_matchingpath(common, type, cc, &no_char1_match, FALSE); - OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1); - add_jump(compiler, &no_match, JUMP(SLJIT_ZERO)); - detect_partial_match_to(common, label); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - - set_jumps(no_char1_match, LABEL()); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - set_jumps(no_match, LABEL()); - break; - - case OP_POSQUERY: - SLJIT_ASSERT(early_fail_ptr == 0); - OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0); - compile_char1_matchingpath(common, type, cc, &no_match, TRUE); - OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0); - set_jumps(no_match, LABEL()); - OP1(SLJIT_MOV, STR_PTR, 0, tmp_base, tmp_offset); - break; - - default: - SLJIT_UNREACHABLE(); - break; - } - -count_match(common); -return end; -} - -static SLJIT_INLINE PCRE2_SPTR compile_fail_accept_matchingpath(compiler_common *common, PCRE2_SPTR cc, backtrack_common *parent) -{ -DEFINE_COMPILER; -backtrack_common *backtrack; - -PUSH_BACKTRACK(sizeof(backtrack_common), cc, NULL); - -if (*cc == OP_FAIL) - { - add_jump(compiler, &backtrack->topbacktracks, JUMP(SLJIT_JUMP)); - return cc + 1; - } - -if (*cc == OP_ACCEPT && common->currententry == NULL && (common->re->overall_options & PCRE2_ENDANCHORED) != 0) - add_jump(compiler, &common->reset_match, CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, STR_END, 0)); - -if (*cc == OP_ASSERT_ACCEPT || common->currententry != NULL || !common->might_be_empty) - { - /* No need to check notempty conditions. */ - if (common->accept_label == NULL) - add_jump(compiler, &common->accept, JUMP(SLJIT_JUMP)); - else - JUMPTO(SLJIT_JUMP, common->accept_label); - return cc + 1; - } - -if (common->accept_label == NULL) - add_jump(compiler, &common->accept, CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0))); -else - CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0), common->accept_label); - -if (HAS_VIRTUAL_REGISTERS) - { - OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV_U32, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, options)); - } -else - OP1(SLJIT_MOV_U32, TMP2, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, options)); - -OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, PCRE2_NOTEMPTY); -add_jump(compiler, &backtrack->topbacktracks, JUMP(SLJIT_NOT_ZERO)); -OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, PCRE2_NOTEMPTY_ATSTART); -if (common->accept_label == NULL) - add_jump(compiler, &common->accept, JUMP(SLJIT_ZERO)); -else - JUMPTO(SLJIT_ZERO, common->accept_label); - -OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(HAS_VIRTUAL_REGISTERS ? TMP1 : ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, str)); -if (common->accept_label == NULL) - add_jump(compiler, &common->accept, CMP(SLJIT_NOT_EQUAL, TMP2, 0, STR_PTR, 0)); -else - CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, STR_PTR, 0, common->accept_label); -add_jump(compiler, &backtrack->topbacktracks, JUMP(SLJIT_JUMP)); -return cc + 1; -} - -static SLJIT_INLINE PCRE2_SPTR compile_close_matchingpath(compiler_common *common, PCRE2_SPTR cc) -{ -DEFINE_COMPILER; -int offset = GET2(cc, 1); -BOOL optimized_cbracket = common->optimized_cbracket[offset] != 0; - -/* Data will be discarded anyway... */ -if (common->currententry != NULL) - return cc + 1 + IMM2_SIZE; - -if (!optimized_cbracket) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR_PRIV(offset)); -offset <<= 1; -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1), STR_PTR, 0); -if (!optimized_cbracket) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset), TMP1, 0); -return cc + 1 + IMM2_SIZE; -} - -static SLJIT_INLINE PCRE2_SPTR compile_control_verb_matchingpath(compiler_common *common, PCRE2_SPTR cc, backtrack_common *parent) -{ -DEFINE_COMPILER; -backtrack_common *backtrack; -PCRE2_UCHAR opcode = *cc; -PCRE2_SPTR ccend = cc + 1; - -if (opcode == OP_COMMIT_ARG || opcode == OP_PRUNE_ARG || - opcode == OP_SKIP_ARG || opcode == OP_THEN_ARG) - ccend += 2 + cc[1]; - -PUSH_BACKTRACK(sizeof(backtrack_common), cc, NULL); - -if (opcode == OP_SKIP) - { - allocate_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - return ccend; - } - -if (opcode == OP_COMMIT_ARG || opcode == OP_PRUNE_ARG || opcode == OP_THEN_ARG) - { - if (HAS_VIRTUAL_REGISTERS) - OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)(cc + 2)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->mark_ptr, TMP2, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(HAS_VIRTUAL_REGISTERS ? TMP1 : ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, mark_ptr), TMP2, 0); - } - -return ccend; -} - -static PCRE2_UCHAR then_trap_opcode[1] = { OP_THEN_TRAP }; - -static SLJIT_INLINE void compile_then_trap_matchingpath(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, backtrack_common *parent) -{ -DEFINE_COMPILER; -backtrack_common *backtrack; -BOOL needs_control_head; -int size; - -PUSH_BACKTRACK_NOVALUE(sizeof(then_trap_backtrack), cc); -common->then_trap = BACKTRACK_AS(then_trap_backtrack); -BACKTRACK_AS(then_trap_backtrack)->common.cc = then_trap_opcode; -BACKTRACK_AS(then_trap_backtrack)->start = (sljit_sw)(cc - common->start); -BACKTRACK_AS(then_trap_backtrack)->framesize = get_framesize(common, cc, ccend, FALSE, &needs_control_head); - -size = BACKTRACK_AS(then_trap_backtrack)->framesize; -size = 3 + (size < 0 ? 0 : size); - -OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr); -allocate_stack(common, size); -if (size > 3) - OP2(SLJIT_ADD, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, STACK_TOP, 0, SLJIT_IMM, (size - 3) * sizeof(sljit_sw)); -else - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, STACK_TOP, 0); -OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(size - 1), SLJIT_IMM, BACKTRACK_AS(then_trap_backtrack)->start); -OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(size - 2), SLJIT_IMM, type_then_trap); -OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(size - 3), TMP2, 0); - -size = BACKTRACK_AS(then_trap_backtrack)->framesize; -if (size >= 0) - init_frame(common, cc, ccend, size - 1, 0); -} - -static void compile_matchingpath(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, backtrack_common *parent) -{ -DEFINE_COMPILER; -backtrack_common *backtrack; -BOOL has_then_trap = FALSE; -then_trap_backtrack *save_then_trap = NULL; - -SLJIT_ASSERT(*ccend == OP_END || (*ccend >= OP_ALT && *ccend <= OP_KETRPOS)); - -if (common->has_then && common->then_offsets[cc - common->start] != 0) - { - SLJIT_ASSERT(*ccend != OP_END && common->control_head_ptr != 0); - has_then_trap = TRUE; - save_then_trap = common->then_trap; - /* Tail item on backtrack. */ - compile_then_trap_matchingpath(common, cc, ccend, parent); - } - -while (cc < ccend) - { - switch(*cc) - { - case OP_SOD: - case OP_SOM: - case OP_NOT_WORD_BOUNDARY: - case OP_WORD_BOUNDARY: - case OP_EODN: - case OP_EOD: - case OP_DOLL: - case OP_DOLLM: - case OP_CIRC: - case OP_CIRCM: - case OP_REVERSE: - cc = compile_simple_assertion_matchingpath(common, *cc, cc + 1, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks); - break; - - case OP_NOT_DIGIT: - case OP_DIGIT: - case OP_NOT_WHITESPACE: - case OP_WHITESPACE: - case OP_NOT_WORDCHAR: - case OP_WORDCHAR: - case OP_ANY: - case OP_ALLANY: - case OP_ANYBYTE: - case OP_NOTPROP: - case OP_PROP: - case OP_ANYNL: - case OP_NOT_HSPACE: - case OP_HSPACE: - case OP_NOT_VSPACE: - case OP_VSPACE: - case OP_EXTUNI: - case OP_NOT: - case OP_NOTI: - cc = compile_char1_matchingpath(common, *cc, cc + 1, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks, TRUE); - break; - - case OP_SET_SOM: - PUSH_BACKTRACK_NOVALUE(sizeof(backtrack_common), cc); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0)); - allocate_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(0), STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), TMP2, 0); - cc++; - break; - - case OP_CHAR: - case OP_CHARI: - if (common->mode == PCRE2_JIT_COMPLETE) - cc = compile_charn_matchingpath(common, cc, ccend, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks); - else - cc = compile_char1_matchingpath(common, *cc, cc + 1, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks, TRUE); - break; - - case OP_STAR: - case OP_MINSTAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_QUERY: - case OP_MINQUERY: - case OP_UPTO: - case OP_MINUPTO: - case OP_EXACT: - case OP_POSSTAR: - case OP_POSPLUS: - case OP_POSQUERY: - case OP_POSUPTO: - case OP_STARI: - case OP_MINSTARI: - case OP_PLUSI: - case OP_MINPLUSI: - case OP_QUERYI: - case OP_MINQUERYI: - case OP_UPTOI: - case OP_MINUPTOI: - case OP_EXACTI: - case OP_POSSTARI: - case OP_POSPLUSI: - case OP_POSQUERYI: - case OP_POSUPTOI: - case OP_NOTSTAR: - case OP_NOTMINSTAR: - case OP_NOTPLUS: - case OP_NOTMINPLUS: - case OP_NOTQUERY: - case OP_NOTMINQUERY: - case OP_NOTUPTO: - case OP_NOTMINUPTO: - case OP_NOTEXACT: - case OP_NOTPOSSTAR: - case OP_NOTPOSPLUS: - case OP_NOTPOSQUERY: - case OP_NOTPOSUPTO: - case OP_NOTSTARI: - case OP_NOTMINSTARI: - case OP_NOTPLUSI: - case OP_NOTMINPLUSI: - case OP_NOTQUERYI: - case OP_NOTMINQUERYI: - case OP_NOTUPTOI: - case OP_NOTMINUPTOI: - case OP_NOTEXACTI: - case OP_NOTPOSSTARI: - case OP_NOTPOSPLUSI: - case OP_NOTPOSQUERYI: - case OP_NOTPOSUPTOI: - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEEXACT: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - case OP_TYPEPOSUPTO: - cc = compile_iterator_matchingpath(common, cc, parent); - break; - - case OP_CLASS: - case OP_NCLASS: - if (cc[1 + (32 / sizeof(PCRE2_UCHAR))] >= OP_CRSTAR && cc[1 + (32 / sizeof(PCRE2_UCHAR))] <= OP_CRPOSRANGE) - cc = compile_iterator_matchingpath(common, cc, parent); - else - cc = compile_char1_matchingpath(common, *cc, cc + 1, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks, TRUE); - break; - -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 - case OP_XCLASS: - if (*(cc + GET(cc, 1)) >= OP_CRSTAR && *(cc + GET(cc, 1)) <= OP_CRPOSRANGE) - cc = compile_iterator_matchingpath(common, cc, parent); - else - cc = compile_char1_matchingpath(common, *cc, cc + 1, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks, TRUE); - break; -#endif - - case OP_REF: - case OP_REFI: - if (cc[1 + IMM2_SIZE] >= OP_CRSTAR && cc[1 + IMM2_SIZE] <= OP_CRPOSRANGE) - cc = compile_ref_iterator_matchingpath(common, cc, parent); - else - { - compile_ref_matchingpath(common, cc, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks, TRUE, FALSE); - cc += 1 + IMM2_SIZE; - } - break; - - case OP_DNREF: - case OP_DNREFI: - if (cc[1 + 2 * IMM2_SIZE] >= OP_CRSTAR && cc[1 + 2 * IMM2_SIZE] <= OP_CRPOSRANGE) - cc = compile_ref_iterator_matchingpath(common, cc, parent); - else - { - compile_dnref_search(common, cc, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks); - compile_ref_matchingpath(common, cc, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks, TRUE, FALSE); - cc += 1 + 2 * IMM2_SIZE; - } - break; - - case OP_RECURSE: - cc = compile_recurse_matchingpath(common, cc, parent); - break; - - case OP_CALLOUT: - case OP_CALLOUT_STR: - cc = compile_callout_matchingpath(common, cc, parent); - break; - - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - PUSH_BACKTRACK_NOVALUE(sizeof(assert_backtrack), cc); - cc = compile_assert_matchingpath(common, cc, BACKTRACK_AS(assert_backtrack), FALSE); - break; - - case OP_BRAMINZERO: - PUSH_BACKTRACK_NOVALUE(sizeof(braminzero_backtrack), cc); - cc = bracketend(cc + 1); - if (*(cc - 1 - LINK_SIZE) != OP_KETRMIN) - { - allocate_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - } - else - { - allocate_stack(common, 2); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), STR_PTR, 0); - } - BACKTRACK_AS(braminzero_backtrack)->matchingpath = LABEL(); - count_match(common); - break; - - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - case OP_ONCE: - case OP_SCRIPT_RUN: - case OP_BRA: - case OP_CBRA: - case OP_COND: - case OP_SBRA: - case OP_SCBRA: - case OP_SCOND: - cc = compile_bracket_matchingpath(common, cc, parent); - break; - - case OP_BRAZERO: - if (cc[1] > OP_ASSERTBACK_NOT) - cc = compile_bracket_matchingpath(common, cc, parent); - else - { - PUSH_BACKTRACK_NOVALUE(sizeof(assert_backtrack), cc); - cc = compile_assert_matchingpath(common, cc, BACKTRACK_AS(assert_backtrack), FALSE); - } - break; - - case OP_BRAPOS: - case OP_CBRAPOS: - case OP_SBRAPOS: - case OP_SCBRAPOS: - case OP_BRAPOSZERO: - cc = compile_bracketpos_matchingpath(common, cc, parent); - break; - - case OP_MARK: - PUSH_BACKTRACK_NOVALUE(sizeof(backtrack_common), cc); - SLJIT_ASSERT(common->mark_ptr != 0); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->mark_ptr); - allocate_stack(common, common->has_skip_arg ? 5 : 1); - if (HAS_VIRTUAL_REGISTERS) - OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(common->has_skip_arg ? 4 : 0), TMP2, 0); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)(cc + 2)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->mark_ptr, TMP2, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(HAS_VIRTUAL_REGISTERS ? TMP1 : ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, mark_ptr), TMP2, 0); - if (common->has_skip_arg) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, STACK_TOP, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, type_mark); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(2), SLJIT_IMM, (sljit_sw)(cc + 2)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(3), STR_PTR, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), TMP1, 0); - } - cc += 1 + 2 + cc[1]; - break; - - case OP_PRUNE: - case OP_PRUNE_ARG: - case OP_SKIP: - case OP_SKIP_ARG: - case OP_THEN: - case OP_THEN_ARG: - case OP_COMMIT: - case OP_COMMIT_ARG: - cc = compile_control_verb_matchingpath(common, cc, parent); - break; - - case OP_FAIL: - case OP_ACCEPT: - case OP_ASSERT_ACCEPT: - cc = compile_fail_accept_matchingpath(common, cc, parent); - break; - - case OP_CLOSE: - cc = compile_close_matchingpath(common, cc); - break; - - case OP_SKIPZERO: - cc = bracketend(cc + 1); - break; - - default: - SLJIT_UNREACHABLE(); - return; - } - if (cc == NULL) - return; - } - -if (has_then_trap) - { - /* Head item on backtrack. */ - PUSH_BACKTRACK_NOVALUE(sizeof(then_trap_backtrack), cc); - BACKTRACK_AS(then_trap_backtrack)->common.cc = then_trap_opcode; - BACKTRACK_AS(then_trap_backtrack)->then_trap = common->then_trap; - common->then_trap = save_then_trap; - } -SLJIT_ASSERT(cc == ccend); -} - -#undef PUSH_BACKTRACK -#undef PUSH_BACKTRACK_NOVALUE -#undef BACKTRACK_AS - -#define COMPILE_BACKTRACKINGPATH(current) \ - do \ - { \ - compile_backtrackingpath(common, (current)); \ - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) \ - return; \ - } \ - while (0) - -#define CURRENT_AS(type) ((type *)current) - -static void compile_iterator_backtrackingpath(compiler_common *common, struct backtrack_common *current) -{ -DEFINE_COMPILER; -PCRE2_SPTR cc = current->cc; -PCRE2_UCHAR opcode; -PCRE2_UCHAR type; -sljit_u32 max = 0, exact; -struct sljit_label *label = NULL; -struct sljit_jump *jump = NULL; -jump_list *jumplist = NULL; -PCRE2_SPTR end; -int private_data_ptr = PRIVATE_DATA(cc); -int base = (private_data_ptr == 0) ? SLJIT_MEM1(STACK_TOP) : SLJIT_MEM1(SLJIT_SP); -int offset0 = (private_data_ptr == 0) ? STACK(0) : private_data_ptr; -int offset1 = (private_data_ptr == 0) ? STACK(1) : private_data_ptr + (int)sizeof(sljit_sw); - -cc = get_iterator_parameters(common, cc, &opcode, &type, &max, &exact, &end); - -switch(opcode) - { - case OP_STAR: - case OP_UPTO: - if (type == OP_ANYNL || type == OP_EXTUNI) - { - SLJIT_ASSERT(private_data_ptr == 0); - set_jumps(CURRENT_AS(char_iterator_backtrack)->u.backtracks, LABEL()); - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - free_stack(common, 1); - CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0, CURRENT_AS(char_iterator_backtrack)->matchingpath); - } - else - { - if (CURRENT_AS(char_iterator_backtrack)->u.charpos.enabled) - { - OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); - OP1(SLJIT_MOV, TMP2, 0, base, offset1); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - - jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0); - label = LABEL(); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - if (CURRENT_AS(char_iterator_backtrack)->u.charpos.othercasebit != 0) - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, CURRENT_AS(char_iterator_backtrack)->u.charpos.othercasebit); - CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CURRENT_AS(char_iterator_backtrack)->u.charpos.chr, CURRENT_AS(char_iterator_backtrack)->matchingpath); - move_back(common, NULL, TRUE); - CMPTO(SLJIT_GREATER, STR_PTR, 0, TMP2, 0, label); - } - else - { - OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); - jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, base, offset1); - move_back(common, NULL, TRUE); - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath); - } - JUMPHERE(jump); - if (private_data_ptr == 0) - free_stack(common, 2); - } - break; - - case OP_MINSTAR: - OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); - compile_char1_matchingpath(common, type, cc, &jumplist, TRUE); - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath); - set_jumps(jumplist, LABEL()); - if (private_data_ptr == 0) - free_stack(common, 1); - break; - - case OP_MINUPTO: - OP1(SLJIT_MOV, TMP1, 0, base, offset1); - OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); - OP2(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); - add_jump(compiler, &jumplist, JUMP(SLJIT_ZERO)); - - OP1(SLJIT_MOV, base, offset1, TMP1, 0); - compile_char1_matchingpath(common, type, cc, &jumplist, TRUE); - OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); - JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath); - - set_jumps(jumplist, LABEL()); - if (private_data_ptr == 0) - free_stack(common, 2); - break; - - case OP_QUERY: - OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); - OP1(SLJIT_MOV, base, offset0, SLJIT_IMM, 0); - CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0, CURRENT_AS(char_iterator_backtrack)->matchingpath); - jump = JUMP(SLJIT_JUMP); - set_jumps(CURRENT_AS(char_iterator_backtrack)->u.backtracks, LABEL()); - OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); - OP1(SLJIT_MOV, base, offset0, SLJIT_IMM, 0); - JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath); - JUMPHERE(jump); - if (private_data_ptr == 0) - free_stack(common, 1); - break; - - case OP_MINQUERY: - OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); - OP1(SLJIT_MOV, base, offset0, SLJIT_IMM, 0); - jump = CMP(SLJIT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0); - compile_char1_matchingpath(common, type, cc, &jumplist, TRUE); - JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath); - set_jumps(jumplist, LABEL()); - JUMPHERE(jump); - if (private_data_ptr == 0) - free_stack(common, 1); - break; - - case OP_EXACT: - case OP_POSSTAR: - case OP_POSQUERY: - case OP_POSUPTO: - break; - - default: - SLJIT_UNREACHABLE(); - break; - } - -set_jumps(current->topbacktracks, LABEL()); -} - -static SLJIT_INLINE void compile_ref_iterator_backtrackingpath(compiler_common *common, struct backtrack_common *current) -{ -DEFINE_COMPILER; -PCRE2_SPTR cc = current->cc; -BOOL ref = (*cc == OP_REF || *cc == OP_REFI); -PCRE2_UCHAR type; - -type = cc[ref ? 1 + IMM2_SIZE : 1 + 2 * IMM2_SIZE]; - -if ((type & 0x1) == 0) - { - /* Maximize case. */ - set_jumps(current->topbacktracks, LABEL()); - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - free_stack(common, 1); - CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0, CURRENT_AS(ref_iterator_backtrack)->matchingpath); - return; - } - -OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); -CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0, CURRENT_AS(ref_iterator_backtrack)->matchingpath); -set_jumps(current->topbacktracks, LABEL()); -free_stack(common, ref ? 2 : 3); -} - -static SLJIT_INLINE void compile_recurse_backtrackingpath(compiler_common *common, struct backtrack_common *current) -{ -DEFINE_COMPILER; -recurse_entry *entry; - -if (!CURRENT_AS(recurse_backtrack)->inlined_pattern) - { - entry = CURRENT_AS(recurse_backtrack)->entry; - if (entry->backtrack_label == NULL) - add_jump(compiler, &entry->backtrack_calls, JUMP(SLJIT_FAST_CALL)); - else - JUMPTO(SLJIT_FAST_CALL, entry->backtrack_label); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0, CURRENT_AS(recurse_backtrack)->matchingpath); - } -else - compile_backtrackingpath(common, current->top); - -set_jumps(current->topbacktracks, LABEL()); -} - -static void compile_assert_backtrackingpath(compiler_common *common, struct backtrack_common *current) -{ -DEFINE_COMPILER; -PCRE2_SPTR cc = current->cc; -PCRE2_UCHAR bra = OP_BRA; -struct sljit_jump *brajump = NULL; - -SLJIT_ASSERT(*cc != OP_BRAMINZERO); -if (*cc == OP_BRAZERO) - { - bra = *cc; - cc++; - } - -if (bra == OP_BRAZERO) - { - SLJIT_ASSERT(current->topbacktracks == NULL); - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - } - -if (CURRENT_AS(assert_backtrack)->framesize < 0) - { - set_jumps(current->topbacktracks, LABEL()); - - if (bra == OP_BRAZERO) - { - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0, CURRENT_AS(assert_backtrack)->matchingpath); - free_stack(common, 1); - } - return; - } - -if (bra == OP_BRAZERO) - { - if (*cc == OP_ASSERT_NOT || *cc == OP_ASSERTBACK_NOT) - { - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0, CURRENT_AS(assert_backtrack)->matchingpath); - free_stack(common, 1); - return; - } - free_stack(common, 1); - brajump = CMP(SLJIT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0); - } - -if (*cc == OP_ASSERT || *cc == OP_ASSERTBACK) - { - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), CURRENT_AS(assert_backtrack)->private_data_ptr); - add_jump(compiler, &common->revertframes, JUMP(SLJIT_FAST_CALL)); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(-2)); - OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, (CURRENT_AS(assert_backtrack)->framesize - 1) * sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), CURRENT_AS(assert_backtrack)->private_data_ptr, TMP1, 0); - - set_jumps(current->topbacktracks, LABEL()); - } -else - set_jumps(current->topbacktracks, LABEL()); - -if (bra == OP_BRAZERO) - { - /* We know there is enough place on the stack. */ - OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0); - JUMPTO(SLJIT_JUMP, CURRENT_AS(assert_backtrack)->matchingpath); - JUMPHERE(brajump); - } -} - -static void compile_bracket_backtrackingpath(compiler_common *common, struct backtrack_common *current) -{ -DEFINE_COMPILER; -int opcode, stacksize, alt_count, alt_max; -int offset = 0; -int private_data_ptr = CURRENT_AS(bracket_backtrack)->private_data_ptr; -int repeat_ptr = 0, repeat_type = 0, repeat_count = 0; -PCRE2_SPTR cc = current->cc; -PCRE2_SPTR ccbegin; -PCRE2_SPTR ccprev; -PCRE2_UCHAR bra = OP_BRA; -PCRE2_UCHAR ket; -assert_backtrack *assert; -BOOL has_alternatives; -BOOL needs_control_head = FALSE; -struct sljit_jump *brazero = NULL; -struct sljit_jump *next_alt = NULL; -struct sljit_jump *once = NULL; -struct sljit_jump *cond = NULL; -struct sljit_label *rmin_label = NULL; -struct sljit_label *exact_label = NULL; -struct sljit_put_label *put_label = NULL; - -if (*cc == OP_BRAZERO || *cc == OP_BRAMINZERO) - { - bra = *cc; - cc++; - } - -opcode = *cc; -ccbegin = bracketend(cc) - 1 - LINK_SIZE; -ket = *ccbegin; -if (ket == OP_KET && PRIVATE_DATA(ccbegin) != 0) - { - repeat_ptr = PRIVATE_DATA(ccbegin); - repeat_type = PRIVATE_DATA(ccbegin + 2); - repeat_count = PRIVATE_DATA(ccbegin + 3); - SLJIT_ASSERT(repeat_type != 0 && repeat_count != 0); - if (repeat_type == OP_UPTO) - ket = OP_KETRMAX; - if (repeat_type == OP_MINUPTO) - ket = OP_KETRMIN; - } -ccbegin = cc; -cc += GET(cc, 1); -has_alternatives = *cc == OP_ALT; -if (SLJIT_UNLIKELY(opcode == OP_COND) || SLJIT_UNLIKELY(opcode == OP_SCOND)) - has_alternatives = (ccbegin[1 + LINK_SIZE] >= OP_ASSERT && ccbegin[1 + LINK_SIZE] <= OP_ASSERTBACK_NOT) || CURRENT_AS(bracket_backtrack)->u.condfailed != NULL; -if (opcode == OP_CBRA || opcode == OP_SCBRA) - offset = (GET2(ccbegin, 1 + LINK_SIZE)) << 1; -if (SLJIT_UNLIKELY(opcode == OP_COND) && (*cc == OP_KETRMAX || *cc == OP_KETRMIN)) - opcode = OP_SCOND; - -alt_max = has_alternatives ? no_alternatives(ccbegin) : 0; - -/* Decoding the needs_control_head in framesize. */ -if (opcode == OP_ONCE) - { - needs_control_head = (CURRENT_AS(bracket_backtrack)->u.framesize & 0x1) != 0; - CURRENT_AS(bracket_backtrack)->u.framesize >>= 1; - } - -if (ket != OP_KET && repeat_type != 0) - { - /* TMP1 is used in OP_KETRMIN below. */ - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - free_stack(common, 1); - if (repeat_type == OP_UPTO) - OP2(SLJIT_ADD, SLJIT_MEM1(SLJIT_SP), repeat_ptr, TMP1, 0, SLJIT_IMM, 1); - else - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), repeat_ptr, TMP1, 0); - } - -if (ket == OP_KETRMAX) - { - if (bra == OP_BRAZERO) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - free_stack(common, 1); - brazero = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0); - } - } -else if (ket == OP_KETRMIN) - { - if (bra != OP_BRAMINZERO) - { - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - if (repeat_type != 0) - { - /* TMP1 was set a few lines above. */ - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0, CURRENT_AS(bracket_backtrack)->recursive_matchingpath); - /* Drop STR_PTR for non-greedy plus quantifier. */ - if (opcode != OP_ONCE) - free_stack(common, 1); - } - else if (opcode >= OP_SBRA || opcode == OP_ONCE) - { - /* Checking zero-length iteration. */ - if (opcode != OP_ONCE || CURRENT_AS(bracket_backtrack)->u.framesize < 0) - CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr, CURRENT_AS(bracket_backtrack)->recursive_matchingpath); - else - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_MEM1(TMP1), STACK(-CURRENT_AS(bracket_backtrack)->u.framesize - 2), CURRENT_AS(bracket_backtrack)->recursive_matchingpath); - } - /* Drop STR_PTR for non-greedy plus quantifier. */ - if (opcode != OP_ONCE) - free_stack(common, 1); - } - else - JUMPTO(SLJIT_JUMP, CURRENT_AS(bracket_backtrack)->recursive_matchingpath); - } - rmin_label = LABEL(); - if (repeat_type != 0) - OP2(SLJIT_ADD, SLJIT_MEM1(SLJIT_SP), repeat_ptr, SLJIT_MEM1(SLJIT_SP), repeat_ptr, SLJIT_IMM, 1); - } -else if (bra == OP_BRAZERO) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - free_stack(common, 1); - brazero = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0); - } -else if (repeat_type == OP_EXACT) - { - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), repeat_ptr, SLJIT_IMM, 1); - exact_label = LABEL(); - } - -if (offset != 0) - { - if (common->capture_last_ptr != 0) - { - SLJIT_ASSERT(common->optimized_cbracket[offset >> 1] == 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), STACK(1)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr, TMP1, 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(2)); - free_stack(common, 3); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset), TMP2, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1), TMP1, 0); - } - else if (common->optimized_cbracket[offset >> 1] == 0) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), STACK(1)); - free_stack(common, 2); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset), TMP1, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1), TMP2, 0); - } - } - -if (SLJIT_UNLIKELY(opcode == OP_ONCE)) - { - if (CURRENT_AS(bracket_backtrack)->u.framesize >= 0) - { - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - add_jump(compiler, &common->revertframes, JUMP(SLJIT_FAST_CALL)); - OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, (CURRENT_AS(bracket_backtrack)->u.framesize - 1) * sizeof(sljit_sw)); - } - once = JUMP(SLJIT_JUMP); - } -else if (SLJIT_UNLIKELY(opcode == OP_COND) || SLJIT_UNLIKELY(opcode == OP_SCOND)) - { - if (has_alternatives) - { - /* Always exactly one alternative. */ - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - free_stack(common, 1); - - alt_max = 2; - next_alt = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0); - } - } -else if (has_alternatives) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - free_stack(common, 1); - - if (alt_max > 3) - { - sljit_emit_ijump(compiler, SLJIT_JUMP, TMP1, 0); - - SLJIT_ASSERT(CURRENT_AS(bracket_backtrack)->u.matching_put_label); - sljit_set_put_label(CURRENT_AS(bracket_backtrack)->u.matching_put_label, LABEL()); - sljit_emit_op0(compiler, SLJIT_ENDBR); - } - else - next_alt = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0); - } - -COMPILE_BACKTRACKINGPATH(current->top); -if (current->topbacktracks) - set_jumps(current->topbacktracks, LABEL()); - -if (SLJIT_UNLIKELY(opcode == OP_COND) || SLJIT_UNLIKELY(opcode == OP_SCOND)) - { - /* Conditional block always has at most one alternative. */ - if (ccbegin[1 + LINK_SIZE] >= OP_ASSERT && ccbegin[1 + LINK_SIZE] <= OP_ASSERTBACK_NOT) - { - SLJIT_ASSERT(has_alternatives); - assert = CURRENT_AS(bracket_backtrack)->u.assert; - if (assert->framesize >= 0 && (ccbegin[1 + LINK_SIZE] == OP_ASSERT || ccbegin[1 + LINK_SIZE] == OP_ASSERTBACK)) - { - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), assert->private_data_ptr); - add_jump(compiler, &common->revertframes, JUMP(SLJIT_FAST_CALL)); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(-2)); - OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, (assert->framesize - 1) * sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), assert->private_data_ptr, TMP1, 0); - } - cond = JUMP(SLJIT_JUMP); - set_jumps(CURRENT_AS(bracket_backtrack)->u.assert->condfailed, LABEL()); - } - else if (CURRENT_AS(bracket_backtrack)->u.condfailed != NULL) - { - SLJIT_ASSERT(has_alternatives); - cond = JUMP(SLJIT_JUMP); - set_jumps(CURRENT_AS(bracket_backtrack)->u.condfailed, LABEL()); - } - else - SLJIT_ASSERT(!has_alternatives); - } - -if (has_alternatives) - { - alt_count = 1; - do - { - current->top = NULL; - current->topbacktracks = NULL; - current->nextbacktracks = NULL; - /* Conditional blocks always have an additional alternative, even if it is empty. */ - if (*cc == OP_ALT) - { - ccprev = cc + 1 + LINK_SIZE; - cc += GET(cc, 1); - if (opcode != OP_COND && opcode != OP_SCOND) - { - if (opcode != OP_ONCE) - { - if (private_data_ptr != 0) - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - else - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - } - else - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(needs_control_head ? 1 : 0)); - } - compile_matchingpath(common, ccprev, cc, current); - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - return; - - if (opcode == OP_ASSERT_NA || opcode == OP_ASSERTBACK_NA) - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), private_data_ptr); - - if (opcode == OP_SCRIPT_RUN) - match_script_run_common(common, private_data_ptr, current); - } - - /* Instructions after the current alternative is successfully matched. */ - /* There is a similar code in compile_bracket_matchingpath. */ - if (opcode == OP_ONCE) - match_once_common(common, ket, CURRENT_AS(bracket_backtrack)->u.framesize, private_data_ptr, has_alternatives, needs_control_head); - - stacksize = 0; - if (repeat_type == OP_MINUPTO) - { - /* We need to preserve the counter. TMP2 will be used below. */ - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), repeat_ptr); - stacksize++; - } - if (ket != OP_KET || bra != OP_BRA) - stacksize++; - if (offset != 0) - { - if (common->capture_last_ptr != 0) - stacksize++; - if (common->optimized_cbracket[offset >> 1] == 0) - stacksize += 2; - } - if (opcode != OP_ONCE) - stacksize++; - - if (stacksize > 0) - allocate_stack(common, stacksize); - - stacksize = 0; - if (repeat_type == OP_MINUPTO) - { - /* TMP2 was set above. */ - OP2(SLJIT_SUB, SLJIT_MEM1(STACK_TOP), STACK(stacksize), TMP2, 0, SLJIT_IMM, 1); - stacksize++; - } - - if (ket != OP_KET || bra != OP_BRA) - { - if (ket != OP_KET) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), STR_PTR, 0); - else - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), SLJIT_IMM, 0); - stacksize++; - } - - if (offset != 0) - stacksize = match_capture_common(common, stacksize, offset, private_data_ptr); - - if (opcode != OP_ONCE) - { - if (alt_max <= 3) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(stacksize), SLJIT_IMM, alt_count); - else - put_label = sljit_emit_put_label(compiler, SLJIT_MEM1(STACK_TOP), STACK(stacksize)); - } - - if (offset != 0 && ket == OP_KETRMAX && common->optimized_cbracket[offset >> 1] != 0) - { - /* If ket is not OP_KETRMAX, this code path is executed after the jump to alternative_matchingpath. */ - SLJIT_ASSERT(private_data_ptr == OVECTOR(offset + 0)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1), STR_PTR, 0); - } - - JUMPTO(SLJIT_JUMP, CURRENT_AS(bracket_backtrack)->alternative_matchingpath); - - if (opcode != OP_ONCE) - { - if (alt_max <= 3) - { - JUMPHERE(next_alt); - alt_count++; - if (alt_count < alt_max) - { - SLJIT_ASSERT(alt_count == 2 && alt_max == 3); - next_alt = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 1); - } - } - else - { - sljit_set_put_label(put_label, LABEL()); - sljit_emit_op0(compiler, SLJIT_ENDBR); - } - } - - COMPILE_BACKTRACKINGPATH(current->top); - if (current->topbacktracks) - set_jumps(current->topbacktracks, LABEL()); - SLJIT_ASSERT(!current->nextbacktracks); - } - while (*cc == OP_ALT); - - if (cond != NULL) - { - SLJIT_ASSERT(opcode == OP_COND || opcode == OP_SCOND); - assert = CURRENT_AS(bracket_backtrack)->u.assert; - if ((ccbegin[1 + LINK_SIZE] == OP_ASSERT_NOT || ccbegin[1 + LINK_SIZE] == OP_ASSERTBACK_NOT) && assert->framesize >= 0) - { - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), assert->private_data_ptr); - add_jump(compiler, &common->revertframes, JUMP(SLJIT_FAST_CALL)); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(-2)); - OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, (assert->framesize - 1) * sizeof(sljit_sw)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), assert->private_data_ptr, TMP1, 0); - } - JUMPHERE(cond); - } - - /* Free the STR_PTR. */ - if (private_data_ptr == 0) - free_stack(common, 1); - } - -if (offset != 0) - { - /* Using both tmp register is better for instruction scheduling. */ - if (common->optimized_cbracket[offset >> 1] != 0) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), STACK(1)); - free_stack(common, 2); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset), TMP1, 0); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1), TMP2, 0); - } - else - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - free_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, TMP1, 0); - } - } -else if (opcode == OP_ASSERT_NA || opcode == OP_ASSERTBACK_NA || opcode == OP_SCRIPT_RUN || opcode == OP_SBRA || opcode == OP_SCOND) - { - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, SLJIT_MEM1(STACK_TOP), STACK(0)); - free_stack(common, 1); - } -else if (opcode == OP_ONCE) - { - cc = ccbegin + GET(ccbegin, 1); - stacksize = needs_control_head ? 1 : 0; - - if (CURRENT_AS(bracket_backtrack)->u.framesize >= 0) - { - /* Reset head and drop saved frame. */ - stacksize += CURRENT_AS(bracket_backtrack)->u.framesize + ((ket != OP_KET || *cc == OP_ALT) ? 2 : 1); - } - else if (ket == OP_KETRMAX || (*cc == OP_ALT && ket != OP_KETRMIN)) - { - /* The STR_PTR must be released. */ - stacksize++; - } - - if (stacksize > 0) - free_stack(common, stacksize); - - JUMPHERE(once); - /* Restore previous private_data_ptr */ - if (CURRENT_AS(bracket_backtrack)->u.framesize >= 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, SLJIT_MEM1(STACK_TOP), STACK(-CURRENT_AS(bracket_backtrack)->u.framesize - 1)); - else if (ket == OP_KETRMIN) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(1)); - /* See the comment below. */ - free_stack(common, 2); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), private_data_ptr, TMP1, 0); - } - } - -if (repeat_type == OP_EXACT) - { - OP2(SLJIT_ADD, TMP1, 0, SLJIT_MEM1(SLJIT_SP), repeat_ptr, SLJIT_IMM, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), repeat_ptr, TMP1, 0); - CMPTO(SLJIT_LESS_EQUAL, TMP1, 0, SLJIT_IMM, repeat_count, exact_label); - } -else if (ket == OP_KETRMAX) - { - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - if (bra != OP_BRAZERO) - free_stack(common, 1); - - CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0, CURRENT_AS(bracket_backtrack)->recursive_matchingpath); - if (bra == OP_BRAZERO) - { - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(1)); - JUMPTO(SLJIT_JUMP, CURRENT_AS(bracket_backtrack)->zero_matchingpath); - JUMPHERE(brazero); - free_stack(common, 1); - } - } -else if (ket == OP_KETRMIN) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - - /* OP_ONCE removes everything in case of a backtrack, so we don't - need to explicitly release the STR_PTR. The extra release would - affect badly the free_stack(2) above. */ - if (opcode != OP_ONCE) - free_stack(common, 1); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0, rmin_label); - if (opcode == OP_ONCE) - free_stack(common, bra == OP_BRAMINZERO ? 2 : 1); - else if (bra == OP_BRAMINZERO) - free_stack(common, 1); - } -else if (bra == OP_BRAZERO) - { - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - JUMPTO(SLJIT_JUMP, CURRENT_AS(bracket_backtrack)->zero_matchingpath); - JUMPHERE(brazero); - } -} - -static SLJIT_INLINE void compile_bracketpos_backtrackingpath(compiler_common *common, struct backtrack_common *current) -{ -DEFINE_COMPILER; -int offset; -struct sljit_jump *jump; - -if (CURRENT_AS(bracketpos_backtrack)->framesize < 0) - { - if (*current->cc == OP_CBRAPOS || *current->cc == OP_SCBRAPOS) - { - offset = (GET2(current->cc, 1 + LINK_SIZE)) << 1; - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), STACK(1)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset), TMP1, 0); - if (common->capture_last_ptr != 0) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(2)); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1), TMP2, 0); - if (common->capture_last_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr, TMP1, 0); - } - set_jumps(current->topbacktracks, LABEL()); - free_stack(common, CURRENT_AS(bracketpos_backtrack)->stacksize); - return; - } - -OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), CURRENT_AS(bracketpos_backtrack)->private_data_ptr); -add_jump(compiler, &common->revertframes, JUMP(SLJIT_FAST_CALL)); -OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, (CURRENT_AS(bracketpos_backtrack)->framesize - 1) * sizeof(sljit_sw)); - -if (current->topbacktracks) - { - jump = JUMP(SLJIT_JUMP); - set_jumps(current->topbacktracks, LABEL()); - /* Drop the stack frame. */ - free_stack(common, CURRENT_AS(bracketpos_backtrack)->stacksize); - JUMPHERE(jump); - } -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), CURRENT_AS(bracketpos_backtrack)->private_data_ptr, SLJIT_MEM1(STACK_TOP), STACK(-CURRENT_AS(bracketpos_backtrack)->framesize - 1)); -} - -static SLJIT_INLINE void compile_braminzero_backtrackingpath(compiler_common *common, struct backtrack_common *current) -{ -assert_backtrack backtrack; - -current->top = NULL; -current->topbacktracks = NULL; -current->nextbacktracks = NULL; -if (current->cc[1] > OP_ASSERTBACK_NOT) - { - /* Manual call of compile_bracket_matchingpath and compile_bracket_backtrackingpath. */ - compile_bracket_matchingpath(common, current->cc, current); - compile_bracket_backtrackingpath(common, current->top); - } -else - { - memset(&backtrack, 0, sizeof(backtrack)); - backtrack.common.cc = current->cc; - backtrack.matchingpath = CURRENT_AS(braminzero_backtrack)->matchingpath; - /* Manual call of compile_assert_matchingpath. */ - compile_assert_matchingpath(common, current->cc, &backtrack, FALSE); - } -SLJIT_ASSERT(!current->nextbacktracks && !current->topbacktracks); -} - -static SLJIT_INLINE void compile_control_verb_backtrackingpath(compiler_common *common, struct backtrack_common *current) -{ -DEFINE_COMPILER; -PCRE2_UCHAR opcode = *current->cc; -struct sljit_label *loop; -struct sljit_jump *jump; - -if (opcode == OP_THEN || opcode == OP_THEN_ARG) - { - if (common->then_trap != NULL) - { - SLJIT_ASSERT(common->control_head_ptr != 0); - - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, type_then_trap); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, common->then_trap->start); - jump = JUMP(SLJIT_JUMP); - - loop = LABEL(); - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - JUMPHERE(jump); - CMPTO(SLJIT_NOT_EQUAL, SLJIT_MEM1(STACK_TOP), STACK(1), TMP1, 0, loop); - CMPTO(SLJIT_NOT_EQUAL, SLJIT_MEM1(STACK_TOP), STACK(2), TMP2, 0, loop); - add_jump(compiler, &common->then_trap->quit, JUMP(SLJIT_JUMP)); - return; - } - else if (!common->local_quit_available && common->in_positive_assertion) - { - add_jump(compiler, &common->positive_assertion_quit, JUMP(SLJIT_JUMP)); - return; - } - } - -if (common->local_quit_available) - { - /* Abort match with a fail. */ - if (common->quit_label == NULL) - add_jump(compiler, &common->quit, JUMP(SLJIT_JUMP)); - else - JUMPTO(SLJIT_JUMP, common->quit_label); - return; - } - -if (opcode == OP_SKIP_ARG) - { - SLJIT_ASSERT(common->control_head_ptr != 0 && TMP1 == SLJIT_R0 && STR_PTR == SLJIT_R1); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr); - OP1(SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)(current->cc + 2)); - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_search_mark)); - - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_R0, 0); - add_jump(compiler, &common->reset_match, CMP(SLJIT_NOT_EQUAL, SLJIT_R0, 0, SLJIT_IMM, 0)); - return; - } - -if (opcode == OP_SKIP) - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); -else - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_IMM, 0); -add_jump(compiler, &common->reset_match, JUMP(SLJIT_JUMP)); -} - -static SLJIT_INLINE void compile_then_trap_backtrackingpath(compiler_common *common, struct backtrack_common *current) -{ -DEFINE_COMPILER; -struct sljit_jump *jump; -int size; - -if (CURRENT_AS(then_trap_backtrack)->then_trap) - { - common->then_trap = CURRENT_AS(then_trap_backtrack)->then_trap; - return; - } - -size = CURRENT_AS(then_trap_backtrack)->framesize; -size = 3 + (size < 0 ? 0 : size); - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(size - 3)); -free_stack(common, size); -jump = JUMP(SLJIT_JUMP); - -set_jumps(CURRENT_AS(then_trap_backtrack)->quit, LABEL()); -/* STACK_TOP is set by THEN. */ -if (CURRENT_AS(then_trap_backtrack)->framesize >= 0) - { - add_jump(compiler, &common->revertframes, JUMP(SLJIT_FAST_CALL)); - OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, (CURRENT_AS(then_trap_backtrack)->framesize - 1) * sizeof(sljit_sw)); - } -OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); -free_stack(common, 3); - -JUMPHERE(jump); -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, TMP1, 0); -} - -static void compile_backtrackingpath(compiler_common *common, struct backtrack_common *current) -{ -DEFINE_COMPILER; -then_trap_backtrack *save_then_trap = common->then_trap; - -while (current) - { - if (current->nextbacktracks != NULL) - set_jumps(current->nextbacktracks, LABEL()); - switch(*current->cc) - { - case OP_SET_SOM: - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - free_stack(common, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(0), TMP1, 0); - break; - - case OP_STAR: - case OP_MINSTAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_QUERY: - case OP_MINQUERY: - case OP_UPTO: - case OP_MINUPTO: - case OP_EXACT: - case OP_POSSTAR: - case OP_POSPLUS: - case OP_POSQUERY: - case OP_POSUPTO: - case OP_STARI: - case OP_MINSTARI: - case OP_PLUSI: - case OP_MINPLUSI: - case OP_QUERYI: - case OP_MINQUERYI: - case OP_UPTOI: - case OP_MINUPTOI: - case OP_EXACTI: - case OP_POSSTARI: - case OP_POSPLUSI: - case OP_POSQUERYI: - case OP_POSUPTOI: - case OP_NOTSTAR: - case OP_NOTMINSTAR: - case OP_NOTPLUS: - case OP_NOTMINPLUS: - case OP_NOTQUERY: - case OP_NOTMINQUERY: - case OP_NOTUPTO: - case OP_NOTMINUPTO: - case OP_NOTEXACT: - case OP_NOTPOSSTAR: - case OP_NOTPOSPLUS: - case OP_NOTPOSQUERY: - case OP_NOTPOSUPTO: - case OP_NOTSTARI: - case OP_NOTMINSTARI: - case OP_NOTPLUSI: - case OP_NOTMINPLUSI: - case OP_NOTQUERYI: - case OP_NOTMINQUERYI: - case OP_NOTUPTOI: - case OP_NOTMINUPTOI: - case OP_NOTEXACTI: - case OP_NOTPOSSTARI: - case OP_NOTPOSPLUSI: - case OP_NOTPOSQUERYI: - case OP_NOTPOSUPTOI: - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEEXACT: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - case OP_TYPEPOSUPTO: - case OP_CLASS: - case OP_NCLASS: -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 - case OP_XCLASS: -#endif - compile_iterator_backtrackingpath(common, current); - break; - - case OP_REF: - case OP_REFI: - case OP_DNREF: - case OP_DNREFI: - compile_ref_iterator_backtrackingpath(common, current); - break; - - case OP_RECURSE: - compile_recurse_backtrackingpath(common, current); - break; - - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - compile_assert_backtrackingpath(common, current); - break; - - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - case OP_ONCE: - case OP_SCRIPT_RUN: - case OP_BRA: - case OP_CBRA: - case OP_COND: - case OP_SBRA: - case OP_SCBRA: - case OP_SCOND: - compile_bracket_backtrackingpath(common, current); - break; - - case OP_BRAZERO: - if (current->cc[1] > OP_ASSERTBACK_NOT) - compile_bracket_backtrackingpath(common, current); - else - compile_assert_backtrackingpath(common, current); - break; - - case OP_BRAPOS: - case OP_CBRAPOS: - case OP_SBRAPOS: - case OP_SCBRAPOS: - case OP_BRAPOSZERO: - compile_bracketpos_backtrackingpath(common, current); - break; - - case OP_BRAMINZERO: - compile_braminzero_backtrackingpath(common, current); - break; - - case OP_MARK: - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(common->has_skip_arg ? 4 : 0)); - if (common->has_skip_arg) - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - free_stack(common, common->has_skip_arg ? 5 : 1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->mark_ptr, TMP1, 0); - if (common->has_skip_arg) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, TMP2, 0); - break; - - case OP_THEN: - case OP_THEN_ARG: - case OP_PRUNE: - case OP_PRUNE_ARG: - case OP_SKIP: - case OP_SKIP_ARG: - compile_control_verb_backtrackingpath(common, current); - break; - - case OP_COMMIT: - case OP_COMMIT_ARG: - if (!common->local_quit_available) - OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_NOMATCH); - if (common->quit_label == NULL) - add_jump(compiler, &common->quit, JUMP(SLJIT_JUMP)); - else - JUMPTO(SLJIT_JUMP, common->quit_label); - break; - - case OP_CALLOUT: - case OP_CALLOUT_STR: - case OP_FAIL: - case OP_ACCEPT: - case OP_ASSERT_ACCEPT: - set_jumps(current->topbacktracks, LABEL()); - break; - - case OP_THEN_TRAP: - /* A virtual opcode for then traps. */ - compile_then_trap_backtrackingpath(common, current); - break; - - default: - SLJIT_UNREACHABLE(); - break; - } - current = current->prev; - } -common->then_trap = save_then_trap; -} - -static SLJIT_INLINE void compile_recurse(compiler_common *common) -{ -DEFINE_COMPILER; -PCRE2_SPTR cc = common->start + common->currententry->start; -PCRE2_SPTR ccbegin = cc + 1 + LINK_SIZE + (*cc == OP_BRA ? 0 : IMM2_SIZE); -PCRE2_SPTR ccend = bracketend(cc) - (1 + LINK_SIZE); -BOOL needs_control_head; -BOOL has_quit; -BOOL has_accept; -int private_data_size = get_recurse_data_length(common, ccbegin, ccend, &needs_control_head, &has_quit, &has_accept); -int alt_count, alt_max, local_size; -backtrack_common altbacktrack; -jump_list *match = NULL; -struct sljit_jump *next_alt = NULL; -struct sljit_jump *accept_exit = NULL; -struct sljit_label *quit; -struct sljit_put_label *put_label = NULL; - -/* Recurse captures then. */ -common->then_trap = NULL; - -SLJIT_ASSERT(*cc == OP_BRA || *cc == OP_CBRA || *cc == OP_CBRAPOS || *cc == OP_SCBRA || *cc == OP_SCBRAPOS); - -alt_max = no_alternatives(cc); -alt_count = 0; - -/* Matching path. */ -SLJIT_ASSERT(common->currententry->entry_label == NULL && common->recursive_head_ptr != 0); -common->currententry->entry_label = LABEL(); -set_jumps(common->currententry->entry_calls, common->currententry->entry_label); - -sljit_emit_fast_enter(compiler, TMP2, 0); -count_match(common); - -local_size = (alt_max > 1) ? 2 : 1; - -/* (Reversed) stack layout: - [private data][return address][optional: str ptr] ... [optional: alternative index][recursive_head_ptr] */ - -allocate_stack(common, private_data_size + local_size); -/* Save return address. */ -OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(local_size - 1), TMP2, 0); - -copy_recurse_data(common, ccbegin, ccend, recurse_copy_from_global, local_size, private_data_size + local_size, has_quit); - -/* This variable is saved and restored all time when we enter or exit from a recursive context. */ -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->recursive_head_ptr, STACK_TOP, 0); - -if (needs_control_head) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_IMM, 0); - -if (alt_max > 1) - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); - -memset(&altbacktrack, 0, sizeof(backtrack_common)); -common->quit_label = NULL; -common->accept_label = NULL; -common->quit = NULL; -common->accept = NULL; -altbacktrack.cc = ccbegin; -cc += GET(cc, 1); -while (1) - { - altbacktrack.top = NULL; - altbacktrack.topbacktracks = NULL; - - if (altbacktrack.cc != ccbegin) - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - - compile_matchingpath(common, altbacktrack.cc, cc, &altbacktrack); - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - return; - - allocate_stack(common, (alt_max > 1 || has_accept) ? 2 : 1); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->recursive_head_ptr); - - if (alt_max > 1 || has_accept) - { - if (alt_max > 3) - put_label = sljit_emit_put_label(compiler, SLJIT_MEM1(STACK_TOP), STACK(1)); - else - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, alt_count); - } - - add_jump(compiler, &match, JUMP(SLJIT_JUMP)); - - if (alt_count == 0) - { - /* Backtracking path entry. */ - SLJIT_ASSERT(common->currententry->backtrack_label == NULL); - common->currententry->backtrack_label = LABEL(); - set_jumps(common->currententry->backtrack_calls, common->currententry->backtrack_label); - - sljit_emit_fast_enter(compiler, TMP1, 0); - - if (has_accept) - accept_exit = CMP(SLJIT_EQUAL, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, -1); - - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - /* Save return address. */ - OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), STACK(local_size - 1), TMP1, 0); - - copy_recurse_data(common, ccbegin, ccend, recurse_swap_global, local_size, private_data_size + local_size, has_quit); - - if (alt_max > 1) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(1)); - free_stack(common, 2); - - if (alt_max > 3) - { - sljit_emit_ijump(compiler, SLJIT_JUMP, TMP1, 0); - sljit_set_put_label(put_label, LABEL()); - sljit_emit_op0(compiler, SLJIT_ENDBR); - } - else - next_alt = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0); - } - else - free_stack(common, has_accept ? 2 : 1); - } - else if (alt_max > 3) - { - sljit_set_put_label(put_label, LABEL()); - sljit_emit_op0(compiler, SLJIT_ENDBR); - } - else - { - JUMPHERE(next_alt); - if (alt_count + 1 < alt_max) - { - SLJIT_ASSERT(alt_count == 1 && alt_max == 3); - next_alt = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 1); - } - } - - alt_count++; - - compile_backtrackingpath(common, altbacktrack.top); - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - return; - set_jumps(altbacktrack.topbacktracks, LABEL()); - - if (*cc != OP_ALT) - break; - - altbacktrack.cc = cc + 1 + LINK_SIZE; - cc += GET(cc, 1); - } - -/* No alternative is matched. */ - -quit = LABEL(); - -copy_recurse_data(common, ccbegin, ccend, recurse_copy_private_to_global, local_size, private_data_size + local_size, has_quit); - -OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), STACK(local_size - 1)); -free_stack(common, private_data_size + local_size); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); -OP_SRC(SLJIT_FAST_RETURN, TMP2, 0); - -if (common->quit != NULL) - { - SLJIT_ASSERT(has_quit); - - set_jumps(common->quit, LABEL()); - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), common->recursive_head_ptr); - copy_recurse_data(common, ccbegin, ccend, recurse_copy_shared_to_global, local_size, private_data_size + local_size, has_quit); - JUMPTO(SLJIT_JUMP, quit); - } - -if (has_accept) - { - JUMPHERE(accept_exit); - free_stack(common, 2); - - /* Save return address. */ - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(local_size - 1), TMP1, 0); - - copy_recurse_data(common, ccbegin, ccend, recurse_copy_kept_shared_to_global, local_size, private_data_size + local_size, has_quit); - - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), STACK(local_size - 1)); - free_stack(common, private_data_size + local_size); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); - OP_SRC(SLJIT_FAST_RETURN, TMP2, 0); - } - -if (common->accept != NULL) - { - SLJIT_ASSERT(has_accept); - - set_jumps(common->accept, LABEL()); - - OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), common->recursive_head_ptr); - OP1(SLJIT_MOV, TMP2, 0, STACK_TOP, 0); - - allocate_stack(common, 2); - OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, -1); - } - -set_jumps(match, LABEL()); - -OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), TMP2, 0); - -copy_recurse_data(common, ccbegin, ccend, recurse_swap_global, local_size, private_data_size + local_size, has_quit); - -OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP2), STACK(local_size - 1)); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1); -OP_SRC(SLJIT_FAST_RETURN, TMP2, 0); -} - -#undef COMPILE_BACKTRACKINGPATH -#undef CURRENT_AS - -#define PUBLIC_JIT_COMPILE_CONFIGURATION_OPTIONS \ - (PCRE2_JIT_INVALID_UTF) - -static int jit_compile(pcre2_code *code, sljit_u32 mode) -{ -pcre2_real_code *re = (pcre2_real_code *)code; -struct sljit_compiler *compiler; -backtrack_common rootbacktrack; -compiler_common common_data; -compiler_common *common = &common_data; -const sljit_u8 *tables = re->tables; -void *allocator_data = &re->memctl; -int private_data_size; -PCRE2_SPTR ccend; -executable_functions *functions; -void *executable_func; -sljit_uw executable_size; -sljit_uw total_length; -struct sljit_label *mainloop_label = NULL; -struct sljit_label *continue_match_label; -struct sljit_label *empty_match_found_label = NULL; -struct sljit_label *empty_match_backtrack_label = NULL; -struct sljit_label *reset_match_label; -struct sljit_label *quit_label; -struct sljit_jump *jump; -struct sljit_jump *minlength_check_failed = NULL; -struct sljit_jump *empty_match = NULL; -struct sljit_jump *end_anchor_failed = NULL; -jump_list *reqcu_not_found = NULL; - -SLJIT_ASSERT(tables); - -#if HAS_VIRTUAL_REGISTERS == 1 -SLJIT_ASSERT(sljit_get_register_index(TMP3) < 0 && sljit_get_register_index(ARGUMENTS) < 0 && sljit_get_register_index(RETURN_ADDR) < 0); -#elif HAS_VIRTUAL_REGISTERS == 0 -SLJIT_ASSERT(sljit_get_register_index(TMP3) >= 0 && sljit_get_register_index(ARGUMENTS) >= 0 && sljit_get_register_index(RETURN_ADDR) >= 0); -#else -#error "Invalid value for HAS_VIRTUAL_REGISTERS" -#endif - -memset(&rootbacktrack, 0, sizeof(backtrack_common)); -memset(common, 0, sizeof(compiler_common)); -common->re = re; -common->name_table = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)); -rootbacktrack.cc = common->name_table + re->name_count * re->name_entry_size; - -#ifdef SUPPORT_UNICODE -common->invalid_utf = (mode & PCRE2_JIT_INVALID_UTF) != 0; -#endif /* SUPPORT_UNICODE */ -mode &= ~PUBLIC_JIT_COMPILE_CONFIGURATION_OPTIONS; - -common->start = rootbacktrack.cc; -common->read_only_data_head = NULL; -common->fcc = tables + fcc_offset; -common->lcc = (sljit_sw)(tables + lcc_offset); -common->mode = mode; -common->might_be_empty = (re->minlength == 0) || (re->flags & PCRE2_MATCH_EMPTY); -common->allow_empty_partial = (re->max_lookbehind > 0) || (re->flags & PCRE2_MATCH_EMPTY); -common->nltype = NLTYPE_FIXED; -switch(re->newline_convention) - { - case PCRE2_NEWLINE_CR: common->newline = CHAR_CR; break; - case PCRE2_NEWLINE_LF: common->newline = CHAR_NL; break; - case PCRE2_NEWLINE_CRLF: common->newline = (CHAR_CR << 8) | CHAR_NL; break; - case PCRE2_NEWLINE_ANY: common->newline = (CHAR_CR << 8) | CHAR_NL; common->nltype = NLTYPE_ANY; break; - case PCRE2_NEWLINE_ANYCRLF: common->newline = (CHAR_CR << 8) | CHAR_NL; common->nltype = NLTYPE_ANYCRLF; break; - case PCRE2_NEWLINE_NUL: common->newline = CHAR_NUL; break; - default: return PCRE2_ERROR_INTERNAL; - } -common->nlmax = READ_CHAR_MAX; -common->nlmin = 0; -if (re->bsr_convention == PCRE2_BSR_UNICODE) - common->bsr_nltype = NLTYPE_ANY; -else if (re->bsr_convention == PCRE2_BSR_ANYCRLF) - common->bsr_nltype = NLTYPE_ANYCRLF; -else - { -#ifdef BSR_ANYCRLF - common->bsr_nltype = NLTYPE_ANYCRLF; -#else - common->bsr_nltype = NLTYPE_ANY; -#endif - } -common->bsr_nlmax = READ_CHAR_MAX; -common->bsr_nlmin = 0; -common->endonly = (re->overall_options & PCRE2_DOLLAR_ENDONLY) != 0; -common->ctypes = (sljit_sw)(tables + ctypes_offset); -common->name_count = re->name_count; -common->name_entry_size = re->name_entry_size; -common->unset_backref = (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) != 0; -common->alt_circumflex = (re->overall_options & PCRE2_ALT_CIRCUMFLEX) != 0; -#ifdef SUPPORT_UNICODE -/* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */ -common->utf = (re->overall_options & PCRE2_UTF) != 0; -common->ucp = (re->overall_options & PCRE2_UCP) != 0; -if (common->utf) - { - if (common->nltype == NLTYPE_ANY) - common->nlmax = 0x2029; - else if (common->nltype == NLTYPE_ANYCRLF) - common->nlmax = (CHAR_CR > CHAR_NL) ? CHAR_CR : CHAR_NL; - else - { - /* We only care about the first newline character. */ - common->nlmax = common->newline & 0xff; - } - - if (common->nltype == NLTYPE_FIXED) - common->nlmin = common->newline & 0xff; - else - common->nlmin = (CHAR_CR < CHAR_NL) ? CHAR_CR : CHAR_NL; - - if (common->bsr_nltype == NLTYPE_ANY) - common->bsr_nlmax = 0x2029; - else - common->bsr_nlmax = (CHAR_CR > CHAR_NL) ? CHAR_CR : CHAR_NL; - common->bsr_nlmin = (CHAR_CR < CHAR_NL) ? CHAR_CR : CHAR_NL; - } -else - common->invalid_utf = FALSE; -#endif /* SUPPORT_UNICODE */ -ccend = bracketend(common->start); - -/* Calculate the local space size on the stack. */ -common->ovector_start = LIMIT_MATCH + sizeof(sljit_sw); -common->optimized_cbracket = (sljit_u8 *)SLJIT_MALLOC(re->top_bracket + 1, allocator_data); -if (!common->optimized_cbracket) - return PCRE2_ERROR_NOMEMORY; -#if defined DEBUG_FORCE_UNOPTIMIZED_CBRAS && DEBUG_FORCE_UNOPTIMIZED_CBRAS == 1 -memset(common->optimized_cbracket, 0, re->top_bracket + 1); -#else -memset(common->optimized_cbracket, 1, re->top_bracket + 1); -#endif - -SLJIT_ASSERT(*common->start == OP_BRA && ccend[-(1 + LINK_SIZE)] == OP_KET); -#if defined DEBUG_FORCE_UNOPTIMIZED_CBRAS && DEBUG_FORCE_UNOPTIMIZED_CBRAS == 2 -common->capture_last_ptr = common->ovector_start; -common->ovector_start += sizeof(sljit_sw); -#endif -if (!check_opcode_types(common, common->start, ccend)) - { - SLJIT_FREE(common->optimized_cbracket, allocator_data); - return PCRE2_ERROR_NOMEMORY; - } - -/* Checking flags and updating ovector_start. */ -if (mode == PCRE2_JIT_COMPLETE && (re->flags & PCRE2_LASTSET) != 0 && (re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) - { - common->req_char_ptr = common->ovector_start; - common->ovector_start += sizeof(sljit_sw); - } -if (mode != PCRE2_JIT_COMPLETE) - { - common->start_used_ptr = common->ovector_start; - common->ovector_start += sizeof(sljit_sw); - if (mode == PCRE2_JIT_PARTIAL_SOFT) - { - common->hit_start = common->ovector_start; - common->ovector_start += sizeof(sljit_sw); - } - } -if ((re->overall_options & (PCRE2_FIRSTLINE | PCRE2_USE_OFFSET_LIMIT)) != 0) - { - common->match_end_ptr = common->ovector_start; - common->ovector_start += sizeof(sljit_sw); - } -#if defined DEBUG_FORCE_CONTROL_HEAD && DEBUG_FORCE_CONTROL_HEAD -common->control_head_ptr = 1; -#endif -if (common->control_head_ptr != 0) - { - common->control_head_ptr = common->ovector_start; - common->ovector_start += sizeof(sljit_sw); - } -if (common->has_set_som) - { - /* Saving the real start pointer is necessary. */ - common->start_ptr = common->ovector_start; - common->ovector_start += sizeof(sljit_sw); - } - -/* Aligning ovector to even number of sljit words. */ -if ((common->ovector_start & sizeof(sljit_sw)) != 0) - common->ovector_start += sizeof(sljit_sw); - -if (common->start_ptr == 0) - common->start_ptr = OVECTOR(0); - -/* Capturing brackets cannot be optimized if callouts are allowed. */ -if (common->capture_last_ptr != 0) - memset(common->optimized_cbracket, 0, re->top_bracket + 1); - -SLJIT_ASSERT(!(common->req_char_ptr != 0 && common->start_used_ptr != 0)); -common->cbra_ptr = OVECTOR_START + (re->top_bracket + 1) * 2 * sizeof(sljit_sw); - -total_length = ccend - common->start; -common->private_data_ptrs = (sljit_s32 *)SLJIT_MALLOC(total_length * (sizeof(sljit_s32) + (common->has_then ? 1 : 0)), allocator_data); -if (!common->private_data_ptrs) - { - SLJIT_FREE(common->optimized_cbracket, allocator_data); - return PCRE2_ERROR_NOMEMORY; - } -memset(common->private_data_ptrs, 0, total_length * sizeof(sljit_s32)); - -private_data_size = common->cbra_ptr + (re->top_bracket + 1) * sizeof(sljit_sw); -set_private_data_ptrs(common, &private_data_size, ccend); -if ((re->overall_options & PCRE2_ANCHORED) == 0 && (re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 && !common->has_skip_in_assert_back) - detect_early_fail(common, common->start, &private_data_size, 0, 0); - -SLJIT_ASSERT(common->early_fail_start_ptr <= common->early_fail_end_ptr); - -if (private_data_size > SLJIT_MAX_LOCAL_SIZE) - { - SLJIT_FREE(common->private_data_ptrs, allocator_data); - SLJIT_FREE(common->optimized_cbracket, allocator_data); - return PCRE2_ERROR_NOMEMORY; - } - -if (common->has_then) - { - common->then_offsets = (sljit_u8 *)(common->private_data_ptrs + total_length); - memset(common->then_offsets, 0, total_length); - set_then_offsets(common, common->start, NULL); - } - -compiler = sljit_create_compiler(allocator_data, NULL); -if (!compiler) - { - SLJIT_FREE(common->optimized_cbracket, allocator_data); - SLJIT_FREE(common->private_data_ptrs, allocator_data); - return PCRE2_ERROR_NOMEMORY; - } -common->compiler = compiler; - -/* Main pcre_jit_exec entry. */ -sljit_emit_enter(compiler, 0, SLJIT_ARG1(SW), 5, 5, 0, 0, private_data_size); - -/* Register init. */ -reset_ovector(common, (re->top_bracket + 1) * 2); -if (common->req_char_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->req_char_ptr, SLJIT_R0, 0); - -OP1(SLJIT_MOV, ARGUMENTS, 0, SLJIT_S0, 0); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_S0, 0); -OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, str)); -OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, end)); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, stack)); -OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, limit_match)); -OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(struct sljit_stack, end)); -OP1(SLJIT_MOV, STACK_LIMIT, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(struct sljit_stack, start)); -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LIMIT_MATCH, TMP1, 0); - -if (common->early_fail_start_ptr < common->early_fail_end_ptr) - reset_early_fail(common); - -if (mode == PCRE2_JIT_PARTIAL_SOFT) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->hit_start, SLJIT_IMM, -1); -if (common->mark_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->mark_ptr, SLJIT_IMM, 0); -if (common->control_head_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_IMM, 0); - -/* Main part of the matching */ -if ((re->overall_options & PCRE2_ANCHORED) == 0) - { - mainloop_label = mainloop_entry(common); - continue_match_label = LABEL(); - /* Forward search if possible. */ - if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) - { - if (mode == PCRE2_JIT_COMPLETE && fast_forward_first_n_chars(common)) - ; - else if ((re->flags & PCRE2_FIRSTSET) != 0) - fast_forward_first_char(common); - else if ((re->flags & PCRE2_STARTLINE) != 0) - fast_forward_newline(common); - else if ((re->flags & PCRE2_FIRSTMAPSET) != 0) - fast_forward_start_bits(common); - } - } -else - continue_match_label = LABEL(); - -if (mode == PCRE2_JIT_COMPLETE && re->minlength > 0 && (re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) - { - OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_NOMATCH); - OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(re->minlength)); - minlength_check_failed = CMP(SLJIT_GREATER, TMP2, 0, STR_END, 0); - } -if (common->req_char_ptr != 0) - reqcu_not_found = search_requested_char(common, (PCRE2_UCHAR)(re->last_codeunit), (re->flags & PCRE2_LASTCASELESS) != 0, (re->flags & PCRE2_FIRSTSET) != 0); - -/* Store the current STR_PTR in OVECTOR(0). */ -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(0), STR_PTR, 0); -/* Copy the limit of allowed recursions. */ -OP1(SLJIT_MOV, COUNT_MATCH, 0, SLJIT_MEM1(SLJIT_SP), LIMIT_MATCH); -if (common->capture_last_ptr != 0) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr, SLJIT_IMM, 0); -if (common->fast_forward_bc_ptr != NULL) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), PRIVATE_DATA(common->fast_forward_bc_ptr + 1) >> 3, STR_PTR, 0); - -if (common->start_ptr != OVECTOR(0)) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->start_ptr, STR_PTR, 0); - -/* Copy the beginning of the string. */ -if (mode == PCRE2_JIT_PARTIAL_SOFT) - { - jump = CMP(SLJIT_NOT_EQUAL, SLJIT_MEM1(SLJIT_SP), common->hit_start, SLJIT_IMM, -1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0); - JUMPHERE(jump); - } -else if (mode == PCRE2_JIT_PARTIAL_HARD) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0); - -compile_matchingpath(common, common->start, ccend, &rootbacktrack); -if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - { - sljit_free_compiler(compiler); - SLJIT_FREE(common->optimized_cbracket, allocator_data); - SLJIT_FREE(common->private_data_ptrs, allocator_data); - PRIV(jit_free_rodata)(common->read_only_data_head, allocator_data); - return PCRE2_ERROR_NOMEMORY; - } - -if ((re->overall_options & PCRE2_ENDANCHORED) != 0) - end_anchor_failed = CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, STR_END, 0); - -if (common->might_be_empty) - { - empty_match = CMP(SLJIT_EQUAL, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0)); - empty_match_found_label = LABEL(); - } - -common->accept_label = LABEL(); -if (common->accept != NULL) - set_jumps(common->accept, common->accept_label); - -/* This means we have a match. Update the ovector. */ -copy_ovector(common, re->top_bracket + 1); -common->quit_label = common->abort_label = LABEL(); -if (common->quit != NULL) - set_jumps(common->quit, common->quit_label); -if (common->abort != NULL) - set_jumps(common->abort, common->abort_label); -if (minlength_check_failed != NULL) - SET_LABEL(minlength_check_failed, common->abort_label); - -sljit_emit_op0(compiler, SLJIT_SKIP_FRAMES_BEFORE_RETURN); -sljit_emit_return(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0); - -if (common->failed_match != NULL) - { - SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE); - set_jumps(common->failed_match, LABEL()); - OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_NOMATCH); - JUMPTO(SLJIT_JUMP, common->abort_label); - } - -if ((re->overall_options & PCRE2_ENDANCHORED) != 0) - JUMPHERE(end_anchor_failed); - -if (mode != PCRE2_JIT_COMPLETE) - { - common->partialmatchlabel = LABEL(); - set_jumps(common->partialmatch, common->partialmatchlabel); - return_with_partial_match(common, common->quit_label); - } - -if (common->might_be_empty) - empty_match_backtrack_label = LABEL(); -compile_backtrackingpath(common, rootbacktrack.top); -if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - { - sljit_free_compiler(compiler); - SLJIT_FREE(common->optimized_cbracket, allocator_data); - SLJIT_FREE(common->private_data_ptrs, allocator_data); - PRIV(jit_free_rodata)(common->read_only_data_head, allocator_data); - return PCRE2_ERROR_NOMEMORY; - } - -SLJIT_ASSERT(rootbacktrack.prev == NULL); -reset_match_label = LABEL(); - -if (mode == PCRE2_JIT_PARTIAL_SOFT) - { - /* Update hit_start only in the first time. */ - jump = CMP(SLJIT_NOT_EQUAL, SLJIT_MEM1(SLJIT_SP), common->hit_start, SLJIT_IMM, 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->start_ptr); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, SLJIT_IMM, -1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->hit_start, TMP1, 0); - JUMPHERE(jump); - } - -/* Check we have remaining characters. */ -if ((re->overall_options & PCRE2_ANCHORED) == 0 && common->match_end_ptr != 0) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); - } - -OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), - (common->fast_forward_bc_ptr != NULL) ? (PRIVATE_DATA(common->fast_forward_bc_ptr + 1) >> 3) : common->start_ptr); - -if ((re->overall_options & PCRE2_ANCHORED) == 0) - { - if (common->ff_newline_shortcut != NULL) - { - /* There cannot be more newlines if PCRE2_FIRSTLINE is set. */ - if ((re->overall_options & PCRE2_FIRSTLINE) == 0) - { - if (common->match_end_ptr != 0) - { - OP1(SLJIT_MOV, TMP3, 0, STR_END, 0); - OP1(SLJIT_MOV, STR_END, 0, TMP1, 0); - CMPTO(SLJIT_LESS, STR_PTR, 0, TMP1, 0, common->ff_newline_shortcut); - OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); - } - else - CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, common->ff_newline_shortcut); - } - } - else - CMPTO(SLJIT_LESS, STR_PTR, 0, (common->match_end_ptr == 0) ? STR_END : TMP1, 0, mainloop_label); - } - -/* No more remaining characters. */ -if (reqcu_not_found != NULL) - set_jumps(reqcu_not_found, LABEL()); - -if (mode == PCRE2_JIT_PARTIAL_SOFT) - CMPTO(SLJIT_NOT_EQUAL, SLJIT_MEM1(SLJIT_SP), common->hit_start, SLJIT_IMM, -1, common->partialmatchlabel); - -OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_NOMATCH); -JUMPTO(SLJIT_JUMP, common->quit_label); - -flush_stubs(common); - -if (common->might_be_empty) - { - JUMPHERE(empty_match); - OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV_U32, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, options)); - OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, PCRE2_NOTEMPTY); - JUMPTO(SLJIT_NOT_ZERO, empty_match_backtrack_label); - OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, PCRE2_NOTEMPTY_ATSTART); - JUMPTO(SLJIT_ZERO, empty_match_found_label); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, str)); - CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, STR_PTR, 0, empty_match_found_label); - JUMPTO(SLJIT_JUMP, empty_match_backtrack_label); - } - -common->fast_forward_bc_ptr = NULL; -common->early_fail_start_ptr = 0; -common->early_fail_end_ptr = 0; -common->currententry = common->entries; -common->local_quit_available = TRUE; -quit_label = common->quit_label; -while (common->currententry != NULL) - { - /* Might add new entries. */ - compile_recurse(common); - if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler))) - { - sljit_free_compiler(compiler); - SLJIT_FREE(common->optimized_cbracket, allocator_data); - SLJIT_FREE(common->private_data_ptrs, allocator_data); - PRIV(jit_free_rodata)(common->read_only_data_head, allocator_data); - return PCRE2_ERROR_NOMEMORY; - } - flush_stubs(common); - common->currententry = common->currententry->next; - } -common->local_quit_available = FALSE; -common->quit_label = quit_label; - -/* Allocating stack, returns with PCRE_ERROR_JIT_STACKLIMIT if fails. */ -/* This is a (really) rare case. */ -set_jumps(common->stackalloc, LABEL()); -/* RETURN_ADDR is not a saved register. */ -sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0); - -SLJIT_ASSERT(TMP1 == SLJIT_R0 && STR_PTR == SLJIT_R1); - -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, STR_PTR, 0); -OP1(SLJIT_MOV, SLJIT_R0, 0, ARGUMENTS, 0); -OP2(SLJIT_SUB, SLJIT_R1, 0, STACK_LIMIT, 0, SLJIT_IMM, STACK_GROWTH_RATE); -OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, stack)); -OP1(SLJIT_MOV, STACK_LIMIT, 0, TMP2, 0); - -sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_stack_resize)); - -jump = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0); -OP1(SLJIT_MOV, TMP2, 0, STACK_LIMIT, 0); -OP1(SLJIT_MOV, STACK_LIMIT, 0, SLJIT_RETURN_REG, 0); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); -OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1); -OP_SRC(SLJIT_FAST_RETURN, TMP1, 0); - -/* Allocation failed. */ -JUMPHERE(jump); -/* We break the return address cache here, but this is a really rare case. */ -OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_JIT_STACKLIMIT); -JUMPTO(SLJIT_JUMP, common->quit_label); - -/* Call limit reached. */ -set_jumps(common->calllimit, LABEL()); -OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_MATCHLIMIT); -JUMPTO(SLJIT_JUMP, common->quit_label); - -if (common->revertframes != NULL) - { - set_jumps(common->revertframes, LABEL()); - do_revertframes(common); - } -if (common->wordboundary != NULL) - { - set_jumps(common->wordboundary, LABEL()); - check_wordboundary(common); - } -if (common->anynewline != NULL) - { - set_jumps(common->anynewline, LABEL()); - check_anynewline(common); - } -if (common->hspace != NULL) - { - set_jumps(common->hspace, LABEL()); - check_hspace(common); - } -if (common->vspace != NULL) - { - set_jumps(common->vspace, LABEL()); - check_vspace(common); - } -if (common->casefulcmp != NULL) - { - set_jumps(common->casefulcmp, LABEL()); - do_casefulcmp(common); - } -if (common->caselesscmp != NULL) - { - set_jumps(common->caselesscmp, LABEL()); - do_caselesscmp(common); - } -if (common->reset_match != NULL) - { - set_jumps(common->reset_match, LABEL()); - do_reset_match(common, (re->top_bracket + 1) * 2); - CMPTO(SLJIT_GREATER, STR_PTR, 0, TMP1, 0, continue_match_label); - OP1(SLJIT_MOV, STR_PTR, 0, TMP1, 0); - JUMPTO(SLJIT_JUMP, reset_match_label); - } -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 -if (common->utfreadchar != NULL) - { - set_jumps(common->utfreadchar, LABEL()); - do_utfreadchar(common); - } -if (common->utfreadtype8 != NULL) - { - set_jumps(common->utfreadtype8, LABEL()); - do_utfreadtype8(common); - } -if (common->utfpeakcharback != NULL) - { - set_jumps(common->utfpeakcharback, LABEL()); - do_utfpeakcharback(common); - } -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ -#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 -if (common->utfreadchar_invalid != NULL) - { - set_jumps(common->utfreadchar_invalid, LABEL()); - do_utfreadchar_invalid(common); - } -if (common->utfreadnewline_invalid != NULL) - { - set_jumps(common->utfreadnewline_invalid, LABEL()); - do_utfreadnewline_invalid(common); - } -if (common->utfmoveback_invalid) - { - set_jumps(common->utfmoveback_invalid, LABEL()); - do_utfmoveback_invalid(common); - } -if (common->utfpeakcharback_invalid) - { - set_jumps(common->utfpeakcharback_invalid, LABEL()); - do_utfpeakcharback_invalid(common); - } -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 */ -if (common->getucd != NULL) - { - set_jumps(common->getucd, LABEL()); - do_getucd(common); - } -if (common->getucdtype != NULL) - { - set_jumps(common->getucdtype, LABEL()); - do_getucdtype(common); - } -#endif /* SUPPORT_UNICODE */ - -SLJIT_FREE(common->optimized_cbracket, allocator_data); -SLJIT_FREE(common->private_data_ptrs, allocator_data); - -executable_func = sljit_generate_code(compiler); -executable_size = sljit_get_generated_code_size(compiler); -sljit_free_compiler(compiler); - -if (executable_func == NULL) - { - PRIV(jit_free_rodata)(common->read_only_data_head, allocator_data); - return PCRE2_ERROR_NOMEMORY; - } - -/* Reuse the function descriptor if possible. */ -if (re->executable_jit != NULL) - functions = (executable_functions *)re->executable_jit; -else - { - functions = SLJIT_MALLOC(sizeof(executable_functions), allocator_data); - if (functions == NULL) - { - /* This case is highly unlikely since we just recently - freed a lot of memory. Not impossible though. */ - sljit_free_code(executable_func, NULL); - PRIV(jit_free_rodata)(common->read_only_data_head, allocator_data); - return PCRE2_ERROR_NOMEMORY; - } - memset(functions, 0, sizeof(executable_functions)); - functions->top_bracket = re->top_bracket + 1; - functions->limit_match = re->limit_match; - re->executable_jit = functions; - } - -/* Turn mode into an index. */ -if (mode == PCRE2_JIT_COMPLETE) - mode = 0; -else - mode = (mode == PCRE2_JIT_PARTIAL_SOFT) ? 1 : 2; - -SLJIT_ASSERT(mode < JIT_NUMBER_OF_COMPILE_MODES); -functions->executable_funcs[mode] = executable_func; -functions->read_only_data_heads[mode] = common->read_only_data_head; -functions->executable_sizes[mode] = executable_size; -return 0; -} - -#endif - -/************************************************* -* JIT compile a Regular Expression * -*************************************************/ - -/* This function used JIT to convert a previously-compiled pattern into machine -code. - -Arguments: - code a compiled pattern - options JIT option bits - -Returns: 0: success or (*NOJIT) was used - <0: an error code -*/ - -#define PUBLIC_JIT_COMPILE_OPTIONS \ - (PCRE2_JIT_COMPLETE|PCRE2_JIT_PARTIAL_SOFT|PCRE2_JIT_PARTIAL_HARD|PCRE2_JIT_INVALID_UTF) - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_jit_compile(pcre2_code *code, uint32_t options) -{ -pcre2_real_code *re = (pcre2_real_code *)code; - -if (code == NULL) - return PCRE2_ERROR_NULL; - -if ((options & ~PUBLIC_JIT_COMPILE_OPTIONS) != 0) - return PCRE2_ERROR_JIT_BADOPTION; - -/* Support for invalid UTF was first introduced in JIT, with the option -PCRE2_JIT_INVALID_UTF. Later, support was added to the interpreter, and the -compile-time option PCRE2_MATCH_INVALID_UTF was created. This is now the -preferred feature, with the earlier option deprecated. However, for backward -compatibility, if the earlier option is set, it forces the new option so that -if JIT matching falls back to the interpreter, there is still support for -invalid UTF. However, if this function has already been successfully called -without PCRE2_JIT_INVALID_UTF and without PCRE2_MATCH_INVALID_UTF (meaning that -non-invalid-supporting JIT code was compiled), give an error. - -If in the future support for PCRE2_JIT_INVALID_UTF is withdrawn, the following -actions are needed: - - 1. Remove the definition from pcre2.h.in and from the list in - PUBLIC_JIT_COMPILE_OPTIONS above. - - 2. Replace PCRE2_JIT_INVALID_UTF with a local flag in this module. - - 3. Replace PCRE2_JIT_INVALID_UTF in pcre2_jit_test.c. - - 4. Delete the following short block of code. The setting of "re" and - "functions" can be moved into the JIT-only block below, but if that is - done, (void)re and (void)functions will be needed in the non-JIT case, to - avoid compiler warnings. -*/ - -#ifdef SUPPORT_JIT -executable_functions *functions = (executable_functions *)re->executable_jit; -static int executable_allocator_is_working = 0; -#endif - -if ((options & PCRE2_JIT_INVALID_UTF) != 0) - { - if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) == 0) - { -#ifdef SUPPORT_JIT - if (functions != NULL) return PCRE2_ERROR_JIT_BADOPTION; -#endif - re->overall_options |= PCRE2_MATCH_INVALID_UTF; - } - } - -/* The above tests are run with and without JIT support. This means that -PCRE2_JIT_INVALID_UTF propagates back into the regex options (ensuring -interpreter support) even in the absence of JIT. But now, if there is no JIT -support, give an error return. */ - -#ifndef SUPPORT_JIT -return PCRE2_ERROR_JIT_BADOPTION; -#else /* SUPPORT_JIT */ - -/* There is JIT support. Do the necessary. */ - -if ((re->flags & PCRE2_NOJIT) != 0) return 0; - -if (executable_allocator_is_working == 0) - { - /* Checks whether the executable allocator is working. This check - might run multiple times in multi-threaded environments, but the - result should not be affected by it. */ - void *ptr = SLJIT_MALLOC_EXEC(32, NULL); - - executable_allocator_is_working = -1; - - if (ptr != NULL) - { - SLJIT_FREE_EXEC(((sljit_u8*)(ptr)) + SLJIT_EXEC_OFFSET(ptr), NULL); - executable_allocator_is_working = 1; - } - } - -if (executable_allocator_is_working < 0) - return PCRE2_ERROR_NOMEMORY; - -if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0) - options |= PCRE2_JIT_INVALID_UTF; - -if ((options & PCRE2_JIT_COMPLETE) != 0 && (functions == NULL - || functions->executable_funcs[0] == NULL)) { - uint32_t excluded_options = (PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_PARTIAL_HARD); - int result = jit_compile(code, options & ~excluded_options); - if (result != 0) - return result; - } - -if ((options & PCRE2_JIT_PARTIAL_SOFT) != 0 && (functions == NULL - || functions->executable_funcs[1] == NULL)) { - uint32_t excluded_options = (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_HARD); - int result = jit_compile(code, options & ~excluded_options); - if (result != 0) - return result; - } - -if ((options & PCRE2_JIT_PARTIAL_HARD) != 0 && (functions == NULL - || functions->executable_funcs[2] == NULL)) { - uint32_t excluded_options = (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT); - int result = jit_compile(code, options & ~excluded_options); - if (result != 0) - return result; - } - -return 0; - -#endif /* SUPPORT_JIT */ -} - -/* JIT compiler uses an all-in-one approach. This improves security, - since the code generator functions are not exported. */ - -#define INCLUDED_FROM_PCRE2_JIT_COMPILE - -#include "pcre2_jit_match.c" -#include "pcre2_jit_misc.c" - -/* End of pcre2_jit_compile.c */ diff --git a/pcre2/src/pcre2_jit_match.c b/pcre2/src/pcre2_jit_match.c deleted file mode 100644 index 7e13b8cfe..000000000 --- a/pcre2/src/pcre2_jit_match.c +++ /dev/null @@ -1,186 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2018 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -#ifndef INCLUDED_FROM_PCRE2_JIT_COMPILE -#error This file must be included from pcre2_jit_compile.c. -#endif - -#ifdef SUPPORT_JIT - -static SLJIT_NOINLINE int jit_machine_stack_exec(jit_arguments *arguments, jit_function executable_func) -{ -sljit_u8 local_space[MACHINE_STACK_SIZE]; -struct sljit_stack local_stack; - -local_stack.min_start = local_space; -local_stack.start = local_space; -local_stack.end = local_space + MACHINE_STACK_SIZE; -local_stack.top = local_space + MACHINE_STACK_SIZE; -arguments->stack = &local_stack; -return executable_func(arguments); -} - -#endif - - -/************************************************* -* Do a JIT pattern match * -*************************************************/ - -/* This function runs a JIT pattern match. - -Arguments: - code points to the compiled expression - subject points to the subject string - length length of subject string (may contain binary zeros) - start_offset where to start in the subject string - options option bits - match_data points to a match_data block - mcontext points to a match context - -Returns: > 0 => success; value is the number of ovector pairs filled - = 0 => success, but ovector is not big enough - -1 => failed to match (PCRE_ERROR_NOMATCH) - < -1 => some kind of unexpected problem -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, - PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, - pcre2_match_context *mcontext) -{ -#ifndef SUPPORT_JIT - -(void)code; -(void)subject; -(void)length; -(void)start_offset; -(void)options; -(void)match_data; -(void)mcontext; -return PCRE2_ERROR_JIT_BADOPTION; - -#else /* SUPPORT_JIT */ - -pcre2_real_code *re = (pcre2_real_code *)code; -executable_functions *functions = (executable_functions *)re->executable_jit; -pcre2_jit_stack *jit_stack; -uint32_t oveccount = match_data->oveccount; -uint32_t max_oveccount; -union { - void *executable_func; - jit_function call_executable_func; -} convert_executable_func; -jit_arguments arguments; -int rc; -int index = 0; - -if ((options & PCRE2_PARTIAL_HARD) != 0) - index = 2; -else if ((options & PCRE2_PARTIAL_SOFT) != 0) - index = 1; - -if (functions == NULL || functions->executable_funcs[index] == NULL) - return PCRE2_ERROR_JIT_BADOPTION; - -/* Sanity checks should be handled by pcre_exec. */ -arguments.str = subject + start_offset; -arguments.begin = subject; -arguments.end = subject + length; -arguments.match_data = match_data; -arguments.startchar_ptr = subject; -arguments.mark_ptr = NULL; -arguments.options = options; - -if (mcontext != NULL) - { - arguments.callout = mcontext->callout; - arguments.callout_data = mcontext->callout_data; - arguments.offset_limit = mcontext->offset_limit; - arguments.limit_match = (mcontext->match_limit < re->limit_match)? - mcontext->match_limit : re->limit_match; - if (mcontext->jit_callback != NULL) - jit_stack = mcontext->jit_callback(mcontext->jit_callback_data); - else - jit_stack = (pcre2_jit_stack *)mcontext->jit_callback_data; - } -else - { - arguments.callout = NULL; - arguments.callout_data = NULL; - arguments.offset_limit = PCRE2_UNSET; - arguments.limit_match = (MATCH_LIMIT < re->limit_match)? - MATCH_LIMIT : re->limit_match; - jit_stack = NULL; - } - - -max_oveccount = functions->top_bracket; -if (oveccount > max_oveccount) - oveccount = max_oveccount; -arguments.oveccount = oveccount << 1; - - -convert_executable_func.executable_func = functions->executable_funcs[index]; -if (jit_stack != NULL) - { - arguments.stack = (struct sljit_stack *)(jit_stack->stack); - rc = convert_executable_func.call_executable_func(&arguments); - } -else - rc = jit_machine_stack_exec(&arguments, convert_executable_func.call_executable_func); - -if (rc > (int)oveccount) - rc = 0; -match_data->code = re; -match_data->subject = (rc >= 0 || rc == PCRE2_ERROR_PARTIAL)? subject : NULL; -match_data->rc = rc; -match_data->startchar = arguments.startchar_ptr - subject; -match_data->leftchar = 0; -match_data->rightchar = 0; -match_data->mark = arguments.mark_ptr; -match_data->matchedby = PCRE2_MATCHEDBY_JIT; - -return match_data->rc; - -#endif /* SUPPORT_JIT */ -} - -/* End of pcre2_jit_match.c */ diff --git a/pcre2/src/pcre2_jit_misc.c b/pcre2/src/pcre2_jit_misc.c deleted file mode 100644 index ec924e0f9..000000000 --- a/pcre2/src/pcre2_jit_misc.c +++ /dev/null @@ -1,232 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -#ifndef INCLUDED_FROM_PCRE2_JIT_COMPILE -#error This file must be included from pcre2_jit_compile.c. -#endif - - - -/************************************************* -* Free JIT read-only data * -*************************************************/ - -void -PRIV(jit_free_rodata)(void *current, void *allocator_data) -{ -#ifndef SUPPORT_JIT -(void)current; -(void)allocator_data; -#else /* SUPPORT_JIT */ -void *next; - -SLJIT_UNUSED_ARG(allocator_data); - -while (current != NULL) - { - next = *(void**)current; - SLJIT_FREE(current, allocator_data); - current = next; - } - -#endif /* SUPPORT_JIT */ -} - -/************************************************* -* Free JIT compiled code * -*************************************************/ - -void -PRIV(jit_free)(void *executable_jit, pcre2_memctl *memctl) -{ -#ifndef SUPPORT_JIT -(void)executable_jit; -(void)memctl; -#else /* SUPPORT_JIT */ - -executable_functions *functions = (executable_functions *)executable_jit; -void *allocator_data = memctl; -int i; - -for (i = 0; i < JIT_NUMBER_OF_COMPILE_MODES; i++) - { - if (functions->executable_funcs[i] != NULL) - sljit_free_code(functions->executable_funcs[i], NULL); - PRIV(jit_free_rodata)(functions->read_only_data_heads[i], allocator_data); - } - -SLJIT_FREE(functions, allocator_data); - -#endif /* SUPPORT_JIT */ -} - - -/************************************************* -* Free unused JIT memory * -*************************************************/ - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_jit_free_unused_memory(pcre2_general_context *gcontext) -{ -#ifndef SUPPORT_JIT -(void)gcontext; /* Suppress warning */ -#else /* SUPPORT_JIT */ -SLJIT_UNUSED_ARG(gcontext); -sljit_free_unused_memory_exec(); -#endif /* SUPPORT_JIT */ -} - - - -/************************************************* -* Allocate a JIT stack * -*************************************************/ - -PCRE2_EXP_DEFN pcre2_jit_stack * PCRE2_CALL_CONVENTION -pcre2_jit_stack_create(size_t startsize, size_t maxsize, - pcre2_general_context *gcontext) -{ -#ifndef SUPPORT_JIT - -(void)gcontext; -(void)startsize; -(void)maxsize; -return NULL; - -#else /* SUPPORT_JIT */ - -pcre2_jit_stack *jit_stack; - -if (startsize < 1 || maxsize < 1) - return NULL; -if (startsize > maxsize) - startsize = maxsize; -startsize = (startsize + STACK_GROWTH_RATE - 1) & ~(STACK_GROWTH_RATE - 1); -maxsize = (maxsize + STACK_GROWTH_RATE - 1) & ~(STACK_GROWTH_RATE - 1); - -jit_stack = PRIV(memctl_malloc)(sizeof(pcre2_real_jit_stack), (pcre2_memctl *)gcontext); -if (jit_stack == NULL) return NULL; -jit_stack->stack = sljit_allocate_stack(startsize, maxsize, &jit_stack->memctl); -if (jit_stack->stack == NULL) - { - jit_stack->memctl.free(jit_stack, jit_stack->memctl.memory_data); - return NULL; - } -return jit_stack; - -#endif -} - - -/************************************************* -* Assign a JIT stack to a pattern * -*************************************************/ - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_jit_stack_assign(pcre2_match_context *mcontext, pcre2_jit_callback callback, - void *callback_data) -{ -#ifndef SUPPORT_JIT -(void)mcontext; -(void)callback; -(void)callback_data; -#else /* SUPPORT_JIT */ - -if (mcontext == NULL) return; -mcontext->jit_callback = callback; -mcontext->jit_callback_data = callback_data; - -#endif /* SUPPORT_JIT */ -} - - -/************************************************* -* Free a JIT stack * -*************************************************/ - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_jit_stack_free(pcre2_jit_stack *jit_stack) -{ -#ifndef SUPPORT_JIT -(void)jit_stack; -#else /* SUPPORT_JIT */ -if (jit_stack != NULL) - { - sljit_free_stack((struct sljit_stack *)(jit_stack->stack), &jit_stack->memctl); - jit_stack->memctl.free(jit_stack, jit_stack->memctl.memory_data); - } -#endif /* SUPPORT_JIT */ -} - - -/************************************************* -* Get target CPU type * -*************************************************/ - -const char* -PRIV(jit_get_target)(void) -{ -#ifndef SUPPORT_JIT -return "JIT is not supported"; -#else /* SUPPORT_JIT */ -return sljit_get_platform_name(); -#endif /* SUPPORT_JIT */ -} - - -/************************************************* -* Get size of JIT code * -*************************************************/ - -size_t -PRIV(jit_get_size)(void *executable_jit) -{ -#ifndef SUPPORT_JIT -(void)executable_jit; -return 0; -#else /* SUPPORT_JIT */ -sljit_uw *executable_sizes = ((executable_functions *)executable_jit)->executable_sizes; -SLJIT_COMPILE_ASSERT(JIT_NUMBER_OF_COMPILE_MODES == 3, number_of_compile_modes_changed); -return executable_sizes[0] + executable_sizes[1] + executable_sizes[2]; -#endif -} - -/* End of pcre2_jit_misc.c */ diff --git a/pcre2/src/pcre2_jit_neon_inc.h b/pcre2/src/pcre2_jit_neon_inc.h deleted file mode 100644 index 150da29eb..000000000 --- a/pcre2/src/pcre2_jit_neon_inc.h +++ /dev/null @@ -1,347 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - This module by Zoltan Herczeg and Sebastian Pop - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -# if defined(FFCS) -# if defined(FF_UTF) -# define FF_FUN ffcs_utf -# else -# define FF_FUN ffcs -# endif - -# elif defined(FFCS_2) -# if defined(FF_UTF) -# define FF_FUN ffcs_2_utf -# else -# define FF_FUN ffcs_2 -# endif - -# elif defined(FFCS_MASK) -# if defined(FF_UTF) -# define FF_FUN ffcs_mask_utf -# else -# define FF_FUN ffcs_mask -# endif - -# elif defined(FFCPS_0) -# if defined (FF_UTF) -# define FF_FUN ffcps_0_utf -# else -# define FF_FUN ffcps_0 -# endif - -# elif defined (FFCPS_1) -# if defined (FF_UTF) -# define FF_FUN ffcps_1_utf -# else -# define FF_FUN ffcps_1 -# endif - -# elif defined (FFCPS_DEFAULT) -# if defined (FF_UTF) -# define FF_FUN ffcps_default_utf -# else -# define FF_FUN ffcps_default -# endif -# endif - -static sljit_u8* SLJIT_FUNC FF_FUN(sljit_u8 *str_end, sljit_u8 *str_ptr, sljit_uw offs1, sljit_uw offs2, sljit_uw chars) -#undef FF_FUN -{ -quad_word qw; -int_char ic; - -SLJIT_UNUSED_ARG(offs1); -SLJIT_UNUSED_ARG(offs2); - -ic.x = chars; - -#if defined(FFCS) -sljit_u8 c1 = ic.c.c1; -vect_t vc1 = VDUPQ(c1); - -#elif defined(FFCS_2) -sljit_u8 c1 = ic.c.c1; -vect_t vc1 = VDUPQ(c1); -sljit_u8 c2 = ic.c.c2; -vect_t vc2 = VDUPQ(c2); - -#elif defined(FFCS_MASK) -sljit_u8 c1 = ic.c.c1; -vect_t vc1 = VDUPQ(c1); -sljit_u8 mask = ic.c.c2; -vect_t vmask = VDUPQ(mask); -#endif - -#if defined(FFCPS) -compare_type compare1_type = compare_match1; -compare_type compare2_type = compare_match1; -vect_t cmp1a, cmp1b, cmp2a, cmp2b; -const sljit_u32 diff = IN_UCHARS(offs1 - offs2); -PCRE2_UCHAR char1a = ic.c.c1; -PCRE2_UCHAR char2a = ic.c.c3; - -# ifdef FFCPS_CHAR1A2A -cmp1a = VDUPQ(char1a); -cmp2a = VDUPQ(char2a); -cmp1b = VDUPQ(0); /* to avoid errors on older compilers -Werror=maybe-uninitialized */ -cmp2b = VDUPQ(0); /* to avoid errors on older compilers -Werror=maybe-uninitialized */ -# else -PCRE2_UCHAR char1b = ic.c.c2; -PCRE2_UCHAR char2b = ic.c.c4; -if (char1a == char1b) - { - cmp1a = VDUPQ(char1a); - cmp1b = VDUPQ(0); /* to avoid errors on older compilers -Werror=maybe-uninitialized */ - } -else - { - sljit_u32 bit1 = char1a ^ char1b; - if (is_powerof2(bit1)) - { - compare1_type = compare_match1i; - cmp1a = VDUPQ(char1a | bit1); - cmp1b = VDUPQ(bit1); - } - else - { - compare1_type = compare_match2; - cmp1a = VDUPQ(char1a); - cmp1b = VDUPQ(char1b); - } - } - -if (char2a == char2b) - { - cmp2a = VDUPQ(char2a); - cmp2b = VDUPQ(0); /* to avoid errors on older compilers -Werror=maybe-uninitialized */ - } -else - { - sljit_u32 bit2 = char2a ^ char2b; - if (is_powerof2(bit2)) - { - compare2_type = compare_match1i; - cmp2a = VDUPQ(char2a | bit2); - cmp2b = VDUPQ(bit2); - } - else - { - compare2_type = compare_match2; - cmp2a = VDUPQ(char2a); - cmp2b = VDUPQ(char2b); - } - } -# endif - -str_ptr += IN_UCHARS(offs1); -#endif - -#if PCRE2_CODE_UNIT_WIDTH != 8 -vect_t char_mask = VDUPQ(0xff); -#endif - -#if defined(FF_UTF) -restart:; -#endif - -#if defined(FFCPS) -sljit_u8 *p1 = str_ptr - diff; -#endif -sljit_s32 align_offset = ((uint64_t)str_ptr & 0xf); -str_ptr = (sljit_u8 *) ((uint64_t)str_ptr & ~0xf); -vect_t data = VLD1Q(str_ptr); -#if PCRE2_CODE_UNIT_WIDTH != 8 -data = VANDQ(data, char_mask); -#endif - -#if defined(FFCS) -vect_t eq = VCEQQ(data, vc1); - -#elif defined(FFCS_2) -vect_t eq1 = VCEQQ(data, vc1); -vect_t eq2 = VCEQQ(data, vc2); -vect_t eq = VORRQ(eq1, eq2); - -#elif defined(FFCS_MASK) -vect_t eq = VORRQ(data, vmask); -eq = VCEQQ(eq, vc1); - -#elif defined(FFCPS) -# if defined(FFCPS_DIFF1) -vect_t prev_data = data; -# endif - -vect_t data2; -if (p1 < str_ptr) - { - data2 = VLD1Q(str_ptr - diff); -#if PCRE2_CODE_UNIT_WIDTH != 8 - data2 = VANDQ(data2, char_mask); -#endif - } -else - data2 = shift_left_n_lanes(data, offs1 - offs2); - -if (compare1_type == compare_match1) - data = VCEQQ(data, cmp1a); -else - data = fast_forward_char_pair_compare(compare1_type, data, cmp1a, cmp1b); - -if (compare2_type == compare_match1) - data2 = VCEQQ(data2, cmp2a); -else - data2 = fast_forward_char_pair_compare(compare2_type, data2, cmp2a, cmp2b); - -vect_t eq = VANDQ(data, data2); -#endif - -VST1Q(qw.mem, eq); -/* Ignore matches before the first STR_PTR. */ -if (align_offset < 8) - { - qw.dw[0] >>= align_offset * 8; - if (qw.dw[0]) - { - str_ptr += align_offset + __builtin_ctzll(qw.dw[0]) / 8; - goto match; - } - if (qw.dw[1]) - { - str_ptr += 8 + __builtin_ctzll(qw.dw[1]) / 8; - goto match; - } - } -else - { - qw.dw[1] >>= (align_offset - 8) * 8; - if (qw.dw[1]) - { - str_ptr += align_offset + __builtin_ctzll(qw.dw[1]) / 8; - goto match; - } - } -str_ptr += 16; - -while (str_ptr < str_end) - { - vect_t orig_data = VLD1Q(str_ptr); -#if PCRE2_CODE_UNIT_WIDTH != 8 - orig_data = VANDQ(orig_data, char_mask); -#endif - data = orig_data; - -#if defined(FFCS) - eq = VCEQQ(data, vc1); - -#elif defined(FFCS_2) - eq1 = VCEQQ(data, vc1); - eq2 = VCEQQ(data, vc2); - eq = VORRQ(eq1, eq2); - -#elif defined(FFCS_MASK) - eq = VORRQ(data, vmask); - eq = VCEQQ(eq, vc1); -#endif - -#if defined(FFCPS) -# if defined (FFCPS_DIFF1) - data2 = VEXTQ(prev_data, data, VECTOR_FACTOR - 1); -# else - data2 = VLD1Q(str_ptr - diff); -# if PCRE2_CODE_UNIT_WIDTH != 8 - data2 = VANDQ(data2, char_mask); -# endif -# endif - -# ifdef FFCPS_CHAR1A2A - data = VCEQQ(data, cmp1a); - data2 = VCEQQ(data2, cmp2a); -# else - if (compare1_type == compare_match1) - data = VCEQQ(data, cmp1a); - else - data = fast_forward_char_pair_compare(compare1_type, data, cmp1a, cmp1b); - if (compare2_type == compare_match1) - data2 = VCEQQ(data2, cmp2a); - else - data2 = fast_forward_char_pair_compare(compare2_type, data2, cmp2a, cmp2b); -# endif - - eq = VANDQ(data, data2); -#endif - - VST1Q(qw.mem, eq); - if (qw.dw[0]) - str_ptr += __builtin_ctzll(qw.dw[0]) / 8; - else if (qw.dw[1]) - str_ptr += 8 + __builtin_ctzll(qw.dw[1]) / 8; - else { - str_ptr += 16; -#if defined (FFCPS_DIFF1) - prev_data = orig_data; -#endif - continue; - } - -match:; - if (str_ptr >= str_end) - /* Failed match. */ - return NULL; - -#if defined(FF_UTF) - if (utf_continue(str_ptr + IN_UCHARS(-offs1))) - { - /* Not a match. */ - str_ptr += IN_UCHARS(1); - goto restart; - } -#endif - - /* Match. */ -#if defined (FFCPS) - str_ptr -= IN_UCHARS(offs1); -#endif - return str_ptr; - } - -/* Failed match. */ -return NULL; -} diff --git a/pcre2/src/pcre2_jit_simd_inc.h b/pcre2/src/pcre2_jit_simd_inc.h deleted file mode 100644 index 5673d338c..000000000 --- a/pcre2/src/pcre2_jit_simd_inc.h +++ /dev/null @@ -1,1123 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - This module by Zoltan Herczeg - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) && !(defined SUPPORT_VALGRIND) - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg) -{ -#if PCRE2_CODE_UNIT_WIDTH == 8 -OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0); -return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0x80); -#elif PCRE2_CODE_UNIT_WIDTH == 16 -OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00); -return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00); -#else -#error "Unknown code width" -#endif -} -#endif - -static sljit_s32 character_to_int32(PCRE2_UCHAR chr) -{ -sljit_u32 value = chr; -#if PCRE2_CODE_UNIT_WIDTH == 8 -#define SSE2_COMPARE_TYPE_INDEX 0 -return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value); -#elif PCRE2_CODE_UNIT_WIDTH == 16 -#define SSE2_COMPARE_TYPE_INDEX 1 -return (sljit_s32)((value << 16) | value); -#elif PCRE2_CODE_UNIT_WIDTH == 32 -#define SSE2_COMPARE_TYPE_INDEX 2 -return (sljit_s32)(value); -#else -#error "Unsupported unit width" -#endif -} - -static void load_from_mem_sse2(struct sljit_compiler *compiler, sljit_s32 dst_xmm_reg, sljit_s32 src_general_reg, sljit_s8 offset) -{ -sljit_u8 instruction[5]; - -SLJIT_ASSERT(dst_xmm_reg < 8); -SLJIT_ASSERT(src_general_reg < 8); - -/* MOVDQA xmm1, xmm2/m128 */ -instruction[0] = ((sljit_u8)offset & 0xf) == 0 ? 0x66 : 0xf3; -instruction[1] = 0x0f; -instruction[2] = 0x6f; - -if (offset == 0) - { - instruction[3] = (dst_xmm_reg << 3) | src_general_reg; - sljit_emit_op_custom(compiler, instruction, 4); - return; - } - -instruction[3] = 0x40 | (dst_xmm_reg << 3) | src_general_reg; -instruction[4] = (sljit_u8)offset; -sljit_emit_op_custom(compiler, instruction, 5); -} - -typedef enum { - sse2_compare_match1, - sse2_compare_match1i, - sse2_compare_match2, -} sse2_compare_type; - -static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, sse2_compare_type compare_type, - int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind) -{ -sljit_u8 instruction[4]; -instruction[0] = 0x66; -instruction[1] = 0x0f; - -SLJIT_ASSERT(step >= 0 && step <= 3); - -if (compare_type != sse2_compare_match2) - { - if (step == 0) - { - if (compare_type == sse2_compare_match1i) - { - /* POR xmm1, xmm2/m128 */ - /* instruction[0] = 0x66; */ - /* instruction[1] = 0x0f; */ - instruction[2] = 0xeb; - instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind; - sljit_emit_op_custom(compiler, instruction, 4); - } - return; - } - - if (step != 2) - return; - - /* PCMPEQB/W/D xmm1, xmm2/m128 */ - /* instruction[0] = 0x66; */ - /* instruction[1] = 0x0f; */ - instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX; - instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind; - sljit_emit_op_custom(compiler, instruction, 4); - return; - } - -switch (step) - { - case 0: - /* MOVDQA xmm1, xmm2/m128 */ - /* instruction[0] = 0x66; */ - /* instruction[1] = 0x0f; */ - instruction[2] = 0x6f; - instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind; - sljit_emit_op_custom(compiler, instruction, 4); - return; - - case 1: - /* PCMPEQB/W/D xmm1, xmm2/m128 */ - /* instruction[0] = 0x66; */ - /* instruction[1] = 0x0f; */ - instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX; - instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind; - sljit_emit_op_custom(compiler, instruction, 4); - return; - - case 2: - /* PCMPEQB/W/D xmm1, xmm2/m128 */ - /* instruction[0] = 0x66; */ - /* instruction[1] = 0x0f; */ - instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX; - instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind; - sljit_emit_op_custom(compiler, instruction, 4); - return; - - case 3: - /* POR xmm1, xmm2/m128 */ - /* instruction[0] = 0x66; */ - /* instruction[1] = 0x0f; */ - instruction[2] = 0xeb; - instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind; - sljit_emit_op_custom(compiler, instruction, 4); - return; - } -} - -#define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2)) - -static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset) -{ -DEFINE_COMPILER; -struct sljit_label *start; -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -struct sljit_label *restart; -#endif -struct sljit_jump *quit; -struct sljit_jump *partial_quit[2]; -sse2_compare_type compare_type = sse2_compare_match1; -sljit_u8 instruction[8]; -sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1); -sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR); -sljit_s32 data_ind = 0; -sljit_s32 tmp_ind = 1; -sljit_s32 cmp1_ind = 2; -sljit_s32 cmp2_ind = 3; -sljit_u32 bit = 0; -int i; - -SLJIT_UNUSED_ARG(offset); - -if (char1 != char2) - { - bit = char1 ^ char2; - compare_type = sse2_compare_match1i; - - if (!is_powerof2(bit)) - { - bit = 0; - compare_type = sse2_compare_match2; - } - } - -partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); -if (common->mode == PCRE2_JIT_COMPLETE) - add_jump(compiler, &common->failed_match, partial_quit[0]); - -/* First part (unaligned start) */ - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit)); - -SLJIT_ASSERT(tmp1_reg_ind < 8); - -/* MOVD xmm, r/m32 */ -instruction[0] = 0x66; -instruction[1] = 0x0f; -instruction[2] = 0x6e; -instruction[3] = 0xc0 | (cmp1_ind << 3) | tmp1_reg_ind; -sljit_emit_op_custom(compiler, instruction, 4); - -if (char1 != char2) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2)); - - /* MOVD xmm, r/m32 */ - instruction[3] = 0xc0 | (cmp2_ind << 3) | tmp1_reg_ind; - sljit_emit_op_custom(compiler, instruction, 4); - } - -OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0); - -/* PSHUFD xmm1, xmm2/m128, imm8 */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0x70; -instruction[3] = 0xc0 | (cmp1_ind << 3) | cmp1_ind; -instruction[4] = 0; -sljit_emit_op_custom(compiler, instruction, 5); - -if (char1 != char2) - { - /* PSHUFD xmm1, xmm2/m128, imm8 */ - instruction[3] = 0xc0 | (cmp2_ind << 3) | cmp2_ind; - sljit_emit_op_custom(compiler, instruction, 5); - } - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -restart = LABEL(); -#endif -OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf); - -load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0); -for (i = 0; i < 4; i++) - fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind); - -/* PMOVMSKB reg, xmm */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0xd7; -instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind; -sljit_emit_op_custom(compiler, instruction, 4); - -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); -OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0); - -quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0); - -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0); - -/* Second part (aligned) */ -start = LABEL(); - -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16); - -partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); -if (common->mode == PCRE2_JIT_COMPLETE) - add_jump(compiler, &common->failed_match, partial_quit[1]); - -load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0); -for (i = 0; i < 4; i++) - fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind); - -/* PMOVMSKB reg, xmm */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0xd7; -instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind; -sljit_emit_op_custom(compiler, instruction, 4); - -CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start); - -JUMPHERE(quit); - -/* BSF r32, r/m32 */ -instruction[0] = 0x0f; -instruction[1] = 0xbc; -instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind; -sljit_emit_op_custom(compiler, instruction, 3); - -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - -if (common->mode != PCRE2_JIT_COMPLETE) - { - JUMPHERE(partial_quit[0]); - JUMPHERE(partial_quit[1]); - OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0); - CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0); - } -else - add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -if (common->utf && offset > 0) - { - SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE); - - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset)); - - quit = jump_if_utf_char_start(compiler, TMP1); - - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0); - JUMPTO(SLJIT_JUMP, restart); - - JUMPHERE(quit); - } -#endif -} - -#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2)) - -static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2) -{ -DEFINE_COMPILER; -struct sljit_label *start; -struct sljit_jump *quit; -jump_list *not_found = NULL; -sse2_compare_type compare_type = sse2_compare_match1; -sljit_u8 instruction[8]; -sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1); -sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR); -sljit_s32 data_ind = 0; -sljit_s32 tmp_ind = 1; -sljit_s32 cmp1_ind = 2; -sljit_s32 cmp2_ind = 3; -sljit_u32 bit = 0; -int i; - -if (char1 != char2) - { - bit = char1 ^ char2; - compare_type = sse2_compare_match1i; - - if (!is_powerof2(bit)) - { - bit = 0; - compare_type = sse2_compare_match2; - } - } - -add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0)); -OP1(SLJIT_MOV, TMP2, 0, TMP1, 0); -OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0); - -/* First part (unaligned start) */ - -OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit)); - -SLJIT_ASSERT(tmp1_reg_ind < 8); - -/* MOVD xmm, r/m32 */ -instruction[0] = 0x66; -instruction[1] = 0x0f; -instruction[2] = 0x6e; -instruction[3] = 0xc0 | (cmp1_ind << 3) | tmp1_reg_ind; -sljit_emit_op_custom(compiler, instruction, 4); - -if (char1 != char2) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2)); - - /* MOVD xmm, r/m32 */ - instruction[3] = 0xc0 | (cmp2_ind << 3) | tmp1_reg_ind; - sljit_emit_op_custom(compiler, instruction, 4); - } - -OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0); - -/* PSHUFD xmm1, xmm2/m128, imm8 */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0x70; -instruction[3] = 0xc0 | (cmp1_ind << 3) | cmp1_ind; -instruction[4] = 0; -sljit_emit_op_custom(compiler, instruction, 5); - -if (char1 != char2) - { - /* PSHUFD xmm1, xmm2/m128, imm8 */ - instruction[3] = 0xc0 | (cmp2_ind << 3) | cmp2_ind; - sljit_emit_op_custom(compiler, instruction, 5); - } - -OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf); - -load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0); -for (i = 0; i < 4; i++) - fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind); - -/* PMOVMSKB reg, xmm */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0xd7; -instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind; -sljit_emit_op_custom(compiler, instruction, 4); - -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); -OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0); - -quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0); - -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0); - -/* Second part (aligned) */ -start = LABEL(); - -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16); - -add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - -load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0); -for (i = 0; i < 4; i++) - fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind); - -/* PMOVMSKB reg, xmm */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0xd7; -instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind; -sljit_emit_op_custom(compiler, instruction, 4); - -CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start); - -JUMPHERE(quit); - -/* BSF r32, r/m32 */ -instruction[0] = 0x0f; -instruction[1] = 0xbc; -instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind; -sljit_emit_op_custom(compiler, instruction, 3); - -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0); -add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0)); - -OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0); -return not_found; -} - -#ifndef _WIN64 - -static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void) -{ -#if PCRE2_CODE_UNIT_WIDTH == 8 -return 15; -#elif PCRE2_CODE_UNIT_WIDTH == 16 -return 7; -#elif PCRE2_CODE_UNIT_WIDTH == 32 -return 3; -#else -#error "Unsupported unit width" -#endif -} - -#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2)) - -static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1, - PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b) -{ -DEFINE_COMPILER; -sse2_compare_type compare1_type = sse2_compare_match1; -sse2_compare_type compare2_type = sse2_compare_match1; -sljit_u32 bit1 = 0; -sljit_u32 bit2 = 0; -sljit_u32 diff = IN_UCHARS(offs1 - offs2); -sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1); -sljit_s32 tmp2_reg_ind = sljit_get_register_index(TMP2); -sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR); -sljit_s32 data1_ind = 0; -sljit_s32 data2_ind = 1; -sljit_s32 tmp1_ind = 2; -sljit_s32 tmp2_ind = 3; -sljit_s32 cmp1a_ind = 4; -sljit_s32 cmp1b_ind = 5; -sljit_s32 cmp2a_ind = 6; -sljit_s32 cmp2b_ind = 7; -struct sljit_label *start; -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -struct sljit_label *restart; -#endif -struct sljit_jump *jump[2]; -sljit_u8 instruction[8]; -int i; - -SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2); -SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset())); -SLJIT_ASSERT(tmp1_reg_ind < 8 && tmp2_reg_ind == 1); - -/* Initialize. */ -if (common->match_end_ptr != 0) - { - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); - OP1(SLJIT_MOV, TMP3, 0, STR_END, 0); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1)); - - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, STR_END, 0); - CMOV(SLJIT_LESS, STR_END, TMP1, 0); - } - -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1)); -add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - -/* MOVD xmm, r/m32 */ -instruction[0] = 0x66; -instruction[1] = 0x0f; -instruction[2] = 0x6e; - -if (char1a == char1b) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a)); -else - { - bit1 = char1a ^ char1b; - if (is_powerof2(bit1)) - { - compare1_type = sse2_compare_match1i; - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1)); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1)); - } - else - { - compare1_type = sse2_compare_match2; - bit1 = 0; - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a)); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b)); - } - } - -instruction[3] = 0xc0 | (cmp1a_ind << 3) | tmp1_reg_ind; -sljit_emit_op_custom(compiler, instruction, 4); - -if (char1a != char1b) - { - instruction[3] = 0xc0 | (cmp1b_ind << 3) | tmp2_reg_ind; - sljit_emit_op_custom(compiler, instruction, 4); - } - -if (char2a == char2b) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a)); -else - { - bit2 = char2a ^ char2b; - if (is_powerof2(bit2)) - { - compare2_type = sse2_compare_match1i; - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2)); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2)); - } - else - { - compare2_type = sse2_compare_match2; - bit2 = 0; - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a)); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b)); - } - } - -instruction[3] = 0xc0 | (cmp2a_ind << 3) | tmp1_reg_ind; -sljit_emit_op_custom(compiler, instruction, 4); - -if (char2a != char2b) - { - instruction[3] = 0xc0 | (cmp2b_ind << 3) | tmp2_reg_ind; - sljit_emit_op_custom(compiler, instruction, 4); - } - -/* PSHUFD xmm1, xmm2/m128, imm8 */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0x70; -instruction[4] = 0; - -instruction[3] = 0xc0 | (cmp1a_ind << 3) | cmp1a_ind; -sljit_emit_op_custom(compiler, instruction, 5); - -if (char1a != char1b) - { - instruction[3] = 0xc0 | (cmp1b_ind << 3) | cmp1b_ind; - sljit_emit_op_custom(compiler, instruction, 5); - } - -instruction[3] = 0xc0 | (cmp2a_ind << 3) | cmp2a_ind; -sljit_emit_op_custom(compiler, instruction, 5); - -if (char2a != char2b) - { - instruction[3] = 0xc0 | (cmp2b_ind << 3) | cmp2b_ind; - sljit_emit_op_custom(compiler, instruction, 5); - } - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -restart = LABEL(); -#endif - -OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff); -OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0); -OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf); - -load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0); - -jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0); - -load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff); -jump[1] = JUMP(SLJIT_JUMP); - -JUMPHERE(jump[0]); - -/* MOVDQA xmm1, xmm2/m128 */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0x6f; -instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind; -sljit_emit_op_custom(compiler, instruction, 4); - -/* PSLLDQ xmm1, imm8 */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0x73; -instruction[3] = 0xc0 | (7 << 3) | data2_ind; -instruction[4] = diff; -sljit_emit_op_custom(compiler, instruction, 5); - -JUMPHERE(jump[1]); - -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf); - -for (i = 0; i < 4; i++) - { - fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind); - fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind); - } - -/* PAND xmm1, xmm2/m128 */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0xdb; -instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind; -sljit_emit_op_custom(compiler, instruction, 4); - -/* PMOVMSKB reg, xmm */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0xd7; -instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | 0; -sljit_emit_op_custom(compiler, instruction, 4); - -/* Ignore matches before the first STR_PTR. */ -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); -OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0); - -jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0); - -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0); - -/* Main loop. */ -start = LABEL(); - -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16); -add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - -load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0); -load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff); - -for (i = 0; i < 4; i++) - { - fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind); - fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind); - } - -/* PAND xmm1, xmm2/m128 */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0xdb; -instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind; -sljit_emit_op_custom(compiler, instruction, 4); - -/* PMOVMSKB reg, xmm */ -/* instruction[0] = 0x66; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0xd7; -instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | 0; -sljit_emit_op_custom(compiler, instruction, 4); - -CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start); - -JUMPHERE(jump[0]); - -/* BSF r32, r/m32 */ -instruction[0] = 0x0f; -instruction[1] = 0xbc; -instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind; -sljit_emit_op_custom(compiler, instruction, 3); - -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - -add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - -if (common->match_end_ptr != 0) - OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -if (common->utf) - { - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1)); - - jump[0] = jump_if_utf_char_start(compiler, TMP1); - - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart); - - add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP)); - - JUMPHERE(jump[0]); - } -#endif - -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1)); - -if (common->match_end_ptr != 0) - OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); -} - -#endif /* !_WIN64 */ - -#undef SSE2_COMPARE_TYPE_INDEX - -#endif /* SLJIT_CONFIG_X86 && !SUPPORT_VALGRIND */ - -#if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 && (defined __ARM_NEON || defined __ARM_NEON__)) - -#include - -typedef union { - unsigned int x; - struct { unsigned char c1, c2, c3, c4; } c; -} int_char; - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -static SLJIT_INLINE int utf_continue(sljit_u8 *s) -{ -#if PCRE2_CODE_UNIT_WIDTH == 8 -return (*s & 0xc0) == 0x80; -#elif PCRE2_CODE_UNIT_WIDTH == 16 -return (*s & 0xfc00) == 0xdc00; -#else -#error "Unknown code width" -#endif -} -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 -# define VECTOR_FACTOR 16 -# define vect_t uint8x16_t -# define VLD1Q(X) vld1q_u8((sljit_u8 *)(X)) -# define VCEQQ vceqq_u8 -# define VORRQ vorrq_u8 -# define VST1Q vst1q_u8 -# define VDUPQ vdupq_n_u8 -# define VEXTQ vextq_u8 -# define VANDQ vandq_u8 -typedef union { - uint8_t mem[16]; - uint64_t dw[2]; -} quad_word; -#elif PCRE2_CODE_UNIT_WIDTH == 16 -# define VECTOR_FACTOR 8 -# define vect_t uint16x8_t -# define VLD1Q(X) vld1q_u16((sljit_u16 *)(X)) -# define VCEQQ vceqq_u16 -# define VORRQ vorrq_u16 -# define VST1Q vst1q_u16 -# define VDUPQ vdupq_n_u16 -# define VEXTQ vextq_u16 -# define VANDQ vandq_u16 -typedef union { - uint16_t mem[8]; - uint64_t dw[2]; -} quad_word; -#else -# define VECTOR_FACTOR 4 -# define vect_t uint32x4_t -# define VLD1Q(X) vld1q_u32((sljit_u32 *)(X)) -# define VCEQQ vceqq_u32 -# define VORRQ vorrq_u32 -# define VST1Q vst1q_u32 -# define VDUPQ vdupq_n_u32 -# define VEXTQ vextq_u32 -# define VANDQ vandq_u32 -typedef union { - uint32_t mem[4]; - uint64_t dw[2]; -} quad_word; -#endif - -#define FFCS -#include "pcre2_jit_neon_inc.h" -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -# define FF_UTF -# include "pcre2_jit_neon_inc.h" -# undef FF_UTF -#endif -#undef FFCS - -#define FFCS_2 -#include "pcre2_jit_neon_inc.h" -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -# define FF_UTF -# include "pcre2_jit_neon_inc.h" -# undef FF_UTF -#endif -#undef FFCS_2 - -#define FFCS_MASK -#include "pcre2_jit_neon_inc.h" -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -# define FF_UTF -# include "pcre2_jit_neon_inc.h" -# undef FF_UTF -#endif -#undef FFCS_MASK - -#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1 - -static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset) -{ -DEFINE_COMPILER; -int_char ic; -struct sljit_jump *partial_quit; -/* Save temporary registers. */ -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0); -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP3, 0); - -/* Prepare function arguments */ -OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0); -OP1(SLJIT_MOV, SLJIT_R1, 0, STR_PTR, 0); -OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, offset); - -if (char1 == char2) - { - ic.c.c1 = char1; - ic.c.c2 = char2; - OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x); - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf && offset > 0) - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_utf)); - else - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs)); -#else - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs)); -#endif - } -else - { - PCRE2_UCHAR mask = char1 ^ char2; - if (is_powerof2(mask)) - { - ic.c.c1 = char1 | mask; - ic.c.c2 = mask; - OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x); - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf && offset > 0) - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask_utf)); - else - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask)); -#else - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask)); -#endif - } - else - { - ic.c.c1 = char1; - ic.c.c2 = char2; - OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x); - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf && offset > 0) - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2_utf)); - else - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2)); -#else - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2)); -#endif - } - } -/* Restore registers. */ -OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); -OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1); - -/* Check return value. */ -partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0); -if (common->mode == PCRE2_JIT_COMPLETE) - add_jump(compiler, &common->failed_match, partial_quit); - -/* Fast forward STR_PTR to the result of memchr. */ -OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0); - -if (common->mode != PCRE2_JIT_COMPLETE) - JUMPHERE(partial_quit); -} - -typedef enum { - compare_match1, - compare_match1i, - compare_match2, -} compare_type; - -static inline vect_t fast_forward_char_pair_compare(compare_type ctype, vect_t dst, vect_t cmp1, vect_t cmp2) -{ -if (ctype == compare_match2) - { - vect_t tmp = dst; - dst = VCEQQ(dst, cmp1); - tmp = VCEQQ(tmp, cmp2); - dst = VORRQ(dst, tmp); - return dst; - } - -if (ctype == compare_match1i) - dst = VORRQ(dst, cmp2); -dst = VCEQQ(dst, cmp1); -return dst; -} - -static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void) -{ -#if PCRE2_CODE_UNIT_WIDTH == 8 -return 15; -#elif PCRE2_CODE_UNIT_WIDTH == 16 -return 7; -#elif PCRE2_CODE_UNIT_WIDTH == 32 -return 3; -#else -#error "Unsupported unit width" -#endif -} - -/* ARM doesn't have a shift left across lanes. */ -static SLJIT_INLINE vect_t shift_left_n_lanes(vect_t a, sljit_u8 n) -{ -vect_t zero = VDUPQ(0); -SLJIT_ASSERT(0 < n && n < VECTOR_FACTOR); -/* VEXTQ takes an immediate as last argument. */ -#define C(X) case X: return VEXTQ(zero, a, VECTOR_FACTOR - X); -switch (n) - { - C(1); C(2); C(3); -#if PCRE2_CODE_UNIT_WIDTH != 32 - C(4); C(5); C(6); C(7); -# if PCRE2_CODE_UNIT_WIDTH != 16 - C(8); C(9); C(10); C(11); C(12); C(13); C(14); C(15); -# endif -#endif - default: - /* Based on the ASSERT(0 < n && n < VECTOR_FACTOR) above, this won't - happen. The return is still here for compilers to not warn. */ - return a; - } -} - -#define FFCPS -#define FFCPS_DIFF1 -#define FFCPS_CHAR1A2A - -#define FFCPS_0 -#include "pcre2_jit_neon_inc.h" -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -# define FF_UTF -# include "pcre2_jit_neon_inc.h" -# undef FF_UTF -#endif -#undef FFCPS_0 - -#undef FFCPS_CHAR1A2A - -#define FFCPS_1 -#include "pcre2_jit_neon_inc.h" -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -# define FF_UTF -# include "pcre2_jit_neon_inc.h" -# undef FF_UTF -#endif -#undef FFCPS_1 - -#undef FFCPS_DIFF1 - -#define FFCPS_DEFAULT -#include "pcre2_jit_neon_inc.h" -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 -# define FF_UTF -# include "pcre2_jit_neon_inc.h" -# undef FF_UTF -#endif -#undef FFCPS - -#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1 - -static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1, - PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b) -{ -DEFINE_COMPILER; -sljit_u32 diff = IN_UCHARS(offs1 - offs2); -struct sljit_jump *partial_quit; -int_char ic; -SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2); -SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset())); -SLJIT_ASSERT(compiler->scratches == 5); - -/* Save temporary register STR_PTR. */ -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0); - -/* Prepare arguments for the function call. */ -if (common->match_end_ptr == 0) - OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0); -else - { - OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); - OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1)); - - OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, STR_END, 0, SLJIT_R0, 0); - CMOV(SLJIT_LESS, SLJIT_R0, STR_END, 0); - } - -OP1(SLJIT_MOV, SLJIT_R1, 0, STR_PTR, 0); -OP1(SLJIT_MOV_S32, SLJIT_R2, 0, SLJIT_IMM, offs1); -OP1(SLJIT_MOV_S32, SLJIT_R3, 0, SLJIT_IMM, offs2); -ic.c.c1 = char1a; -ic.c.c2 = char1b; -ic.c.c3 = char2a; -ic.c.c4 = char2b; -OP1(SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_IMM, ic.x); - -if (diff == 1) { - if (char1a == char1b && char2a == char2b) { -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_0_utf)); - else -#endif - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_0)); - } else { -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_1_utf)); - else -#endif - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_1)); - } -} else { -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_default_utf)); - else -#endif - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_default)); -} - -/* Restore STR_PTR register. */ -OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); - -/* Check return value. */ -partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0); -add_jump(compiler, &common->failed_match, partial_quit); - -/* Fast forward STR_PTR to the result of memchr. */ -OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0); - -JUMPHERE(partial_quit); -} - -#endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */ diff --git a/pcre2/src/pcre2_maketables.c b/pcre2/src/pcre2_maketables.c deleted file mode 100644 index 56d249402..000000000 --- a/pcre2/src/pcre2_maketables.c +++ /dev/null @@ -1,163 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -/* This module contains the external function pcre2_maketables(), which builds -character tables for PCRE2 in the current locale. The file is compiled on its -own as part of the PCRE2 library. It is also included in the compilation of -pcre2_dftables.c as a freestanding program, in which case the macro -PCRE2_DFTABLES is defined. */ - -#ifndef PCRE2_DFTABLES /* Compiling the library */ -# ifdef HAVE_CONFIG_H -# include "config.h" -# endif -# include "pcre2_internal.h" -#endif - - - -/************************************************* -* Create PCRE2 character tables * -*************************************************/ - -/* This function builds a set of character tables for use by PCRE2 and returns -a pointer to them. They are build using the ctype functions, and consequently -their contents will depend upon the current locale setting. When compiled as -part of the library, the store is obtained via a general context malloc, if -supplied, but when PCRE2_DFTABLES is defined (when compiling the pcre2_dftables -freestanding auxiliary program) malloc() is used, and the function has a -different name so as not to clash with the prototype in pcre2.h. - -Arguments: none when PCRE2_DFTABLES is defined - else a PCRE2 general context or NULL -Returns: pointer to the contiguous block of data - else NULL if memory allocation failed -*/ - -#ifdef PCRE2_DFTABLES /* Included in freestanding pcre2_dftables program */ -static const uint8_t *maketables(void) -{ -uint8_t *yield = (uint8_t *)malloc(TABLES_LENGTH); - -#else /* Not PCRE2_DFTABLES, that is, compiling the library */ -PCRE2_EXP_DEFN const uint8_t * PCRE2_CALL_CONVENTION -pcre2_maketables(pcre2_general_context *gcontext) -{ -uint8_t *yield = (uint8_t *)((gcontext != NULL)? - gcontext->memctl.malloc(TABLES_LENGTH, gcontext->memctl.memory_data) : - malloc(TABLES_LENGTH)); -#endif /* PCRE2_DFTABLES */ - -int i; -uint8_t *p; - -if (yield == NULL) return NULL; -p = yield; - -/* First comes the lower casing table */ - -for (i = 0; i < 256; i++) *p++ = tolower(i); - -/* Next the case-flipping table */ - -for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i); - -/* Then the character class tables. Don't try to be clever and save effort on -exclusive ones - in some locales things may be different. - -Note that the table for "space" includes everything "isspace" gives, including -VT in the default locale. This makes it work for the POSIX class [:space:]. -From PCRE1 release 8.34 and for all PCRE2 releases it is also correct for Perl -space, because Perl added VT at release 5.18. - -Note also that it is possible for a character to be alnum or alpha without -being lower or upper, such as "male and female ordinals" (\xAA and \xBA) in the -fr_FR locale (at least under Debian Linux's locales as of 12/2005). So we must -test for alnum specially. */ - -memset(p, 0, cbit_length); -for (i = 0; i < 256; i++) - { - if (isdigit(i)) p[cbit_digit + i/8] |= 1u << (i&7); - if (isupper(i)) p[cbit_upper + i/8] |= 1u << (i&7); - if (islower(i)) p[cbit_lower + i/8] |= 1u << (i&7); - if (isalnum(i)) p[cbit_word + i/8] |= 1u << (i&7); - if (i == '_') p[cbit_word + i/8] |= 1u << (i&7); - if (isspace(i)) p[cbit_space + i/8] |= 1u << (i&7); - if (isxdigit(i)) p[cbit_xdigit + i/8] |= 1u << (i&7); - if (isgraph(i)) p[cbit_graph + i/8] |= 1u << (i&7); - if (isprint(i)) p[cbit_print + i/8] |= 1u << (i&7); - if (ispunct(i)) p[cbit_punct + i/8] |= 1u << (i&7); - if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1u << (i&7); - } -p += cbit_length; - -/* Finally, the character type table. In this, we used to exclude VT from the -white space chars, because Perl didn't recognize it as such for \s and for -comments within regexes. However, Perl changed at release 5.18, so PCRE1 -changed at release 8.34 and it's always been this way for PCRE2. */ - -for (i = 0; i < 256; i++) - { - int x = 0; - if (isspace(i)) x += ctype_space; - if (isalpha(i)) x += ctype_letter; - if (islower(i)) x += ctype_lcletter; - if (isdigit(i)) x += ctype_digit; - if (isalnum(i) || i == '_') x += ctype_word; - *p++ = x; - } - -return yield; -} - -#ifndef PCRE2_DFTABLES /* Compiling the library */ -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables) -{ - if (gcontext) - gcontext->memctl.free((void *)tables, gcontext->memctl.memory_data); - else - free((void *)tables); -} -#endif - -/* End of pcre2_maketables.c */ diff --git a/pcre2/src/pcre2_match.c b/pcre2/src/pcre2_match.c deleted file mode 100644 index e3f78c2ca..000000000 --- a/pcre2/src/pcre2_match.c +++ /dev/null @@ -1,7311 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2015-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -/* These defines enable debugging code */ - -/* #define DEBUG_FRAMES_DISPLAY */ -/* #define DEBUG_SHOW_OPS */ -/* #define DEBUG_SHOW_RMATCH */ - -#ifdef DEBUG_FRAME_DISPLAY -#include -#endif - -/* These defines identify the name of the block containing "static" -information, and fields within it. */ - -#define NLBLOCK mb /* Block containing newline information */ -#define PSSTART start_subject /* Field containing processed string start */ -#define PSEND end_subject /* Field containing processed string end */ - -#include "pcre2_internal.h" - -#define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */ - -/* Masks for identifying the public options that are permitted at match time. */ - -#define PUBLIC_MATCH_OPTIONS \ - (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ - PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ - PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT) - -#define PUBLIC_JIT_MATCH_OPTIONS \ - (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\ - PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\ - PCRE2_COPY_MATCHED_SUBJECT) - -/* Non-error returns from and within the match() function. Error returns are -externally defined PCRE2_ERROR_xxx codes, which are all negative. */ - -#define MATCH_MATCH 1 -#define MATCH_NOMATCH 0 - -/* Special internal returns used in the match() function. Make them -sufficiently negative to avoid the external error codes. */ - -#define MATCH_ACCEPT (-999) -#define MATCH_KETRPOS (-998) -/* The next 5 must be kept together and in sequence so that a test that checks -for any one of them can use a range. */ -#define MATCH_COMMIT (-997) -#define MATCH_PRUNE (-996) -#define MATCH_SKIP (-995) -#define MATCH_SKIP_ARG (-994) -#define MATCH_THEN (-993) -#define MATCH_BACKTRACK_MAX MATCH_THEN -#define MATCH_BACKTRACK_MIN MATCH_COMMIT - -/* Group frame type values. Zero means the frame is not a group frame. The -lower 16 bits are used for data (e.g. the capture number). Group frames are -used for most groups so that information about the start is easily available at -the end without having to scan back through intermediate frames (backtrack -points). */ - -#define GF_CAPTURE 0x00010000u -#define GF_NOCAPTURE 0x00020000u -#define GF_CONDASSERT 0x00030000u -#define GF_RECURSE 0x00040000u - -/* Masks for the identity and data parts of the group frame type. */ - -#define GF_IDMASK(a) ((a) & 0xffff0000u) -#define GF_DATAMASK(a) ((a) & 0x0000ffffu) - -/* Repetition types */ - -enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS }; - -/* Min and max values for the common repeats; a maximum of UINT32_MAX => -infinity. */ - -static const uint32_t rep_min[] = { - 0, 0, /* * and *? */ - 1, 1, /* + and +? */ - 0, 0, /* ? and ?? */ - 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ - 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ - -static const uint32_t rep_max[] = { - UINT32_MAX, UINT32_MAX, /* * and *? */ - UINT32_MAX, UINT32_MAX, /* + and +? */ - 1, 1, /* ? and ?? */ - 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ - UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ - -/* Repetition types - must include OP_CRPOSRANGE (not needed above) */ - -static const uint32_t rep_typ[] = { - REPTYPE_MAX, REPTYPE_MIN, /* * and *? */ - REPTYPE_MAX, REPTYPE_MIN, /* + and +? */ - REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */ - REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */ - REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */ - REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */ - -/* Numbers for RMATCH calls at backtracking points. When these lists are -changed, the code at RETURN_SWITCH below must be updated in sync. */ - -enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, - RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, - RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, - RM31, RM32, RM33, RM34, RM35, RM36 }; - -#ifdef SUPPORT_WIDE_CHARS -enum { RM100=100, RM101 }; -#endif - -#ifdef SUPPORT_UNICODE -enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207, - RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215, - RM216, RM217, RM218, RM219, RM220, RM221, RM222 }; -#endif - -/* Define short names for general fields in the current backtrack frame, which -is always pointed to by the F variable. Occasional references to fields in -other frames are written out explicitly. There are also some fields in the -current frame whose names start with "temp" that are used for short-term, -localised backtracking memory. These are #defined with Lxxx names at the point -of use and undefined afterwards. */ - -#define Fback_frame F->back_frame -#define Fcapture_last F->capture_last -#define Fcurrent_recurse F->current_recurse -#define Fecode F->ecode -#define Feptr F->eptr -#define Fgroup_frame_type F->group_frame_type -#define Flast_group_offset F->last_group_offset -#define Flength F->length -#define Fmark F->mark -#define Frdepth F->rdepth -#define Fstart_match F->start_match -#define Foffset_top F->offset_top -#define Foccu F->occu -#define Fop F->op -#define Fovector F->ovector -#define Freturn_id F->return_id - - -#ifdef DEBUG_FRAMES_DISPLAY -/************************************************* -* Display current frames and contents * -*************************************************/ - -/* This debugging function displays the current set of frames and their -contents. It is not called automatically from anywhere, the intention being -that calls can be inserted where necessary when debugging frame-related -problems. - -Arguments: - f the file to write to - F the current top frame - P a previous frame of interest - frame_size the frame size - mb points to the match block - s identification text - -Returns: nothing -*/ - -static void -display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size, - match_block *mb, const char *s, ...) -{ -uint32_t i; -heapframe *Q; -va_list ap; -va_start(ap, s); - -fprintf(f, "FRAMES "); -vfprintf(f, s, ap); -va_end(ap); - -if (P != NULL) fprintf(f, " P=%lu", - ((char *)P - (char *)(mb->match_frames))/frame_size); -fprintf(f, "\n"); - -for (i = 0, Q = mb->match_frames; - Q <= F; - i++, Q = (heapframe *)((char *)Q + frame_size)) - { - fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d", - i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode), - Q->back_frame, Q->return_id); - - if (Q->last_group_offset == PCRE2_UNSET) - fprintf(f, " lgoffset=unset\n"); - else - fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size); - } -} - -#endif - - - -/************************************************* -* Process a callout * -*************************************************/ - -/* This function is called for all callouts, whether "standalone" or at the -start of a conditional group. Feptr will be pointing to either OP_CALLOUT or -OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized -with fixed values. - -Arguments: - F points to the current backtracking frame - mb points to the match block - lengthptr where to return the length of the callout item - -Returns: the return from the callout - or 0 if no callout function exists -*/ - -static int -do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr) -{ -int rc; -PCRE2_SIZE save0, save1; -PCRE2_SIZE *callout_ovector; -pcre2_callout_block *cb; - -*lengthptr = (*Fecode == OP_CALLOUT)? - PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE); - -if (mb->callout == NULL) return 0; /* No callout function provided */ - -/* The original matching code (pre 10.30) worked directly with the ovector -passed by the user, and this was passed to callouts. Now that the working -ovector is in the backtracking frame, it no longer needs to reserve space for -the overall match offsets (which would waste space in the frame). For backward -compatibility, however, we pass capture_top and offset_vector to the callout as -if for the extended ovector, and we ensure that the first two slots are unset -by preserving and restoring their current contents. Picky compilers complain if -references such as Fovector[-2] are use directly, so we set up a separate -pointer. */ - -callout_ovector = (PCRE2_SIZE *)(Fovector) - 2; - -/* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields -are set externally. The first 3 never change; the last is updated for each -bumpalong. */ - -cb = mb->cb; -cb->capture_top = (uint32_t)Foffset_top/2 + 1; -cb->capture_last = Fcapture_last; -cb->offset_vector = callout_ovector; -cb->mark = mb->nomatch_mark; -cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject); -cb->pattern_position = GET(Fecode, 1); -cb->next_item_length = GET(Fecode, 1 + LINK_SIZE); - -if (*Fecode == OP_CALLOUT) /* Numerical callout */ - { - cb->callout_number = Fecode[1 + 2*LINK_SIZE]; - cb->callout_string_offset = 0; - cb->callout_string = NULL; - cb->callout_string_length = 0; - } -else /* String callout */ - { - cb->callout_number = 0; - cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE); - cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1; - cb->callout_string_length = - *lengthptr - (1 + 4*LINK_SIZE) - 2; - } - -save0 = callout_ovector[0]; -save1 = callout_ovector[1]; -callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET; -rc = mb->callout(cb, mb->callout_data); -callout_ovector[0] = save0; -callout_ovector[1] = save1; -cb->callout_flags = 0; -return rc; -} - - - -/************************************************* -* Match a back-reference * -*************************************************/ - -/* This function is called only when it is known that the offset lies within -the offsets that have so far been used in the match. Note that in caseless -UTF-8 mode, the number of subject bytes matched may be different to the number -of reference bytes. (In theory this could also happen in UTF-16 mode, but it -seems unlikely.) - -Arguments: - offset index into the offset vector - caseless TRUE if caseless - F the current backtracking frame pointer - mb points to match block - lengthptr pointer for returning the length matched - -Returns: = 0 sucessful match; number of code units matched is set - < 0 no match - > 0 partial match -*/ - -static int -match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb, - PCRE2_SIZE *lengthptr) -{ -PCRE2_SPTR p; -PCRE2_SIZE length; -PCRE2_SPTR eptr; -PCRE2_SPTR eptr_start; - -/* Deal with an unset group. The default is no match, but there is an option to -match an empty string. */ - -if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET) - { - if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) - { - *lengthptr = 0; - return 0; /* Match */ - } - else return -1; /* No match */ - } - -/* Separate the caseless and UTF cases for speed. */ - -eptr = eptr_start = Feptr; -p = mb->start_subject + Fovector[offset]; -length = Fovector[offset+1] - Fovector[offset]; - -if (caseless) - { -#if defined SUPPORT_UNICODE - BOOL utf = (mb->poptions & PCRE2_UTF) != 0; - - if (utf || (mb->poptions & PCRE2_UCP) != 0) - { - PCRE2_SPTR endptr = p + length; - - /* Match characters up to the end of the reference. NOTE: the number of - code units matched may differ, because in UTF-8 there are some characters - whose upper and lower case codes have different numbers of bytes. For - example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3 - bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a - sequence of two of the latter. It is important, therefore, to check the - length along the reference, not along the subject (earlier code did this - wrong). UCP without uses Unicode properties but without UTF encoding. */ - - while (p < endptr) - { - uint32_t c, d; - const ucd_record *ur; - if (eptr >= mb->end_subject) return 1; /* Partial match */ - - if (utf) - { - GETCHARINC(c, eptr); - GETCHARINC(d, p); - } - else - { - c = *eptr++; - d = *p++; - } - - ur = GET_UCD(d); - if (c != d && c != (uint32_t)((int)d + ur->other_case)) - { - const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset; - for (;;) - { - if (c < *pp) return -1; /* No match */ - if (c == *pp++) break; - } - } - } - } - else -#endif - - /* Not in UTF or UCP mode */ - { - for (; length > 0; length--) - { - uint32_t cc, cp; - if (eptr >= mb->end_subject) return 1; /* Partial match */ - cc = UCHAR21TEST(eptr); - cp = UCHAR21TEST(p); - if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc)) - return -1; /* No match */ - p++; - eptr++; - } - } - } - -/* In the caseful case, we can just compare the code units, whether or not we -are in UTF and/or UCP mode. When partial matching, we have to do this unit by -unit. */ - -else - { - if (mb->partial != 0) - { - for (; length > 0; length--) - { - if (eptr >= mb->end_subject) return 1; /* Partial match */ - if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */ - } - } - - /* Not partial matching */ - - else - { - if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */ - if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */ - eptr += length; - } - } - -*lengthptr = eptr - eptr_start; -return 0; /* Match */ -} - - - -/****************************************************************************** -******************************************************************************* - "Recursion" in the match() function - -The original match() function was highly recursive, but this proved to be the -source of a number of problems over the years, mostly because of the relatively -small system stacks that are commonly found. As new features were added to -patterns, various kludges were invented to reduce the amount of stack used, -making the code hard to understand in places. - -A version did exist that used individual frames on the heap instead of calling -match() recursively, but this ran substantially slower. The current version is -a refactoring that uses a vector of frames to remember backtracking points. -This runs no slower, and possibly even a bit faster than the original recursive -implementation. An initial vector of size START_FRAMES_SIZE (enough for maybe -50 frames) is allocated on the system stack. If this is not big enough, the -heap is used for a larger vector. - -******************************************************************************* -******************************************************************************/ - - - - -/************************************************* -* Macros for the match() function * -*************************************************/ - -/* These macros pack up tests that are used for partial matching several times -in the code. The second one is used when we already know we are past the end of -the subject. We set the "hit end" flag if the pointer is at the end of the -subject and either (a) the pointer is past the earliest inspected character -(i.e. something has been matched, even if not part of the actual matched -string), or (b) the pattern contains a lookbehind. These are the conditions for -which adding more characters may allow the current match to continue. - -For hard partial matching, we immediately return a partial match. Otherwise, -carrying on means that a complete match on the current subject will be sought. -A partial match is returned only if no complete match can be found. */ - -#define CHECK_PARTIAL()\ - if (Feptr >= mb->end_subject) \ - { \ - SCHECK_PARTIAL(); \ - } - -#define SCHECK_PARTIAL()\ - if (mb->partial != 0 && \ - (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \ - { \ - mb->hitend = TRUE; \ - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \ - } - - -/* These macros are used to implement backtracking. They simulate a recursive -call to the match() function by means of a local vector of frames which -remember the backtracking points. */ - -#define RMATCH(ra,rb)\ - {\ - start_ecode = ra;\ - Freturn_id = rb;\ - goto MATCH_RECURSE;\ - L_##rb:;\ - } - -#define RRETURN(ra)\ - {\ - rrc = ra;\ - goto RETURN_SWITCH;\ - } - - - -/************************************************* -* Match from current position * -*************************************************/ - -/* This function is called to run one match attempt at a single starting point -in the subject. - -Performance note: It might be tempting to extract commonly used fields from the -mb structure (e.g. end_subject) into individual variables to improve -performance. Tests using gcc on a SPARC disproved this; in the first case, it -made performance worse. - -Arguments: - start_eptr starting character in subject - start_ecode starting position in compiled code - ovector pointer to the final output vector - oveccount number of pairs in ovector - top_bracket number of capturing parentheses in the pattern - frame_size size of each backtracking frame - mb pointer to "static" variables block - -Returns: MATCH_MATCH if matched ) these values are >= 0 - MATCH_NOMATCH if failed to match ) - negative MATCH_xxx value for PRUNE, SKIP, etc - negative PCRE2_ERROR_xxx value if aborted by an error condition - (e.g. stopped by repeated call or depth limit) -*/ - -static int -match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector, - uint16_t oveccount, uint16_t top_bracket, PCRE2_SIZE frame_size, - match_block *mb) -{ -/* Frame-handling variables */ - -heapframe *F; /* Current frame pointer */ -heapframe *N = NULL; /* Temporary frame pointers */ -heapframe *P = NULL; -heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */ -PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ - -/* Local variables that do not need to be preserved over calls to RRMATCH(). */ - -PCRE2_SPTR bracode; /* Temp pointer to start of group */ -PCRE2_SIZE offset; /* Used for group offsets */ -PCRE2_SIZE length; /* Used for various length calculations */ - -int rrc; /* Return from functions & backtracking "recursions" */ -#ifdef SUPPORT_UNICODE -int proptype; /* Type of character property */ -#endif - -uint32_t i; /* Used for local loops */ -uint32_t fc; /* Character values */ -uint32_t number; /* Used for group and other numbers */ -uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */ -uint32_t group_frame_type; /* Specifies type for new group frames */ - -BOOL condition; /* Used in conditional groups */ -BOOL cur_is_word; /* Used in "word" tests */ -BOOL prev_is_word; /* Used in "word" tests */ - -/* UTF and UCP flags */ - -#ifdef SUPPORT_UNICODE -BOOL utf = (mb->poptions & PCRE2_UTF) != 0; -BOOL ucp = (mb->poptions & PCRE2_UCP) != 0; -#else -BOOL utf = FALSE; /* Required for convenience even when no Unicode support */ -#endif - -/* This is the length of the last part of a backtracking frame that must be -copied when a new frame is created. */ - -frame_copy_size = frame_size - offsetof(heapframe, eptr); - -/* Set up the first current frame at the start of the vector, and initialize -fields that are not reset for new frames. */ - -F = mb->match_frames; -Frdepth = 0; /* "Recursion" depth */ -Fcapture_last = 0; /* Number of most recent capture */ -Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */ -Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */ -Fmark = NULL; /* Most recent mark */ -Foffset_top = 0; /* End of captures within the frame */ -Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */ -group_frame_type = 0; /* Not a start of group frame */ -goto NEW_FRAME; /* Start processing with this frame */ - -/* Come back here when we want to create a new frame for remembering a -backtracking point. */ - -MATCH_RECURSE: - -/* Set up a new backtracking frame. If the vector is full, get a new one -on the heap, doubling the size, but constrained by the heap limit. */ - -N = (heapframe *)((char *)F + frame_size); -if (N >= mb->match_frames_top) - { - PCRE2_SIZE newsize = mb->frame_vector_size * 2; - heapframe *new; - - if ((newsize / 1024) > mb->heap_limit) - { - PCRE2_SIZE maxsize = ((mb->heap_limit * 1024)/frame_size) * frame_size; - if (mb->frame_vector_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT; - newsize = maxsize; - } - - new = mb->memctl.malloc(newsize, mb->memctl.memory_data); - if (new == NULL) return PCRE2_ERROR_NOMEMORY; - memcpy(new, mb->match_frames, mb->frame_vector_size); - - F = (heapframe *)((char *)new + ((char *)F - (char *)mb->match_frames)); - N = (heapframe *)((char *)F + frame_size); - - if (mb->match_frames != mb->stack_frames) - mb->memctl.free(mb->match_frames, mb->memctl.memory_data); - mb->match_frames = new; - mb->match_frames_top = (heapframe *)((char *)mb->match_frames + newsize); - mb->frame_vector_size = newsize; - } - -#ifdef DEBUG_SHOW_RMATCH -fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1); -if (group_frame_type != 0) - { - fprintf(stderr, " type=%x ", group_frame_type); - switch (GF_IDMASK(group_frame_type)) - { - case GF_CAPTURE: - fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type)); - break; - - case GF_NOCAPTURE: - fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type)); - break; - - case GF_CONDASSERT: - fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type)); - break; - - case GF_RECURSE: - fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type)); - break; - - default: - fprintf(stderr, "*** unknown ***"); - break; - } - } -fprintf(stderr, "\n"); -#endif - -/* Copy those fields that must be copied into the new frame, increase the -"recursion" depth (i.e. the new frame's index) and then make the new frame -current. */ - -memcpy((char *)N + offsetof(heapframe, eptr), - (char *)F + offsetof(heapframe, eptr), - frame_copy_size); - -N->rdepth = Frdepth + 1; -F = N; - -/* Carry on processing with a new frame. */ - -NEW_FRAME: -Fgroup_frame_type = group_frame_type; -Fecode = start_ecode; /* Starting code pointer */ -Fback_frame = frame_size; /* Default is go back one frame */ - -/* If this is a special type of group frame, remember its offset for quick -access at the end of the group. If this is a recursion, set a new current -recursion value. */ - -if (group_frame_type != 0) - { - Flast_group_offset = (char *)F - (char *)mb->match_frames; - if (GF_IDMASK(group_frame_type) == GF_RECURSE) - Fcurrent_recurse = GF_DATAMASK(group_frame_type); - group_frame_type = 0; - } - - -/* ========================================================================= */ -/* This is the main processing loop. First check that we haven't recorded too -many backtracks (search tree is too large), or that we haven't exceeded the -recursive depth limit (used too many backtracking frames). If not, process the -opcodes. */ - -if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT; -if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT; - -for (;;) - { -#ifdef DEBUG_SHOW_OPS -fprintf(stderr, "++ op=%d\n", *Fecode); -#endif - - Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */ - switch(Fop) - { - /* ===================================================================== */ - /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close - any currently open capturing brackets. Unlike reaching the end of a group, - where we know the starting frame is at the top of the chained frames, in - this case we have to search back for the relevant frame in case other types - of group that use chained frames have intervened. Multiple OP_CLOSEs always - come innermost first, which matches the chain order. We can ignore this in - a recursion, because captures are not passed out of recursions. */ - - case OP_CLOSE: - if (Fcurrent_recurse == RECURSE_UNSET) - { - number = GET2(Fecode, 1); - offset = Flast_group_offset; - for(;;) - { - if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; - N = (heapframe *)((char *)mb->match_frames + offset); - P = (heapframe *)((char *)N - frame_size); - if (N->group_frame_type == (GF_CAPTURE | number)) break; - offset = P->last_group_offset; - } - offset = (number << 1) - 2; - Fcapture_last = number; - Fovector[offset] = P->eptr - mb->start_subject; - Fovector[offset+1] = Feptr - mb->start_subject; - if (offset >= Foffset_top) Foffset_top = offset + 2; - } - Fecode += PRIV(OP_lengths)[*Fecode]; - break; - - - /* ===================================================================== */ - /* Real or forced end of the pattern, assertion, or recursion. In an - assertion ACCEPT, update the last used pointer and remember the current - frame so that the captures and mark can be fished out of it. */ - - case OP_ASSERT_ACCEPT: - if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; - assert_accept_frame = F; - RRETURN(MATCH_ACCEPT); - - /* If recursing, we have to find the most recent recursion. */ - - case OP_ACCEPT: - case OP_END: - - /* Handle end of a recursion. */ - - if (Fcurrent_recurse != RECURSE_UNSET) - { - offset = Flast_group_offset; - for(;;) - { - if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; - N = (heapframe *)((char *)mb->match_frames + offset); - P = (heapframe *)((char *)N - frame_size); - if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break; - offset = P->last_group_offset; - } - - /* N is now the frame of the recursion; the previous frame is at the - OP_RECURSE position. Go back there, copying the current subject position - and mark, and move on past the OP_RECURSE. */ - - P->eptr = Feptr; - P->mark = Fmark; - F = P; - Fecode += 1 + LINK_SIZE; - continue; - } - - /* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY - is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the - start of the subject. In both cases, backtracking will then try other - alternatives, if any. */ - - if (Feptr == Fstart_match && - ((mb->moptions & PCRE2_NOTEMPTY) != 0 || - ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 && - Fstart_match == mb->start_subject + mb->start_offset))) - RRETURN(MATCH_NOMATCH); - - /* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not - the end of the subject. After (*ACCEPT) we fail the entire match (at this - position) but backtrack on reaching the end of the pattern. */ - - if (Feptr < mb->end_subject && - ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0) - { - if (Fop == OP_END) RRETURN(MATCH_NOMATCH); - return MATCH_NOMATCH; - } - - /* We have a successful match of the whole pattern. Record the result and - then do a direct return from the function. If there is space in the offset - vector, set any pairs that follow the highest-numbered captured string but - are less than the number of capturing groups in the pattern to PCRE2_UNSET. - It is documented that this happens. "Gaps" are set to PCRE2_UNSET - dynamically. It is only those at the end that need setting here. */ - - mb->end_match_ptr = Feptr; /* Record where we ended */ - mb->end_offset_top = Foffset_top; /* and how many extracts were taken */ - mb->mark = Fmark; /* and the last success mark */ - if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; - - ovector[0] = Fstart_match - mb->start_subject; - ovector[1] = Feptr - mb->start_subject; - - /* Set i to the smaller of the sizes of the external and frame ovectors. */ - - i = 2 * ((top_bracket + 1 > oveccount)? oveccount : top_bracket + 1); - memcpy(ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE)); - while (--i >= Foffset_top + 2) ovector[i] = PCRE2_UNSET; - return MATCH_MATCH; /* Note: NOT RRETURN */ - - - /*===================================================================== */ - /* Match any single character type except newline; have to take care with - CRLF newlines and partial matching. */ - - case OP_ANY: - if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); - if (mb->partial != 0 && - Feptr == mb->end_subject - 1 && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) - { - mb->hitend = TRUE; - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; - } - /* Fall through */ - - /* Match any single character whatsoever. */ - - case OP_ALLANY: - if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ - { /* not be updated before SCHECK_PARTIAL. */ - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - Feptr++; -#ifdef SUPPORT_UNICODE - if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); -#endif - Fecode++; - break; - - - /* ===================================================================== */ - /* Match a single code unit, even in UTF mode. This opcode really does - match any code unit, even newline. (It really should be called ANYCODEUNIT, - of course - the byte name is from pre-16 bit days.) */ - - case OP_ANYBYTE: - if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ - { /* not be updated before SCHECK_PARTIAL. */ - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - Feptr++; - Fecode++; - break; - - - /* ===================================================================== */ - /* Match a single character, casefully */ - - case OP_CHAR: -#ifdef SUPPORT_UNICODE - if (utf) - { - Flength = 1; - Fecode++; - GETCHARLEN(fc, Fecode, Flength); - if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr)) - { - CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ - RRETURN(MATCH_NOMATCH); - } - for (; Flength > 0; Flength--) - { - if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH); - } - } - else -#endif - - /* Not UTF mode */ - { - if (mb->end_subject - Feptr < 1) - { - SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ - RRETURN(MATCH_NOMATCH); - } - if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH); - Fecode += 2; - } - break; - - - /* ===================================================================== */ - /* Match a single character, caselessly. If we are at the end of the - subject, give up immediately. We get here only when the pattern character - has at most one other case. Characters with more than two cases are coded - as OP_PROP with the pseudo-property PT_CLIST. */ - - case OP_CHARI: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - -#ifdef SUPPORT_UNICODE - if (utf) - { - Flength = 1; - Fecode++; - GETCHARLEN(fc, Fecode, Flength); - - /* If the pattern character's value is < 128, we know that its other case - (if any) is also < 128 (and therefore only one code unit long in all - code-unit widths), so we can use the fast lookup table. We checked above - that there is at least one character left in the subject. */ - - if (fc < 128) - { - uint32_t cc = UCHAR21(Feptr); - if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); - Fecode++; - Feptr++; - } - - /* Otherwise we must pick up the subject character and use Unicode - property support to test its other case. Note that we cannot use the - value of "Flength" to check for sufficient bytes left, because the other - case of the character may have more or fewer code units. */ - - else - { - uint32_t dc; - GETCHARINC(dc, Feptr); - Fecode += Flength; - if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); - } - } - - /* If UCP is set without UTF we must do the same as above, but with one - character per code unit. */ - - else if (ucp) - { - uint32_t cc = UCHAR21(Feptr); - fc = Fecode[1]; - if (fc < 128) - { - if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); - } - else - { - if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); - } - Feptr++; - Fecode += 2; - } - - else -#endif /* SUPPORT_UNICODE */ - - /* Not UTF or UCP mode; use the table for characters < 256. */ - { - if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) - != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); - Feptr++; - Fecode += 2; - } - break; - - - /* ===================================================================== */ - /* Match not a single character. */ - - case OP_NOT: - case OP_NOTI: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - -#ifdef SUPPORT_UNICODE - if (utf) - { - uint32_t ch; - Fecode++; - GETCHARINC(ch, Fecode); - GETCHARINC(fc, Feptr); - if (ch == fc) - { - RRETURN(MATCH_NOMATCH); /* Caseful match */ - } - else if (Fop == OP_NOTI) /* If caseless */ - { - if (ch > 127) - ch = UCD_OTHERCASE(ch); - else - ch = (mb->fcc)[ch]; - if (ch == fc) RRETURN(MATCH_NOMATCH); - } - } - - /* UCP without UTF is as above, but with one character per code unit. */ - - else if (ucp) - { - uint32_t ch; - fc = UCHAR21INC(Feptr); - ch = Fecode[1]; - Fecode += 2; - - if (ch == fc) - { - RRETURN(MATCH_NOMATCH); /* Caseful match */ - } - else if (Fop == OP_NOTI) /* If caseless */ - { - if (ch > 127) - ch = UCD_OTHERCASE(ch); - else - ch = (mb->fcc)[ch]; - if (ch == fc) RRETURN(MATCH_NOMATCH); - } - } - - else -#endif /* SUPPORT_UNICODE */ - - /* Neither UTF nor UCP is set */ - - { - uint32_t ch = Fecode[1]; - fc = UCHAR21INC(Feptr); - if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) - RRETURN(MATCH_NOMATCH); - Fecode += 2; - } - break; - - - /* ===================================================================== */ - /* Match a single character repeatedly. */ - -#define Loclength F->temp_size -#define Lstart_eptr F->temp_sptr[0] -#define Lcharptr F->temp_sptr[1] -#define Lmin F->temp_32[0] -#define Lmax F->temp_32[1] -#define Lc F->temp_32[2] -#define Loc F->temp_32[3] - - case OP_EXACT: - case OP_EXACTI: - Lmin = Lmax = GET2(Fecode, 1); - Fecode += 1 + IMM2_SIZE; - goto REPEATCHAR; - - case OP_POSUPTO: - case OP_POSUPTOI: - reptype = REPTYPE_POS; - Lmin = 0; - Lmax = GET2(Fecode, 1); - Fecode += 1 + IMM2_SIZE; - goto REPEATCHAR; - - case OP_UPTO: - case OP_UPTOI: - reptype = REPTYPE_MAX; - Lmin = 0; - Lmax = GET2(Fecode, 1); - Fecode += 1 + IMM2_SIZE; - goto REPEATCHAR; - - case OP_MINUPTO: - case OP_MINUPTOI: - reptype = REPTYPE_MIN; - Lmin = 0; - Lmax = GET2(Fecode, 1); - Fecode += 1 + IMM2_SIZE; - goto REPEATCHAR; - - case OP_POSSTAR: - case OP_POSSTARI: - reptype = REPTYPE_POS; - Lmin = 0; - Lmax = UINT32_MAX; - Fecode++; - goto REPEATCHAR; - - case OP_POSPLUS: - case OP_POSPLUSI: - reptype = REPTYPE_POS; - Lmin = 1; - Lmax = UINT32_MAX; - Fecode++; - goto REPEATCHAR; - - case OP_POSQUERY: - case OP_POSQUERYI: - reptype = REPTYPE_POS; - Lmin = 0; - Lmax = 1; - Fecode++; - goto REPEATCHAR; - - case OP_STAR: - case OP_STARI: - case OP_MINSTAR: - case OP_MINSTARI: - case OP_PLUS: - case OP_PLUSI: - case OP_MINPLUS: - case OP_MINPLUSI: - case OP_QUERY: - case OP_QUERYI: - case OP_MINQUERY: - case OP_MINQUERYI: - fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI); - Lmin = rep_min[fc]; - Lmax = rep_max[fc]; - reptype = rep_typ[fc]; - - /* Common code for all repeated single-character matches. We first check - for the minimum number of characters. If the minimum equals the maximum, we - are done. Otherwise, if minimizing, check the rest of the pattern for a - match; if there isn't one, advance up to the maximum, one character at a - time. - - If maximizing, advance up to the maximum number of matching characters, - until Feptr is past the end of the maximum run. If possessive, we are - then done (no backing up). Otherwise, match at this position; anything - other than no match is immediately returned. For nomatch, back up one - character, unless we are matching \R and the last thing matched was - \r\n, in which case, back up two code units until we reach the first - optional character position. - - The various UTF/non-UTF and caseful/caseless cases are handled separately, - for speed. */ - - REPEATCHAR: -#ifdef SUPPORT_UNICODE - if (utf) - { - Flength = 1; - Lcharptr = Fecode; - GETCHARLEN(fc, Fecode, Flength); - Fecode += Flength; - - /* Handle multi-code-unit character matching, caseful and caseless. */ - - if (Flength > 1) - { - uint32_t othercase; - - if (Fop >= OP_STARI && /* Caseless */ - (othercase = UCD_OTHERCASE(fc)) != fc) - Loclength = PRIV(ord2utf)(othercase, Foccu); - else Loclength = 0; - - for (i = 1; i <= Lmin; i++) - { - if (Feptr <= mb->end_subject - Flength && - memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; - else if (Loclength > 0 && - Feptr <= mb->end_subject - Loclength && - memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) - Feptr += Loclength; - else - { - CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - } - - if (Lmin == Lmax) continue; - - if (reptype == REPTYPE_MIN) - { - for (;;) - { - RMATCH(Fecode, RM202); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr <= mb->end_subject - Flength && - memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; - else if (Loclength > 0 && - Feptr <= mb->end_subject - Loclength && - memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) - Feptr += Loclength; - else - { - CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - } - /* Control never gets here */ - } - - else /* Maximize */ - { - Lstart_eptr = Feptr; - for (i = Lmin; i < Lmax; i++) - { - if (Feptr <= mb->end_subject - Flength && - memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) - Feptr += Flength; - else if (Loclength > 0 && - Feptr <= mb->end_subject - Loclength && - memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) - Feptr += Loclength; - else - { - CHECK_PARTIAL(); - break; - } - } - - /* After \C in UTF mode, Lstart_eptr might be in the middle of a - Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't - go too far. */ - - if (reptype != REPTYPE_POS) for(;;) - { - if (Feptr <= Lstart_eptr) break; - RMATCH(Fecode, RM203); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Feptr--; - BACKCHAR(Feptr); - } - } - break; /* End of repeated wide character handling */ - } - - /* Length of UTF character is 1. Put it into the preserved variable and - fall through to the non-UTF code. */ - - Lc = fc; - } - else -#endif /* SUPPORT_UNICODE */ - - /* When not in UTF mode, load a single-code-unit character. Then proceed as - above, using Unicode casing if either UTF or UCP is set. */ - - Lc = *Fecode++; - - /* Caseless comparison */ - - if (Fop >= OP_STARI) - { -#if PCRE2_CODE_UNIT_WIDTH == 8 -#ifdef SUPPORT_UNICODE - if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); - else -#endif /* SUPPORT_UNICODE */ - /* Lc will be < 128 in UTF-8 mode. */ - Loc = mb->fcc[Lc]; -#else /* 16-bit & 32-bit */ -#ifdef SUPPORT_UNICODE - if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); - else -#endif /* SUPPORT_UNICODE */ - Loc = TABLE_GET(Lc, mb->fcc, Lc); -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - - for (i = 1; i <= Lmin; i++) - { - uint32_t cc; /* Faster than PCRE2_UCHAR */ - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - cc = UCHAR21TEST(Feptr); - if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); - Feptr++; - } - if (Lmin == Lmax) continue; - - if (reptype == REPTYPE_MIN) - { - for (;;) - { - uint32_t cc; /* Faster than PCRE2_UCHAR */ - RMATCH(Fecode, RM25); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - cc = UCHAR21TEST(Feptr); - if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); - Feptr++; - } - /* Control never gets here */ - } - - else /* Maximize */ - { - Lstart_eptr = Feptr; - for (i = Lmin; i < Lmax; i++) - { - uint32_t cc; /* Faster than PCRE2_UCHAR */ - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - cc = UCHAR21TEST(Feptr); - if (Lc != cc && Loc != cc) break; - Feptr++; - } - if (reptype != REPTYPE_POS) for (;;) - { - if (Feptr == Lstart_eptr) break; - RMATCH(Fecode, RM26); - Feptr--; - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - } - } - } - - /* Caseful comparisons (includes all multi-byte characters) */ - - else - { - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); - } - - if (Lmin == Lmax) continue; - - if (reptype == REPTYPE_MIN) - { - for (;;) - { - RMATCH(Fecode, RM27); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - } - else /* Maximize */ - { - Lstart_eptr = Feptr; - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - - if (Lc != UCHAR21TEST(Feptr)) break; - Feptr++; - } - - if (reptype != REPTYPE_POS) for (;;) - { - if (Feptr <= Lstart_eptr) break; - RMATCH(Fecode, RM28); - Feptr--; - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - } - } - } - break; - -#undef Loclength -#undef Lstart_eptr -#undef Lcharptr -#undef Lmin -#undef Lmax -#undef Lc -#undef Loc - - - /* ===================================================================== */ - /* Match a negated single one-byte character repeatedly. This is almost a - repeat of the code for a repeated single character, but I haven't found a - nice way of commoning these up that doesn't require a test of the - positive/negative option for each character match. Maybe that wouldn't add - very much to the time taken, but character matching *is* what this is all - about... */ - -#define Lstart_eptr F->temp_sptr[0] -#define Lmin F->temp_32[0] -#define Lmax F->temp_32[1] -#define Lc F->temp_32[2] -#define Loc F->temp_32[3] - - case OP_NOTEXACT: - case OP_NOTEXACTI: - Lmin = Lmax = GET2(Fecode, 1); - Fecode += 1 + IMM2_SIZE; - goto REPEATNOTCHAR; - - case OP_NOTUPTO: - case OP_NOTUPTOI: - Lmin = 0; - Lmax = GET2(Fecode, 1); - reptype = REPTYPE_MAX; - Fecode += 1 + IMM2_SIZE; - goto REPEATNOTCHAR; - - case OP_NOTMINUPTO: - case OP_NOTMINUPTOI: - Lmin = 0; - Lmax = GET2(Fecode, 1); - reptype = REPTYPE_MIN; - Fecode += 1 + IMM2_SIZE; - goto REPEATNOTCHAR; - - case OP_NOTPOSSTAR: - case OP_NOTPOSSTARI: - reptype = REPTYPE_POS; - Lmin = 0; - Lmax = UINT32_MAX; - Fecode++; - goto REPEATNOTCHAR; - - case OP_NOTPOSPLUS: - case OP_NOTPOSPLUSI: - reptype = REPTYPE_POS; - Lmin = 1; - Lmax = UINT32_MAX; - Fecode++; - goto REPEATNOTCHAR; - - case OP_NOTPOSQUERY: - case OP_NOTPOSQUERYI: - reptype = REPTYPE_POS; - Lmin = 0; - Lmax = 1; - Fecode++; - goto REPEATNOTCHAR; - - case OP_NOTPOSUPTO: - case OP_NOTPOSUPTOI: - reptype = REPTYPE_POS; - Lmin = 0; - Lmax = GET2(Fecode, 1); - Fecode += 1 + IMM2_SIZE; - goto REPEATNOTCHAR; - - case OP_NOTSTAR: - case OP_NOTSTARI: - case OP_NOTMINSTAR: - case OP_NOTMINSTARI: - case OP_NOTPLUS: - case OP_NOTPLUSI: - case OP_NOTMINPLUS: - case OP_NOTMINPLUSI: - case OP_NOTQUERY: - case OP_NOTQUERYI: - case OP_NOTMINQUERY: - case OP_NOTMINQUERYI: - fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); - Lmin = rep_min[fc]; - Lmax = rep_max[fc]; - reptype = rep_typ[fc]; - - /* Common code for all repeated single-character non-matches. */ - - REPEATNOTCHAR: - GETCHARINCTEST(Lc, Fecode); - - /* The code is duplicated for the caseless and caseful cases, for speed, - since matching characters is likely to be quite common. First, ensure the - minimum number of matches are present. If Lmin = Lmax, we are done. - Otherwise, if minimizing, keep trying the rest of the expression and - advancing one matching character if failing, up to the maximum. - Alternatively, if maximizing, find the maximum number of characters and - work backwards. */ - - if (Fop >= OP_NOTSTARI) /* Caseless */ - { -#ifdef SUPPORT_UNICODE - if ((utf || ucp) && Lc > 127) - Loc = UCD_OTHERCASE(Lc); - else -#endif /* SUPPORT_UNICODE */ - - Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */ - -#ifdef SUPPORT_UNICODE - if (utf) - { - uint32_t d; - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINC(d, Feptr); - if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); - } - } - else -#endif /* SUPPORT_UNICODE */ - - /* Not UTF mode */ - { - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); - Feptr++; - } - } - - if (Lmin == Lmax) continue; /* Finished for exact count */ - - if (reptype == REPTYPE_MIN) - { -#ifdef SUPPORT_UNICODE - if (utf) - { - uint32_t d; - for (;;) - { - RMATCH(Fecode, RM204); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINC(d, Feptr); - if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); - } - } - else -#endif /*SUPPORT_UNICODE */ - - /* Not UTF mode */ - { - for (;;) - { - RMATCH(Fecode, RM29); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); - Feptr++; - } - } - /* Control never gets here */ - } - - /* Maximize case */ - - else - { - Lstart_eptr = Feptr; - -#ifdef SUPPORT_UNICODE - if (utf) - { - uint32_t d; - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLEN(d, Feptr, len); - if (Lc == d || Loc == d) break; - Feptr += len; - } - - /* After \C in UTF mode, Lstart_eptr might be in the middle of a - Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't - go too far. */ - - if (reptype != REPTYPE_POS) for(;;) - { - if (Feptr <= Lstart_eptr) break; - RMATCH(Fecode, RM205); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Feptr--; - BACKCHAR(Feptr); - } - } - else -#endif /* SUPPORT_UNICODE */ - - /* Not UTF mode */ - { - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - if (Lc == *Feptr || Loc == *Feptr) break; - Feptr++; - } - if (reptype != REPTYPE_POS) for (;;) - { - if (Feptr == Lstart_eptr) break; - RMATCH(Fecode, RM30); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Feptr--; - } - } - } - } - - /* Caseful comparisons */ - - else - { -#ifdef SUPPORT_UNICODE - if (utf) - { - uint32_t d; - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINC(d, Feptr); - if (Lc == d) RRETURN(MATCH_NOMATCH); - } - } - else -#endif - /* Not UTF mode */ - { - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); - } - } - - if (Lmin == Lmax) continue; - - if (reptype == REPTYPE_MIN) - { -#ifdef SUPPORT_UNICODE - if (utf) - { - uint32_t d; - for (;;) - { - RMATCH(Fecode, RM206); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINC(d, Feptr); - if (Lc == d) RRETURN(MATCH_NOMATCH); - } - } - else -#endif - /* Not UTF mode */ - { - for (;;) - { - RMATCH(Fecode, RM31); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); - } - } - /* Control never gets here */ - } - - /* Maximize case */ - - else - { - Lstart_eptr = Feptr; - -#ifdef SUPPORT_UNICODE - if (utf) - { - uint32_t d; - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLEN(d, Feptr, len); - if (Lc == d) break; - Feptr += len; - } - - /* After \C in UTF mode, Lstart_eptr might be in the middle of a - Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't - go too far. */ - - if (reptype != REPTYPE_POS) for(;;) - { - if (Feptr <= Lstart_eptr) break; - RMATCH(Fecode, RM207); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Feptr--; - BACKCHAR(Feptr); - } - } - else -#endif - /* Not UTF mode */ - { - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - if (Lc == *Feptr) break; - Feptr++; - } - if (reptype != REPTYPE_POS) for (;;) - { - if (Feptr == Lstart_eptr) break; - RMATCH(Fecode, RM32); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Feptr--; - } - } - } - } - break; - -#undef Lstart_eptr -#undef Lmin -#undef Lmax -#undef Lc -#undef Loc - - - /* ===================================================================== */ - /* Match a bit-mapped character class, possibly repeatedly. These opcodes - are used when all the characters in the class have values in the range - 0-255, and either the matching is caseful, or the characters are in the - range 0-127 when UTF processing is enabled. The only difference between - OP_CLASS and OP_NCLASS occurs when a data character outside the range is - encountered. */ - -#define Lmin F->temp_32[0] -#define Lmax F->temp_32[1] -#define Lstart_eptr F->temp_sptr[0] -#define Lbyte_map_address F->temp_sptr[1] -#define Lbyte_map ((unsigned char *)Lbyte_map_address) - - case OP_NCLASS: - case OP_CLASS: - { - Lbyte_map_address = Fecode + 1; /* Save for matching */ - Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */ - - /* Look past the end of the item to see if there is repeat information - following. Then obey similar code to character type repeats. */ - - switch (*Fecode) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSSTAR: - case OP_CRPOSPLUS: - case OP_CRPOSQUERY: - fc = *Fecode++ - OP_CRSTAR; - Lmin = rep_min[fc]; - Lmax = rep_max[fc]; - reptype = rep_typ[fc]; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - Lmin = GET2(Fecode, 1); - Lmax = GET2(Fecode, 1 + IMM2_SIZE); - if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ - reptype = rep_typ[*Fecode - OP_CRSTAR]; - Fecode += 1 + 2 * IMM2_SIZE; - break; - - default: /* No repeat follows */ - Lmin = Lmax = 1; - break; - } - - /* First, ensure the minimum number of matches are present. */ - -#ifdef SUPPORT_UNICODE - if (utf) - { - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINC(fc, Feptr); - if (fc > 255) - { - if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); - } - else - if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); - } - } - else -#endif - /* Not UTF mode */ - { - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - fc = *Feptr++; -#if PCRE2_CODE_UNIT_WIDTH != 8 - if (fc > 255) - { - if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); - } - else -#endif - if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); - } - } - - /* If Lmax == Lmin we are done. Continue with main loop. */ - - if (Lmin == Lmax) continue; - - /* If minimizing, keep testing the rest of the expression and advancing - the pointer while it matches the class. */ - - if (reptype == REPTYPE_MIN) - { -#ifdef SUPPORT_UNICODE - if (utf) - { - for (;;) - { - RMATCH(Fecode, RM200); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINC(fc, Feptr); - if (fc > 255) - { - if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); - } - else - if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); - } - } - else -#endif - /* Not UTF mode */ - { - for (;;) - { - RMATCH(Fecode, RM23); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - fc = *Feptr++; -#if PCRE2_CODE_UNIT_WIDTH != 8 - if (fc > 255) - { - if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); - } - else -#endif - if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); - } - } - /* Control never gets here */ - } - - /* If maximizing, find the longest possible run, then work backwards. */ - - else - { - Lstart_eptr = Feptr; - -#ifdef SUPPORT_UNICODE - if (utf) - { - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLEN(fc, Feptr, len); - if (fc > 255) - { - if (Fop == OP_CLASS) break; - } - else - if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; - Feptr += len; - } - - if (reptype == REPTYPE_POS) continue; /* No backtracking */ - - /* After \C in UTF mode, Lstart_eptr might be in the middle of a - Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't - go too far. */ - - for (;;) - { - RMATCH(Fecode, RM201); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ - BACKCHAR(Feptr); - } - } - else -#endif - /* Not UTF mode */ - { - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - fc = *Feptr; -#if PCRE2_CODE_UNIT_WIDTH != 8 - if (fc > 255) - { - if (Fop == OP_CLASS) break; - } - else -#endif - if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; - Feptr++; - } - - if (reptype == REPTYPE_POS) continue; /* No backtracking */ - - while (Feptr >= Lstart_eptr) - { - RMATCH(Fecode, RM24); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Feptr--; - } - } - - RRETURN(MATCH_NOMATCH); - } - } - /* Control never gets here */ - -#undef Lbyte_map_address -#undef Lbyte_map -#undef Lstart_eptr -#undef Lmin -#undef Lmax - - - /* ===================================================================== */ - /* Match an extended character class. In the 8-bit library, this opcode is - encountered only when UTF-8 mode mode is supported. In the 16-bit and - 32-bit libraries, codepoints greater than 255 may be encountered even when - UTF is not supported. */ - -#define Lstart_eptr F->temp_sptr[0] -#define Lxclass_data F->temp_sptr[1] -#define Lmin F->temp_32[0] -#define Lmax F->temp_32[1] - -#ifdef SUPPORT_WIDE_CHARS - case OP_XCLASS: - { - Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */ - Fecode += GET(Fecode, 1); /* Advance past the item */ - - switch (*Fecode) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSSTAR: - case OP_CRPOSPLUS: - case OP_CRPOSQUERY: - fc = *Fecode++ - OP_CRSTAR; - Lmin = rep_min[fc]; - Lmax = rep_max[fc]; - reptype = rep_typ[fc]; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - Lmin = GET2(Fecode, 1); - Lmax = GET2(Fecode, 1 + IMM2_SIZE); - if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ - reptype = rep_typ[*Fecode - OP_CRSTAR]; - Fecode += 1 + 2 * IMM2_SIZE; - break; - - default: /* No repeat follows */ - Lmin = Lmax = 1; - break; - } - - /* First, ensure the minimum number of matches are present. */ - - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH); - } - - /* If Lmax == Lmin we can just continue with the main loop. */ - - if (Lmin == Lmax) continue; - - /* If minimizing, keep testing the rest of the expression and advancing - the pointer while it matches the class. */ - - if (reptype == REPTYPE_MIN) - { - for (;;) - { - RMATCH(Fecode, RM100); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - } - - /* If maximizing, find the longest possible run, then work backwards. */ - - else - { - Lstart_eptr = Feptr; - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } -#ifdef SUPPORT_UNICODE - GETCHARLENTEST(fc, Feptr, len); -#else - fc = *Feptr; -#endif - if (!PRIV(xclass)(fc, Lxclass_data, utf)) break; - Feptr += len; - } - - if (reptype == REPTYPE_POS) continue; /* No backtracking */ - - /* After \C in UTF mode, Lstart_eptr might be in the middle of a - Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't - go too far. */ - - for(;;) - { - RMATCH(Fecode, RM101); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ -#ifdef SUPPORT_UNICODE - if (utf) BACKCHAR(Feptr); -#endif - } - RRETURN(MATCH_NOMATCH); - } - - /* Control never gets here */ - } -#endif /* SUPPORT_WIDE_CHARS: end of XCLASS */ - -#undef Lstart_eptr -#undef Lxclass_data -#undef Lmin -#undef Lmax - - - /* ===================================================================== */ - /* Match various character types when PCRE2_UCP is not set. These opcodes - are not generated when PCRE2_UCP is set - instead appropriate property - tests are compiled. */ - - case OP_NOT_DIGIT: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); - Fecode++; - break; - - case OP_DIGIT: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) - RRETURN(MATCH_NOMATCH); - Fecode++; - break; - - case OP_NOT_WHITESPACE: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) - RRETURN(MATCH_NOMATCH); - Fecode++; - break; - - case OP_WHITESPACE: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) - RRETURN(MATCH_NOMATCH); - Fecode++; - break; - - case OP_NOT_WORDCHAR: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); - Fecode++; - break; - - case OP_WORDCHAR: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); - Fecode++; - break; - - case OP_ANYNL: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - switch(fc) - { - default: RRETURN(MATCH_NOMATCH); - - case CHAR_CR: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - } - else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++; - break; - - case CHAR_LF: - break; - - case CHAR_VT: - case CHAR_FF: - case CHAR_NEL: -#ifndef EBCDIC - case 0x2028: - case 0x2029: -#endif /* Not EBCDIC */ - if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); - break; - } - Fecode++; - break; - - case OP_NOT_HSPACE: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - switch(fc) - { - HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ - default: break; - } - Fecode++; - break; - - case OP_HSPACE: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - switch(fc) - { - HSPACE_CASES: break; /* Byte and multibyte cases */ - default: RRETURN(MATCH_NOMATCH); - } - Fecode++; - break; - - case OP_NOT_VSPACE: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - switch(fc) - { - VSPACE_CASES: RRETURN(MATCH_NOMATCH); - default: break; - } - Fecode++; - break; - - case OP_VSPACE: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - switch(fc) - { - VSPACE_CASES: break; - default: RRETURN(MATCH_NOMATCH); - } - Fecode++; - break; - - -#ifdef SUPPORT_UNICODE - - /* ===================================================================== */ - /* Check the next character by Unicode property. We will get here only - if the support is in the binary; otherwise a compile-time error occurs. */ - - case OP_PROP: - case OP_NOTPROP: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - { - const uint32_t *cp; - const ucd_record *prop = GET_UCD(fc); - - switch(Fecode[1]) - { - case PT_ANY: - if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH); - break; - - case PT_LAMP: - if ((prop->chartype == ucp_Lu || - prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt) == (Fop == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - break; - - case PT_GC: - if ((Fecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (Fop == OP_PROP)) - RRETURN(MATCH_NOMATCH); - break; - - case PT_PC: - if ((Fecode[2] != prop->chartype) == (Fop == OP_PROP)) - RRETURN(MATCH_NOMATCH); - break; - - case PT_SC: - if ((Fecode[2] != prop->script) == (Fop == OP_PROP)) - RRETURN(MATCH_NOMATCH); - break; - - /* These are specials */ - - case PT_ALNUM: - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (Fop == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - break; - - /* Perl space used to exclude VT, but from Perl 5.18 it is included, - which means that Perl space and POSIX space are now identical. PCRE - was changed at release 8.34. */ - - case PT_SPACE: /* Perl space */ - case PT_PXSPACE: /* POSIX space */ - switch(fc) - { - HSPACE_CASES: - VSPACE_CASES: - if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH); - break; - - default: - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == - (Fop == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); - break; - } - break; - - case PT_WORD: - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || - fc == CHAR_UNDERSCORE) == (Fop == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - break; - - case PT_CLIST: - cp = PRIV(ucd_caseless_sets) + Fecode[2]; - for (;;) - { - if (fc < *cp) - { if (Fop == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; } - if (fc == *cp++) - { if (Fop == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } } - } - break; - - case PT_UCNC: - if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || - fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || - fc >= 0xe000) == (Fop == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - break; - - /* This should never occur */ - - default: - return PCRE2_ERROR_INTERNAL; - } - - Fecode += 3; - } - break; - - - /* ===================================================================== */ - /* Match an extended Unicode sequence. We will get here only if the support - is in the binary; otherwise a compile-time error occurs. */ - - case OP_EXTUNI: - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - else - { - GETCHARINCTEST(fc, Feptr); - Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf, - NULL); - } - CHECK_PARTIAL(); - Fecode++; - break; - -#endif /* SUPPORT_UNICODE */ - - - /* ===================================================================== */ - /* Match a single character type repeatedly. Note that the property type - does not need to be in a stack frame as it is not used within an RMATCH() - loop. */ - -#define Lstart_eptr F->temp_sptr[0] -#define Lmin F->temp_32[0] -#define Lmax F->temp_32[1] -#define Lctype F->temp_32[2] -#define Lpropvalue F->temp_32[3] - - case OP_TYPEEXACT: - Lmin = Lmax = GET2(Fecode, 1); - Fecode += 1 + IMM2_SIZE; - goto REPEATTYPE; - - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - Lmin = 0; - Lmax = GET2(Fecode, 1); - reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX; - Fecode += 1 + IMM2_SIZE; - goto REPEATTYPE; - - case OP_TYPEPOSSTAR: - reptype = REPTYPE_POS; - Lmin = 0; - Lmax = UINT32_MAX; - Fecode++; - goto REPEATTYPE; - - case OP_TYPEPOSPLUS: - reptype = REPTYPE_POS; - Lmin = 1; - Lmax = UINT32_MAX; - Fecode++; - goto REPEATTYPE; - - case OP_TYPEPOSQUERY: - reptype = REPTYPE_POS; - Lmin = 0; - Lmax = 1; - Fecode++; - goto REPEATTYPE; - - case OP_TYPEPOSUPTO: - reptype = REPTYPE_POS; - Lmin = 0; - Lmax = GET2(Fecode, 1); - Fecode += 1 + IMM2_SIZE; - goto REPEATTYPE; - - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - fc = *Fecode++ - OP_TYPESTAR; - Lmin = rep_min[fc]; - Lmax = rep_max[fc]; - reptype = rep_typ[fc]; - - /* Common code for all repeated character type matches. */ - - REPEATTYPE: - Lctype = *Fecode++; /* Code for the character type */ - -#ifdef SUPPORT_UNICODE - if (Lctype == OP_PROP || Lctype == OP_NOTPROP) - { - proptype = *Fecode++; - Lpropvalue = *Fecode++; - } - else proptype = -1; -#endif - - /* First, ensure the minimum number of matches are present. Use inline - code for maximizing the speed, and do the type test once at the start - (i.e. keep it out of the loop). The code for UTF mode is separated out for - tidiness, except for Unicode property tests. */ - - if (Lmin > 0) - { -#ifdef SUPPORT_UNICODE - if (proptype >= 0) /* Property tests in all modes */ - { - switch(proptype) - { - case PT_ANY: - if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - } - break; - - case PT_LAMP: - for (i = 1; i <= Lmin; i++) - { - int chartype; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - chartype = UCD_CHARTYPE(fc); - if ((chartype == ucp_Lu || - chartype == ucp_Ll || - chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - break; - - case PT_GC: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - break; - - case PT_PC: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - break; - - case PT_SC: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - break; - - case PT_ALNUM: - for (i = 1; i <= Lmin; i++) - { - int category; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - category = UCD_CATEGORY(fc); - if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - break; - - /* Perl space used to exclude VT, but from Perl 5.18 it is included, - which means that Perl space and POSIX space are now identical. PCRE - was changed at release 8.34. */ - - case PT_SPACE: /* Perl space */ - case PT_PXSPACE: /* POSIX space */ - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - switch(fc) - { - HSPACE_CASES: - VSPACE_CASES: - if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); - break; - - default: - if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - break; - } - } - break; - - case PT_WORD: - for (i = 1; i <= Lmin; i++) - { - int category; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - category = UCD_CATEGORY(fc); - if ((category == ucp_L || category == ucp_N || - fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - break; - - case PT_CLIST: - for (i = 1; i <= Lmin; i++) - { - const uint32_t *cp; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - cp = PRIV(ucd_caseless_sets) + Lpropvalue; - for (;;) - { - if (fc < *cp) - { - if (Lctype == OP_NOTPROP) break; - RRETURN(MATCH_NOMATCH); - } - if (fc == *cp++) - { - if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); - break; - } - } - } - break; - - case PT_UCNC: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || - fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || - fc >= 0xe000) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - break; - - /* This should not occur */ - - default: - return PCRE2_ERROR_INTERNAL; - } - } - - /* Match extended Unicode sequences. We will get here only if the - support is in the binary; otherwise a compile-time error occurs. */ - - else if (Lctype == OP_EXTUNI) - { - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - else - { - GETCHARINCTEST(fc, Feptr); - Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, - mb->end_subject, utf, NULL); - } - CHECK_PARTIAL(); - } - } - else -#endif /* SUPPORT_UNICODE */ - -/* Handle all other cases in UTF mode */ - -#ifdef SUPPORT_UNICODE - if (utf) switch(Lctype) - { - case OP_ANY: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); - if (mb->partial != 0 && - Feptr + 1 >= mb->end_subject && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - UCHAR21(Feptr) == NLBLOCK->nl[0]) - { - mb->hitend = TRUE; - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; - } - Feptr++; - ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); - } - break; - - case OP_ALLANY: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - Feptr++; - ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); - } - break; - - case OP_ANYBYTE: - if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH); - Feptr += Lmin; - break; - - case OP_ANYNL: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINC(fc, Feptr); - switch(fc) - { - default: RRETURN(MATCH_NOMATCH); - - case CHAR_CR: - if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; - break; - - case CHAR_LF: - break; - - case CHAR_VT: - case CHAR_FF: - case CHAR_NEL: -#ifndef EBCDIC - case 0x2028: - case 0x2029: -#endif /* Not EBCDIC */ - if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); - break; - } - } - break; - - case OP_NOT_HSPACE: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINC(fc, Feptr); - switch(fc) - { - HSPACE_CASES: RRETURN(MATCH_NOMATCH); - default: break; - } - } - break; - - case OP_HSPACE: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINC(fc, Feptr); - switch(fc) - { - HSPACE_CASES: break; - default: RRETURN(MATCH_NOMATCH); - } - } - break; - - case OP_NOT_VSPACE: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINC(fc, Feptr); - switch(fc) - { - VSPACE_CASES: RRETURN(MATCH_NOMATCH); - default: break; - } - } - break; - - case OP_VSPACE: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINC(fc, Feptr); - switch(fc) - { - VSPACE_CASES: break; - default: RRETURN(MATCH_NOMATCH); - } - } - break; - - case OP_NOT_DIGIT: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINC(fc, Feptr); - if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); - } - break; - - case OP_DIGIT: - for (i = 1; i <= Lmin; i++) - { - uint32_t cc; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - cc = UCHAR21(Feptr); - if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0) - RRETURN(MATCH_NOMATCH); - Feptr++; - /* No need to skip more code units - we know it has only one. */ - } - break; - - case OP_NOT_WHITESPACE: - for (i = 1; i <= Lmin; i++) - { - uint32_t cc; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - cc = UCHAR21(Feptr); - if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0) - RRETURN(MATCH_NOMATCH); - Feptr++; - ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); - } - break; - - case OP_WHITESPACE: - for (i = 1; i <= Lmin; i++) - { - uint32_t cc; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - cc = UCHAR21(Feptr); - if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0) - RRETURN(MATCH_NOMATCH); - Feptr++; - /* No need to skip more code units - we know it has only one. */ - } - break; - - case OP_NOT_WORDCHAR: - for (i = 1; i <= Lmin; i++) - { - uint32_t cc; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - cc = UCHAR21(Feptr); - if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); - Feptr++; - ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); - } - break; - - case OP_WORDCHAR: - for (i = 1; i <= Lmin; i++) - { - uint32_t cc; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - cc = UCHAR21(Feptr); - if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); - Feptr++; - /* No need to skip more code units - we know it has only one. */ - } - break; - - default: - return PCRE2_ERROR_INTERNAL; - } /* End switch(Lctype) */ - - else -#endif /* SUPPORT_UNICODE */ - - /* Code for the non-UTF case for minimum matching of operators other - than OP_PROP and OP_NOTPROP. */ - - switch(Lctype) - { - case OP_ANY: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); - if (mb->partial != 0 && - Feptr + 1 >= mb->end_subject && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - *Feptr == NLBLOCK->nl[0]) - { - mb->hitend = TRUE; - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; - } - Feptr++; - } - break; - - case OP_ALLANY: - if (Feptr > mb->end_subject - Lmin) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - Feptr += Lmin; - break; - - /* This OP_ANYBYTE case will never be reached because \C gets turned - into OP_ALLANY in non-UTF mode. Cut out the code so that coverage - reports don't complain about it's never being used. */ - -/* case OP_ANYBYTE: -* if (Feptr > mb->end_subject - Lmin) -* { -* SCHECK_PARTIAL(); -* RRETURN(MATCH_NOMATCH); -* } -* Feptr += Lmin; -* break; -*/ - case OP_ANYNL: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - switch(*Feptr++) - { - default: RRETURN(MATCH_NOMATCH); - - case CHAR_CR: - if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; - break; - - case CHAR_LF: - break; - - case CHAR_VT: - case CHAR_FF: - case CHAR_NEL: -#if PCRE2_CODE_UNIT_WIDTH != 8 - case 0x2028: - case 0x2029: -#endif - if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); - break; - } - } - break; - - case OP_NOT_HSPACE: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - switch(*Feptr++) - { - default: break; - HSPACE_BYTE_CASES: -#if PCRE2_CODE_UNIT_WIDTH != 8 - HSPACE_MULTIBYTE_CASES: -#endif - RRETURN(MATCH_NOMATCH); - } - } - break; - - case OP_HSPACE: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - switch(*Feptr++) - { - default: RRETURN(MATCH_NOMATCH); - HSPACE_BYTE_CASES: -#if PCRE2_CODE_UNIT_WIDTH != 8 - HSPACE_MULTIBYTE_CASES: -#endif - break; - } - } - break; - - case OP_NOT_VSPACE: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - switch(*Feptr++) - { - VSPACE_BYTE_CASES: -#if PCRE2_CODE_UNIT_WIDTH != 8 - VSPACE_MULTIBYTE_CASES: -#endif - RRETURN(MATCH_NOMATCH); - default: break; - } - } - break; - - case OP_VSPACE: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - switch(*Feptr++) - { - default: RRETURN(MATCH_NOMATCH); - VSPACE_BYTE_CASES: -#if PCRE2_CODE_UNIT_WIDTH != 8 - VSPACE_MULTIBYTE_CASES: -#endif - break; - } - } - break; - - case OP_NOT_DIGIT: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); - Feptr++; - } - break; - - case OP_DIGIT: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) - RRETURN(MATCH_NOMATCH); - Feptr++; - } - break; - - case OP_NOT_WHITESPACE: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) - RRETURN(MATCH_NOMATCH); - Feptr++; - } - break; - - case OP_WHITESPACE: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) - RRETURN(MATCH_NOMATCH); - Feptr++; - } - break; - - case OP_NOT_WORDCHAR: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); - Feptr++; - } - break; - - case OP_WORDCHAR: - for (i = 1; i <= Lmin; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); - Feptr++; - } - break; - - default: - return PCRE2_ERROR_INTERNAL; - } - } - - /* If Lmin = Lmax we are done. Continue with the main loop. */ - - if (Lmin == Lmax) continue; - - /* If minimizing, we have to test the rest of the pattern before each - subsequent match. */ - - if (reptype == REPTYPE_MIN) - { -#ifdef SUPPORT_UNICODE - if (proptype >= 0) - { - switch(proptype) - { - case PT_ANY: - for (;;) - { - RMATCH(Fecode, RM208); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - case PT_LAMP: - for (;;) - { - int chartype; - RMATCH(Fecode, RM209); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - chartype = UCD_CHARTYPE(fc); - if ((chartype == ucp_Lu || - chartype == ucp_Ll || - chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - case PT_GC: - for (;;) - { - RMATCH(Fecode, RM210); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - case PT_PC: - for (;;) - { - RMATCH(Fecode, RM211); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - case PT_SC: - for (;;) - { - RMATCH(Fecode, RM212); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - case PT_ALNUM: - for (;;) - { - int category; - RMATCH(Fecode, RM213); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - category = UCD_CATEGORY(fc); - if ((category == ucp_L || category == ucp_N) == - (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - /* Perl space used to exclude VT, but from Perl 5.18 it is included, - which means that Perl space and POSIX space are now identical. PCRE - was changed at release 8.34. */ - - case PT_SPACE: /* Perl space */ - case PT_PXSPACE: /* POSIX space */ - for (;;) - { - RMATCH(Fecode, RM214); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - switch(fc) - { - HSPACE_CASES: - VSPACE_CASES: - if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); - break; - - default: - if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - break; - } - } - /* Control never gets here */ - - case PT_WORD: - for (;;) - { - int category; - RMATCH(Fecode, RM215); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - category = UCD_CATEGORY(fc); - if ((category == ucp_L || - category == ucp_N || - fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - case PT_CLIST: - for (;;) - { - const uint32_t *cp; - RMATCH(Fecode, RM216); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - cp = PRIV(ucd_caseless_sets) + Lpropvalue; - for (;;) - { - if (fc < *cp) - { - if (Lctype == OP_NOTPROP) break; - RRETURN(MATCH_NOMATCH); - } - if (fc == *cp++) - { - if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); - break; - } - } - } - /* Control never gets here */ - - case PT_UCNC: - for (;;) - { - RMATCH(Fecode, RM217); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(fc, Feptr); - if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || - fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || - fc >= 0xe000) == (Lctype == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - /* This should never occur */ - default: - return PCRE2_ERROR_INTERNAL; - } - } - - /* Match extended Unicode sequences. We will get here only if the - support is in the binary; otherwise a compile-time error occurs. */ - - else if (Lctype == OP_EXTUNI) - { - for (;;) - { - RMATCH(Fecode, RM218); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - else - { - GETCHARINCTEST(fc, Feptr); - Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, - utf, NULL); - } - CHECK_PARTIAL(); - } - } - else -#endif /* SUPPORT_UNICODE */ - - /* UTF mode for non-property testing character types. */ - -#ifdef SUPPORT_UNICODE - if (utf) - { - for (;;) - { - RMATCH(Fecode, RM219); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); - GETCHARINC(fc, Feptr); - switch(Lctype) - { - case OP_ANY: /* This is the non-NL case */ - if (mb->partial != 0 && /* Take care with CRLF partial */ - Feptr >= mb->end_subject && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - fc == NLBLOCK->nl[0]) - { - mb->hitend = TRUE; - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; - } - break; - - case OP_ALLANY: - case OP_ANYBYTE: - break; - - case OP_ANYNL: - switch(fc) - { - default: RRETURN(MATCH_NOMATCH); - - case CHAR_CR: - if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; - break; - - case CHAR_LF: - break; - - case CHAR_VT: - case CHAR_FF: - case CHAR_NEL: -#ifndef EBCDIC - case 0x2028: - case 0x2029: -#endif /* Not EBCDIC */ - if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) - RRETURN(MATCH_NOMATCH); - break; - } - break; - - case OP_NOT_HSPACE: - switch(fc) - { - HSPACE_CASES: RRETURN(MATCH_NOMATCH); - default: break; - } - break; - - case OP_HSPACE: - switch(fc) - { - HSPACE_CASES: break; - default: RRETURN(MATCH_NOMATCH); - } - break; - - case OP_NOT_VSPACE: - switch(fc) - { - VSPACE_CASES: RRETURN(MATCH_NOMATCH); - default: break; - } - break; - - case OP_VSPACE: - switch(fc) - { - VSPACE_CASES: break; - default: RRETURN(MATCH_NOMATCH); - } - break; - - case OP_NOT_DIGIT: - if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_DIGIT: - if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_NOT_WHITESPACE: - if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_WHITESPACE: - if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_NOT_WORDCHAR: - if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_WORDCHAR: - if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); - break; - - default: - return PCRE2_ERROR_INTERNAL; - } - } - } - else -#endif /* SUPPORT_UNICODE */ - - /* Not UTF mode */ - { - for (;;) - { - RMATCH(Fecode, RM33); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) - RRETURN(MATCH_NOMATCH); - fc = *Feptr++; - switch(Lctype) - { - case OP_ANY: /* This is the non-NL case */ - if (mb->partial != 0 && /* Take care with CRLF partial */ - Feptr >= mb->end_subject && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - fc == NLBLOCK->nl[0]) - { - mb->hitend = TRUE; - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; - } - break; - - case OP_ALLANY: - case OP_ANYBYTE: - break; - - case OP_ANYNL: - switch(fc) - { - default: RRETURN(MATCH_NOMATCH); - - case CHAR_CR: - if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; - break; - - case CHAR_LF: - break; - - case CHAR_VT: - case CHAR_FF: - case CHAR_NEL: -#if PCRE2_CODE_UNIT_WIDTH != 8 - case 0x2028: - case 0x2029: -#endif - if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) - RRETURN(MATCH_NOMATCH); - break; - } - break; - - case OP_NOT_HSPACE: - switch(fc) - { - default: break; - HSPACE_BYTE_CASES: -#if PCRE2_CODE_UNIT_WIDTH != 8 - HSPACE_MULTIBYTE_CASES: -#endif - RRETURN(MATCH_NOMATCH); - } - break; - - case OP_HSPACE: - switch(fc) - { - default: RRETURN(MATCH_NOMATCH); - HSPACE_BYTE_CASES: -#if PCRE2_CODE_UNIT_WIDTH != 8 - HSPACE_MULTIBYTE_CASES: -#endif - break; - } - break; - - case OP_NOT_VSPACE: - switch(fc) - { - default: break; - VSPACE_BYTE_CASES: -#if PCRE2_CODE_UNIT_WIDTH != 8 - VSPACE_MULTIBYTE_CASES: -#endif - RRETURN(MATCH_NOMATCH); - } - break; - - case OP_VSPACE: - switch(fc) - { - default: RRETURN(MATCH_NOMATCH); - VSPACE_BYTE_CASES: -#if PCRE2_CODE_UNIT_WIDTH != 8 - VSPACE_MULTIBYTE_CASES: -#endif - break; - } - break; - - case OP_NOT_DIGIT: - if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_DIGIT: - if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_NOT_WHITESPACE: - if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_WHITESPACE: - if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_NOT_WORDCHAR: - if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_WORDCHAR: - if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); - break; - - default: - return PCRE2_ERROR_INTERNAL; - } - } - } - /* Control never gets here */ - } - - /* If maximizing, it is worth using inline code for speed, doing the type - test once at the start (i.e. keep it out of the loop). */ - - else - { - Lstart_eptr = Feptr; /* Remember where we started */ - -#ifdef SUPPORT_UNICODE - if (proptype >= 0) - { - switch(proptype) - { - case PT_ANY: - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLENTEST(fc, Feptr, len); - if (Lctype == OP_NOTPROP) break; - Feptr+= len; - } - break; - - case PT_LAMP: - for (i = Lmin; i < Lmax; i++) - { - int chartype; - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLENTEST(fc, Feptr, len); - chartype = UCD_CHARTYPE(fc); - if ((chartype == ucp_Lu || - chartype == ucp_Ll || - chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) - break; - Feptr+= len; - } - break; - - case PT_GC: - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLENTEST(fc, Feptr, len); - if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) - break; - Feptr+= len; - } - break; - - case PT_PC: - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLENTEST(fc, Feptr, len); - if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) - break; - Feptr+= len; - } - break; - - case PT_SC: - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLENTEST(fc, Feptr, len); - if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) - break; - Feptr+= len; - } - break; - - case PT_ALNUM: - for (i = Lmin; i < Lmax; i++) - { - int category; - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLENTEST(fc, Feptr, len); - category = UCD_CATEGORY(fc); - if ((category == ucp_L || category == ucp_N) == - (Lctype == OP_NOTPROP)) - break; - Feptr+= len; - } - break; - - /* Perl space used to exclude VT, but from Perl 5.18 it is included, - which means that Perl space and POSIX space are now identical. PCRE - was changed at release 8.34. */ - - case PT_SPACE: /* Perl space */ - case PT_PXSPACE: /* POSIX space */ - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLENTEST(fc, Feptr, len); - switch(fc) - { - HSPACE_CASES: - VSPACE_CASES: - if (Lctype == OP_NOTPROP) goto ENDLOOP99; /* Break the loop */ - break; - - default: - if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) - goto ENDLOOP99; /* Break the loop */ - break; - } - Feptr+= len; - } - ENDLOOP99: - break; - - case PT_WORD: - for (i = Lmin; i < Lmax; i++) - { - int category; - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLENTEST(fc, Feptr, len); - category = UCD_CATEGORY(fc); - if ((category == ucp_L || category == ucp_N || - fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP)) - break; - Feptr+= len; - } - break; - - case PT_CLIST: - for (i = Lmin; i < Lmax; i++) - { - const uint32_t *cp; - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLENTEST(fc, Feptr, len); - cp = PRIV(ucd_caseless_sets) + Lpropvalue; - for (;;) - { - if (fc < *cp) - { if (Lctype == OP_NOTPROP) break; else goto GOT_MAX; } - if (fc == *cp++) - { if (Lctype == OP_NOTPROP) goto GOT_MAX; else break; } - } - Feptr += len; - } - GOT_MAX: - break; - - case PT_UCNC: - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLENTEST(fc, Feptr, len); - if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || - fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || - fc >= 0xe000) == (Lctype == OP_NOTPROP)) - break; - Feptr += len; - } - break; - - default: - return PCRE2_ERROR_INTERNAL; - } - - /* Feptr is now past the end of the maximum run */ - - if (reptype == REPTYPE_POS) continue; /* No backtracking */ - - /* After \C in UTF mode, Lstart_eptr might be in the middle of a - Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't - go too far. */ - - for(;;) - { - if (Feptr <= Lstart_eptr) break; - RMATCH(Fecode, RM222); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Feptr--; - if (utf) BACKCHAR(Feptr); - } - } - - /* Match extended Unicode grapheme clusters. We will get here only if the - support is in the binary; otherwise a compile-time error occurs. */ - - else if (Lctype == OP_EXTUNI) - { - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - else - { - GETCHARINCTEST(fc, Feptr); - Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, - utf, NULL); - } - CHECK_PARTIAL(); - } - - /* Feptr is now past the end of the maximum run */ - - if (reptype == REPTYPE_POS) continue; /* No backtracking */ - - /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start - of the run while backtracking because the use of \C in UTF mode can - cause BACKCHAR to move back past Lstart_eptr. This is just palliative; - the use of \C in UTF mode is fraught with danger. */ - - for(;;) - { - int lgb, rgb; - PCRE2_SPTR fptr; - - if (Feptr <= Lstart_eptr) break; /* At start of char run */ - RMATCH(Fecode, RM220); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - - /* Backtracking over an extended grapheme cluster involves inspecting - the previous two characters (if present) to see if a break is - permitted between them. */ - - Feptr--; - if (!utf) fc = *Feptr; else - { - BACKCHAR(Feptr); - GETCHAR(fc, Feptr); - } - rgb = UCD_GRAPHBREAK(fc); - - for (;;) - { - if (Feptr <= Lstart_eptr) break; /* At start of char run */ - fptr = Feptr - 1; - if (!utf) fc = *fptr; else - { - BACKCHAR(fptr); - GETCHAR(fc, fptr); - } - lgb = UCD_GRAPHBREAK(fc); - if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; - Feptr = fptr; - rgb = lgb; - } - } - } - - else -#endif /* SUPPORT_UNICODE */ - -#ifdef SUPPORT_UNICODE - if (utf) - { - switch(Lctype) - { - case OP_ANY: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - if (IS_NEWLINE(Feptr)) break; - if (mb->partial != 0 && /* Take care with CRLF partial */ - Feptr + 1 >= mb->end_subject && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - UCHAR21(Feptr) == NLBLOCK->nl[0]) - { - mb->hitend = TRUE; - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; - } - Feptr++; - ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); - } - break; - - case OP_ALLANY: - if (Lmax < UINT32_MAX) - { - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - Feptr++; - ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); - } - } - else - { - Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */ - SCHECK_PARTIAL(); - } - break; - - /* The "byte" (i.e. "code unit") case is the same as non-UTF */ - - case OP_ANYBYTE: - fc = Lmax - Lmin; - if (fc > (uint32_t)(mb->end_subject - Feptr)) - { - Feptr = mb->end_subject; - SCHECK_PARTIAL(); - } - else Feptr += fc; - break; - - case OP_ANYNL: - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLEN(fc, Feptr, len); - if (fc == CHAR_CR) - { - if (++Feptr >= mb->end_subject) break; - if (UCHAR21(Feptr) == CHAR_LF) Feptr++; - } - else - { - if (fc != CHAR_LF && - (mb->bsr_convention == PCRE2_BSR_ANYCRLF || - (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL -#ifndef EBCDIC - && fc != 0x2028 && fc != 0x2029 -#endif /* Not EBCDIC */ - ))) - break; - Feptr += len; - } - } - break; - - case OP_NOT_HSPACE: - case OP_HSPACE: - for (i = Lmin; i < Lmax; i++) - { - BOOL gotspace; - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLEN(fc, Feptr, len); - switch(fc) - { - HSPACE_CASES: gotspace = TRUE; break; - default: gotspace = FALSE; break; - } - if (gotspace == (Lctype == OP_NOT_HSPACE)) break; - Feptr += len; - } - break; - - case OP_NOT_VSPACE: - case OP_VSPACE: - for (i = Lmin; i < Lmax; i++) - { - BOOL gotspace; - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLEN(fc, Feptr, len); - switch(fc) - { - VSPACE_CASES: gotspace = TRUE; break; - default: gotspace = FALSE; break; - } - if (gotspace == (Lctype == OP_NOT_VSPACE)) break; - Feptr += len; - } - break; - - case OP_NOT_DIGIT: - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLEN(fc, Feptr, len); - if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break; - Feptr+= len; - } - break; - - case OP_DIGIT: - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLEN(fc, Feptr, len); - if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break; - Feptr+= len; - } - break; - - case OP_NOT_WHITESPACE: - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLEN(fc, Feptr, len); - if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break; - Feptr+= len; - } - break; - - case OP_WHITESPACE: - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLEN(fc, Feptr, len); - if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break; - Feptr+= len; - } - break; - - case OP_NOT_WORDCHAR: - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLEN(fc, Feptr, len); - if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break; - Feptr+= len; - } - break; - - case OP_WORDCHAR: - for (i = Lmin; i < Lmax; i++) - { - int len = 1; - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - GETCHARLEN(fc, Feptr, len); - if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break; - Feptr+= len; - } - break; - - default: - return PCRE2_ERROR_INTERNAL; - } - - if (reptype == REPTYPE_POS) continue; /* No backtracking */ - - /* After \C in UTF mode, Lstart_eptr might be in the middle of a - Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go - too far. */ - - for(;;) - { - if (Feptr <= Lstart_eptr) break; - RMATCH(Fecode, RM221); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Feptr--; - BACKCHAR(Feptr); - if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && - UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR) - Feptr--; - } - } - else -#endif /* SUPPORT_UNICODE */ - - /* Not UTF mode */ - { - switch(Lctype) - { - case OP_ANY: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - if (IS_NEWLINE(Feptr)) break; - if (mb->partial != 0 && /* Take care with CRLF partial */ - Feptr + 1 >= mb->end_subject && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - *Feptr == NLBLOCK->nl[0]) - { - mb->hitend = TRUE; - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; - } - Feptr++; - } - break; - - case OP_ALLANY: - case OP_ANYBYTE: - fc = Lmax - Lmin; - if (fc > (uint32_t)(mb->end_subject - Feptr)) - { - Feptr = mb->end_subject; - SCHECK_PARTIAL(); - } - else Feptr += fc; - break; - - case OP_ANYNL: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - fc = *Feptr; - if (fc == CHAR_CR) - { - if (++Feptr >= mb->end_subject) break; - if (*Feptr == CHAR_LF) Feptr++; - } - else - { - if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF || - (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL -#if PCRE2_CODE_UNIT_WIDTH != 8 - && fc != 0x2028 && fc != 0x2029 -#endif - ))) break; - Feptr++; - } - } - break; - - case OP_NOT_HSPACE: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - switch(*Feptr) - { - default: Feptr++; break; - HSPACE_BYTE_CASES: -#if PCRE2_CODE_UNIT_WIDTH != 8 - HSPACE_MULTIBYTE_CASES: -#endif - goto ENDLOOP00; - } - } - ENDLOOP00: - break; - - case OP_HSPACE: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - switch(*Feptr) - { - default: goto ENDLOOP01; - HSPACE_BYTE_CASES: -#if PCRE2_CODE_UNIT_WIDTH != 8 - HSPACE_MULTIBYTE_CASES: -#endif - Feptr++; break; - } - } - ENDLOOP01: - break; - - case OP_NOT_VSPACE: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - switch(*Feptr) - { - default: Feptr++; break; - VSPACE_BYTE_CASES: -#if PCRE2_CODE_UNIT_WIDTH != 8 - VSPACE_MULTIBYTE_CASES: -#endif - goto ENDLOOP02; - } - } - ENDLOOP02: - break; - - case OP_VSPACE: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - switch(*Feptr) - { - default: goto ENDLOOP03; - VSPACE_BYTE_CASES: -#if PCRE2_CODE_UNIT_WIDTH != 8 - VSPACE_MULTIBYTE_CASES: -#endif - Feptr++; break; - } - } - ENDLOOP03: - break; - - case OP_NOT_DIGIT: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) - break; - Feptr++; - } - break; - - case OP_DIGIT: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) - break; - Feptr++; - } - break; - - case OP_NOT_WHITESPACE: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) - break; - Feptr++; - } - break; - - case OP_WHITESPACE: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) - break; - Feptr++; - } - break; - - case OP_NOT_WORDCHAR: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) - break; - Feptr++; - } - break; - - case OP_WORDCHAR: - for (i = Lmin; i < Lmax; i++) - { - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - break; - } - if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) - break; - Feptr++; - } - break; - - default: - return PCRE2_ERROR_INTERNAL; - } - - if (reptype == REPTYPE_POS) continue; /* No backtracking */ - - for (;;) - { - if (Feptr == Lstart_eptr) break; - RMATCH(Fecode, RM34); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Feptr--; - if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF && - Feptr[-1] == CHAR_CR) Feptr--; - } - } - } - break; /* End of repeat character type processing */ - -#undef Lstart_eptr -#undef Lmin -#undef Lmax -#undef Lctype -#undef Lpropvalue - - - /* ===================================================================== */ - /* Match a back reference, possibly repeatedly. Look past the end of the - item to see if there is repeat information following. The OP_REF and - OP_REFI opcodes are used for a reference to a numbered group or to a - non-duplicated named group. For a duplicated named group, OP_DNREF and - OP_DNREFI are used. In this case we must scan the list of groups to which - the name refers, and use the first one that is set. */ - -#define Lmin F->temp_32[0] -#define Lmax F->temp_32[1] -#define Lcaseless F->temp_32[2] -#define Lstart F->temp_sptr[0] -#define Loffset F->temp_size - - case OP_DNREF: - case OP_DNREFI: - Lcaseless = (Fop == OP_DNREFI); - { - int count = GET2(Fecode, 1+IMM2_SIZE); - PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; - Fecode += 1 + 2*IMM2_SIZE; - - while (count-- > 0) - { - Loffset = (GET2(slot, 0) << 1) - 2; - if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break; - slot += mb->name_entry_size; - } - } - goto REF_REPEAT; - - case OP_REF: - case OP_REFI: - Lcaseless = (Fop == OP_REFI); - Loffset = (GET2(Fecode, 1) << 1) - 2; - Fecode += 1 + IMM2_SIZE; - - /* Set up for repetition, or handle the non-repeated case. The maximum and - minimum must be in the heap frame, but as they are short-term values, we - use temporary fields. */ - - REF_REPEAT: - switch (*Fecode) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - fc = *Fecode++ - OP_CRSTAR; - Lmin = rep_min[fc]; - Lmax = rep_max[fc]; - reptype = rep_typ[fc]; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - Lmin = GET2(Fecode, 1); - Lmax = GET2(Fecode, 1 + IMM2_SIZE); - reptype = rep_typ[*Fecode - OP_CRSTAR]; - if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ - Fecode += 1 + 2 * IMM2_SIZE; - break; - - default: /* No repeat follows */ - { - rrc = match_ref(Loffset, Lcaseless, F, mb, &length); - if (rrc != 0) - { - if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ - CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - } - Feptr += length; - continue; /* With the main loop */ - } - - /* Handle repeated back references. If a set group has length zero, just - continue with the main loop, because it matches however many times. For an - unset reference, if the minimum is zero, we can also just continue. We can - also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset - group behave as a zero-length group. For any other unset cases, carrying - on will result in NOMATCH. */ - - if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) - { - if (Fovector[Loffset] == Fovector[Loffset + 1]) continue; - } - else /* Group is not set */ - { - if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) - continue; - } - - /* First, ensure the minimum number of matches are present. */ - - for (i = 1; i <= Lmin; i++) - { - PCRE2_SIZE slength; - rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); - if (rrc != 0) - { - if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ - CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - Feptr += slength; - } - - /* If min = max, we are done. They are not both allowed to be zero. */ - - if (Lmin == Lmax) continue; - - /* If minimizing, keep trying and advancing the pointer. */ - - if (reptype == REPTYPE_MIN) - { - for (;;) - { - PCRE2_SIZE slength; - RMATCH(Fecode, RM20); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); - if (rrc != 0) - { - if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ - CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - Feptr += slength; - } - /* Control never gets here */ - } - - /* If maximizing, find the longest string and work backwards, as long as - the matched lengths for each iteration are the same. */ - - else - { - BOOL samelengths = TRUE; - Lstart = Feptr; /* Starting position */ - Flength = Fovector[Loffset+1] - Fovector[Loffset]; - - for (i = Lmin; i < Lmax; i++) - { - PCRE2_SIZE slength; - rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); - if (rrc != 0) - { - /* Can't use CHECK_PARTIAL because we don't want to update Feptr in - the soft partial matching case. */ - - if (rrc > 0 && mb->partial != 0 && - mb->end_subject > mb->start_used_ptr) - { - mb->hitend = TRUE; - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; - } - break; - } - - if (slength != Flength) samelengths = FALSE; - Feptr += slength; - } - - /* If the length matched for each repetition is the same as the length of - the captured group, we can easily work backwards. This is the normal - case. However, in caseless UTF-8 mode there are pairs of case-equivalent - characters whose lengths (in terms of code units) differ. However, this - is very rare, so we handle it by re-matching fewer and fewer times. */ - - if (samelengths) - { - while (Feptr >= Lstart) - { - RMATCH(Fecode, RM21); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Feptr -= Flength; - } - } - - /* The rare case of non-matching lengths. Re-scan the repetition for each - iteration. We know that match_ref() will succeed every time. */ - - else - { - Lmax = i; - for (;;) - { - RMATCH(Fecode, RM22); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Feptr == Lstart) break; /* Failed after minimal repetition */ - Feptr = Lstart; - Lmax--; - for (i = Lmin; i < Lmax; i++) - { - PCRE2_SIZE slength; - (void)match_ref(Loffset, Lcaseless, F, mb, &slength); - Feptr += slength; - } - } - } - - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - -#undef Lcaseless -#undef Lmin -#undef Lmax -#undef Lstart -#undef Loffset - - - -/* ========================================================================= */ -/* Opcodes for the start of various parenthesized items */ -/* ========================================================================= */ - - /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the - (*THEN) is within the current branch by comparing the address of OP_THEN - that is passed back with the end of the branch. If (*THEN) is within the - current branch, and the branch is one of two or more alternatives (it - either starts or ends with OP_ALT), we have reached the limit of THEN's - action, so convert the return code to NOMATCH, which will cause normal - backtracking to happen from now on. Otherwise, THEN is passed back to an - outer alternative. This implements Perl's treatment of parenthesized - groups, where a group not containing | does not affect the current - alternative, that is, (X) is NOT the same as (X|(*F)). */ - - - /* ===================================================================== */ - /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive - bracket group, indicating that it may occur zero times. It may repeat - infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in - the pattern. Brackets with fixed upper repeat limits are compiled as a - number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO. - Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */ - -#define Lnext_ecode F->temp_sptr[0] - - case OP_BRAZERO: - Lnext_ecode = Fecode + 1; - RMATCH(Lnext_ecode, RM9); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); - Fecode = Lnext_ecode + 1 + LINK_SIZE; - break; - - case OP_BRAMINZERO: - Lnext_ecode = Fecode + 1; - do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); - RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Fecode++; - break; - -#undef Lnext_ecode - - case OP_SKIPZERO: - Fecode++; - do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); - Fecode += 1 + LINK_SIZE; - break; - - - /* ===================================================================== */ - /* Handle possessive brackets with an unlimited repeat. The end of these - brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without - going further in the pattern. */ - -#define Lframe_type F->temp_32[0] -#define Lmatched_once F->temp_32[1] -#define Lzero_allowed F->temp_32[2] -#define Lstart_eptr F->temp_sptr[0] -#define Lstart_group F->temp_sptr[1] - - case OP_BRAPOSZERO: - Lzero_allowed = TRUE; /* Zero repeat is allowed */ - Fecode += 1; - if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS) - goto POSSESSIVE_CAPTURE; - goto POSSESSIVE_NON_CAPTURE; - - case OP_BRAPOS: - case OP_SBRAPOS: - Lzero_allowed = FALSE; /* Zero repeat not allowed */ - - POSSESSIVE_NON_CAPTURE: - Lframe_type = GF_NOCAPTURE; /* Remembered frame type */ - goto POSSESSIVE_GROUP; - - case OP_CBRAPOS: - case OP_SCBRAPOS: - Lzero_allowed = FALSE; /* Zero repeat not allowed */ - - POSSESSIVE_CAPTURE: - number = GET2(Fecode, 1+LINK_SIZE); - Lframe_type = GF_CAPTURE | number; /* Remembered frame type */ - - POSSESSIVE_GROUP: - Lmatched_once = FALSE; /* Never matched */ - Lstart_group = Fecode; /* Start of this group */ - - for (;;) - { - Lstart_eptr = Feptr; /* Position at group start */ - group_frame_type = Lframe_type; - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8); - if (rrc == MATCH_KETRPOS) - { - Lmatched_once = TRUE; /* Matched at least once */ - if (Feptr == Lstart_eptr) /* Empty match; skip to end */ - { - do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); - break; - } - - Fecode = Lstart_group; - continue; - } - - /* See comment above about handling THEN. */ - - if (rrc == MATCH_THEN) - { - PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); - if (mb->verb_ecode_ptr < next_ecode && - (*Fecode == OP_ALT || *next_ecode == OP_ALT)) - rrc = MATCH_NOMATCH; - } - - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Fecode += GET(Fecode, 1); - if (*Fecode != OP_ALT) break; - } - - /* Success if matched something or zero repeat allowed */ - - if (Lmatched_once || Lzero_allowed) - { - Fecode += 1 + LINK_SIZE; - break; - } - - RRETURN(MATCH_NOMATCH); - -#undef Lmatched_once -#undef Lzero_allowed -#undef Lframe_type -#undef Lstart_eptr -#undef Lstart_group - - - /* ===================================================================== */ - /* Handle non-capturing brackets that cannot match an empty string. When we - get to the final alternative within the brackets, as long as there are no - THEN's in the pattern, we can optimize by not recording a new backtracking - point. (Ideally we should test for a THEN within this group, but we don't - have that information.) Don't do this if we are at the very top level, - however, because that would make handling assertions and once-only brackets - messier when there is nothing to go back to. */ - -#define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */ -#define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */ - - case OP_BRA: - if (mb->hasthen || Frdepth == 0) - { - Lframe_type = 0; - goto GROUPLOOP; - } - - for (;;) - { - Lnext_branch = Fecode + GET(Fecode, 1); - if (*Lnext_branch != OP_ALT) break; - - /* This is never the final branch. We do not need to test for MATCH_THEN - here because this code is not used when there is a THEN in the pattern. */ - - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Fecode = Lnext_branch; - } - - /* Hit the start of the final branch. Continue at this level. */ - - Fecode += PRIV(OP_lengths)[*Fecode]; - break; - -#undef Lnext_branch - - - /* ===================================================================== */ - /* Handle a capturing bracket, other than those that are possessive with an - unlimited repeat. */ - - case OP_CBRA: - case OP_SCBRA: - Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE); - goto GROUPLOOP; - - - /* ===================================================================== */ - /* Atomic groups and non-capturing brackets that can match an empty string - must record a backtracking point and also set up a chained frame. */ - - case OP_ONCE: - case OP_SCRIPT_RUN: - case OP_SBRA: - Lframe_type = GF_NOCAPTURE | Fop; - - GROUPLOOP: - for (;;) - { - group_frame_type = Lframe_type; - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2); - if (rrc == MATCH_THEN) - { - PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); - if (mb->verb_ecode_ptr < next_ecode && - (*Fecode == OP_ALT || *next_ecode == OP_ALT)) - rrc = MATCH_NOMATCH; - } - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Fecode += GET(Fecode, 1); - if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); - } - /* Control never reaches here. */ - -#undef Lframe_type - - - /* ===================================================================== */ - /* Recursion either matches the current regex, or some subexpression. The - offset data is the offset to the starting bracket from the start of the - whole pattern. (This is so that it works from duplicated subpatterns.) */ - -#define Lframe_type F->temp_32[0] -#define Lstart_branch F->temp_sptr[0] - - case OP_RECURSE: - bracode = mb->start_code + GET(Fecode, 1); - number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE); - - /* If we are already in a recursion, check for repeating the same one - without advancing the subject pointer. This should catch convoluted mutual - recursions. (Some simple cases are caught at compile time.) */ - - if (Fcurrent_recurse != RECURSE_UNSET) - { - offset = Flast_group_offset; - while (offset != PCRE2_UNSET) - { - N = (heapframe *)((char *)mb->match_frames + offset); - P = (heapframe *)((char *)N - frame_size); - if (N->group_frame_type == (GF_RECURSE | number)) - { - if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP; - break; - } - offset = P->last_group_offset; - } - } - - /* Now run the recursion, branch by branch. */ - - Lstart_branch = bracode; - Lframe_type = GF_RECURSE | number; - - for (;;) - { - PCRE2_SPTR next_ecode; - - group_frame_type = Lframe_type; - RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11); - next_ecode = Lstart_branch + GET(Lstart_branch,1); - - /* Handle backtracking verbs, which are defined in a range that can - easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to - escape beyond a recursion; they cause a NOMATCH for the entire recursion. - - When one of these verbs triggers, the current recursion group number is - recorded. If it matches the recursion we are processing, the verb - happened within the recursion and we must deal with it. Otherwise it must - have happened after the recursion completed, and so has to be passed - back. See comment above about handling THEN. */ - - if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX && - mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE)) - { - if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode && - (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT)) - rrc = MATCH_NOMATCH; - else RRETURN(MATCH_NOMATCH); - } - - /* Note that carrying on after (*ACCEPT) in a recursion is handled in the - OP_ACCEPT code. Nothing needs to be done here. */ - - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Lstart_branch = next_ecode; - if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH); - } - /* Control never reaches here. */ - -#undef Lframe_type -#undef Lstart_branch - - - /* ===================================================================== */ - /* Positive assertions are like other groups except that PCRE doesn't allow - the effect of (*THEN) to escape beyond an assertion; it is therefore - treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its - captures and mark retained. Any other return is an error. */ - -#define Lframe_type F->temp_32[0] - - case OP_ASSERT: - case OP_ASSERTBACK: - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - Lframe_type = GF_NOCAPTURE | Fop; - for (;;) - { - group_frame_type = Lframe_type; - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3); - if (rrc == MATCH_ACCEPT) - { - memcpy(Fovector, - (char *)assert_accept_frame + offsetof(heapframe, ovector), - assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); - Foffset_top = assert_accept_frame->offset_top; - Fmark = assert_accept_frame->mark; - break; - } - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); - Fecode += GET(Fecode, 1); - if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); - } - - do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); - Fecode += 1 + LINK_SIZE; - break; - -#undef Lframe_type - - - /* ===================================================================== */ - /* Handle negative assertions. Loop for each non-matching branch as for - positive assertions. */ - -#define Lframe_type F->temp_32[0] - - case OP_ASSERT_NOT: - case OP_ASSERTBACK_NOT: - Lframe_type = GF_NOCAPTURE | Fop; - - for (;;) - { - group_frame_type = Lframe_type; - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4); - switch(rrc) - { - case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */ - case MATCH_MATCH: - RRETURN (MATCH_NOMATCH); - - case MATCH_NOMATCH: /* Branch failed, try next if present. */ - case MATCH_THEN: - Fecode += GET(Fecode, 1); - if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED; - break; - - case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */ - case MATCH_SKIP: - case MATCH_PRUNE: - do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); - goto ASSERT_NOT_FAILED; - - default: /* Pass back any other return */ - RRETURN(rrc); - } - } - - /* None of the branches have matched or there was a backtrack to (*COMMIT), - (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a - negative assertion, so carry on. */ - - ASSERT_NOT_FAILED: - Fecode += 1 + LINK_SIZE; - break; - -#undef Lframe_type - - - /* ===================================================================== */ - /* The callout item calls an external function, if one is provided, passing - details of the match so far. This is mainly for debugging, though the - function is able to force a failure. */ - - case OP_CALLOUT: - case OP_CALLOUT_STR: - rrc = do_callout(F, mb, &length); - if (rrc > 0) RRETURN(MATCH_NOMATCH); - if (rrc < 0) RRETURN(rrc); - Fecode += length; - break; - - - /* ===================================================================== */ - /* Conditional group: compilation checked that there are no more than two - branches. If the condition is false, skipping the first branch takes us - past the end of the item if there is only one branch, but that's exactly - what we want. */ - - case OP_COND: - case OP_SCOND: - - /* The variable Flength will be added to Fecode when the condition is - false, to get to the second branch. Setting it to the offset to the ALT or - KET, then incrementing Fecode achieves this effect. However, if the second - branch is non-existent, we must point to the KET so that the end of the - group is correctly processed. We now have Fecode pointing to the condition - or callout. */ - - Flength = GET(Fecode, 1); /* Offset to the second branch */ - if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE; - Fecode += 1 + LINK_SIZE; /* From this opcode */ - - /* Because of the way auto-callout works during compile, a callout item is - inserted between OP_COND and an assertion condition. Such a callout can - also be inserted manually. */ - - if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR) - { - rrc = do_callout(F, mb, &length); - if (rrc > 0) RRETURN(MATCH_NOMATCH); - if (rrc < 0) RRETURN(rrc); - - /* Advance Fecode past the callout, so it now points to the condition. We - must adjust Flength so that the value of Fecode+Flength is unchanged. */ - - Fecode += length; - Flength -= length; - } - - /* Test the various possible conditions */ - - condition = FALSE; - switch(*Fecode) - { - case OP_RREF: /* Group recursion test */ - if (Fcurrent_recurse != RECURSE_UNSET) - { - number = GET2(Fecode, 1); - condition = (number == RREF_ANY || number == Fcurrent_recurse); - } - break; - - case OP_DNRREF: /* Duplicate named group recursion test */ - if (Fcurrent_recurse != RECURSE_UNSET) - { - int count = GET2(Fecode, 1 + IMM2_SIZE); - PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; - while (count-- > 0) - { - number = GET2(slot, 0); - condition = number == Fcurrent_recurse; - if (condition) break; - slot += mb->name_entry_size; - } - } - break; - - case OP_CREF: /* Numbered group used test */ - offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */ - condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; - break; - - case OP_DNCREF: /* Duplicate named group used test */ - { - int count = GET2(Fecode, 1 + IMM2_SIZE); - PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; - while (count-- > 0) - { - offset = (GET2(slot, 0) << 1) - 2; - condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; - if (condition) break; - slot += mb->name_entry_size; - } - } - break; - - case OP_FALSE: - case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */ - break; - - case OP_TRUE: - condition = TRUE; - break; - - /* The condition is an assertion. Run code similar to the assertion code - above. */ - -#define Lpositive F->temp_32[0] -#define Lstart_branch F->temp_sptr[0] - - default: - Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK); - Lstart_branch = Fecode; - - for (;;) - { - group_frame_type = GF_CONDASSERT | *Fecode; - RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5); - - switch(rrc) - { - case MATCH_ACCEPT: /* Save captures */ - memcpy(Fovector, - (char *)assert_accept_frame + offsetof(heapframe, ovector), - assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); - Foffset_top = assert_accept_frame->offset_top; - - /* Fall through */ - /* In the case of a match, the captures have already been put into - the current frame. */ - - case MATCH_MATCH: - condition = Lpositive; /* TRUE for positive assertion */ - break; - - /* PCRE doesn't allow the effect of (*THEN) to escape beyond an - assertion; it is therefore always treated as NOMATCH. */ - - case MATCH_NOMATCH: - case MATCH_THEN: - Lstart_branch += GET(Lstart_branch, 1); - if (*Lstart_branch == OP_ALT) continue; /* Try next branch */ - condition = !Lpositive; /* TRUE for negative assertion */ - break; - - /* These force no match without checking other branches. */ - - case MATCH_COMMIT: - case MATCH_SKIP: - case MATCH_PRUNE: - condition = !Lpositive; - break; - - default: - RRETURN(rrc); - } - break; /* Out of the branch loop */ - } - - /* If the condition is true, find the end of the assertion so that - advancing past it gets us to the start of the first branch. */ - - if (condition) - { - do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); - } - break; /* End of assertion condition */ - } - -#undef Lpositive -#undef Lstart_branch - - /* Choose branch according to the condition. */ - - Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength; - - /* If the opcode is OP_SCOND it means we are at a repeated conditional - group that might match an empty string. We must therefore descend a level - so that the start is remembered for checking. For OP_COND we can just - continue at this level. */ - - if (Fop == OP_SCOND) - { - group_frame_type = GF_NOCAPTURE | Fop; - RMATCH(Fecode, RM35); - RRETURN(rrc); - } - break; - - - -/* ========================================================================= */ -/* End of start of parenthesis opcodes */ -/* ========================================================================= */ - - - /* ===================================================================== */ - /* Move the subject pointer back. This occurs only at the start of each - branch of a lookbehind assertion. If we are too close to the start to move - back, fail. When working with UTF-8 we move back a number of characters, - not bytes. */ - - case OP_REVERSE: - number = GET(Fecode, 1); -#ifdef SUPPORT_UNICODE - if (utf) - { - while (number-- > 0) - { - if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH); - Feptr--; - BACKCHAR(Feptr); - } - } - else -#endif - - /* No UTF-8 support, or not in UTF-8 mode: count is code unit count */ - - { - if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH); - Feptr -= number; - } - - /* Save the earliest consulted character, then skip to next opcode */ - - if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr; - Fecode += 1 + LINK_SIZE; - break; - - - /* ===================================================================== */ - /* An alternation is the end of a branch; scan along to find the end of the - bracketed group. */ - - case OP_ALT: - do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); - break; - - - /* ===================================================================== */ - /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the - starting frame was added to the chained frames in order to remember the - starting subject position for the group. */ - - case OP_KET: - case OP_KETRMIN: - case OP_KETRMAX: - case OP_KETRPOS: - - bracode = Fecode - GET(Fecode, 1); - - /* Point N to the frame at the start of the most recent group. - Remember the subject pointer at the start of the group. */ - - if (*bracode != OP_BRA && *bracode != OP_COND) - { - N = (heapframe *)((char *)mb->match_frames + Flast_group_offset); - P = (heapframe *)((char *)N - frame_size); - Flast_group_offset = P->last_group_offset; - -#ifdef DEBUG_SHOW_RMATCH - fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n", - N->rdepth, N->group_frame_type, - (char *)P->eptr - (char *)mb->start_subject); -#endif - - /* If we are at the end of an assertion that is a condition, return a - match, discarding any intermediate backtracking points. Copy back the - mark setting and the captures into the frame before N so that they are - set on return. Doing this for all assertions, both positive and negative, - seems to match what Perl does. */ - - if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT) - { - memcpy((char *)P + offsetof(heapframe, ovector), Fovector, - Foffset_top * sizeof(PCRE2_SIZE)); - P->offset_top = Foffset_top; - P->mark = Fmark; - Fback_frame = (char *)F - (char *)P; - RRETURN(MATCH_MATCH); - } - } - else P = NULL; /* Indicates starting frame not recorded */ - - /* The group was not a conditional assertion. */ - - switch (*bracode) - { - case OP_BRA: /* No need to do anything for these */ - case OP_COND: - case OP_SCOND: - break; - - /* Non-atomic positive assertions are like OP_BRA, except that the - subject pointer must be put back to where it was at the start of the - assertion. */ - - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; - Feptr = P->eptr; - break; - - /* Atomic positive assertions are like OP_ONCE, except that in addition - the subject pointer must be put back to where it was at the start of the - assertion. */ - - case OP_ASSERT: - case OP_ASSERTBACK: - if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; - Feptr = P->eptr; - /* Fall through */ - - /* For an atomic group, discard internal backtracking points. We must - also ensure that any remaining branches within the top-level of the group - are not tried. Do this by adjusting the code pointer within the backtrack - frame so that it points to the final branch. */ - - case OP_ONCE: - Fback_frame = ((char *)F - (char *)P); - for (;;) - { - uint32_t y = GET(P->ecode,1); - if ((P->ecode)[y] != OP_ALT) break; - P->ecode += y; - } - break; - - /* A matching negative assertion returns MATCH, which is turned into - NOMATCH at the assertion level. */ - - case OP_ASSERT_NOT: - case OP_ASSERTBACK_NOT: - RRETURN(MATCH_MATCH); - - /* At the end of a script run, apply the script-checking rules. This code - will never by exercised if Unicode support it not compiled, because in - that environment script runs cause an error at compile time. */ - - case OP_SCRIPT_RUN: - if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH); - break; - - /* Whole-pattern recursion is coded as a recurse into group 0, so it - won't be picked up here. Instead, we catch it when the OP_END is reached. - Other recursion is handled here. */ - - case OP_CBRA: - case OP_CBRAPOS: - case OP_SCBRA: - case OP_SCBRAPOS: - number = GET2(bracode, 1+LINK_SIZE); - - /* Handle a recursively called group. We reinstate the previous set of - captures and then carry on after the recursion call. */ - - if (Fcurrent_recurse == number) - { - P = (heapframe *)((char *)N - frame_size); - memcpy((char *)F + offsetof(heapframe, ovector), P->ovector, - P->offset_top * sizeof(PCRE2_SIZE)); - Foffset_top = P->offset_top; - Fcapture_last = P->capture_last; - Fcurrent_recurse = P->current_recurse; - Fecode = P->ecode + 1 + LINK_SIZE; - continue; /* With next opcode */ - } - - /* Deal with actual capturing. */ - - offset = (number << 1) - 2; - Fcapture_last = number; - Fovector[offset] = P->eptr - mb->start_subject; - Fovector[offset+1] = Feptr - mb->start_subject; - if (offset >= Foffset_top) Foffset_top = offset + 2; - break; - } /* End actions relating to the starting opcode */ - - /* OP_KETRPOS is a possessive repeating ket. Remember the current position, - and return the MATCH_KETRPOS. This makes it possible to do the repeats one - at a time from the outer level. This must precede the empty string test - - in this case that test is done at the outer level. */ - - if (*Fecode == OP_KETRPOS) - { - memcpy((char *)P + offsetof(heapframe, eptr), - (char *)F + offsetof(heapframe, eptr), - frame_copy_size); - RRETURN(MATCH_KETRPOS); - } - - /* Handle the different kinds of closing brackets. A non-repeating ket - needs no special action, just continuing at this level. This also happens - for the repeating kets if the group matched no characters, in order to - forcibly break infinite loops. Otherwise, the repeating kets try the rest - of the pattern or restart from the preceding bracket, in the appropriate - order. */ - - if (Fop != OP_KET && (P == NULL || Feptr != P->eptr)) - { - if (Fop == OP_KETRMIN) - { - RMATCH(Fecode + 1 + LINK_SIZE, RM6); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - Fecode -= GET(Fecode, 1); - break; /* End of ket processing */ - } - - /* Repeat the maximum number of times (KETRMAX) */ - - RMATCH(bracode, RM7); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - } - - /* Carry on at this level for a non-repeating ket, or after matching an - empty string, or after repeating for a maximum number of times. */ - - Fecode += 1 + LINK_SIZE; - break; - - - /* ===================================================================== */ - /* Start and end of line assertions, not multiline mode. */ - - case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */ - if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0) - RRETURN(MATCH_NOMATCH); - Fecode++; - break; - - case OP_SOD: /* Unconditional start of subject */ - if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH); - Fecode++; - break; - - /* When PCRE2_NOTEOL is unset, assert before the subject end, or a - terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */ - - case OP_DOLL: - if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); - if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS; - - /* Fall through */ - /* Unconditional end of subject assertion (\z) */ - - case OP_EOD: - if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH); - if (mb->partial != 0) - { - mb->hitend = TRUE; - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; - } - Fecode++; - break; - - /* End of subject or ending \n assertion (\Z) */ - - case OP_EODN: - ASSERT_NL_OR_EOS: - if (Feptr < mb->end_subject && - (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen)) - { - if (mb->partial != 0 && - Feptr + 1 >= mb->end_subject && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) - { - mb->hitend = TRUE; - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; - } - RRETURN(MATCH_NOMATCH); - } - - /* Either at end of string or \n before end. */ - - if (mb->partial != 0) - { - mb->hitend = TRUE; - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; - } - Fecode++; - break; - - - /* ===================================================================== */ - /* Start and end of line assertions, multiline mode. */ - - /* Start of subject unless notbol, or after any newline except for one at - the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */ - - case OP_CIRCM: - if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject) - RRETURN(MATCH_NOMATCH); - if (Feptr != mb->start_subject && - ((Feptr == mb->end_subject && - (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) || - !WAS_NEWLINE(Feptr))) - RRETURN(MATCH_NOMATCH); - Fecode++; - break; - - /* Assert before any newline, or before end of subject unless noteol is - set. */ - - case OP_DOLLM: - if (Feptr < mb->end_subject) - { - if (!IS_NEWLINE(Feptr)) - { - if (mb->partial != 0 && - Feptr + 1 >= mb->end_subject && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) - { - mb->hitend = TRUE; - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; - } - RRETURN(MATCH_NOMATCH); - } - } - else - { - if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); - SCHECK_PARTIAL(); - } - Fecode++; - break; - - - /* ===================================================================== */ - /* Start of match assertion */ - - case OP_SOM: - if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH); - Fecode++; - break; - - - /* ===================================================================== */ - /* Reset the start of match point */ - - case OP_SET_SOM: - Fstart_match = Feptr; - Fecode++; - break; - - - /* ===================================================================== */ - /* Word boundary assertions. Find out if the previous and current - characters are "word" characters. It takes a bit more work in UTF mode. - Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is - not set. When it is set, use Unicode properties if available, even when not - in UTF mode. Remember the earliest and latest consulted characters. */ - - case OP_NOT_WORD_BOUNDARY: - case OP_WORD_BOUNDARY: - if (Feptr == mb->check_subject) prev_is_word = FALSE; else - { - PCRE2_SPTR lastptr = Feptr - 1; -#ifdef SUPPORT_UNICODE - if (utf) - { - BACKCHAR(lastptr); - GETCHAR(fc, lastptr); - } - else -#endif /* SUPPORT_UNICODE */ - fc = *lastptr; - if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr; -#ifdef SUPPORT_UNICODE - if ((mb->poptions & PCRE2_UCP) != 0) - { - if (fc == '_') prev_is_word = TRUE; else - { - int cat = UCD_CATEGORY(fc); - prev_is_word = (cat == ucp_L || cat == ucp_N); - } - } - else -#endif /* SUPPORT_UNICODE */ - prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; - } - - /* Get status of next character */ - - if (Feptr >= mb->end_subject) - { - SCHECK_PARTIAL(); - cur_is_word = FALSE; - } - else - { - PCRE2_SPTR nextptr = Feptr + 1; -#ifdef SUPPORT_UNICODE - if (utf) - { - FORWARDCHARTEST(nextptr, mb->end_subject); - GETCHAR(fc, Feptr); - } - else -#endif /* SUPPORT_UNICODE */ - fc = *Feptr; - if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr; -#ifdef SUPPORT_UNICODE - if ((mb->poptions & PCRE2_UCP) != 0) - { - if (fc == '_') cur_is_word = TRUE; else - { - int cat = UCD_CATEGORY(fc); - cur_is_word = (cat == ucp_L || cat == ucp_N); - } - } - else -#endif /* SUPPORT_UNICODE */ - cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; - } - - /* Now see if the situation is what we want */ - - if ((*Fecode++ == OP_WORD_BOUNDARY)? - cur_is_word == prev_is_word : cur_is_word != prev_is_word) - RRETURN(MATCH_NOMATCH); - break; - - - /* ===================================================================== */ - /* Backtracking (*VERB)s, with and without arguments. Note that if the - pattern is successfully matched, we do not come back from RMATCH. */ - - case OP_MARK: - Fmark = mb->nomatch_mark = Fecode + 2; - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12); - - /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an - argument, and we must check whether that argument matches this MARK's - argument. It is passed back in mb->verb_skip_ptr. If it does match, we - return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject - position that corresponds to this mark. Otherwise, pass back the return - code unaltered. */ - - if (rrc == MATCH_SKIP_ARG && - PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0) - { - mb->verb_skip_ptr = Feptr; /* Pass back current position */ - RRETURN(MATCH_SKIP); - } - RRETURN(rrc); - - case OP_FAIL: - RRETURN(MATCH_NOMATCH); - - /* Record the current recursing group number in mb->verb_current_recurse - when a backtracking return such as MATCH_COMMIT is given. This enables the - recurse processing to catch verbs from within the recursion. */ - - case OP_COMMIT: - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - mb->verb_current_recurse = Fcurrent_recurse; - RRETURN(MATCH_COMMIT); - - case OP_COMMIT_ARG: - Fmark = mb->nomatch_mark = Fecode + 2; - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - mb->verb_current_recurse = Fcurrent_recurse; - RRETURN(MATCH_COMMIT); - - case OP_PRUNE: - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - mb->verb_current_recurse = Fcurrent_recurse; - RRETURN(MATCH_PRUNE); - - case OP_PRUNE_ARG: - Fmark = mb->nomatch_mark = Fecode + 2; - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - mb->verb_current_recurse = Fcurrent_recurse; - RRETURN(MATCH_PRUNE); - - case OP_SKIP: - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - mb->verb_skip_ptr = Feptr; /* Pass back current position */ - mb->verb_current_recurse = Fcurrent_recurse; - RRETURN(MATCH_SKIP); - - /* Note that, for Perl compatibility, SKIP with an argument does NOT set - nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was - not a matching mark, we have to re-run the match, ignoring the SKIP_ARG - that failed and any that precede it (either they also failed, or were not - triggered). To do this, we maintain a count of executed SKIP_ARGs. If a - SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg - set to the count of the one that failed. */ - - case OP_SKIP_ARG: - mb->skip_arg_count++; - if (mb->skip_arg_count <= mb->ignore_skip_arg) - { - Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1]; - break; - } - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - - /* Pass back the current skip name and return the special MATCH_SKIP_ARG - return code. This will either be caught by a matching MARK, or get to the - top, where it causes a rematch with mb->ignore_skip_arg set to the value of - mb->skip_arg_count. */ - - mb->verb_skip_ptr = Fecode + 2; - mb->verb_current_recurse = Fcurrent_recurse; - RRETURN(MATCH_SKIP_ARG); - - /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that - the branch in which it occurs can be determined. */ - - case OP_THEN: - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - mb->verb_ecode_ptr = Fecode; - mb->verb_current_recurse = Fcurrent_recurse; - RRETURN(MATCH_THEN); - - case OP_THEN_ARG: - Fmark = mb->nomatch_mark = Fecode + 2; - RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - mb->verb_ecode_ptr = Fecode; - mb->verb_current_recurse = Fcurrent_recurse; - RRETURN(MATCH_THEN); - - - /* ===================================================================== */ - /* There's been some horrible disaster. Arrival here can only mean there is - something seriously wrong in the code above or the OP_xxx definitions. */ - - default: - return PCRE2_ERROR_INTERNAL; - } - - /* Do not insert any code in here without much thought; it is assumed - that "continue" in the code above comes out to here to repeat the main - loop. */ - - } /* End of main loop */ -/* Control never reaches here */ - - -/* ========================================================================= */ -/* The RRETURN() macro jumps here. The number that is saved in Freturn_id -indicates which label we actually want to return to. The value in Frdepth is -the index number of the frame in the vector. The return value has been placed -in rrc. */ - -#define LBL(val) case val: goto L_RM##val; - -RETURN_SWITCH: -if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; -if (Frdepth == 0) return rrc; /* Exit from the top level */ -F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */ -mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */ - -#ifdef DEBUG_SHOW_RMATCH -fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id); -#endif - -switch (Freturn_id) - { - LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) - LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16) - LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24) - LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32) - LBL(33) LBL(34) LBL(35) LBL(36) - -#ifdef SUPPORT_WIDE_CHARS - LBL(100) LBL(101) -#endif - -#ifdef SUPPORT_UNICODE - LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206) - LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213) - LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220) - LBL(221) LBL(222) -#endif - - default: - return PCRE2_ERROR_INTERNAL; - } -#undef LBL -} - - -/************************************************* -* Match a Regular Expression * -*************************************************/ - -/* This function applies a compiled pattern to a subject string and picks out -portions of the string if it matches. Two elements in the vector are set for -each substring: the offsets to the start and end of the substring. - -Arguments: - code points to the compiled expression - subject points to the subject string - length length of subject string (may contain binary zeros) - start_offset where to start in the subject string - options option bits - match_data points to a match_data block - mcontext points a PCRE2 context - -Returns: > 0 => success; value is the number of ovector pairs filled - = 0 => success, but ovector is not big enough - = -1 => failed to match (PCRE2_ERROR_NOMATCH) - = -2 => partial match (PCRE2_ERROR_PARTIAL) - < -2 => some kind of unexpected problem -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, - PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, - pcre2_match_context *mcontext) -{ -int rc; -int was_zero_terminated = 0; -const uint8_t *start_bits = NULL; -const pcre2_real_code *re = (const pcre2_real_code *)code; - -BOOL anchored; -BOOL firstline; -BOOL has_first_cu = FALSE; -BOOL has_req_cu = FALSE; -BOOL startline; - -#if PCRE2_CODE_UNIT_WIDTH == 8 -BOOL memchr_not_found_first_cu; -BOOL memchr_not_found_first_cu2; -#endif - -PCRE2_UCHAR first_cu = 0; -PCRE2_UCHAR first_cu2 = 0; -PCRE2_UCHAR req_cu = 0; -PCRE2_UCHAR req_cu2 = 0; - -PCRE2_SPTR bumpalong_limit; -PCRE2_SPTR end_subject; -PCRE2_SPTR true_end_subject; -PCRE2_SPTR start_match = subject + start_offset; -PCRE2_SPTR req_cu_ptr = start_match - 1; -PCRE2_SPTR start_partial; -PCRE2_SPTR match_partial; - -#ifdef SUPPORT_JIT -BOOL use_jit; -#endif - -/* This flag is needed even when Unicode is not supported for convenience -(it is used by the IS_NEWLINE macro). */ - -BOOL utf = FALSE; - -#ifdef SUPPORT_UNICODE -BOOL ucp = FALSE; -BOOL allow_invalid; -uint32_t fragment_options = 0; -#ifdef SUPPORT_JIT -BOOL jit_checked_utf = FALSE; -#endif -#endif /* SUPPORT_UNICODE */ - -PCRE2_SIZE frame_size; - -/* We need to have mb as a pointer to a match block, because the IS_NEWLINE -macro is used below, and it expects NLBLOCK to be defined as a pointer. */ - -pcre2_callout_block cb; -match_block actual_match_block; -match_block *mb = &actual_match_block; - -/* Allocate an initial vector of backtracking frames on the stack. If this -proves to be too small, it is replaced by a larger one on the heap. To get a -vector of the size required that is aligned for pointers, allocate it as a -vector of pointers. */ - -PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)] - PCRE2_KEEP_UNINITIALIZED; -mb->stack_frames = (heapframe *)stack_frames_vector; - -/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated -subject string. */ - -if (length == PCRE2_ZERO_TERMINATED) - { - length = PRIV(strlen)(subject); - was_zero_terminated = 1; - } -true_end_subject = end_subject = subject + length; - -/* Plausibility checks */ - -if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; -if (code == NULL || subject == NULL || match_data == NULL) - return PCRE2_ERROR_NULL; -if (start_offset > length) return PCRE2_ERROR_BADOFFSET; - -/* Check that the first field in the block is the magic number. */ - -if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; - -/* Check the code unit width. */ - -if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) - return PCRE2_ERROR_BADMODE; - -/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the -options variable for this function. Users of PCRE2 who are not calling the -function directly would like to have a way of setting these flags, in the same -way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with -constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and -(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now -transfer to the options for this function. The bits are guaranteed to be -adjacent, but do not have the same values. This bit of Boolean trickery assumes -that the match-time bits are not more significant than the flag bits. If by -accident this is not the case, a compile-time division by zero error will -occur. */ - -#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) -#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) -options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); -#undef FF -#undef OO - -/* If the pattern was successfully studied with JIT support, we will run the -JIT executable instead of the rest of this function. Most options must be set -at compile time for the JIT code to be usable. */ - -#ifdef SUPPORT_JIT -use_jit = (re->executable_jit != NULL && - (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); -#endif - -/* Initialize UTF/UCP parameters. */ - -#ifdef SUPPORT_UNICODE -utf = (re->overall_options & PCRE2_UTF) != 0; -allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; -ucp = (re->overall_options & PCRE2_UCP) != 0; -#endif /* SUPPORT_UNICODE */ - -/* Convert the partial matching flags into an integer. */ - -mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 : - ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0; - -/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same -time. */ - -if (mb->partial != 0 && - ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0) - return PCRE2_ERROR_BADOPTION; - -/* It is an error to set an offset limit without setting the flag at compile -time. */ - -if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET && - (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) - return PCRE2_ERROR_BADOFFSETLIMIT; - -/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, -free the memory that was obtained. Set the field to NULL for no match cases. */ - -if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) - { - match_data->memctl.free((void *)match_data->subject, - match_data->memctl.memory_data); - match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; - } -match_data->subject = NULL; - -/* Zero the error offset in case the first code unit is invalid UTF. */ - -match_data->startchar = 0; - - -/* ============================= JIT matching ============================== */ - -/* Prepare for JIT matching. Check a UTF string for validity unless no check is -requested or invalid UTF can be handled. We check only the portion of the -subject that might be be inspected during matching - from the offset minus the -maximum lookbehind to the given length. This saves time when a small part of a -large subject is being matched by the use of a starting offset. Note that the -maximum lookbehind is a number of characters, not code units. */ - -#ifdef SUPPORT_JIT -if (use_jit) - { -#ifdef SUPPORT_UNICODE - if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid) - { -#if PCRE2_CODE_UNIT_WIDTH != 32 - unsigned int i; -#endif - - /* For 8-bit and 16-bit UTF, check that the first code unit is a valid - character start. */ - -#if PCRE2_CODE_UNIT_WIDTH != 32 - if (start_match < end_subject && NOT_FIRSTCU(*start_match)) - { - if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; -#if PCRE2_CODE_UNIT_WIDTH == 8 - return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ -#else - return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ -#endif - } -#endif /* WIDTH != 32 */ - - /* Move back by the maximum lookbehind, just in case it happens at the very - start of matching. */ - -#if PCRE2_CODE_UNIT_WIDTH != 32 - for (i = re->max_lookbehind; i > 0 && start_match > subject; i--) - { - start_match--; - while (start_match > subject && -#if PCRE2_CODE_UNIT_WIDTH == 8 - (*start_match & 0xc0) == 0x80) -#else /* 16-bit */ - (*start_match & 0xfc00) == 0xdc00) -#endif - start_match--; - } -#else /* PCRE2_CODE_UNIT_WIDTH != 32 */ - - /* In the 32-bit library, one code unit equals one character. However, - we cannot just subtract the lookbehind and then compare pointers, because - a very large lookbehind could create an invalid pointer. */ - - if (start_offset >= re->max_lookbehind) - start_match -= re->max_lookbehind; - else - start_match = subject; -#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ - - /* Validate the relevant portion of the subject. Adjust the offset of an - invalid code point to be an absolute offset in the whole string. */ - - match_data->rc = PRIV(valid_utf)(start_match, - length - (start_match - subject), &(match_data->startchar)); - if (match_data->rc != 0) - { - match_data->startchar += start_match - subject; - return match_data->rc; - } - jit_checked_utf = TRUE; - } -#endif /* SUPPORT_UNICODE */ - - /* If JIT returns BADOPTION, which means that the selected complete or - partial matching mode was not compiled, fall through to the interpreter. */ - - rc = pcre2_jit_match(code, subject, length, start_offset, options, - match_data, mcontext); - if (rc != PCRE2_ERROR_JIT_BADOPTION) - { - if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0) - { - length = CU2BYTES(length + was_zero_terminated); - match_data->subject = match_data->memctl.malloc(length, - match_data->memctl.memory_data); - if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; - memcpy((void *)match_data->subject, subject, length); - match_data->flags |= PCRE2_MD_COPIED_SUBJECT; - } - return rc; - } - } -#endif /* SUPPORT_JIT */ - -/* ========================= End of JIT matching ========================== */ - - -/* Proceed with non-JIT matching. The default is to allow lookbehinds to the -start of the subject. A UTF check when there is a non-zero offset may change -this. */ - -mb->check_subject = subject; - -/* If a UTF subject string was not checked for validity in the JIT code above, -check it here, and handle support for invalid UTF strings. The check above -happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset. -If we get here in those circumstances, it means the subject string is valid, -but for some reason JIT matching was not successful. There is no need to check -the subject again. - -We check only the portion of the subject that might be be inspected during -matching - from the offset minus the maximum lookbehind to the given length. -This saves time when a small part of a large subject is being matched by the -use of a starting offset. Note that the maximum lookbehind is a number of -characters, not code units. - -Note also that support for invalid UTF forces a check, overriding the setting -of PCRE2_NO_CHECK_UTF. */ - -#ifdef SUPPORT_UNICODE -if (utf && -#ifdef SUPPORT_JIT - !jit_checked_utf && -#endif - ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid)) - { -#if PCRE2_CODE_UNIT_WIDTH != 32 - BOOL skipped_bad_start = FALSE; -#endif - - /* For 8-bit and 16-bit UTF, check that the first code unit is a valid - character start. If we are handling invalid UTF, just skip over such code - units. Otherwise, give an appropriate error. */ - -#if PCRE2_CODE_UNIT_WIDTH != 32 - if (allow_invalid) - { - while (start_match < end_subject && NOT_FIRSTCU(*start_match)) - { - start_match++; - skipped_bad_start = TRUE; - } - } - else if (start_match < end_subject && NOT_FIRSTCU(*start_match)) - { - if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; -#if PCRE2_CODE_UNIT_WIDTH == 8 - return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ -#else - return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ -#endif - } -#endif /* WIDTH != 32 */ - - /* The mb->check_subject field points to the start of UTF checking; - lookbehinds can go back no further than this. */ - - mb->check_subject = start_match; - - /* Move back by the maximum lookbehind, just in case it happens at the very - start of matching, but don't do this if we skipped bad 8-bit or 16-bit code - units above. */ - -#if PCRE2_CODE_UNIT_WIDTH != 32 - if (!skipped_bad_start) - { - unsigned int i; - for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--) - { - mb->check_subject--; - while (mb->check_subject > subject && -#if PCRE2_CODE_UNIT_WIDTH == 8 - (*mb->check_subject & 0xc0) == 0x80) -#else /* 16-bit */ - (*mb->check_subject & 0xfc00) == 0xdc00) -#endif - mb->check_subject--; - } - } -#else /* PCRE2_CODE_UNIT_WIDTH != 32 */ - - /* In the 32-bit library, one code unit equals one character. However, - we cannot just subtract the lookbehind and then compare pointers, because - a very large lookbehind could create an invalid pointer. */ - - if (start_offset >= re->max_lookbehind) - mb->check_subject -= re->max_lookbehind; - else - mb->check_subject = subject; -#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ - - /* Validate the relevant portion of the subject. There's a loop in case we - encounter bad UTF in the characters preceding start_match which we are - scanning because of a lookbehind. */ - - for (;;) - { - match_data->rc = PRIV(valid_utf)(mb->check_subject, - length - (mb->check_subject - subject), &(match_data->startchar)); - - if (match_data->rc == 0) break; /* Valid UTF string */ - - /* Invalid UTF string. Adjust the offset to be an absolute offset in the - whole string. If we are handling invalid UTF strings, set end_subject to - stop before the bad code unit, and set the options to "not end of line". - Otherwise return the error. */ - - match_data->startchar += mb->check_subject - subject; - if (!allow_invalid || match_data->rc > 0) return match_data->rc; - end_subject = subject + match_data->startchar; - - /* If the end precedes start_match, it means there is invalid UTF in the - extra code units we reversed over because of a lookbehind. Advance past the - first bad code unit, and then skip invalid character starting code units in - 8-bit and 16-bit modes, and try again. */ - - if (end_subject < start_match) - { - mb->check_subject = end_subject + 1; -#if PCRE2_CODE_UNIT_WIDTH != 32 - while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject)) - mb->check_subject++; -#endif - } - - /* Otherwise, set the not end of line option, and do the match. */ - - else - { - fragment_options = PCRE2_NOTEOL; - break; - } - } - } -#endif /* SUPPORT_UNICODE */ - -/* A NULL match context means "use a default context", but we take the memory -control functions from the pattern. */ - -if (mcontext == NULL) - { - mcontext = (pcre2_match_context *)(&PRIV(default_match_context)); - mb->memctl = re->memctl; - } -else mb->memctl = mcontext->memctl; - -anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0; -firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; -startline = (re->flags & PCRE2_STARTLINE) != 0; -bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? - true_end_subject : subject + mcontext->offset_limit; - -/* Initialize and set up the fixed fields in the callout block, with a pointer -in the match block. */ - -mb->cb = &cb; -cb.version = 2; -cb.subject = subject; -cb.subject_length = (PCRE2_SIZE)(end_subject - subject); -cb.callout_flags = 0; - -/* Fill in the remaining fields in the match block, except for moptions, which -gets set later. */ - -mb->callout = mcontext->callout; -mb->callout_data = mcontext->callout_data; - -mb->start_subject = subject; -mb->start_offset = start_offset; -mb->end_subject = end_subject; -mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; -mb->allowemptypartial = (re->max_lookbehind > 0) || - (re->flags & PCRE2_MATCH_EMPTY) != 0; -mb->poptions = re->overall_options; /* Pattern options */ -mb->ignore_skip_arg = 0; -mb->mark = mb->nomatch_mark = NULL; /* In case never set */ - -/* The name table is needed for finding all the numbers associated with a -given name, for condition testing. The code follows the name table. */ - -mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); -mb->name_count = re->name_count; -mb->name_entry_size = re->name_entry_size; -mb->start_code = mb->name_table + re->name_count * re->name_entry_size; - -/* Process the \R and newline settings. */ - -mb->bsr_convention = re->bsr_convention; -mb->nltype = NLTYPE_FIXED; -switch(re->newline_convention) - { - case PCRE2_NEWLINE_CR: - mb->nllen = 1; - mb->nl[0] = CHAR_CR; - break; - - case PCRE2_NEWLINE_LF: - mb->nllen = 1; - mb->nl[0] = CHAR_NL; - break; - - case PCRE2_NEWLINE_NUL: - mb->nllen = 1; - mb->nl[0] = CHAR_NUL; - break; - - case PCRE2_NEWLINE_CRLF: - mb->nllen = 2; - mb->nl[0] = CHAR_CR; - mb->nl[1] = CHAR_NL; - break; - - case PCRE2_NEWLINE_ANY: - mb->nltype = NLTYPE_ANY; - break; - - case PCRE2_NEWLINE_ANYCRLF: - mb->nltype = NLTYPE_ANYCRLF; - break; - - default: return PCRE2_ERROR_INTERNAL; - } - -/* The backtracking frames have fixed data at the front, and a PCRE2_SIZE -vector at the end, whose size depends on the number of capturing parentheses in -the pattern. It is not used at all if there are no capturing parentheses. - - frame_size is the total size of each frame - mb->frame_vector_size is the total usable size of the vector (rounded down - to a whole number of frames) - -The last of these is changed within the match() function if the frame vector -has to be expanded. We therefore put it into the match block so that it is -correct when calling match() more than once for non-anchored patterns. */ - -frame_size = offsetof(heapframe, ovector) + - re->top_bracket * 2 * sizeof(PCRE2_SIZE); - -/* Limits set in the pattern override the match context only if they are -smaller. */ - -mb->heap_limit = (mcontext->heap_limit < re->limit_heap)? - mcontext->heap_limit : re->limit_heap; - -mb->match_limit = (mcontext->match_limit < re->limit_match)? - mcontext->match_limit : re->limit_match; - -mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)? - mcontext->depth_limit : re->limit_depth; - -/* If a pattern has very many capturing parentheses, the frame size may be very -large. Ensure that there are at least 10 available frames by getting an initial -vector on the heap if necessary, except when the heap limit prevents this. Get -fewer if possible. (The heap limit is in kibibytes.) */ - -if (frame_size <= START_FRAMES_SIZE/10) - { - mb->match_frames = mb->stack_frames; /* Initial frame vector on the stack */ - mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size); - } -else - { - mb->frame_vector_size = frame_size * 10; - if ((mb->frame_vector_size / 1024) > mb->heap_limit) - { - if (frame_size > mb->heap_limit * 1024) return PCRE2_ERROR_HEAPLIMIT; - mb->frame_vector_size = ((mb->heap_limit * 1024)/frame_size) * frame_size; - } - mb->match_frames = mb->memctl.malloc(mb->frame_vector_size, - mb->memctl.memory_data); - if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY; - } - -mb->match_frames_top = - (heapframe *)((char *)mb->match_frames + mb->frame_vector_size); - -/* Write to the ovector within the first frame to mark every capture unset and -to avoid uninitialized memory read errors when it is copied to a new frame. */ - -memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff, - re->top_bracket * 2 * sizeof(PCRE2_SIZE)); - -/* Pointers to the individual character tables */ - -mb->lcc = re->tables + lcc_offset; -mb->fcc = re->tables + fcc_offset; -mb->ctypes = re->tables + ctypes_offset; - -/* Set up the first code unit to match, if available. If there's no first code -unit there may be a bitmap of possible first characters. */ - -if ((re->flags & PCRE2_FIRSTSET) != 0) - { - has_first_cu = TRUE; - first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); - if ((re->flags & PCRE2_FIRSTCASELESS) != 0) - { - first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu); -#else - if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu); -#endif -#endif /* SUPPORT_UNICODE */ - } - } -else - if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) - start_bits = re->start_bitmap; - -/* There may also be a "last known required character" set. */ - -if ((re->flags & PCRE2_LASTSET) != 0) - { - has_req_cu = TRUE; - req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit); - if ((re->flags & PCRE2_LASTCASELESS) != 0) - { - req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu); -#else - if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu); -#endif -#endif /* SUPPORT_UNICODE */ - } - } - - -/* ==========================================================================*/ - -/* Loop for handling unanchored repeated matching attempts; for anchored regexs -the loop runs just once. */ - -#ifdef SUPPORT_UNICODE -FRAGMENT_RESTART: -#endif - -start_partial = match_partial = NULL; -mb->hitend = FALSE; - -#if PCRE2_CODE_UNIT_WIDTH == 8 -memchr_not_found_first_cu = FALSE; -memchr_not_found_first_cu2 = FALSE; -#endif - -for(;;) - { - PCRE2_SPTR new_start_match; - - /* ----------------- Start of match optimizations ---------------- */ - - /* There are some optimizations that avoid running the match if a known - starting point is not found, or if a known later code unit is not present. - However, there is an option (settable at compile time) that disables these, - for testing and for ensuring that all callouts do actually occur. */ - - if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) - { - /* If firstline is TRUE, the start of the match is constrained to the first - line of a multiline string. That is, the match must be before or at the - first newline following the start of matching. Temporarily adjust - end_subject so that we stop the scans for a first code unit at a newline. - If the match fails at the newline, later code breaks the loop. */ - - if (firstline) - { - PCRE2_SPTR t = start_match; -#ifdef SUPPORT_UNICODE - if (utf) - { - while (t < end_subject && !IS_NEWLINE(t)) - { - t++; - ACROSSCHAR(t < end_subject, t, t++); - } - } - else -#endif - while (t < end_subject && !IS_NEWLINE(t)) t++; - end_subject = t; - } - - /* Anchored: check the first code unit if one is recorded. This may seem - pointless but it can help in detecting a no match case without scanning for - the required code unit. */ - - if (anchored) - { - if (has_first_cu || start_bits != NULL) - { - BOOL ok = start_match < end_subject; - if (ok) - { - PCRE2_UCHAR c = UCHAR21TEST(start_match); - ok = has_first_cu && (c == first_cu || c == first_cu2); - if (!ok && start_bits != NULL) - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - if (c > 255) c = 255; -#endif - ok = (start_bits[c/8] & (1u << (c&7))) != 0; - } - } - if (!ok) - { - rc = MATCH_NOMATCH; - break; - } - } - } - - /* Not anchored. Advance to a unique first code unit if there is one. In - 8-bit mode, the use of memchr() gives a big speed up, even though we have - to call it twice in caseless mode, in order to find the earliest occurrence - of the character in either of its cases. If a call to memchr() that - searches the rest of the subject fails to find one case, remember that in - order not to keep on repeating the search. This can make a huge difference - when the strings are very long and only one case is present. */ - - else - { - if (has_first_cu) - { - if (first_cu != first_cu2) /* Caseless */ - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - PCRE2_UCHAR smc; - while (start_match < end_subject && - (smc = UCHAR21TEST(start_match)) != first_cu && - smc != first_cu2) - start_match++; - -#else /* 8-bit code units */ - PCRE2_SPTR pp1 = NULL; - PCRE2_SPTR pp2 = NULL; - PCRE2_SIZE cu2size = end_subject - start_match; - - if (!memchr_not_found_first_cu) - { - pp1 = memchr(start_match, first_cu, end_subject - start_match); - if (pp1 == NULL) memchr_not_found_first_cu = TRUE; - else cu2size = pp1 - start_match; - } - - /* If pp1 is not NULL, we have arranged to search only as far as pp1, - to see if the other case is earlier, so we can set "not found" only - when both searches have returned NULL. */ - - if (!memchr_not_found_first_cu2) - { - pp2 = memchr(start_match, first_cu2, cu2size); - memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL); - } - - if (pp1 == NULL) - start_match = (pp2 == NULL)? end_subject : pp2; - else - start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; -#endif - } - - /* The caseful case */ - - else - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - while (start_match < end_subject && UCHAR21TEST(start_match) != - first_cu) - start_match++; -#else - start_match = memchr(start_match, first_cu, end_subject - start_match); - if (start_match == NULL) start_match = end_subject; -#endif - } - - /* If we can't find the required first code unit, having reached the - true end of the subject, break the bumpalong loop, to force a match - failure, except when doing partial matching, when we let the next cycle - run at the end of the subject. To see why, consider the pattern - /(?<=abc)def/, which partially matches "abc", even though the string - does not contain the starting character "d". If we have not reached the - true end of the subject (PCRE2_FIRSTLINE caused end_subject to be - temporarily modified) we also let the cycle run, because the matching - string is legitimately allowed to start with the first code unit of a - newline. */ - - if (mb->partial == 0 && start_match >= mb->end_subject) - { - rc = MATCH_NOMATCH; - break; - } - } - - /* If there's no first code unit, advance to just after a linebreak for a - multiline match if required. */ - - else if (startline) - { - if (start_match > mb->start_subject + start_offset) - { -#ifdef SUPPORT_UNICODE - if (utf) - { - while (start_match < end_subject && !WAS_NEWLINE(start_match)) - { - start_match++; - ACROSSCHAR(start_match < end_subject, start_match, start_match++); - } - } - else -#endif - while (start_match < end_subject && !WAS_NEWLINE(start_match)) - start_match++; - - /* If we have just passed a CR and the newline option is ANY or - ANYCRLF, and we are now at a LF, advance the match position by one - more code unit. */ - - if (start_match[-1] == CHAR_CR && - (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && - start_match < end_subject && - UCHAR21TEST(start_match) == CHAR_NL) - start_match++; - } - } - - /* If there's no first code unit or a requirement for a multiline line - start, advance to a non-unique first code unit if any have been - identified. The bitmap contains only 256 bits. When code units are 16 or - 32 bits wide, all code units greater than 254 set the 255 bit. */ - - else if (start_bits != NULL) - { - while (start_match < end_subject) - { - uint32_t c = UCHAR21TEST(start_match); -#if PCRE2_CODE_UNIT_WIDTH != 8 - if (c > 255) c = 255; -#endif - if ((start_bits[c/8] & (1u << (c&7))) != 0) break; - start_match++; - } - - /* See comment above in first_cu checking about the next few lines. */ - - if (mb->partial == 0 && start_match >= mb->end_subject) - { - rc = MATCH_NOMATCH; - break; - } - } - } /* End first code unit handling */ - - /* Restore fudged end_subject */ - - end_subject = mb->end_subject; - - /* The following two optimizations must be disabled for partial matching. */ - - if (mb->partial == 0) - { - PCRE2_SPTR p; - - /* The minimum matching length is a lower bound; no string of that length - may actually match the pattern. Although the value is, strictly, in - characters, we treat it as code units to avoid spending too much time in - this optimization. */ - - if (end_subject - start_match < re->minlength) - { - rc = MATCH_NOMATCH; - break; - } - - /* If req_cu is set, we know that that code unit must appear in the - subject for the (non-partial) match to succeed. If the first code unit is - set, req_cu must be later in the subject; otherwise the test starts at - the match point. This optimization can save a huge amount of backtracking - in patterns with nested unlimited repeats that aren't going to match. - Writing separate code for caseful/caseless versions makes it go faster, - as does using an autoincrement and backing off on a match. As in the case - of the first code unit, using memchr() in the 8-bit library gives a big - speed up. Unlike the first_cu check above, we do not need to call - memchr() twice in the caseless case because we only need to check for the - presence of the character in either case, not find the first occurrence. - - The search can be skipped if the code unit was found later than the - current starting point in a previous iteration of the bumpalong loop. - - HOWEVER: when the subject string is very, very long, searching to its end - can take a long time, and give bad performance on quite ordinary - anchored patterns. This showed up when somebody was matching something - like /^\d+C/ on a 32-megabyte string... so we don't do this when the - string is sufficiently long, but it's worth searching a lot more for - unanchored patterns. */ - - p = start_match + (has_first_cu? 1:0); - if (has_req_cu && p > req_cu_ptr) - { - PCRE2_SIZE check_length = end_subject - start_match; - - if (check_length < REQ_CU_MAX || - (!anchored && check_length < REQ_CU_MAX * 1000)) - { - if (req_cu != req_cu2) /* Caseless */ - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - while (p < end_subject) - { - uint32_t pp = UCHAR21INCTEST(p); - if (pp == req_cu || pp == req_cu2) { p--; break; } - } -#else /* 8-bit code units */ - PCRE2_SPTR pp = p; - p = memchr(pp, req_cu, end_subject - pp); - if (p == NULL) - { - p = memchr(pp, req_cu2, end_subject - pp); - if (p == NULL) p = end_subject; - } -#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ - } - - /* The caseful case */ - - else - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - while (p < end_subject) - { - if (UCHAR21INCTEST(p) == req_cu) { p--; break; } - } - -#else /* 8-bit code units */ - p = memchr(p, req_cu, end_subject - p); - if (p == NULL) p = end_subject; -#endif - } - - /* If we can't find the required code unit, break the bumpalong loop, - forcing a match failure. */ - - if (p >= end_subject) - { - rc = MATCH_NOMATCH; - break; - } - - /* If we have found the required code unit, save the point where we - found it, so that we don't search again next time round the bumpalong - loop if the start hasn't yet passed this code unit. */ - - req_cu_ptr = p; - } - } - } - } - - /* ------------ End of start of match optimizations ------------ */ - - /* Give no match if we have passed the bumpalong limit. */ - - if (start_match > bumpalong_limit) - { - rc = MATCH_NOMATCH; - break; - } - - /* OK, we can now run the match. If "hitend" is set afterwards, remember the - first starting point for which a partial match was found. */ - - cb.start_match = (PCRE2_SIZE)(start_match - subject); - cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH; - - mb->start_used_ptr = start_match; - mb->last_used_ptr = start_match; -#ifdef SUPPORT_UNICODE - mb->moptions = options | fragment_options; -#else - mb->moptions = options; -#endif - mb->match_call_count = 0; - mb->end_offset_top = 0; - mb->skip_arg_count = 0; - - rc = match(start_match, mb->start_code, match_data->ovector, - match_data->oveccount, re->top_bracket, frame_size, mb); - - if (mb->hitend && start_partial == NULL) - { - start_partial = mb->start_used_ptr; - match_partial = start_match; - } - - switch(rc) - { - /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched - the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP - entirely. The only way we can do that is to re-do the match at the same - point, with a flag to force SKIP with an argument to be ignored. Just - treating this case as NOMATCH does not work because it does not check other - alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ - - case MATCH_SKIP_ARG: - new_start_match = start_match; - mb->ignore_skip_arg = mb->skip_arg_count; - break; - - /* SKIP passes back the next starting point explicitly, but if it is no - greater than the match we have just done, treat it as NOMATCH. */ - - case MATCH_SKIP: - if (mb->verb_skip_ptr > start_match) - { - new_start_match = mb->verb_skip_ptr; - break; - } - /* Fall through */ - - /* NOMATCH and PRUNE advance by one character. THEN at this level acts - exactly like PRUNE. Unset ignore SKIP-with-argument. */ - - case MATCH_NOMATCH: - case MATCH_PRUNE: - case MATCH_THEN: - mb->ignore_skip_arg = 0; - new_start_match = start_match + 1; -#ifdef SUPPORT_UNICODE - if (utf) - ACROSSCHAR(new_start_match < end_subject, new_start_match, - new_start_match++); -#endif - break; - - /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ - - case MATCH_COMMIT: - rc = MATCH_NOMATCH; - goto ENDLOOP; - - /* Any other return is either a match, or some kind of error. */ - - default: - goto ENDLOOP; - } - - /* Control reaches here for the various types of "no match at this point" - result. Reset the code to MATCH_NOMATCH for subsequent checking. */ - - rc = MATCH_NOMATCH; - - /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first - newline in the subject (though it may continue over the newline). Therefore, - if we have just failed to match, starting at a newline, do not continue. */ - - if (firstline && IS_NEWLINE(start_match)) break; - - /* Advance to new matching position */ - - start_match = new_start_match; - - /* Break the loop if the pattern is anchored or if we have passed the end of - the subject. */ - - if (anchored || start_match > end_subject) break; - - /* If we have just passed a CR and we are now at a LF, and the pattern does - not contain any explicit matches for \r or \n, and the newline option is CRLF - or ANY or ANYCRLF, advance the match position by one more code unit. In - normal matching start_match will aways be greater than the first position at - this stage, but a failed *SKIP can cause a return at the same point, which is - why the first test exists. */ - - if (start_match > subject + start_offset && - start_match[-1] == CHAR_CR && - start_match < end_subject && - *start_match == CHAR_NL && - (re->flags & PCRE2_HASCRORLF) == 0 && - (mb->nltype == NLTYPE_ANY || - mb->nltype == NLTYPE_ANYCRLF || - mb->nllen == 2)) - start_match++; - - mb->mark = NULL; /* Reset for start of next match attempt */ - } /* End of for(;;) "bumpalong" loop */ - -/* ==========================================================================*/ - -/* When we reach here, one of the following stopping conditions is true: - -(1) The match succeeded, either completely, or partially; - -(2) The pattern is anchored or the match was failed after (*COMMIT); - -(3) We are past the end of the subject or the bumpalong limit; - -(4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because - this option requests that a match occur at or before the first newline in - the subject. - -(5) Some kind of error occurred. - -*/ - -ENDLOOP: - -/* If end_subject != true_end_subject, it means we are handling invalid UTF, -and have just processed a non-terminal fragment. If this resulted in no match -or a partial match we must carry on to the next fragment (a partial match is -returned to the caller only at the very end of the subject). A loop is used to -avoid trying to match against empty fragments; if the pattern can match an -empty string it would have done so already. */ - -#ifdef SUPPORT_UNICODE -if (utf && end_subject != true_end_subject && - (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL)) - { - for (;;) - { - /* Advance past the first bad code unit, and then skip invalid character - starting code units in 8-bit and 16-bit modes. */ - - start_match = end_subject + 1; - -#if PCRE2_CODE_UNIT_WIDTH != 32 - while (start_match < true_end_subject && NOT_FIRSTCU(*start_match)) - start_match++; -#endif - - /* If we have hit the end of the subject, there isn't another non-empty - fragment, so give up. */ - - if (start_match >= true_end_subject) - { - rc = MATCH_NOMATCH; /* In case it was partial */ - break; - } - - /* Check the rest of the subject */ - - mb->check_subject = start_match; - rc = PRIV(valid_utf)(start_match, length - (start_match - subject), - &(match_data->startchar)); - - /* The rest of the subject is valid UTF. */ - - if (rc == 0) - { - mb->end_subject = end_subject = true_end_subject; - fragment_options = PCRE2_NOTBOL; - goto FRAGMENT_RESTART; - } - - /* A subsequent UTF error has been found; if the next fragment is - non-empty, set up to process it. Otherwise, let the loop advance. */ - - else if (rc < 0) - { - mb->end_subject = end_subject = start_match + match_data->startchar; - if (end_subject > start_match) - { - fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL; - goto FRAGMENT_RESTART; - } - } - } - } -#endif /* SUPPORT_UNICODE */ - -/* Release an enlarged frame vector that is on the heap. */ - -if (mb->match_frames != mb->stack_frames) - mb->memctl.free(mb->match_frames, mb->memctl.memory_data); - -/* Fill in fields that are always returned in the match data. */ - -match_data->code = re; -match_data->mark = mb->mark; -match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER; - -/* Handle a fully successful match. Set the return code to the number of -captured strings, or 0 if there were too many to fit into the ovector, and then -set the remaining returned values before returning. Make a copy of the subject -string if requested. */ - -if (rc == MATCH_MATCH) - { - match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)? - 0 : (int)mb->end_offset_top/2 + 1; - match_data->startchar = start_match - subject; - match_data->leftchar = mb->start_used_ptr - subject; - match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)? - mb->last_used_ptr : mb->end_match_ptr) - subject; - if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0) - { - length = CU2BYTES(length + was_zero_terminated); - match_data->subject = match_data->memctl.malloc(length, - match_data->memctl.memory_data); - if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; - memcpy((void *)match_data->subject, subject, length); - match_data->flags |= PCRE2_MD_COPIED_SUBJECT; - } - else match_data->subject = subject; - return match_data->rc; - } - -/* Control gets here if there has been a partial match, an error, or if the -overall match attempt has failed at all permitted starting positions. Any mark -data is in the nomatch_mark field. */ - -match_data->mark = mb->nomatch_mark; - -/* For anything other than nomatch or partial match, just return the code. */ - -if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc; - -/* Handle a partial match. If a "soft" partial match was requested, searching -for a complete match will have continued, and the value of rc at this point -will be MATCH_NOMATCH. For a "hard" partial match, it will already be -PCRE2_ERROR_PARTIAL. */ - -else if (match_partial != NULL) - { - match_data->subject = subject; - match_data->ovector[0] = match_partial - subject; - match_data->ovector[1] = end_subject - subject; - match_data->startchar = match_partial - subject; - match_data->leftchar = start_partial - subject; - match_data->rightchar = end_subject - subject; - match_data->rc = PCRE2_ERROR_PARTIAL; - } - -/* Else this is the classic nomatch case. */ - -else match_data->rc = PCRE2_ERROR_NOMATCH; - -return match_data->rc; -} - -/* End of pcre2_match.c */ diff --git a/pcre2/src/pcre2_match_data.c b/pcre2/src/pcre2_match_data.c deleted file mode 100644 index 53e469870..000000000 --- a/pcre2/src/pcre2_match_data.c +++ /dev/null @@ -1,166 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - - - -/************************************************* -* Create a match data block given ovector size * -*************************************************/ - -/* A minimum of 1 is imposed on the number of ovector pairs. */ - -PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION -pcre2_match_data_create(uint32_t oveccount, pcre2_general_context *gcontext) -{ -pcre2_match_data *yield; -if (oveccount < 1) oveccount = 1; -yield = PRIV(memctl_malloc)( - offsetof(pcre2_match_data, ovector) + 2*oveccount*sizeof(PCRE2_SIZE), - (pcre2_memctl *)gcontext); -if (yield == NULL) return NULL; -yield->oveccount = oveccount; -yield->flags = 0; -return yield; -} - - - -/************************************************* -* Create a match data block using pattern data * -*************************************************/ - -/* If no context is supplied, use the memory allocator from the code. */ - -PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION -pcre2_match_data_create_from_pattern(const pcre2_code *code, - pcre2_general_context *gcontext) -{ -if (gcontext == NULL) gcontext = (pcre2_general_context *)code; -return pcre2_match_data_create(((pcre2_real_code *)code)->top_bracket + 1, - gcontext); -} - - - -/************************************************* -* Free a match data block * -*************************************************/ - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_match_data_free(pcre2_match_data *match_data) -{ -if (match_data != NULL) - { - if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) - match_data->memctl.free((void *)match_data->subject, - match_data->memctl.memory_data); - match_data->memctl.free(match_data, match_data->memctl.memory_data); - } -} - - - -/************************************************* -* Get last mark in match * -*************************************************/ - -PCRE2_EXP_DEFN PCRE2_SPTR PCRE2_CALL_CONVENTION -pcre2_get_mark(pcre2_match_data *match_data) -{ -return match_data->mark; -} - - - -/************************************************* -* Get pointer to ovector * -*************************************************/ - -PCRE2_EXP_DEFN PCRE2_SIZE * PCRE2_CALL_CONVENTION -pcre2_get_ovector_pointer(pcre2_match_data *match_data) -{ -return match_data->ovector; -} - - - -/************************************************* -* Get number of ovector slots * -*************************************************/ - -PCRE2_EXP_DEFN uint32_t PCRE2_CALL_CONVENTION -pcre2_get_ovector_count(pcre2_match_data *match_data) -{ -return match_data->oveccount; -} - - - -/************************************************* -* Get starting code unit in match * -*************************************************/ - -PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION -pcre2_get_startchar(pcre2_match_data *match_data) -{ -return match_data->startchar; -} - - - -/************************************************* -* Get size of match data block * -*************************************************/ - -PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION -pcre2_get_match_data_size(pcre2_match_data *match_data) -{ -return offsetof(pcre2_match_data, ovector) + - 2 * (match_data->oveccount) * sizeof(PCRE2_SIZE); -} - -/* End of pcre2_match_data.c */ diff --git a/pcre2/src/pcre2_newline.c b/pcre2/src/pcre2_newline.c deleted file mode 100644 index 6e9366db9..000000000 --- a/pcre2/src/pcre2_newline.c +++ /dev/null @@ -1,243 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -/* This module contains internal functions for testing newlines when more than -one kind of newline is to be recognized. When a newline is found, its length is -returned. In principle, we could implement several newline "types", each -referring to a different set of newline characters. At present, PCRE2 supports -only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF, -and NLTYPE_ANY. The full list of Unicode newline characters is taken from -http://unicode.org/unicode/reports/tr18/. */ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - - - -/************************************************* -* Check for newline at given position * -*************************************************/ - -/* This function is called only via the IS_NEWLINE macro, which does so only -when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed -newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the code unit -pointed to by ptr is less than the end of the string. - -Arguments: - ptr pointer to possible newline - type the newline type - endptr pointer to the end of the string - lenptr where to return the length - utf TRUE if in utf mode - -Returns: TRUE or FALSE -*/ - -BOOL -PRIV(is_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR endptr, - uint32_t *lenptr, BOOL utf) -{ -uint32_t c; - -#ifdef SUPPORT_UNICODE -if (utf) { GETCHAR(c, ptr); } else c = *ptr; -#else -(void)utf; -c = *ptr; -#endif /* SUPPORT_UNICODE */ - -if (type == NLTYPE_ANYCRLF) switch(c) - { - case CHAR_LF: - *lenptr = 1; - return TRUE; - - case CHAR_CR: - *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1; - return TRUE; - - default: - return FALSE; - } - -/* NLTYPE_ANY */ - -else switch(c) - { -#ifdef EBCDIC - case CHAR_NEL: -#endif - case CHAR_LF: - case CHAR_VT: - case CHAR_FF: - *lenptr = 1; - return TRUE; - - case CHAR_CR: - *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1; - return TRUE; - -#ifndef EBCDIC -#if PCRE2_CODE_UNIT_WIDTH == 8 - case CHAR_NEL: - *lenptr = utf? 2 : 1; - return TRUE; - - case 0x2028: /* LS */ - case 0x2029: /* PS */ - *lenptr = 3; - return TRUE; - -#else /* 16-bit or 32-bit code units */ - case CHAR_NEL: - case 0x2028: /* LS */ - case 0x2029: /* PS */ - *lenptr = 1; - return TRUE; -#endif -#endif /* Not EBCDIC */ - - default: - return FALSE; - } -} - - - -/************************************************* -* Check for newline at previous position * -*************************************************/ - -/* This function is called only via the WAS_NEWLINE macro, which does so only -when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed -newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the initial -value of ptr is greater than the start of the string that is being processed. - -Arguments: - ptr pointer to possible newline - type the newline type - startptr pointer to the start of the string - lenptr where to return the length - utf TRUE if in utf mode - -Returns: TRUE or FALSE -*/ - -BOOL -PRIV(was_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR startptr, - uint32_t *lenptr, BOOL utf) -{ -uint32_t c; -ptr--; - -#ifdef SUPPORT_UNICODE -if (utf) - { - BACKCHAR(ptr); - GETCHAR(c, ptr); - } -else c = *ptr; -#else -(void)utf; -c = *ptr; -#endif /* SUPPORT_UNICODE */ - -if (type == NLTYPE_ANYCRLF) switch(c) - { - case CHAR_LF: - *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1; - return TRUE; - - case CHAR_CR: - *lenptr = 1; - return TRUE; - - default: - return FALSE; - } - -/* NLTYPE_ANY */ - -else switch(c) - { - case CHAR_LF: - *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1; - return TRUE; - -#ifdef EBCDIC - case CHAR_NEL: -#endif - case CHAR_VT: - case CHAR_FF: - case CHAR_CR: - *lenptr = 1; - return TRUE; - -#ifndef EBCDIC -#if PCRE2_CODE_UNIT_WIDTH == 8 - case CHAR_NEL: - *lenptr = utf? 2 : 1; - return TRUE; - - case 0x2028: /* LS */ - case 0x2029: /* PS */ - *lenptr = 3; - return TRUE; - -#else /* 16-bit or 32-bit code units */ - case CHAR_NEL: - case 0x2028: /* LS */ - case 0x2029: /* PS */ - *lenptr = 1; - return TRUE; -#endif -#endif /* Not EBCDIC */ - - default: - return FALSE; - } -} - -/* End of pcre2_newline.c */ diff --git a/pcre2/src/pcre2_ord2utf.c b/pcre2/src/pcre2_ord2utf.c deleted file mode 100644 index 140373099..000000000 --- a/pcre2/src/pcre2_ord2utf.c +++ /dev/null @@ -1,120 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -/* This file contains a function that converts a Unicode character code point -into a UTF string. The behaviour is different for each code unit width. */ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - - -/* If SUPPORT_UNICODE is not defined, this function will never be called. -Supply a dummy function because some compilers do not like empty source -modules. */ - -#ifndef SUPPORT_UNICODE -unsigned int -PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer) -{ -(void)(cvalue); -(void)(buffer); -return 0; -} -#else /* SUPPORT_UNICODE */ - - -/************************************************* -* Convert code point to UTF * -*************************************************/ - -/* -Arguments: - cvalue the character value - buffer pointer to buffer for result - -Returns: number of code units placed in the buffer -*/ - -unsigned int -PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer) -{ -/* Convert to UTF-8 */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 -int i, j; -for (i = 0; i < PRIV(utf8_table1_size); i++) - if ((int)cvalue <= PRIV(utf8_table1)[i]) break; -buffer += i; -for (j = i; j > 0; j--) - { - *buffer-- = 0x80 | (cvalue & 0x3f); - cvalue >>= 6; - } -*buffer = PRIV(utf8_table2)[i] | cvalue; -return i + 1; - -/* Convert to UTF-16 */ - -#elif PCRE2_CODE_UNIT_WIDTH == 16 -if (cvalue <= 0xffff) - { - *buffer = (PCRE2_UCHAR)cvalue; - return 1; - } -cvalue -= 0x10000; -*buffer++ = 0xd800 | (cvalue >> 10); -*buffer = 0xdc00 | (cvalue & 0x3ff); -return 2; - -/* Convert to UTF-32 */ - -#else -*buffer = (PCRE2_UCHAR)cvalue; -return 1; -#endif -} -#endif /* SUPPORT_UNICODE */ - -/* End of pcre_ord2utf.c */ diff --git a/pcre2/src/pcre2_pattern_info.c b/pcre2/src/pcre2_pattern_info.c deleted file mode 100644 index a29f5eff6..000000000 --- a/pcre2/src/pcre2_pattern_info.c +++ /dev/null @@ -1,432 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2018 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - - -/************************************************* -* Return info about compiled pattern * -*************************************************/ - -/* -Arguments: - code points to compiled code - what what information is required - where where to put the information; if NULL, return length - -Returns: 0 when data returned - > 0 when length requested - < 0 on error or unset value -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_pattern_info(const pcre2_code *code, uint32_t what, void *where) -{ -const pcre2_real_code *re = (pcre2_real_code *)code; - -if (where == NULL) /* Requests field length */ - { - switch(what) - { - case PCRE2_INFO_ALLOPTIONS: - case PCRE2_INFO_ARGOPTIONS: - case PCRE2_INFO_BACKREFMAX: - case PCRE2_INFO_BSR: - case PCRE2_INFO_CAPTURECOUNT: - case PCRE2_INFO_DEPTHLIMIT: - case PCRE2_INFO_EXTRAOPTIONS: - case PCRE2_INFO_FIRSTCODETYPE: - case PCRE2_INFO_FIRSTCODEUNIT: - case PCRE2_INFO_HASBACKSLASHC: - case PCRE2_INFO_HASCRORLF: - case PCRE2_INFO_HEAPLIMIT: - case PCRE2_INFO_JCHANGED: - case PCRE2_INFO_LASTCODETYPE: - case PCRE2_INFO_LASTCODEUNIT: - case PCRE2_INFO_MATCHEMPTY: - case PCRE2_INFO_MATCHLIMIT: - case PCRE2_INFO_MAXLOOKBEHIND: - case PCRE2_INFO_MINLENGTH: - case PCRE2_INFO_NAMEENTRYSIZE: - case PCRE2_INFO_NAMECOUNT: - case PCRE2_INFO_NEWLINE: - return sizeof(uint32_t); - - case PCRE2_INFO_FIRSTBITMAP: - return sizeof(const uint8_t *); - - case PCRE2_INFO_JITSIZE: - case PCRE2_INFO_SIZE: - case PCRE2_INFO_FRAMESIZE: - return sizeof(size_t); - - case PCRE2_INFO_NAMETABLE: - return sizeof(PCRE2_SPTR); - } - } - -if (re == NULL) return PCRE2_ERROR_NULL; - -/* Check that the first field in the block is the magic number. If it is not, -return with PCRE2_ERROR_BADMAGIC. */ - -if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; - -/* Check that this pattern was compiled in the correct bit mode */ - -if ((re->flags & (PCRE2_CODE_UNIT_WIDTH/8)) == 0) return PCRE2_ERROR_BADMODE; - -switch(what) - { - case PCRE2_INFO_ALLOPTIONS: - *((uint32_t *)where) = re->overall_options; - break; - - case PCRE2_INFO_ARGOPTIONS: - *((uint32_t *)where) = re->compile_options; - break; - - case PCRE2_INFO_BACKREFMAX: - *((uint32_t *)where) = re->top_backref; - break; - - case PCRE2_INFO_BSR: - *((uint32_t *)where) = re->bsr_convention; - break; - - case PCRE2_INFO_CAPTURECOUNT: - *((uint32_t *)where) = re->top_bracket; - break; - - case PCRE2_INFO_DEPTHLIMIT: - *((uint32_t *)where) = re->limit_depth; - if (re->limit_depth == UINT32_MAX) return PCRE2_ERROR_UNSET; - break; - - case PCRE2_INFO_EXTRAOPTIONS: - *((uint32_t *)where) = re->extra_options; - break; - - case PCRE2_INFO_FIRSTCODETYPE: - *((uint32_t *)where) = ((re->flags & PCRE2_FIRSTSET) != 0)? 1 : - ((re->flags & PCRE2_STARTLINE) != 0)? 2 : 0; - break; - - case PCRE2_INFO_FIRSTCODEUNIT: - *((uint32_t *)where) = ((re->flags & PCRE2_FIRSTSET) != 0)? - re->first_codeunit : 0; - break; - - case PCRE2_INFO_FIRSTBITMAP: - *((const uint8_t **)where) = ((re->flags & PCRE2_FIRSTMAPSET) != 0)? - &(re->start_bitmap[0]) : NULL; - break; - - case PCRE2_INFO_FRAMESIZE: - *((size_t *)where) = offsetof(heapframe, ovector) + - re->top_bracket * 2 * sizeof(PCRE2_SIZE); - break; - - case PCRE2_INFO_HASBACKSLASHC: - *((uint32_t *)where) = (re->flags & PCRE2_HASBKC) != 0; - break; - - case PCRE2_INFO_HASCRORLF: - *((uint32_t *)where) = (re->flags & PCRE2_HASCRORLF) != 0; - break; - - case PCRE2_INFO_HEAPLIMIT: - *((uint32_t *)where) = re->limit_heap; - if (re->limit_heap == UINT32_MAX) return PCRE2_ERROR_UNSET; - break; - - case PCRE2_INFO_JCHANGED: - *((uint32_t *)where) = (re->flags & PCRE2_JCHANGED) != 0; - break; - - case PCRE2_INFO_JITSIZE: -#ifdef SUPPORT_JIT - *((size_t *)where) = (re->executable_jit != NULL)? - PRIV(jit_get_size)(re->executable_jit) : 0; -#else - *((size_t *)where) = 0; -#endif - break; - - case PCRE2_INFO_LASTCODETYPE: - *((uint32_t *)where) = ((re->flags & PCRE2_LASTSET) != 0)? 1 : 0; - break; - - case PCRE2_INFO_LASTCODEUNIT: - *((uint32_t *)where) = ((re->flags & PCRE2_LASTSET) != 0)? - re->last_codeunit : 0; - break; - - case PCRE2_INFO_MATCHEMPTY: - *((uint32_t *)where) = (re->flags & PCRE2_MATCH_EMPTY) != 0; - break; - - case PCRE2_INFO_MATCHLIMIT: - *((uint32_t *)where) = re->limit_match; - if (re->limit_match == UINT32_MAX) return PCRE2_ERROR_UNSET; - break; - - case PCRE2_INFO_MAXLOOKBEHIND: - *((uint32_t *)where) = re->max_lookbehind; - break; - - case PCRE2_INFO_MINLENGTH: - *((uint32_t *)where) = re->minlength; - break; - - case PCRE2_INFO_NAMEENTRYSIZE: - *((uint32_t *)where) = re->name_entry_size; - break; - - case PCRE2_INFO_NAMECOUNT: - *((uint32_t *)where) = re->name_count; - break; - - case PCRE2_INFO_NAMETABLE: - *((PCRE2_SPTR *)where) = (PCRE2_SPTR)((char *)re + sizeof(pcre2_real_code)); - break; - - case PCRE2_INFO_NEWLINE: - *((uint32_t *)where) = re->newline_convention; - break; - - case PCRE2_INFO_SIZE: - *((size_t *)where) = re->blocksize; - break; - - default: return PCRE2_ERROR_BADOPTION; - } - -return 0; -} - - - -/************************************************* -* Callout enumerator * -*************************************************/ - -/* -Arguments: - code points to compiled code - callback function called for each callout block - callout_data user data passed to the callback - -Returns: 0 when successfully completed - < 0 on local error - != 0 for callback error -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_callout_enumerate(const pcre2_code *code, - int (*callback)(pcre2_callout_enumerate_block *, void *), void *callout_data) -{ -pcre2_real_code *re = (pcre2_real_code *)code; -pcre2_callout_enumerate_block cb; -PCRE2_SPTR cc; -#ifdef SUPPORT_UNICODE -BOOL utf; -#endif - -if (re == NULL) return PCRE2_ERROR_NULL; - -#ifdef SUPPORT_UNICODE -utf = (re->overall_options & PCRE2_UTF) != 0; -#endif - -/* Check that the first field in the block is the magic number. If it is not, -return with PCRE2_ERROR_BADMAGIC. */ - -if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; - -/* Check that this pattern was compiled in the correct bit mode */ - -if ((re->flags & (PCRE2_CODE_UNIT_WIDTH/8)) == 0) return PCRE2_ERROR_BADMODE; - -cb.version = 0; -cc = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) - + re->name_count * re->name_entry_size; - -while (TRUE) - { - int rc; - switch (*cc) - { - case OP_END: - return 0; - - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - case OP_STAR: - case OP_MINSTAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_QUERY: - case OP_MINQUERY: - case OP_UPTO: - case OP_MINUPTO: - case OP_EXACT: - case OP_POSSTAR: - case OP_POSPLUS: - case OP_POSQUERY: - case OP_POSUPTO: - case OP_STARI: - case OP_MINSTARI: - case OP_PLUSI: - case OP_MINPLUSI: - case OP_QUERYI: - case OP_MINQUERYI: - case OP_UPTOI: - case OP_MINUPTOI: - case OP_EXACTI: - case OP_POSSTARI: - case OP_POSPLUSI: - case OP_POSQUERYI: - case OP_POSUPTOI: - case OP_NOTSTAR: - case OP_NOTMINSTAR: - case OP_NOTPLUS: - case OP_NOTMINPLUS: - case OP_NOTQUERY: - case OP_NOTMINQUERY: - case OP_NOTUPTO: - case OP_NOTMINUPTO: - case OP_NOTEXACT: - case OP_NOTPOSSTAR: - case OP_NOTPOSPLUS: - case OP_NOTPOSQUERY: - case OP_NOTPOSUPTO: - case OP_NOTSTARI: - case OP_NOTMINSTARI: - case OP_NOTPLUSI: - case OP_NOTMINPLUSI: - case OP_NOTQUERYI: - case OP_NOTMINQUERYI: - case OP_NOTUPTOI: - case OP_NOTMINUPTOI: - case OP_NOTEXACTI: - case OP_NOTPOSSTARI: - case OP_NOTPOSPLUSI: - case OP_NOTPOSQUERYI: - case OP_NOTPOSUPTOI: - cc += PRIV(OP_lengths)[*cc]; -#ifdef SUPPORT_UNICODE - if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; - - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEEXACT: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - case OP_TYPEPOSUPTO: - cc += PRIV(OP_lengths)[*cc]; -#ifdef SUPPORT_UNICODE - if (cc[-1] == OP_PROP || cc[-1] == OP_NOTPROP) cc += 2; -#endif - break; - -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 - case OP_XCLASS: - cc += GET(cc, 1); - break; -#endif - - case OP_MARK: - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - case OP_SKIP_ARG: - case OP_THEN_ARG: - cc += PRIV(OP_lengths)[*cc] + cc[1]; - break; - - case OP_CALLOUT: - cb.pattern_position = GET(cc, 1); - cb.next_item_length = GET(cc, 1 + LINK_SIZE); - cb.callout_number = cc[1 + 2*LINK_SIZE]; - cb.callout_string_offset = 0; - cb.callout_string_length = 0; - cb.callout_string = NULL; - rc = callback(&cb, callout_data); - if (rc != 0) return rc; - cc += PRIV(OP_lengths)[*cc]; - break; - - case OP_CALLOUT_STR: - cb.pattern_position = GET(cc, 1); - cb.next_item_length = GET(cc, 1 + LINK_SIZE); - cb.callout_number = 0; - cb.callout_string_offset = GET(cc, 1 + 3*LINK_SIZE); - cb.callout_string_length = - GET(cc, 1 + 2*LINK_SIZE) - (1 + 4*LINK_SIZE) - 2; - cb.callout_string = cc + (1 + 4*LINK_SIZE) + 1; - rc = callback(&cb, callout_data); - if (rc != 0) return rc; - cc += GET(cc, 1 + 2*LINK_SIZE); - break; - - default: - cc += PRIV(OP_lengths)[*cc]; - break; - } - } -} - -/* End of pcre2_pattern_info.c */ diff --git a/pcre2/src/pcre2_printint.c b/pcre2/src/pcre2_printint.c deleted file mode 100644 index b9bab025a..000000000 --- a/pcre2/src/pcre2_printint.c +++ /dev/null @@ -1,836 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -/* This module contains a PCRE private debugging function for printing out the -internal form of a compiled regular expression, along with some supporting -local functions. This source file is #included in pcre2test.c at each supported -code unit width, with PCRE2_SUFFIX set appropriately, just like the functions -that comprise the library. It can also optionally be included in -pcre2_compile.c for detailed debugging in error situations. */ - - -/* Tables of operator names. The same 8-bit table is used for all code unit -widths, so it must be defined only once. The list itself is defined in -pcre2_internal.h, which is #included by pcre2test before this file. */ - -#ifndef OP_LISTS_DEFINED -static const char *OP_names[] = { OP_NAME_LIST }; -#define OP_LISTS_DEFINED -#endif - -/* The functions and tables herein must all have mode-dependent names. */ - -#define OP_lengths PCRE2_SUFFIX(OP_lengths_) -#define get_ucpname PCRE2_SUFFIX(get_ucpname_) -#define pcre2_printint PCRE2_SUFFIX(pcre2_printint_) -#define print_char PCRE2_SUFFIX(print_char_) -#define print_custring PCRE2_SUFFIX(print_custring_) -#define print_custring_bylen PCRE2_SUFFIX(print_custring_bylen_) -#define print_prop PCRE2_SUFFIX(print_prop_) - -/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that -the definition is next to the definition of the opcodes in pcre2_internal.h. -The contents of the table are, however, mode-dependent. */ - -static const uint8_t OP_lengths[] = { OP_LENGTHS }; - - - -/************************************************* -* Print one character from a string * -*************************************************/ - -/* In UTF mode the character may occupy more than one code unit. - -Arguments: - f file to write to - ptr pointer to first code unit of the character - utf TRUE if string is UTF (will be FALSE if UTF is not supported) - -Returns: number of additional code units used -*/ - -static unsigned int -print_char(FILE *f, PCRE2_SPTR ptr, BOOL utf) -{ -uint32_t c = *ptr; -BOOL one_code_unit = !utf; - -/* If UTF is supported and requested, check for a valid single code unit. */ - -#ifdef SUPPORT_UNICODE -if (utf) - { -#if PCRE2_CODE_UNIT_WIDTH == 8 - one_code_unit = c < 0x80; -#elif PCRE2_CODE_UNIT_WIDTH == 16 - one_code_unit = (c & 0xfc00) != 0xd800; -#else - one_code_unit = (c & 0xfffff800u) != 0xd800u; -#endif /* CODE_UNIT_WIDTH */ - } -#endif /* SUPPORT_UNICODE */ - -/* Handle a valid one-code-unit character at any width. */ - -if (one_code_unit) - { - if (PRINTABLE(c)) fprintf(f, "%c", (char)c); - else if (c < 0x80) fprintf(f, "\\x%02x", c); - else fprintf(f, "\\x{%02x}", c); - return 0; - } - -/* Code for invalid UTF code units and multi-unit UTF characters is different -for each width. If UTF is not supported, control should never get here, but we -need a return statement to keep the compiler happy. */ - -#ifndef SUPPORT_UNICODE -return 0; -#else - -/* Malformed UTF-8 should occur only if the sanity check has been turned off. -Rather than swallow random bytes, just stop if we hit a bad one. Print it with -\X instead of \x as an indication. */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 -if ((c & 0xc0) != 0xc0) - { - fprintf(f, "\\X{%x}", c); /* Invalid starting byte */ - return 0; - } -else - { - int i; - int a = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes */ - int s = 6*a; - c = (c & PRIV(utf8_table3)[a]) << s; - for (i = 1; i <= a; i++) - { - if ((ptr[i] & 0xc0) != 0x80) - { - fprintf(f, "\\X{%x}", c); /* Invalid secondary byte */ - return i - 1; - } - s -= 6; - c |= (ptr[i] & 0x3f) << s; - } - fprintf(f, "\\x{%x}", c); - return a; -} -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - -/* UTF-16: rather than swallow a low surrogate, just stop if we hit a bad one. -Print it with \X instead of \x as an indication. */ - -#if PCRE2_CODE_UNIT_WIDTH == 16 -if ((ptr[1] & 0xfc00) != 0xdc00) - { - fprintf(f, "\\X{%x}", c); - return 0; - } -c = (((c & 0x3ff) << 10) | (ptr[1] & 0x3ff)) + 0x10000; -fprintf(f, "\\x{%x}", c); -return 1; -#endif /* PCRE2_CODE_UNIT_WIDTH == 16 */ - -/* For UTF-32 we get here only for a malformed code unit, which should only -occur if the sanity check has been turned off. Print it with \X instead of \x -as an indication. */ - -#if PCRE2_CODE_UNIT_WIDTH == 32 -fprintf(f, "\\X{%x}", c); -return 0; -#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ -#endif /* SUPPORT_UNICODE */ -} - - - -/************************************************* -* Print string as a list of code units * -*************************************************/ - -/* These take no account of UTF as they always print each individual code unit. -The string is zero-terminated for print_custring(); the length is given for -print_custring_bylen(). - -Arguments: - f file to write to - ptr point to the string - len length for print_custring_bylen() - -Returns: nothing -*/ - -static void -print_custring(FILE *f, PCRE2_SPTR ptr) -{ -while (*ptr != '\0') - { - uint32_t c = *ptr++; - if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x{%x}", c); - } -} - -static void -print_custring_bylen(FILE *f, PCRE2_SPTR ptr, PCRE2_UCHAR len) -{ -for (; len > 0; len--) - { - uint32_t c = *ptr++; - if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x{%x}", c); - } -} - - - -/************************************************* -* Find Unicode property name * -*************************************************/ - -/* When there is no UTF/UCP support, the table of names does not exist. This -function should not be called in such configurations, because a pattern that -tries to use Unicode properties won't compile. Rather than put lots of #ifdefs -into the main code, however, we just put one into this function. */ - -static const char * -get_ucpname(unsigned int ptype, unsigned int pvalue) -{ -#ifdef SUPPORT_UNICODE -int i; -for (i = PRIV(utt_size) - 1; i >= 0; i--) - { - if (ptype == PRIV(utt)[i].type && pvalue == PRIV(utt)[i].value) break; - } -return (i >= 0)? PRIV(utt_names) + PRIV(utt)[i].name_offset : "??"; -#else /* No UTF support */ -(void)ptype; -(void)pvalue; -return "??"; -#endif /* SUPPORT_UNICODE */ -} - - - -/************************************************* -* Print Unicode property value * -*************************************************/ - -/* "Normal" properties can be printed from tables. The PT_CLIST property is a -pseudo-property that contains a pointer to a list of case-equivalent -characters. - -Arguments: - f file to write to - code pointer in the compiled code - before text to print before - after text to print after - -Returns: nothing -*/ - -static void -print_prop(FILE *f, PCRE2_SPTR code, const char *before, const char *after) -{ -if (code[1] != PT_CLIST) - { - fprintf(f, "%s%s %s%s", before, OP_names[*code], get_ucpname(code[1], - code[2]), after); - } -else - { - const char *not = (*code == OP_PROP)? "" : "not "; - const uint32_t *p = PRIV(ucd_caseless_sets) + code[2]; - fprintf (f, "%s%sclist", before, not); - while (*p < NOTACHAR) fprintf(f, " %04x", *p++); - fprintf(f, "%s", after); - } -} - - - -/************************************************* -* Print compiled pattern * -*************************************************/ - -/* The print_lengths flag controls whether offsets and lengths of items are -printed. Lenths can be turned off from pcre2test so that automatic tests on -bytecode can be written that do not depend on the value of LINK_SIZE. - -Arguments: - re a compiled pattern - f the file to write to - print_lengths show various lengths - -Returns: nothing -*/ - -static void -pcre2_printint(pcre2_code *re, FILE *f, BOOL print_lengths) -{ -PCRE2_SPTR codestart, nametable, code; -uint32_t nesize = re->name_entry_size; -BOOL utf = (re->overall_options & PCRE2_UTF) != 0; - -nametable = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)); -code = codestart = nametable + re->name_count * re->name_entry_size; - -for(;;) - { - PCRE2_SPTR ccode; - uint32_t c; - int i; - const char *flag = " "; - unsigned int extra = 0; - - if (print_lengths) - fprintf(f, "%3d ", (int)(code - codestart)); - else - fprintf(f, " "); - - switch(*code) - { -/* ========================================================================== */ - /* These cases are never obeyed. This is a fudge that causes a compile- - time error if the vectors OP_names or OP_lengths, which are indexed - by opcode, are not the correct length. It seems to be the only way to do - such a check at compile time, as the sizeof() operator does not work in - the C preprocessor. */ - - case OP_TABLE_LENGTH: - case OP_TABLE_LENGTH + - ((sizeof(OP_names)/sizeof(const char *) == OP_TABLE_LENGTH) && - (sizeof(OP_lengths) == OP_TABLE_LENGTH)): - return; -/* ========================================================================== */ - - case OP_END: - fprintf(f, " %s\n", OP_names[*code]); - fprintf(f, "------------------------------------------------------------------\n"); - return; - - case OP_CHAR: - fprintf(f, " "); - do - { - code++; - code += 1 + print_char(f, code, utf); - } - while (*code == OP_CHAR); - fprintf(f, "\n"); - continue; - - case OP_CHARI: - fprintf(f, " /i "); - do - { - code++; - code += 1 + print_char(f, code, utf); - } - while (*code == OP_CHARI); - fprintf(f, "\n"); - continue; - - case OP_CBRA: - case OP_CBRAPOS: - case OP_SCBRA: - case OP_SCBRAPOS: - if (print_lengths) fprintf(f, "%3d ", GET(code, 1)); - else fprintf(f, " "); - fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE)); - break; - - case OP_BRA: - case OP_BRAPOS: - case OP_SBRA: - case OP_SBRAPOS: - case OP_KETRMAX: - case OP_KETRMIN: - case OP_KETRPOS: - case OP_ALT: - case OP_KET: - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - case OP_ONCE: - case OP_SCRIPT_RUN: - case OP_COND: - case OP_SCOND: - case OP_REVERSE: - if (print_lengths) fprintf(f, "%3d ", GET(code, 1)); - else fprintf(f, " "); - fprintf(f, "%s", OP_names[*code]); - break; - - case OP_CLOSE: - fprintf(f, " %s %d", OP_names[*code], GET2(code, 1)); - break; - - case OP_CREF: - fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]); - break; - - case OP_DNCREF: - { - PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE; - fprintf(f, " %s Cond ref <", flag); - print_custring(f, entry); - fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE)); - } - break; - - case OP_RREF: - c = GET2(code, 1); - if (c == RREF_ANY) - fprintf(f, " Cond recurse any"); - else - fprintf(f, " Cond recurse %d", c); - break; - - case OP_DNRREF: - { - PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE; - fprintf(f, " %s Cond recurse <", flag); - print_custring(f, entry); - fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE)); - } - break; - - case OP_FALSE: - fprintf(f, " Cond false"); - break; - - case OP_TRUE: - fprintf(f, " Cond true"); - break; - - case OP_STARI: - case OP_MINSTARI: - case OP_POSSTARI: - case OP_PLUSI: - case OP_MINPLUSI: - case OP_POSPLUSI: - case OP_QUERYI: - case OP_MINQUERYI: - case OP_POSQUERYI: - flag = "/i"; - /* Fall through */ - case OP_STAR: - case OP_MINSTAR: - case OP_POSSTAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_POSPLUS: - case OP_QUERY: - case OP_MINQUERY: - case OP_POSQUERY: - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPOSSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEPOSPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSQUERY: - fprintf(f, " %s ", flag); - - if (*code >= OP_TYPESTAR) - { - if (code[1] == OP_PROP || code[1] == OP_NOTPROP) - { - print_prop(f, code + 1, "", " "); - extra = 2; - } - else fprintf(f, "%s", OP_names[code[1]]); - } - else extra = print_char(f, code+1, utf); - fprintf(f, "%s", OP_names[*code]); - break; - - case OP_EXACTI: - case OP_UPTOI: - case OP_MINUPTOI: - case OP_POSUPTOI: - flag = "/i"; - /* Fall through */ - case OP_EXACT: - case OP_UPTO: - case OP_MINUPTO: - case OP_POSUPTO: - fprintf(f, " %s ", flag); - extra = print_char(f, code + 1 + IMM2_SIZE, utf); - fprintf(f, "{"); - if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,"); - fprintf(f, "%d}", GET2(code,1)); - if (*code == OP_MINUPTO || *code == OP_MINUPTOI) fprintf(f, "?"); - else if (*code == OP_POSUPTO || *code == OP_POSUPTOI) fprintf(f, "+"); - break; - - case OP_TYPEEXACT: - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEPOSUPTO: - if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) - { - print_prop(f, code + IMM2_SIZE + 1, " ", " "); - extra = 2; - } - else fprintf(f, " %s", OP_names[code[1 + IMM2_SIZE]]); - fprintf(f, "{"); - if (*code != OP_TYPEEXACT) fprintf(f, "0,"); - fprintf(f, "%d}", GET2(code,1)); - if (*code == OP_TYPEMINUPTO) fprintf(f, "?"); - else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+"); - break; - - case OP_NOTI: - flag = "/i"; - /* Fall through */ - case OP_NOT: - fprintf(f, " %s [^", flag); - extra = print_char(f, code + 1, utf); - fprintf(f, "]"); - break; - - case OP_NOTSTARI: - case OP_NOTMINSTARI: - case OP_NOTPOSSTARI: - case OP_NOTPLUSI: - case OP_NOTMINPLUSI: - case OP_NOTPOSPLUSI: - case OP_NOTQUERYI: - case OP_NOTMINQUERYI: - case OP_NOTPOSQUERYI: - flag = "/i"; - /* Fall through */ - - case OP_NOTSTAR: - case OP_NOTMINSTAR: - case OP_NOTPOSSTAR: - case OP_NOTPLUS: - case OP_NOTMINPLUS: - case OP_NOTPOSPLUS: - case OP_NOTQUERY: - case OP_NOTMINQUERY: - case OP_NOTPOSQUERY: - fprintf(f, " %s [^", flag); - extra = print_char(f, code + 1, utf); - fprintf(f, "]%s", OP_names[*code]); - break; - - case OP_NOTEXACTI: - case OP_NOTUPTOI: - case OP_NOTMINUPTOI: - case OP_NOTPOSUPTOI: - flag = "/i"; - /* Fall through */ - - case OP_NOTEXACT: - case OP_NOTUPTO: - case OP_NOTMINUPTO: - case OP_NOTPOSUPTO: - fprintf(f, " %s [^", flag); - extra = print_char(f, code + 1 + IMM2_SIZE, utf); - fprintf(f, "]{"); - if (*code != OP_NOTEXACT && *code != OP_NOTEXACTI) fprintf(f, "0,"); - fprintf(f, "%d}", GET2(code,1)); - if (*code == OP_NOTMINUPTO || *code == OP_NOTMINUPTOI) fprintf(f, "?"); - else - if (*code == OP_NOTPOSUPTO || *code == OP_NOTPOSUPTOI) fprintf(f, "+"); - break; - - case OP_RECURSE: - if (print_lengths) fprintf(f, "%3d ", GET(code, 1)); - else fprintf(f, " "); - fprintf(f, "%s", OP_names[*code]); - break; - - case OP_REFI: - flag = "/i"; - /* Fall through */ - case OP_REF: - fprintf(f, " %s \\%d", flag, GET2(code,1)); - ccode = code + OP_lengths[*code]; - goto CLASS_REF_REPEAT; - - case OP_DNREFI: - flag = "/i"; - /* Fall through */ - case OP_DNREF: - { - PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE; - fprintf(f, " %s \\k<", flag); - print_custring(f, entry); - fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE)); - } - ccode = code + OP_lengths[*code]; - goto CLASS_REF_REPEAT; - - case OP_CALLOUT: - fprintf(f, " %s %d %d %d", OP_names[*code], code[1 + 2*LINK_SIZE], - GET(code, 1), GET(code, 1 + LINK_SIZE)); - break; - - case OP_CALLOUT_STR: - c = code[1 + 4*LINK_SIZE]; - fprintf(f, " %s %c", OP_names[*code], c); - extra = GET(code, 1 + 2*LINK_SIZE); - print_custring_bylen(f, code + 2 + 4*LINK_SIZE, extra - 3 - 4*LINK_SIZE); - for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) - if (c == PRIV(callout_start_delims)[i]) - { - c = PRIV(callout_end_delims)[i]; - break; - } - fprintf(f, "%c %d %d %d", c, GET(code, 1 + 3*LINK_SIZE), GET(code, 1), - GET(code, 1 + LINK_SIZE)); - break; - - case OP_PROP: - case OP_NOTPROP: - print_prop(f, code, " ", ""); - break; - - /* OP_XCLASS cannot occur in 8-bit, non-UTF mode. However, there's no harm - in having this code always here, and it makes it less messy without all - those #ifdefs. */ - - case OP_CLASS: - case OP_NCLASS: - case OP_XCLASS: - { - unsigned int min, max; - BOOL printmap; - BOOL invertmap = FALSE; - uint8_t *map; - uint8_t inverted_map[32]; - - fprintf(f, " ["); - - if (*code == OP_XCLASS) - { - extra = GET(code, 1); - ccode = code + LINK_SIZE + 1; - printmap = (*ccode & XCL_MAP) != 0; - if ((*ccode & XCL_NOT) != 0) - { - invertmap = (*ccode & XCL_HASPROP) == 0; - fprintf(f, "^"); - } - ccode++; - } - else - { - printmap = TRUE; - ccode = code + 1; - } - - /* Print a bit map */ - - if (printmap) - { - map = (uint8_t *)ccode; - if (invertmap) - { - /* Using 255 ^ instead of ~ avoids clang sanitize warning. */ - for (i = 0; i < 32; i++) inverted_map[i] = 255 ^ map[i]; - map = inverted_map; - } - - for (i = 0; i < 256; i++) - { - if ((map[i/8] & (1u << (i&7))) != 0) - { - int j; - for (j = i+1; j < 256; j++) - if ((map[j/8] & (1u << (j&7))) == 0) break; - if (i == '-' || i == ']') fprintf(f, "\\"); - if (PRINTABLE(i)) fprintf(f, "%c", i); - else fprintf(f, "\\x%02x", i); - if (--j > i) - { - if (j != i + 1) fprintf(f, "-"); - if (j == '-' || j == ']') fprintf(f, "\\"); - if (PRINTABLE(j)) fprintf(f, "%c", j); - else fprintf(f, "\\x%02x", j); - } - i = j; - } - } - ccode += 32 / sizeof(PCRE2_UCHAR); - } - - /* For an XCLASS there is always some additional data */ - - if (*code == OP_XCLASS) - { - PCRE2_UCHAR ch; - while ((ch = *ccode++) != XCL_END) - { - BOOL not = FALSE; - const char *notch = ""; - - switch(ch) - { - case XCL_NOTPROP: - not = TRUE; - notch = "^"; - /* Fall through */ - - case XCL_PROP: - { - unsigned int ptype = *ccode++; - unsigned int pvalue = *ccode++; - - switch(ptype) - { - case PT_PXGRAPH: - fprintf(f, "[:%sgraph:]", notch); - break; - - case PT_PXPRINT: - fprintf(f, "[:%sprint:]", notch); - break; - - case PT_PXPUNCT: - fprintf(f, "[:%spunct:]", notch); - break; - - default: - fprintf(f, "\\%c{%s}", (not? 'P':'p'), - get_ucpname(ptype, pvalue)); - break; - } - } - break; - - default: - ccode += 1 + print_char(f, ccode, utf); - if (ch == XCL_RANGE) - { - fprintf(f, "-"); - ccode += 1 + print_char(f, ccode, utf); - } - break; - } - } - } - - /* Indicate a non-UTF class which was created by negation */ - - fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : ""); - - /* Handle repeats after a class or a back reference */ - - CLASS_REF_REPEAT: - switch(*ccode) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSSTAR: - case OP_CRPOSPLUS: - case OP_CRPOSQUERY: - fprintf(f, "%s", OP_names[*ccode]); - extra += OP_lengths[*ccode]; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - min = GET2(ccode,1); - max = GET2(ccode,1 + IMM2_SIZE); - if (max == 0) fprintf(f, "{%u,}", min); - else fprintf(f, "{%u,%u}", min, max); - if (*ccode == OP_CRMINRANGE) fprintf(f, "?"); - else if (*ccode == OP_CRPOSRANGE) fprintf(f, "+"); - extra += OP_lengths[*ccode]; - break; - - /* Do nothing if it's not a repeat; this code stops picky compilers - warning about the lack of a default code path. */ - - default: - break; - } - } - break; - - case OP_MARK: - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - case OP_SKIP_ARG: - case OP_THEN_ARG: - fprintf(f, " %s ", OP_names[*code]); - print_custring_bylen(f, code + 2, code[1]); - extra += code[1]; - break; - - case OP_THEN: - fprintf(f, " %s", OP_names[*code]); - break; - - case OP_CIRCM: - case OP_DOLLM: - flag = "/m"; - /* Fall through */ - - /* Anything else is just an item with no data, but possibly a flag. */ - - default: - fprintf(f, " %s %s", flag, OP_names[*code]); - break; - } - - code += OP_lengths[*code] + extra; - fprintf(f, "\n"); - } -} - -/* End of pcre2_printint.c */ diff --git a/pcre2/src/pcre2_script_run.c b/pcre2/src/pcre2_script_run.c deleted file mode 100644 index 91a483302..000000000 --- a/pcre2/src/pcre2_script_run.c +++ /dev/null @@ -1,441 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2018 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -/* This module contains the function for checking a script run. */ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - - -/************************************************* -* Check script run * -*************************************************/ - -/* A script run is conceptually a sequence of characters all in the same -Unicode script. However, it isn't quite that simple. There are special rules -for scripts that are commonly used together, and also special rules for digits. -This function implements the appropriate checks, which is possible only when -PCRE2 is compiled with Unicode support. The function returns TRUE if there is -no Unicode support; however, it should never be called in that circumstance -because an error is given by pcre2_compile() if a script run is called for in a -version of PCRE2 compiled without Unicode support. - -Arguments: - pgr point to the first character - endptr point after the last character - utf TRUE if in UTF mode - -Returns: TRUE if this is a valid script run -*/ - -/* These dummy values must be less than the negation of the largest offset in -the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD -records (and is only likely to be a few hundred). */ - -#define SCRIPT_UNSET (-99999) -#define SCRIPT_HANPENDING (-99998) -#define SCRIPT_HANHIRAKATA (-99997) -#define SCRIPT_HANBOPOMOFO (-99996) -#define SCRIPT_HANHANGUL (-99995) -#define SCRIPT_LIST (-99994) - -#define INTERSECTION_LIST_SIZE 50 - -BOOL -PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf) -{ -#ifdef SUPPORT_UNICODE -int require_script = SCRIPT_UNSET; -uint8_t intersection_list[INTERSECTION_LIST_SIZE]; -const uint8_t *require_list = NULL; -uint32_t require_digitset = 0; -uint32_t c; - -#if PCRE2_CODE_UNIT_WIDTH == 32 -(void)utf; /* Avoid compiler warning */ -#endif - -/* Any string containing fewer than 2 characters is a valid script run. */ - -if (ptr >= endptr) return TRUE; -GETCHARINCTEST(c, ptr); -if (ptr >= endptr) return TRUE; - -/* Scan strings of two or more characters, checking the Unicode characteristics -of each code point. We make use of the Script Extensions property. There is -special code for scripts that can be combined with characters from the Han -Chinese script. This may be used in conjunction with four other scripts in -these combinations: - -. Han with Hiragana and Katakana is allowed (for Japanese). -. Han with Bopomofo is allowed (for Taiwanese Mandarin). -. Han with Hangul is allowed (for Korean). - -If the first significant character's script is one of the four, the required -script type is immediately known. However, if the first significant -character's script is Han, we have to keep checking for a non-Han character. -Hence the SCRIPT_HANPENDING state. */ - -for (;;) - { - const ucd_record *ucd = GET_UCD(c); - int32_t scriptx = ucd->scriptx; - - /* If the script extension is Unknown, the string is not a valid script run. - Such characters can only form script runs of length one. */ - - if (scriptx == ucp_Unknown) return FALSE; - - /* A character whose script extension is Inherited is always accepted with - any script, and plays no further part in this testing. A character whose - script is Common is always accepted, but must still be tested for a digit - below. The scriptx value at this point is non-zero, because zero is - ucp_Unknown, tested for above. */ - - if (scriptx != ucp_Inherited) - { - if (scriptx != ucp_Common) - { - /* If the script extension value is positive, the character is not a mark - that can be used with many scripts. In the simple case we either set or - compare with the required script. However, handling the scripts that can - combine with Han are more complicated, as is the case when the previous - characters have been man-script marks. */ - - if (scriptx > 0) - { - switch(require_script) - { - /* Either the first significant character (require_script unset) or - after only Han characters. */ - - case SCRIPT_UNSET: - case SCRIPT_HANPENDING: - switch(scriptx) - { - case ucp_Han: - require_script = SCRIPT_HANPENDING; - break; - - case ucp_Hiragana: - case ucp_Katakana: - require_script = SCRIPT_HANHIRAKATA; - break; - - case ucp_Bopomofo: - require_script = SCRIPT_HANBOPOMOFO; - break; - - case ucp_Hangul: - require_script = SCRIPT_HANHANGUL; - break; - - /* Not a Han-related script. If expecting one, fail. Otherise set - the requirement to this script. */ - - default: - if (require_script == SCRIPT_HANPENDING) return FALSE; - require_script = scriptx; - break; - } - break; - - /* Previously encountered one of the "with Han" scripts. Check that - this character is appropriate. */ - - case SCRIPT_HANHIRAKATA: - if (scriptx != ucp_Han && scriptx != ucp_Hiragana && - scriptx != ucp_Katakana) - return FALSE; - break; - - case SCRIPT_HANBOPOMOFO: - if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE; - break; - - case SCRIPT_HANHANGUL: - if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE; - break; - - /* We have a list of scripts to check that is derived from one or - more previous characters. This is either one of the lists in - ucd_script_sets[] (for one previous character) or the intersection of - several lists for multiple characters. */ - - case SCRIPT_LIST: - { - const uint8_t *list; - for (list = require_list; *list != 0; list++) - { - if (*list == scriptx) break; - } - if (*list == 0) return FALSE; - } - - /* The rest of the string must be in this script, but we have to - allow for the Han complications. */ - - switch(scriptx) - { - case ucp_Han: - require_script = SCRIPT_HANPENDING; - break; - - case ucp_Hiragana: - case ucp_Katakana: - require_script = SCRIPT_HANHIRAKATA; - break; - - case ucp_Bopomofo: - require_script = SCRIPT_HANBOPOMOFO; - break; - - case ucp_Hangul: - require_script = SCRIPT_HANHANGUL; - break; - - default: - require_script = scriptx; - break; - } - break; - - /* This is the easy case when a single script is required. */ - - default: - if (scriptx != require_script) return FALSE; - break; - } - } /* End of handing positive scriptx */ - - /* If scriptx is negative, this character is a mark-type character that - has a list of permitted scripts. */ - - else - { - uint32_t chspecial; - const uint8_t *clist, *rlist; - const uint8_t *list = PRIV(ucd_script_sets) - scriptx; - - switch(require_script) - { - case SCRIPT_UNSET: - require_list = PRIV(ucd_script_sets) - scriptx; - require_script = SCRIPT_LIST; - break; - - /* An inspection of the Unicode 11.0.0 files shows that there are the - following types of Script Extension list that involve the Han, - Bopomofo, Hiragana, Katakana, and Hangul scripts: - - . Bopomofo + Han - . Han + Hiragana + Katakana - . Hiragana + Katakana - . Bopopmofo + Hangul + Han + Hiragana + Katakana - - The following code tries to make sense of this. */ - -#define FOUND_BOPOMOFO 1 -#define FOUND_HIRAGANA 2 -#define FOUND_KATAKANA 4 -#define FOUND_HANGUL 8 - - case SCRIPT_HANPENDING: - chspecial = 0; - for (; *list != 0; list++) - { - switch (*list) - { - case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break; - case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break; - case ucp_Katakana: chspecial |= FOUND_KATAKANA; break; - case ucp_Hangul: chspecial |= FOUND_HANGUL; break; - default: break; - } - } - - if (chspecial == 0) return FALSE; - - if (chspecial == FOUND_BOPOMOFO) - { - require_script = SCRIPT_HANBOPOMOFO; - } - else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA)) - { - require_script = SCRIPT_HANHIRAKATA; - } - - /* Otherwise it must be allowed with all of them, so remain in - the pending state. */ - - break; - - case SCRIPT_HANHIRAKATA: - for (; *list != 0; list++) - { - if (*list == ucp_Hiragana || *list == ucp_Katakana) break; - } - if (*list == 0) return FALSE; - break; - - case SCRIPT_HANBOPOMOFO: - for (; *list != 0; list++) - { - if (*list == ucp_Bopomofo) break; - } - if (*list == 0) return FALSE; - break; - - case SCRIPT_HANHANGUL: - for (; *list != 0; list++) - { - if (*list == ucp_Hangul) break; - } - if (*list == 0) return FALSE; - break; - - /* Previously encountered one or more characters that are allowed - with a list of scripts. Build the intersection of the required list - with this character's list in intersection_list[]. This code is - written so that it still works OK if the required list is already in - that vector. */ - - case SCRIPT_LIST: - { - int i = 0; - for (rlist = require_list; *rlist != 0; rlist++) - { - for (clist = list; *clist != 0; clist++) - { - if (*rlist == *clist) - { - intersection_list[i++] = *rlist; - break; - } - } - } - if (i == 0) return FALSE; /* No scripts in common */ - - /* If there's just one script in common, we can set it as the - unique required script. Otherwise, terminate the intersection list - and make it the required list. */ - - if (i == 1) - { - require_script = intersection_list[0]; - } - else - { - intersection_list[i] = 0; - require_list = intersection_list; - } - } - break; - - /* The previously set required script is a single script, not - Han-related. Check that it is in this character's list. */ - - default: - for (; *list != 0; list++) - { - if (*list == require_script) break; - } - if (*list == 0) return FALSE; - break; - } - } /* End of handling negative scriptx */ - } /* End of checking non-Common character */ - - /* The character is in an acceptable script. We must now ensure that all - decimal digits in the string come from the same set. Some scripts (e.g. - Common, Arabic) have more than one set of decimal digits. This code does - not allow mixing sets, even within the same script. The vector called - PRIV(ucd_digit_sets)[] contains, in its first element, the number of - following elements, and then, in ascending order, the code points of the - '9' characters in every set of 10 digits. Each set is identified by the - offset in the vector of its '9' character. An initial check of the first - value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ - - if (ucd->chartype == ucp_Nd) - { - uint32_t digitset; - - if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else - { - int mid; - int bot = 1; - int top = PRIV(ucd_digit_sets)[0]; - for (;;) - { - if (top <= bot + 1) /* <= rather than == is paranoia */ - { - digitset = top; - break; - } - mid = (top + bot) / 2; - if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid; - } - } - - /* A required value of 0 means "unset". */ - - if (require_digitset == 0) require_digitset = digitset; - else if (digitset != require_digitset) return FALSE; - } /* End digit handling */ - } /* End checking non-Inherited character */ - - /* If we haven't yet got to the end, pick up the next character. */ - - if (ptr >= endptr) return TRUE; - GETCHARINCTEST(c, ptr); - } /* End checking loop */ - -#else /* NOT SUPPORT_UNICODE */ -(void)ptr; -(void)endptr; -(void)utf; -return TRUE; -#endif /* SUPPORT_UNICODE */ -} - -/* End of pcre2_script_run.c */ diff --git a/pcre2/src/pcre2_serialize.c b/pcre2/src/pcre2_serialize.c deleted file mode 100644 index ba17a26d2..000000000 --- a/pcre2/src/pcre2_serialize.c +++ /dev/null @@ -1,286 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -/* This module contains functions for serializing and deserializing -a sequence of compiled codes. */ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - - -#include "pcre2_internal.h" - -/* Magic number to provide a small check against being handed junk. */ - -#define SERIALIZED_DATA_MAGIC 0x50523253u - -/* Deserialization is limited to the current PCRE version and -character width. */ - -#define SERIALIZED_DATA_VERSION \ - ((PCRE2_MAJOR) | ((PCRE2_MINOR) << 16)) - -#define SERIALIZED_DATA_CONFIG \ - (sizeof(PCRE2_UCHAR) | ((sizeof(void*)) << 8) | ((sizeof(PCRE2_SIZE)) << 16)) - - - -/************************************************* -* Serialize compiled patterns * -*************************************************/ - -PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION -pcre2_serialize_encode(const pcre2_code **codes, int32_t number_of_codes, - uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size, - pcre2_general_context *gcontext) -{ -uint8_t *bytes; -uint8_t *dst_bytes; -int32_t i; -PCRE2_SIZE total_size; -const pcre2_real_code *re; -const uint8_t *tables; -pcre2_serialized_data *data; - -const pcre2_memctl *memctl = (gcontext != NULL) ? - &gcontext->memctl : &PRIV(default_compile_context).memctl; - -if (codes == NULL || serialized_bytes == NULL || serialized_size == NULL) - return PCRE2_ERROR_NULL; - -if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA; - -/* Compute total size. */ -total_size = sizeof(pcre2_serialized_data) + TABLES_LENGTH; -tables = NULL; - -for (i = 0; i < number_of_codes; i++) - { - if (codes[i] == NULL) return PCRE2_ERROR_NULL; - re = (const pcre2_real_code *)(codes[i]); - if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; - if (tables == NULL) - tables = re->tables; - else if (tables != re->tables) - return PCRE2_ERROR_MIXEDTABLES; - total_size += re->blocksize; - } - -/* Initialize the byte stream. */ -bytes = memctl->malloc(total_size + sizeof(pcre2_memctl), memctl->memory_data); -if (bytes == NULL) return PCRE2_ERROR_NOMEMORY; - -/* The controller is stored as a hidden parameter. */ -memcpy(bytes, memctl, sizeof(pcre2_memctl)); -bytes += sizeof(pcre2_memctl); - -data = (pcre2_serialized_data *)bytes; -data->magic = SERIALIZED_DATA_MAGIC; -data->version = SERIALIZED_DATA_VERSION; -data->config = SERIALIZED_DATA_CONFIG; -data->number_of_codes = number_of_codes; - -/* Copy all compiled code data. */ -dst_bytes = bytes + sizeof(pcre2_serialized_data); -memcpy(dst_bytes, tables, TABLES_LENGTH); -dst_bytes += TABLES_LENGTH; - -for (i = 0; i < number_of_codes; i++) - { - re = (const pcre2_real_code *)(codes[i]); - (void)memcpy(dst_bytes, (char *)re, re->blocksize); - - /* Certain fields in the compiled code block are re-set during - deserialization. In order to ensure that the serialized data stream is always - the same for the same pattern, set them to zero here. We can't assume the - copy of the pattern is correctly aligned for accessing the fields as part of - a structure. Note the use of sizeof(void *) in the second of these, to - specify the size of a pointer. If sizeof(uint8_t *) is used (tables is a - pointer to uint8_t), gcc gives a warning because the first argument is also a - pointer to uint8_t. Casting the first argument to (void *) can stop this, but - it didn't stop Coverity giving the same complaint. */ - - (void)memset(dst_bytes + offsetof(pcre2_real_code, memctl), 0, - sizeof(pcre2_memctl)); - (void)memset(dst_bytes + offsetof(pcre2_real_code, tables), 0, - sizeof(void *)); - (void)memset(dst_bytes + offsetof(pcre2_real_code, executable_jit), 0, - sizeof(void *)); - - dst_bytes += re->blocksize; - } - -*serialized_bytes = bytes; -*serialized_size = total_size; -return number_of_codes; -} - - -/************************************************* -* Deserialize compiled patterns * -*************************************************/ - -PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION -pcre2_serialize_decode(pcre2_code **codes, int32_t number_of_codes, - const uint8_t *bytes, pcre2_general_context *gcontext) -{ -const pcre2_serialized_data *data = (const pcre2_serialized_data *)bytes; -const pcre2_memctl *memctl = (gcontext != NULL) ? - &gcontext->memctl : &PRIV(default_compile_context).memctl; - -const uint8_t *src_bytes; -pcre2_real_code *dst_re; -uint8_t *tables; -int32_t i, j; - -/* Sanity checks. */ - -if (data == NULL || codes == NULL) return PCRE2_ERROR_NULL; -if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA; -if (data->number_of_codes <= 0) return PCRE2_ERROR_BADSERIALIZEDDATA; -if (data->magic != SERIALIZED_DATA_MAGIC) return PCRE2_ERROR_BADMAGIC; -if (data->version != SERIALIZED_DATA_VERSION) return PCRE2_ERROR_BADMODE; -if (data->config != SERIALIZED_DATA_CONFIG) return PCRE2_ERROR_BADMODE; - -if (number_of_codes > data->number_of_codes) - number_of_codes = data->number_of_codes; - -src_bytes = bytes + sizeof(pcre2_serialized_data); - -/* Decode tables. The reference count for the tables is stored immediately -following them. */ - -tables = memctl->malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE), memctl->memory_data); -if (tables == NULL) return PCRE2_ERROR_NOMEMORY; - -memcpy(tables, src_bytes, TABLES_LENGTH); -*(PCRE2_SIZE *)(tables + TABLES_LENGTH) = number_of_codes; -src_bytes += TABLES_LENGTH; - -/* Decode the byte stream. We must not try to read the size from the compiled -code block in the stream, because it might be unaligned, which causes errors on -hardware such as Sparc-64 that doesn't like unaligned memory accesses. The type -of the blocksize field is given its own name to ensure that it is the same here -as in the block. */ - -for (i = 0; i < number_of_codes; i++) - { - CODE_BLOCKSIZE_TYPE blocksize; - memcpy(&blocksize, src_bytes + offsetof(pcre2_real_code, blocksize), - sizeof(CODE_BLOCKSIZE_TYPE)); - if (blocksize <= sizeof(pcre2_real_code)) - return PCRE2_ERROR_BADSERIALIZEDDATA; - - /* The allocator provided by gcontext replaces the original one. */ - - dst_re = (pcre2_real_code *)PRIV(memctl_malloc)(blocksize, - (pcre2_memctl *)gcontext); - if (dst_re == NULL) - { - memctl->free(tables, memctl->memory_data); - for (j = 0; j < i; j++) - { - memctl->free(codes[j], memctl->memory_data); - codes[j] = NULL; - } - return PCRE2_ERROR_NOMEMORY; - } - - /* The new allocator must be preserved. */ - - memcpy(((uint8_t *)dst_re) + sizeof(pcre2_memctl), - src_bytes + sizeof(pcre2_memctl), blocksize - sizeof(pcre2_memctl)); - if (dst_re->magic_number != MAGIC_NUMBER || - dst_re->name_entry_size > MAX_NAME_SIZE + IMM2_SIZE + 1 || - dst_re->name_count > MAX_NAME_COUNT) - { - memctl->free(dst_re, memctl->memory_data); - return PCRE2_ERROR_BADSERIALIZEDDATA; - } - - /* At the moment only one table is supported. */ - - dst_re->tables = tables; - dst_re->executable_jit = NULL; - dst_re->flags |= PCRE2_DEREF_TABLES; - - codes[i] = dst_re; - src_bytes += blocksize; - } - -return number_of_codes; -} - - -/************************************************* -* Get the number of serialized patterns * -*************************************************/ - -PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION -pcre2_serialize_get_number_of_codes(const uint8_t *bytes) -{ -const pcre2_serialized_data *data = (const pcre2_serialized_data *)bytes; - -if (data == NULL) return PCRE2_ERROR_NULL; -if (data->magic != SERIALIZED_DATA_MAGIC) return PCRE2_ERROR_BADMAGIC; -if (data->version != SERIALIZED_DATA_VERSION) return PCRE2_ERROR_BADMODE; -if (data->config != SERIALIZED_DATA_CONFIG) return PCRE2_ERROR_BADMODE; - -return data->number_of_codes; -} - - -/************************************************* -* Free the allocated stream * -*************************************************/ - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_serialize_free(uint8_t *bytes) -{ -if (bytes != NULL) - { - pcre2_memctl *memctl = (pcre2_memctl *)(bytes - sizeof(pcre2_memctl)); - memctl->free(memctl, memctl->memory_data); - } -} - -/* End of pcre2_serialize.c */ diff --git a/pcre2/src/pcre2_string_utils.c b/pcre2/src/pcre2_string_utils.c deleted file mode 100644 index d6be01acf..000000000 --- a/pcre2/src/pcre2_string_utils.c +++ /dev/null @@ -1,237 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2018 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -/* This module contains internal functions for comparing and finding the length -of strings. These are used instead of strcmp() etc because the standard -functions work only on 8-bit data. */ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - - -/************************************************* -* Emulated memmove() for systems without it * -*************************************************/ - -/* This function can make use of bcopy() if it is available. Otherwise do it by -steam, as there some non-Unix environments that lack both memmove() and -bcopy(). */ - -#if !defined(VPCOMPAT) && !defined(HAVE_MEMMOVE) -void * -PRIV(memmove)(void *d, const void *s, size_t n) -{ -#ifdef HAVE_BCOPY -bcopy(s, d, n); -return d; -#else -size_t i; -unsigned char *dest = (unsigned char *)d; -const unsigned char *src = (const unsigned char *)s; -if (dest > src) - { - dest += n; - src += n; - for (i = 0; i < n; ++i) *(--dest) = *(--src); - return (void *)dest; - } -else - { - for (i = 0; i < n; ++i) *dest++ = *src++; - return (void *)(dest - n); - } -#endif /* not HAVE_BCOPY */ -} -#endif /* not VPCOMPAT && not HAVE_MEMMOVE */ - - -/************************************************* -* Compare two zero-terminated PCRE2 strings * -*************************************************/ - -/* -Arguments: - str1 first string - str2 second string - -Returns: 0, 1, or -1 -*/ - -int -PRIV(strcmp)(PCRE2_SPTR str1, PCRE2_SPTR str2) -{ -PCRE2_UCHAR c1, c2; -while (*str1 != '\0' || *str2 != '\0') - { - c1 = *str1++; - c2 = *str2++; - if (c1 != c2) return ((c1 > c2) << 1) - 1; - } -return 0; -} - - -/************************************************* -* Compare zero-terminated PCRE2 & 8-bit strings * -*************************************************/ - -/* As the 8-bit string is almost always a literal, its type is specified as -const char *. - -Arguments: - str1 first string - str2 second string - -Returns: 0, 1, or -1 -*/ - -int -PRIV(strcmp_c8)(PCRE2_SPTR str1, const char *str2) -{ -PCRE2_UCHAR c1, c2; -while (*str1 != '\0' || *str2 != '\0') - { - c1 = *str1++; - c2 = *str2++; - if (c1 != c2) return ((c1 > c2) << 1) - 1; - } -return 0; -} - - -/************************************************* -* Compare two PCRE2 strings, given a length * -*************************************************/ - -/* -Arguments: - str1 first string - str2 second string - len the length - -Returns: 0, 1, or -1 -*/ - -int -PRIV(strncmp)(PCRE2_SPTR str1, PCRE2_SPTR str2, size_t len) -{ -PCRE2_UCHAR c1, c2; -for (; len > 0; len--) - { - c1 = *str1++; - c2 = *str2++; - if (c1 != c2) return ((c1 > c2) << 1) - 1; - } -return 0; -} - - -/************************************************* -* Compare PCRE2 string to 8-bit string by length * -*************************************************/ - -/* As the 8-bit string is almost always a literal, its type is specified as -const char *. - -Arguments: - str1 first string - str2 second string - len the length - -Returns: 0, 1, or -1 -*/ - -int -PRIV(strncmp_c8)(PCRE2_SPTR str1, const char *str2, size_t len) -{ -PCRE2_UCHAR c1, c2; -for (; len > 0; len--) - { - c1 = *str1++; - c2 = *str2++; - if (c1 != c2) return ((c1 > c2) << 1) - 1; - } -return 0; -} - - -/************************************************* -* Find the length of a PCRE2 string * -*************************************************/ - -/* -Argument: the string -Returns: the length -*/ - -PCRE2_SIZE -PRIV(strlen)(PCRE2_SPTR str) -{ -PCRE2_SIZE c = 0; -while (*str++ != 0) c++; -return c; -} - - -/************************************************* -* Copy 8-bit 0-terminated string to PCRE2 string * -*************************************************/ - -/* Arguments: - str1 buffer to receive the string - str2 8-bit string to be copied - -Returns: the number of code units used (excluding trailing zero) -*/ - -PCRE2_SIZE -PRIV(strcpy_c8)(PCRE2_UCHAR *str1, const char *str2) -{ -PCRE2_UCHAR *t = str1; -while (*str2 != 0) *t++ = *str2++; -*t = 0; -return t - str1; -} - -/* End of pcre2_string_utils.c */ diff --git a/pcre2/src/pcre2_study.c b/pcre2/src/pcre2_study.c deleted file mode 100644 index 9bbb37570..000000000 --- a/pcre2/src/pcre2_study.c +++ /dev/null @@ -1,1825 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -/* This module contains functions for scanning a compiled pattern and -collecting data (e.g. minimum matching length). */ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - -/* The maximum remembered capturing brackets minimum. */ - -#define MAX_CACHE_BACKREF 128 - -/* Set a bit in the starting code unit bit map. */ - -#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7)) - -/* Returns from set_start_bits() */ - -enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP }; - - -/************************************************* -* Find the minimum subject length for a group * -*************************************************/ - -/* Scan a parenthesized group and compute the minimum length of subject that -is needed to match it. This is a lower bound; it does not mean there is a -string of that length that matches. In UTF mode, the result is in characters -rather than code units. The field in a compiled pattern for storing the minimum -length is 16-bits long (on the grounds that anything longer than that is -pathological), so we give up when we reach that amount. This also means that -integer overflow for really crazy patterns cannot happen. - -Backreference minimum lengths are cached to speed up multiple references. This -function is called only when the highest back reference in the pattern is less -than or equal to MAX_CACHE_BACKREF, which is one less than the size of the -caching vector. The zeroth element contains the number of the highest set -value. - -Arguments: - re compiled pattern block - code pointer to start of group (the bracket) - startcode pointer to start of the whole pattern's code - utf UTF flag - recurses chain of recurse_check to catch mutual recursion - countptr pointer to call count (to catch over complexity) - backref_cache vector for caching back references. - -This function is no longer called when the pattern contains (*ACCEPT); however, -the old code for returning -1 is retained, just in case. - -Returns: the minimum length - -1 \C in UTF-8 mode - or (*ACCEPT) - or pattern too complicated - -2 internal error (missing capturing bracket) - -3 internal error (opcode not listed) -*/ - -static int -find_minlength(const pcre2_real_code *re, PCRE2_SPTR code, - PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr, - int *backref_cache) -{ -int length = -1; -int branchlength = 0; -int prev_cap_recno = -1; -int prev_cap_d = 0; -int prev_recurse_recno = -1; -int prev_recurse_d = 0; -uint32_t once_fudge = 0; -BOOL had_recurse = FALSE; -BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0; -PCRE2_SPTR nextbranch = code + GET(code, 1); -PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE; -recurse_check this_recurse; - -/* If this is a "could be empty" group, its minimum length is 0. */ - -if (*code >= OP_SBRA && *code <= OP_SCOND) return 0; - -/* Skip over capturing bracket number */ - -if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE; - -/* A large and/or complex regex can take too long to process. */ - -if ((*countptr)++ > 1000) return -1; - -/* Scan along the opcodes for this branch. If we get to the end of the branch, -check the length against that of the other branches. If the accumulated length -passes 16-bits, reset to that value and skip the rest of the branch. */ - -for (;;) - { - int d, min, recno; - PCRE2_UCHAR op, *cs, *ce; - - if (branchlength >= UINT16_MAX) - { - branchlength = UINT16_MAX; - cc = (PCRE2_UCHAR *)nextbranch; - } - - op = *cc; - switch (op) - { - case OP_COND: - case OP_SCOND: - - /* If there is only one branch in a condition, the implied branch has zero - length, so we don't add anything. This covers the DEFINE "condition" - automatically. If there are two branches we can treat it the same as any - other non-capturing subpattern. */ - - cs = cc + GET(cc, 1); - if (*cs != OP_ALT) - { - cc = cs + 1 + LINK_SIZE; - break; - } - goto PROCESS_NON_CAPTURE; - - case OP_BRA: - /* There's a special case of OP_BRA, when it is wrapped round a repeated - OP_RECURSE. We'd like to process the latter at this level so that - remembering the value works for repeated cases. So we do nothing, but - set a fudge value to skip over the OP_KET after the recurse. */ - - if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET) - { - once_fudge = 1 + LINK_SIZE; - cc += 1 + LINK_SIZE; - break; - } - /* Fall through */ - - case OP_ONCE: - case OP_SCRIPT_RUN: - case OP_SBRA: - case OP_BRAPOS: - case OP_SBRAPOS: - PROCESS_NON_CAPTURE: - d = find_minlength(re, cc, startcode, utf, recurses, countptr, - backref_cache); - if (d < 0) return d; - branchlength += d; - do cc += GET(cc, 1); while (*cc == OP_ALT); - cc += 1 + LINK_SIZE; - break; - - /* To save time for repeated capturing subpatterns, we remember the - length of the previous one. Unfortunately we can't do the same for - the unnumbered ones above. Nor can we do this if (?| is present in the - pattern because captures with the same number are not then identical. */ - - case OP_CBRA: - case OP_SCBRA: - case OP_CBRAPOS: - case OP_SCBRAPOS: - recno = (int)GET2(cc, 1+LINK_SIZE); - if (dupcapused || recno != prev_cap_recno) - { - prev_cap_recno = recno; - prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr, - backref_cache); - if (prev_cap_d < 0) return prev_cap_d; - } - branchlength += prev_cap_d; - do cc += GET(cc, 1); while (*cc == OP_ALT); - cc += 1 + LINK_SIZE; - break; - - /* ACCEPT makes things far too complicated; we have to give up. In fact, - from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not - used. However, leave the code in place, just in case. */ - - case OP_ACCEPT: - case OP_ASSERT_ACCEPT: - return -1; - - /* Reached end of a branch; if it's a ket it is the end of a nested - call. If it's ALT it is an alternation in a nested call. If it is END it's - the end of the outer call. All can be handled by the same code. If the - length of any branch is zero, there is no need to scan any subsequent - branches. */ - - case OP_ALT: - case OP_KET: - case OP_KETRMAX: - case OP_KETRMIN: - case OP_KETRPOS: - case OP_END: - if (length < 0 || (!had_recurse && branchlength < length)) - length = branchlength; - if (op != OP_ALT || length == 0) return length; - nextbranch = cc + GET(cc, 1); - cc += 1 + LINK_SIZE; - branchlength = 0; - had_recurse = FALSE; - break; - - /* Skip over assertive subpatterns */ - - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ASSERT_NA: - case OP_ASSERTBACK_NA: - do cc += GET(cc, 1); while (*cc == OP_ALT); - /* Fall through */ - - /* Skip over things that don't match chars */ - - case OP_REVERSE: - case OP_CREF: - case OP_DNCREF: - case OP_RREF: - case OP_DNRREF: - case OP_FALSE: - case OP_TRUE: - case OP_CALLOUT: - case OP_SOD: - case OP_SOM: - case OP_EOD: - case OP_EODN: - case OP_CIRC: - case OP_CIRCM: - case OP_DOLL: - case OP_DOLLM: - case OP_NOT_WORD_BOUNDARY: - case OP_WORD_BOUNDARY: - cc += PRIV(OP_lengths)[*cc]; - break; - - case OP_CALLOUT_STR: - cc += GET(cc, 1 + 2*LINK_SIZE); - break; - - /* Skip over a subpattern that has a {0} or {0,x} quantifier */ - - case OP_BRAZERO: - case OP_BRAMINZERO: - case OP_BRAPOSZERO: - case OP_SKIPZERO: - cc += PRIV(OP_lengths)[*cc]; - do cc += GET(cc, 1); while (*cc == OP_ALT); - cc += 1 + LINK_SIZE; - break; - - /* Handle literal characters and + repetitions */ - - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - case OP_PLUS: - case OP_PLUSI: - case OP_MINPLUS: - case OP_MINPLUSI: - case OP_POSPLUS: - case OP_POSPLUSI: - case OP_NOTPLUS: - case OP_NOTPLUSI: - case OP_NOTMINPLUS: - case OP_NOTMINPLUSI: - case OP_NOTPOSPLUS: - case OP_NOTPOSPLUSI: - branchlength++; - cc += 2; -#ifdef SUPPORT_UNICODE - if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; - - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEPOSPLUS: - branchlength++; - cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2; - break; - - /* Handle exact repetitions. The count is already in characters, but we - may need to skip over a multibyte character in UTF mode. */ - - case OP_EXACT: - case OP_EXACTI: - case OP_NOTEXACT: - case OP_NOTEXACTI: - branchlength += GET2(cc,1); - cc += 2 + IMM2_SIZE; -#ifdef SUPPORT_UNICODE - if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; - - case OP_TYPEEXACT: - branchlength += GET2(cc,1); - cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP - || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); - break; - - /* Handle single-char non-literal matchers */ - - case OP_PROP: - case OP_NOTPROP: - cc += 2; - /* Fall through */ - - case OP_NOT_DIGIT: - case OP_DIGIT: - case OP_NOT_WHITESPACE: - case OP_WHITESPACE: - case OP_NOT_WORDCHAR: - case OP_WORDCHAR: - case OP_ANY: - case OP_ALLANY: - case OP_EXTUNI: - case OP_HSPACE: - case OP_NOT_HSPACE: - case OP_VSPACE: - case OP_NOT_VSPACE: - branchlength++; - cc++; - break; - - /* "Any newline" might match two characters, but it also might match just - one. */ - - case OP_ANYNL: - branchlength += 1; - cc++; - break; - - /* The single-byte matcher means we can't proceed in UTF mode. (In - non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever - appear, but leave the code, just in case.) */ - - case OP_ANYBYTE: -#ifdef SUPPORT_UNICODE - if (utf) return -1; -#endif - branchlength++; - cc++; - break; - - /* For repeated character types, we have to test for \p and \P, which have - an extra two bytes of parameters. */ - - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSQUERY: - if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; - cc += PRIV(OP_lengths)[op]; - break; - - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEPOSUPTO: - if (cc[1 + IMM2_SIZE] == OP_PROP - || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2; - cc += PRIV(OP_lengths)[op]; - break; - - /* Check a class for variable quantification */ - - case OP_CLASS: - case OP_NCLASS: -#ifdef SUPPORT_WIDE_CHARS - case OP_XCLASS: - /* The original code caused an unsigned overflow in 64 bit systems, - so now we use a conditional statement. */ - if (op == OP_XCLASS) - cc += GET(cc, 1); - else - cc += PRIV(OP_lengths)[OP_CLASS]; -#else - cc += PRIV(OP_lengths)[OP_CLASS]; -#endif - - switch (*cc) - { - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRPOSPLUS: - branchlength++; - /* Fall through */ - - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSSTAR: - case OP_CRPOSQUERY: - cc++; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - branchlength += GET2(cc,1); - cc += 1 + 2 * IMM2_SIZE; - break; - - default: - branchlength++; - break; - } - break; - - /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same - way: we find the minimum length for the subpattern. A recursion - (backreference or subroutine) causes an a flag to be set that causes the - length of this branch to be ignored. The logic is that a recursion can only - make sense if there is another alternative that stops the recursing. That - will provide the minimum length (when no recursion happens). - - If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket - matches an empty string (by default it causes a matching failure), so in - that case we must set the minimum length to zero. - - For backreferenes, if duplicate numbers are present in the pattern we check - for a reference to a duplicate. If it is, we don't know which version will - be referenced, so we have to set the minimum length to zero. */ - - /* Duplicate named pattern back reference. */ - - case OP_DNREF: - case OP_DNREFI: - if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) - { - int count = GET2(cc, 1+IMM2_SIZE); - PCRE2_UCHAR *slot = - (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + - GET2(cc, 1) * re->name_entry_size; - - d = INT_MAX; - - /* Scan all groups with the same name; find the shortest. */ - - while (count-- > 0) - { - int dd, i; - recno = GET2(slot, 0); - - if (recno <= backref_cache[0] && backref_cache[recno] >= 0) - dd = backref_cache[recno]; - else - { - ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); - if (cs == NULL) return -2; - do ce += GET(ce, 1); while (*ce == OP_ALT); - - dd = 0; - if (!dupcapused || - (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL) - { - if (cc > cs && cc < ce) /* Simple recursion */ - { - had_recurse = TRUE; - } - else - { - recurse_check *r = recurses; - for (r = recurses; r != NULL; r = r->prev) - if (r->group == cs) break; - if (r != NULL) /* Mutual recursion */ - { - had_recurse = TRUE; - } - else - { - this_recurse.prev = recurses; /* No recursion */ - this_recurse.group = cs; - dd = find_minlength(re, cs, startcode, utf, &this_recurse, - countptr, backref_cache); - if (dd < 0) return dd; - } - } - } - - backref_cache[recno] = dd; - for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; - backref_cache[0] = recno; - } - - if (dd < d) d = dd; - if (d <= 0) break; /* No point looking at any more */ - slot += re->name_entry_size; - } - } - else d = 0; - cc += 1 + 2*IMM2_SIZE; - goto REPEAT_BACK_REFERENCE; - - /* Single back reference by number. References by name are converted to by - number when there is no duplication. */ - - case OP_REF: - case OP_REFI: - recno = GET2(cc, 1); - if (recno <= backref_cache[0] && backref_cache[recno] >= 0) - d = backref_cache[recno]; - else - { - int i; - d = 0; - - if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) - { - ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); - if (cs == NULL) return -2; - do ce += GET(ce, 1); while (*ce == OP_ALT); - - if (!dupcapused || - (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL) - { - if (cc > cs && cc < ce) /* Simple recursion */ - { - had_recurse = TRUE; - } - else - { - recurse_check *r = recurses; - for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; - if (r != NULL) /* Mutual recursion */ - { - had_recurse = TRUE; - } - else /* No recursion */ - { - this_recurse.prev = recurses; - this_recurse.group = cs; - d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr, - backref_cache); - if (d < 0) return d; - } - } - } - } - - backref_cache[recno] = d; - for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; - backref_cache[0] = recno; - } - - cc += 1 + IMM2_SIZE; - - /* Handle repeated back references */ - - REPEAT_BACK_REFERENCE: - switch (*cc) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSSTAR: - case OP_CRPOSQUERY: - min = 0; - cc++; - break; - - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRPOSPLUS: - min = 1; - cc++; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - min = GET2(cc, 1); - cc += 1 + 2 * IMM2_SIZE; - break; - - default: - min = 1; - break; - } - - /* Take care not to overflow: (1) min and d are ints, so check that their - product is not greater than INT_MAX. (2) branchlength is limited to - UINT16_MAX (checked at the top of the loop). */ - - if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d) - branchlength = UINT16_MAX; - else branchlength += min * d; - break; - - /* Recursion always refers to the first occurrence of a subpattern with a - given number. Therefore, we can always make use of caching, even when the - pattern contains multiple subpatterns with the same number. */ - - case OP_RECURSE: - cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1); - recno = GET2(cs, 1+LINK_SIZE); - if (recno == prev_recurse_recno) - { - branchlength += prev_recurse_d; - } - else - { - do ce += GET(ce, 1); while (*ce == OP_ALT); - if (cc > cs && cc < ce) /* Simple recursion */ - had_recurse = TRUE; - else - { - recurse_check *r = recurses; - for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; - if (r != NULL) /* Mutual recursion */ - had_recurse = TRUE; - else - { - this_recurse.prev = recurses; - this_recurse.group = cs; - prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse, - countptr, backref_cache); - if (prev_recurse_d < 0) return prev_recurse_d; - prev_recurse_recno = recno; - branchlength += prev_recurse_d; - } - } - } - cc += 1 + LINK_SIZE + once_fudge; - once_fudge = 0; - break; - - /* Anything else does not or need not match a character. We can get the - item's length from the table, but for those that can match zero occurrences - of a character, we must take special action for UTF-8 characters. As it - happens, the "NOT" versions of these opcodes are used at present only for - ASCII characters, so they could be omitted from this list. However, in - future that may change, so we include them here so as not to leave a - gotcha for a future maintainer. */ - - case OP_UPTO: - case OP_UPTOI: - case OP_NOTUPTO: - case OP_NOTUPTOI: - case OP_MINUPTO: - case OP_MINUPTOI: - case OP_NOTMINUPTO: - case OP_NOTMINUPTOI: - case OP_POSUPTO: - case OP_POSUPTOI: - case OP_NOTPOSUPTO: - case OP_NOTPOSUPTOI: - - case OP_STAR: - case OP_STARI: - case OP_NOTSTAR: - case OP_NOTSTARI: - case OP_MINSTAR: - case OP_MINSTARI: - case OP_NOTMINSTAR: - case OP_NOTMINSTARI: - case OP_POSSTAR: - case OP_POSSTARI: - case OP_NOTPOSSTAR: - case OP_NOTPOSSTARI: - - case OP_QUERY: - case OP_QUERYI: - case OP_NOTQUERY: - case OP_NOTQUERYI: - case OP_MINQUERY: - case OP_MINQUERYI: - case OP_NOTMINQUERY: - case OP_NOTMINQUERYI: - case OP_POSQUERY: - case OP_POSQUERYI: - case OP_NOTPOSQUERY: - case OP_NOTPOSQUERYI: - - cc += PRIV(OP_lengths)[op]; -#ifdef SUPPORT_UNICODE - if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; - - /* Skip these, but we need to add in the name length. */ - - case OP_MARK: - case OP_COMMIT_ARG: - case OP_PRUNE_ARG: - case OP_SKIP_ARG: - case OP_THEN_ARG: - cc += PRIV(OP_lengths)[op] + cc[1]; - break; - - /* The remaining opcodes are just skipped over. */ - - case OP_CLOSE: - case OP_COMMIT: - case OP_FAIL: - case OP_PRUNE: - case OP_SET_SOM: - case OP_SKIP: - case OP_THEN: - cc += PRIV(OP_lengths)[op]; - break; - - /* This should not occur: we list all opcodes explicitly so that when - new ones get added they are properly considered. */ - - default: - return -3; - } - } -/* Control never gets here */ -} - - - -/************************************************* -* Set a bit and maybe its alternate case * -*************************************************/ - -/* Given a character, set its first code unit's bit in the table, and also the -corresponding bit for the other version of a letter if we are caseless. - -Arguments: - re points to the regex block - p points to the first code unit of the character - caseless TRUE if caseless - utf TRUE for UTF mode - ucp TRUE for UCP mode - -Returns: pointer after the character -*/ - -static PCRE2_SPTR -set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf, - BOOL ucp) -{ -uint32_t c = *p++; /* First code unit */ - -(void)utf; /* Stop compiler warnings when UTF not supported */ -(void)ucp; - -/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for -0xff. */ - -#if PCRE2_CODE_UNIT_WIDTH != 8 -if (c > 0xff) SET_BIT(0xff); else -#endif - -SET_BIT(c); - -/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find -the end of the character, even when caseless. */ - -#ifdef SUPPORT_UNICODE -if (utf) - { -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (c >= 0xc0) GETUTF8INC(c, p); -#elif PCRE2_CODE_UNIT_WIDTH == 16 - if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p); -#endif - } -#endif /* SUPPORT_UNICODE */ - -/* If caseless, handle the other case of the character. */ - -if (caseless) - { -#ifdef SUPPORT_UNICODE - if (utf || ucp) - { - c = UCD_OTHERCASE(c); -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (utf) - { - PCRE2_UCHAR buff[6]; - (void)PRIV(ord2utf)(c, buff); - SET_BIT(buff[0]); - } - else if (c < 256) SET_BIT(c); -#else /* 16-bit or 32-bit mode */ - if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); -#endif - } - - else -#endif /* SUPPORT_UNICODE */ - - /* Not UTF or UCP */ - - if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]); - } - -return p; -} - - - -/************************************************* -* Set bits for a positive character type * -*************************************************/ - -/* This function sets starting bits for a character type. In UTF-8 mode, we can -only do a direct setting for bytes less than 128, as otherwise there can be -confusion with bytes in the middle of UTF-8 characters. In a "traditional" -environment, the tables will only recognize ASCII characters anyway, but in at -least one Windows environment, some higher bytes bits were set in the tables. -So we deal with that case by considering the UTF-8 encoding. - -Arguments: - re the regex block - cbit type the type of character wanted - table_limit 32 for non-UTF-8; 16 for UTF-8 - -Returns: nothing -*/ - -static void -set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit) -{ -uint32_t c; -for (c = 0; c < table_limit; c++) - re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type]; -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 -if (table_limit == 32) return; -for (c = 128; c < 256; c++) - { - if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0) - { - PCRE2_UCHAR buff[6]; - (void)PRIV(ord2utf)(c, buff); - SET_BIT(buff[0]); - } - } -#endif /* UTF-8 */ -} - - -/************************************************* -* Set bits for a negative character type * -*************************************************/ - -/* This function sets starting bits for a negative character type such as \D. -In UTF-8 mode, we can only do a direct setting for bytes less than 128, as -otherwise there can be confusion with bytes in the middle of UTF-8 characters. -Unlike in the positive case, where we can set appropriate starting bits for -specific high-valued UTF-8 characters, in this case we have to set the bits for -all high-valued characters. The lowest is 0xc2, but we overkill by starting at -0xc0 (192) for simplicity. - -Arguments: - re the regex block - cbit type the type of character wanted - table_limit 32 for non-UTF-8; 16 for UTF-8 - -Returns: nothing -*/ - -static void -set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit) -{ -uint32_t c; -for (c = 0; c < table_limit; c++) - re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 -if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff; -#endif -} - - - -/************************************************* -* Create bitmap of starting code units * -*************************************************/ - -/* This function scans a compiled unanchored expression recursively and -attempts to build a bitmap of the set of possible starting code units whose -values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause -the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode -we pass a value of 16 rather than 32 as the final argument. (See comments in -those functions for the reason.) - -The SSB_CONTINUE return is useful for parenthesized groups in patterns such as -(a*)b where the group provides some optional starting code units but scanning -must continue at the outer level to find at least one mandatory code unit. At -the outermost level, this function fails unless the result is SSB_DONE. - -We restrict recursion (for nested groups) to 1000 to avoid stack overflow -issues. - -Arguments: - re points to the compiled regex block - code points to an expression - utf TRUE if in UTF mode - ucp TRUE if in UCP mode - depthptr pointer to recurse depth - -Returns: SSB_FAIL => Failed to find any starting code units - SSB_DONE => Found mandatory starting code units - SSB_CONTINUE => Found optional starting code units - SSB_UNKNOWN => Hit an unrecognized opcode - SSB_TOODEEP => Recursion is too deep -*/ - -static int -set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp, - int *depthptr) -{ -uint32_t c; -int yield = SSB_DONE; - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 -int table_limit = utf? 16:32; -#else -int table_limit = 32; -#endif - -*depthptr += 1; -if (*depthptr > 1000) return SSB_TOODEEP; - -do - { - BOOL try_next = TRUE; - PCRE2_SPTR tcode = code + 1 + LINK_SIZE; - - if (*code == OP_CBRA || *code == OP_SCBRA || - *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE; - - while (try_next) /* Loop for items in this branch */ - { - int rc; - uint8_t *classmap = NULL; -#ifdef SUPPORT_WIDE_CHARS - PCRE2_UCHAR xclassflags; -#endif - - switch(*tcode) - { - /* If we reach something we don't understand, it means a new opcode has - been created that hasn't been added to this function. Hopefully this - problem will be discovered during testing. */ - - default: - return SSB_UNKNOWN; - - /* Fail for a valid opcode that implies no starting bits. */ - - case OP_ACCEPT: - case OP_ASSERT_ACCEPT: - case OP_ALLANY: - case OP_ANY: - case OP_ANYBYTE: - case OP_CIRCM: - case OP_CLOSE: - case OP_COMMIT: - case OP_COMMIT_ARG: - case OP_COND: - case OP_CREF: - case OP_FALSE: - case OP_TRUE: - case OP_DNCREF: - case OP_DNREF: - case OP_DNREFI: - case OP_DNRREF: - case OP_DOLL: - case OP_DOLLM: - case OP_END: - case OP_EOD: - case OP_EODN: - case OP_EXTUNI: - case OP_FAIL: - case OP_MARK: - case OP_NOT: - case OP_NOTEXACT: - case OP_NOTEXACTI: - case OP_NOTI: - case OP_NOTMINPLUS: - case OP_NOTMINPLUSI: - case OP_NOTMINQUERY: - case OP_NOTMINQUERYI: - case OP_NOTMINSTAR: - case OP_NOTMINSTARI: - case OP_NOTMINUPTO: - case OP_NOTMINUPTOI: - case OP_NOTPLUS: - case OP_NOTPLUSI: - case OP_NOTPOSPLUS: - case OP_NOTPOSPLUSI: - case OP_NOTPOSQUERY: - case OP_NOTPOSQUERYI: - case OP_NOTPOSSTAR: - case OP_NOTPOSSTARI: - case OP_NOTPOSUPTO: - case OP_NOTPOSUPTOI: - case OP_NOTPROP: - case OP_NOTQUERY: - case OP_NOTQUERYI: - case OP_NOTSTAR: - case OP_NOTSTARI: - case OP_NOTUPTO: - case OP_NOTUPTOI: - case OP_NOT_HSPACE: - case OP_NOT_VSPACE: - case OP_PRUNE: - case OP_PRUNE_ARG: - case OP_RECURSE: - case OP_REF: - case OP_REFI: - case OP_REVERSE: - case OP_RREF: - case OP_SCOND: - case OP_SET_SOM: - case OP_SKIP: - case OP_SKIP_ARG: - case OP_SOD: - case OP_SOM: - case OP_THEN: - case OP_THEN_ARG: - return SSB_FAIL; - - /* OP_CIRC happens only at the start of an anchored branch (multiline ^ - uses OP_CIRCM). Skip over it. */ - - case OP_CIRC: - tcode += PRIV(OP_lengths)[OP_CIRC]; - break; - - /* A "real" property test implies no starting bits, but the fake property - PT_CLIST identifies a list of characters. These lists are short, as they - are used for characters with more than one "other case", so there is no - point in recognizing them for OP_NOTPROP. */ - - case OP_PROP: - if (tcode[1] != PT_CLIST) return SSB_FAIL; - { - const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2]; - while ((c = *p++) < NOTACHAR) - { -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (utf) - { - PCRE2_UCHAR buff[6]; - (void)PRIV(ord2utf)(c, buff); - c = buff[0]; - } -#endif - if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); - } - } - try_next = FALSE; - break; - - /* We can ignore word boundary tests. */ - - case OP_WORD_BOUNDARY: - case OP_NOT_WORD_BOUNDARY: - tcode++; - break; - - /* If we hit a bracket or a positive lookahead assertion, recurse to set - bits from within the subpattern. If it can't find anything, we have to - give up. If it finds some mandatory character(s), we are done for this - branch. Otherwise, carry on scanning after the subpattern. */ - - case OP_BRA: - case OP_SBRA: - case OP_CBRA: - case OP_SCBRA: - case OP_BRAPOS: - case OP_SBRAPOS: - case OP_CBRAPOS: - case OP_SCBRAPOS: - case OP_ONCE: - case OP_SCRIPT_RUN: - case OP_ASSERT: - case OP_ASSERT_NA: - rc = set_start_bits(re, tcode, utf, ucp, depthptr); - if (rc == SSB_DONE) - { - try_next = FALSE; - } - else if (rc == SSB_CONTINUE) - { - do tcode += GET(tcode, 1); while (*tcode == OP_ALT); - tcode += 1 + LINK_SIZE; - } - else return rc; /* FAIL, UNKNOWN, or TOODEEP */ - break; - - /* If we hit ALT or KET, it means we haven't found anything mandatory in - this branch, though we might have found something optional. For ALT, we - continue with the next alternative, but we have to arrange that the final - result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, - return SSB_CONTINUE: if this is the top level, that indicates failure, - but after a nested subpattern, it causes scanning to continue. */ - - case OP_ALT: - yield = SSB_CONTINUE; - try_next = FALSE; - break; - - case OP_KET: - case OP_KETRMAX: - case OP_KETRMIN: - case OP_KETRPOS: - return SSB_CONTINUE; - - /* Skip over callout */ - - case OP_CALLOUT: - tcode += PRIV(OP_lengths)[OP_CALLOUT]; - break; - - case OP_CALLOUT_STR: - tcode += GET(tcode, 1 + 2*LINK_SIZE); - break; - - /* Skip over lookbehind and negative lookahead assertions */ - - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ASSERTBACK_NA: - do tcode += GET(tcode, 1); while (*tcode == OP_ALT); - tcode += 1 + LINK_SIZE; - break; - - /* BRAZERO does the bracket, but carries on. */ - - case OP_BRAZERO: - case OP_BRAMINZERO: - case OP_BRAPOSZERO: - rc = set_start_bits(re, ++tcode, utf, ucp, depthptr); - if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc; - do tcode += GET(tcode,1); while (*tcode == OP_ALT); - tcode += 1 + LINK_SIZE; - break; - - /* SKIPZERO skips the bracket. */ - - case OP_SKIPZERO: - tcode++; - do tcode += GET(tcode,1); while (*tcode == OP_ALT); - tcode += 1 + LINK_SIZE; - break; - - /* Single-char * or ? sets the bit and tries the next item */ - - case OP_STAR: - case OP_MINSTAR: - case OP_POSSTAR: - case OP_QUERY: - case OP_MINQUERY: - case OP_POSQUERY: - tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp); - break; - - case OP_STARI: - case OP_MINSTARI: - case OP_POSSTARI: - case OP_QUERYI: - case OP_MINQUERYI: - case OP_POSQUERYI: - tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp); - break; - - /* Single-char upto sets the bit and tries the next */ - - case OP_UPTO: - case OP_MINUPTO: - case OP_POSUPTO: - tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp); - break; - - case OP_UPTOI: - case OP_MINUPTOI: - case OP_POSUPTOI: - tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp); - break; - - /* At least one single char sets the bit and stops */ - - case OP_EXACT: - tcode += IMM2_SIZE; - /* Fall through */ - case OP_CHAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_POSPLUS: - (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp); - try_next = FALSE; - break; - - case OP_EXACTI: - tcode += IMM2_SIZE; - /* Fall through */ - case OP_CHARI: - case OP_PLUSI: - case OP_MINPLUSI: - case OP_POSPLUSI: - (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp); - try_next = FALSE; - break; - - /* Special spacing and line-terminating items. These recognize specific - lists of characters. The difference between VSPACE and ANYNL is that the - latter can match the two-character CRLF sequence, but that is not - relevant for finding the first character, so their code here is - identical. */ - - case OP_HSPACE: - SET_BIT(CHAR_HT); - SET_BIT(CHAR_SPACE); - - /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set - the bits for 0xA0 and for code units >= 255, independently of UTF. */ - -#if PCRE2_CODE_UNIT_WIDTH != 8 - SET_BIT(0xA0); - SET_BIT(0xFF); -#else - /* For the 8-bit library in UTF-8 mode, set the bits for the first code - units of horizontal space characters. */ - -#ifdef SUPPORT_UNICODE - if (utf) - { - SET_BIT(0xC2); /* For U+00A0 */ - SET_BIT(0xE1); /* For U+1680, U+180E */ - SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ - SET_BIT(0xE3); /* For U+3000 */ - } - else -#endif - /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless - the code is EBCDIC. */ - { -#ifndef EBCDIC - SET_BIT(0xA0); -#endif /* Not EBCDIC */ - } -#endif /* 8-bit support */ - - try_next = FALSE; - break; - - case OP_ANYNL: - case OP_VSPACE: - SET_BIT(CHAR_LF); - SET_BIT(CHAR_VT); - SET_BIT(CHAR_FF); - SET_BIT(CHAR_CR); - - /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set - the bits for NEL and for code units >= 255, independently of UTF. */ - -#if PCRE2_CODE_UNIT_WIDTH != 8 - SET_BIT(CHAR_NEL); - SET_BIT(0xFF); -#else - /* For the 8-bit library in UTF-8 mode, set the bits for the first code - units of vertical space characters. */ - -#ifdef SUPPORT_UNICODE - if (utf) - { - SET_BIT(0xC2); /* For U+0085 (NEL) */ - SET_BIT(0xE2); /* For U+2028, U+2029 */ - } - else -#endif - /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */ - { - SET_BIT(CHAR_NEL); - } -#endif /* 8-bit support */ - - try_next = FALSE; - break; - - /* Single character types set the bits and stop. Note that if PCRE2_UCP - is set, we do not see these opcodes because \d etc are converted to - properties. Therefore, these apply in the case when only characters less - than 256 are recognized to match the types. */ - - case OP_NOT_DIGIT: - set_nottype_bits(re, cbit_digit, table_limit); - try_next = FALSE; - break; - - case OP_DIGIT: - set_type_bits(re, cbit_digit, table_limit); - try_next = FALSE; - break; - - case OP_NOT_WHITESPACE: - set_nottype_bits(re, cbit_space, table_limit); - try_next = FALSE; - break; - - case OP_WHITESPACE: - set_type_bits(re, cbit_space, table_limit); - try_next = FALSE; - break; - - case OP_NOT_WORDCHAR: - set_nottype_bits(re, cbit_word, table_limit); - try_next = FALSE; - break; - - case OP_WORDCHAR: - set_type_bits(re, cbit_word, table_limit); - try_next = FALSE; - break; - - /* One or more character type fudges the pointer and restarts, knowing - it will hit a single character type and stop there. */ - - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEPOSPLUS: - tcode++; - break; - - case OP_TYPEEXACT: - tcode += 1 + IMM2_SIZE; - break; - - /* Zero or more repeats of character types set the bits and then - try again. */ - - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEPOSUPTO: - tcode += IMM2_SIZE; /* Fall through */ - - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPOSSTAR: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSQUERY: - switch(tcode[1]) - { - default: - case OP_ANY: - case OP_ALLANY: - return SSB_FAIL; - - case OP_HSPACE: - SET_BIT(CHAR_HT); - SET_BIT(CHAR_SPACE); - - /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set - the bits for 0xA0 and for code units >= 255, independently of UTF. */ - -#if PCRE2_CODE_UNIT_WIDTH != 8 - SET_BIT(0xA0); - SET_BIT(0xFF); -#else - /* For the 8-bit library in UTF-8 mode, set the bits for the first code - units of horizontal space characters. */ - -#ifdef SUPPORT_UNICODE - if (utf) - { - SET_BIT(0xC2); /* For U+00A0 */ - SET_BIT(0xE1); /* For U+1680, U+180E */ - SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ - SET_BIT(0xE3); /* For U+3000 */ - } - else -#endif - /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless - the code is EBCDIC. */ - { -#ifndef EBCDIC - SET_BIT(0xA0); -#endif /* Not EBCDIC */ - } -#endif /* 8-bit support */ - break; - - case OP_ANYNL: - case OP_VSPACE: - SET_BIT(CHAR_LF); - SET_BIT(CHAR_VT); - SET_BIT(CHAR_FF); - SET_BIT(CHAR_CR); - - /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set - the bits for NEL and for code units >= 255, independently of UTF. */ - -#if PCRE2_CODE_UNIT_WIDTH != 8 - SET_BIT(CHAR_NEL); - SET_BIT(0xFF); -#else - /* For the 8-bit library in UTF-8 mode, set the bits for the first code - units of vertical space characters. */ - -#ifdef SUPPORT_UNICODE - if (utf) - { - SET_BIT(0xC2); /* For U+0085 (NEL) */ - SET_BIT(0xE2); /* For U+2028, U+2029 */ - } - else -#endif - /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */ - { - SET_BIT(CHAR_NEL); - } -#endif /* 8-bit support */ - break; - - case OP_NOT_DIGIT: - set_nottype_bits(re, cbit_digit, table_limit); - break; - - case OP_DIGIT: - set_type_bits(re, cbit_digit, table_limit); - break; - - case OP_NOT_WHITESPACE: - set_nottype_bits(re, cbit_space, table_limit); - break; - - case OP_WHITESPACE: - set_type_bits(re, cbit_space, table_limit); - break; - - case OP_NOT_WORDCHAR: - set_nottype_bits(re, cbit_word, table_limit); - break; - - case OP_WORDCHAR: - set_type_bits(re, cbit_word, table_limit); - break; - } - - tcode += 2; - break; - - /* Extended class: if there are any property checks, or if this is a - negative XCLASS without a map, give up. If there are no property checks, - there must be wide characters on the XCLASS list, because otherwise an - XCLASS would not have been created. This means that code points >= 255 - are potential starters. In the UTF-8 case we can scan them and set bits - for the relevant leading bytes. */ - -#ifdef SUPPORT_WIDE_CHARS - case OP_XCLASS: - xclassflags = tcode[1 + LINK_SIZE]; - if ((xclassflags & XCL_HASPROP) != 0 || - (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT) - return SSB_FAIL; - - /* We have a positive XCLASS or a negative one without a map. Set up the - map pointer if there is one, and fall through. */ - - classmap = ((xclassflags & XCL_MAP) == 0)? NULL : - (uint8_t *)(tcode + 1 + LINK_SIZE + 1); - - /* In UTF-8 mode, scan the character list and set bits for leading bytes, - then jump to handle the map. */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (utf && (xclassflags & XCL_NOT) == 0) - { - PCRE2_UCHAR b, e; - PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32); - tcode += GET(tcode, 1); - - for (;;) switch (*p++) - { - case XCL_SINGLE: - b = *p++; - while ((*p & 0xc0) == 0x80) p++; - re->start_bitmap[b/8] |= (1u << (b&7)); - break; - - case XCL_RANGE: - b = *p++; - while ((*p & 0xc0) == 0x80) p++; - e = *p++; - while ((*p & 0xc0) == 0x80) p++; - for (; b <= e; b++) - re->start_bitmap[b/8] |= (1u << (b&7)); - break; - - case XCL_END: - goto HANDLE_CLASSMAP; - - default: - return SSB_UNKNOWN; /* Internal error, should not occur */ - } - } -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */ -#endif /* SUPPORT_WIDE_CHARS */ - - /* It seems that the fall through comment must be outside the #ifdef if - it is to avoid the gcc compiler warning. */ - - /* Fall through */ - - /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are - in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter - because it starts a character with a value > 255. In 8-bit non-UTF mode, - there is no difference between CLASS and NCLASS. In all other wide - character modes, set the 0xFF bit to indicate code units >= 255. */ - - case OP_NCLASS: -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (utf) - { - re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ - memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ - } -#elif PCRE2_CODE_UNIT_WIDTH != 8 - SET_BIT(0xFF); /* For characters >= 255 */ -#endif - /* Fall through */ - - /* Enter here for a positive non-XCLASS. If we have fallen through from - an XCLASS, classmap will already be set; just advance the code pointer. - Otherwise, set up classmap for a a non-XCLASS and advance past it. */ - - case OP_CLASS: - if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else - { - classmap = (uint8_t *)(++tcode); - tcode += 32 / sizeof(PCRE2_UCHAR); - } - - /* When wide characters are supported, classmap may be NULL. In UTF-8 - (sic) mode, the bits in a class bit map correspond to character values, - not to byte values. However, the bit map we are constructing is for byte - values. So we have to do a conversion for characters whose code point is - greater than 127. In fact, there are only two possible starting bytes for - characters in the range 128 - 255. */ - -#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8 - HANDLE_CLASSMAP: -#endif - if (classmap != NULL) - { -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (utf) - { - for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c]; - for (c = 128; c < 256; c++) - { - if ((classmap[c/8] & (1u << (c&7))) != 0) - { - int d = (c >> 6) | 0xc0; /* Set bit for this starter */ - re->start_bitmap[d/8] |= (1u << (d&7)); /* and then skip on to the */ - c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ - } - } - } - else -#endif - /* In all modes except UTF-8, the two bit maps are compatible. */ - - { - for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c]; - } - } - - /* Act on what follows the class. For a zero minimum repeat, continue; - otherwise stop processing. */ - - switch (*tcode) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSSTAR: - case OP_CRPOSQUERY: - tcode++; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE; - else try_next = FALSE; - break; - - default: - try_next = FALSE; - break; - } - break; /* End of class handling case */ - } /* End of switch for opcodes */ - } /* End of try_next loop */ - - code += GET(code, 1); /* Advance to next branch */ - } -while (*code == OP_ALT); - -return yield; -} - - - -/************************************************* -* Study a compiled expression * -*************************************************/ - -/* This function is handed a compiled expression that it must study to produce -information that will speed up the matching. - -Argument: - re points to the compiled expression - -Returns: 0 normally; non-zero should never normally occur - 1 unknown opcode in set_start_bits - 2 missing capturing bracket - 3 unknown opcode in find_minlength -*/ - -int -PRIV(study)(pcre2_real_code *re) -{ -int count = 0; -PCRE2_UCHAR *code; -BOOL utf = (re->overall_options & PCRE2_UTF) != 0; -BOOL ucp = (re->overall_options & PCRE2_UCP) != 0; - -/* Find start of compiled code */ - -code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + - re->name_entry_size * re->name_count; - -/* For a pattern that has a first code unit, or a multiline pattern that -matches only at "line start", there is no point in seeking a list of starting -code units. */ - -if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0) - { - int depth = 0; - int rc = set_start_bits(re, code, utf, ucp, &depth); - if (rc == SSB_UNKNOWN) return 1; - - /* If a list of starting code units was set up, scan the list to see if only - one or two were listed. Having only one listed is rare because usually a - single starting code unit will have been recognized and PCRE2_FIRSTSET set. - If two are listed, see if they are caseless versions of the same character; - if so we can replace the list with a caseless first code unit. This gives - better performance and is plausibly worth doing for patterns such as [Ww]ord - or (word|WORD). */ - - if (rc == SSB_DONE) - { - int i; - int a = -1; - int b = -1; - uint8_t *p = re->start_bitmap; - uint32_t flags = PCRE2_FIRSTMAPSET; - - for (i = 0; i < 256; p++, i += 8) - { - uint8_t x = *p; - if (x != 0) - { - int c; - uint8_t y = x & (~x + 1); /* Least significant bit */ - if (y != x) goto DONE; /* More than one bit set */ - - /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and - all wide characters", so we cannot use it here. */ - -#if PCRE2_CODE_UNIT_WIDTH != 8 - if (i == 248 && x == 0x80) goto DONE; -#endif - - /* Compute the character value */ - - c = i; - switch (x) - { - case 1: break; - case 2: c += 1; break; case 4: c += 2; break; - case 8: c += 3; break; case 16: c += 4; break; - case 32: c += 5; break; case 64: c += 6; break; - case 128: c += 7; break; - } - - /* c contains the code unit value, in the range 0-255. In 8-bit UTF - mode, only values < 128 can be used. In all the other cases, c is a - character value. */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (utf && c > 127) goto DONE; -#endif - if (a < 0) a = c; /* First one found, save in a */ - else if (b < 0) /* Second one found */ - { - int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c); - -#ifdef SUPPORT_UNICODE - if (utf || ucp) - { - if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */ - if (c > 127) d = UCD_OTHERCASE(c); - } -#endif /* SUPPORT_UNICODE */ - - if (d != a) goto DONE; /* Not the other case of a */ - b = c; /* Save second in b */ - } - else goto DONE; /* More than two characters found */ - } - } - - /* Replace the start code unit bits with a first code unit, but only if it - is not the same as a required later code unit. This is because a search for - a required code unit starts after an explicit first code unit, but at a - code unit found from the bitmap. Patterns such as /a*a/ don't work - if both the start unit and required unit are the same. */ - - if (a >= 0 && - ( - (re->flags & PCRE2_LASTSET) == 0 || - ( - re->last_codeunit != (uint32_t)a && - (b < 0 || re->last_codeunit != (uint32_t)b) - ) - )) - { - re->first_codeunit = a; - flags = PCRE2_FIRSTSET; - if (b >= 0) flags |= PCRE2_FIRSTCASELESS; - } - - DONE: - re->flags |= flags; - } - } - -/* Find the minimum length of subject string. If the pattern can match an empty -string, the minimum length is already known. If the pattern contains (*ACCEPT) -all bets are off, and we don't even try to find a minimum length. If there are -more back references than the size of the vector we are going to cache them in, -do nothing. A pattern that complicated will probably take a long time to -analyze and may in any case turn out to be too complicated. Note that back -reference minima are held as 16-bit numbers. */ - -if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 && - re->top_backref <= MAX_CACHE_BACKREF) - { - int min; - int backref_cache[MAX_CACHE_BACKREF+1]; - backref_cache[0] = 0; /* Highest one that is set */ - min = find_minlength(re, code, code, utf, NULL, &count, backref_cache); - switch(min) - { - case -1: /* \C in UTF mode or over-complex regex */ - break; /* Leave minlength unchanged (will be zero) */ - - case -2: - return 2; /* missing capturing bracket */ - - case -3: - return 3; /* unrecognized opcode */ - - default: - re->minlength = (min > UINT16_MAX)? UINT16_MAX : min; - break; - } - } - -return 0; -} - -/* End of pcre2_study.c */ diff --git a/pcre2/src/pcre2_substitute.c b/pcre2/src/pcre2_substitute.c deleted file mode 100644 index 981a106a9..000000000 --- a/pcre2/src/pcre2_substitute.c +++ /dev/null @@ -1,987 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2020 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - -#define PTR_STACK_SIZE 20 - -#define SUBSTITUTE_OPTIONS \ - (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \ - PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \ - PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \ - PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY) - - - -/************************************************* -* Find end of substitute text * -*************************************************/ - -/* In extended mode, we recognize ${name:+set text:unset text} and similar -constructions. This requires the identification of unescaped : and } -characters. This function scans for such. It must deal with nested ${ -constructions. The pointer to the text is updated, either to the required end -character, or to where an error was detected. - -Arguments: - code points to the compiled expression (for options) - ptrptr points to the pointer to the start of the text (updated) - ptrend end of the whole string - last TRUE if the last expected string (only } recognized) - -Returns: 0 on success - negative error code on failure -*/ - -static int -find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, - BOOL last) -{ -int rc = 0; -uint32_t nestlevel = 0; -BOOL literal = FALSE; -PCRE2_SPTR ptr = *ptrptr; - -for (; ptr < ptrend; ptr++) - { - if (literal) - { - if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E) - { - literal = FALSE; - ptr += 1; - } - } - - else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) - { - if (nestlevel == 0) goto EXIT; - nestlevel--; - } - - else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT; - - else if (*ptr == CHAR_DOLLAR_SIGN) - { - if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET) - { - nestlevel++; - ptr += 1; - } - } - - else if (*ptr == CHAR_BACKSLASH) - { - int erc; - int errorcode; - uint32_t ch; - - if (ptr < ptrend - 1) switch (ptr[1]) - { - case CHAR_L: - case CHAR_l: - case CHAR_U: - case CHAR_u: - ptr += 1; - continue; - } - - ptr += 1; /* Must point after \ */ - erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode, - code->overall_options, code->extra_options, FALSE, NULL); - ptr -= 1; /* Back to last code unit of escape */ - if (errorcode != 0) - { - rc = errorcode; - goto EXIT; - } - - switch(erc) - { - case 0: /* Data character */ - case ESC_E: /* Isolated \E is ignored */ - break; - - case ESC_Q: - literal = TRUE; - break; - - default: - rc = PCRE2_ERROR_BADREPESCAPE; - goto EXIT; - } - } - } - -rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */ - -EXIT: -*ptrptr = ptr; -return rc; -} - - - -/************************************************* -* Match and substitute * -*************************************************/ - -/* This function applies a compiled re to a subject string and creates a new -string with substitutions. The first 7 arguments are the same as for -pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED. - -Arguments: - code points to the compiled expression - subject points to the subject string - length length of subject string (may contain binary zeros) - start_offset where to start in the subject string - options option bits - match_data points to a match_data block, or is NULL - context points a PCRE2 context - replacement points to the replacement string - rlength length of replacement string - buffer where to put the substituted string - blength points to length of buffer; updated to length of string - -Returns: >= 0 number of substitutions made - < 0 an error code - PCRE2_ERROR_BADREPLACEMENT means invalid use of $ -*/ - -/* This macro checks for space in the buffer before copying into it. On -overflow, either give an error immediately, or keep on, accumulating the -length. */ - -#define CHECKMEMCPY(from,length) \ - { \ - if (!overflowed && lengthleft < length) \ - { \ - if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ - overflowed = TRUE; \ - extra_needed = length - lengthleft; \ - } \ - else if (overflowed) \ - { \ - extra_needed += length; \ - } \ - else \ - { \ - memcpy(buffer + buff_offset, from, CU2BYTES(length)); \ - buff_offset += length; \ - lengthleft -= length; \ - } \ - } - -/* Here's the function */ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, - PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, - pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, - PCRE2_UCHAR *buffer, PCRE2_SIZE *blength) -{ -int rc; -int subs; -int forcecase = 0; -int forcecasereset = 0; -uint32_t ovector_count; -uint32_t goptions = 0; -uint32_t suboptions; -pcre2_match_data *internal_match_data = NULL; -BOOL escaped_literal = FALSE; -BOOL overflowed = FALSE; -BOOL use_existing_match; -BOOL replacement_only; -#ifdef SUPPORT_UNICODE -BOOL utf = (code->overall_options & PCRE2_UTF) != 0; -BOOL ucp = (code->overall_options & PCRE2_UCP) != 0; -#endif -PCRE2_UCHAR temp[6]; -PCRE2_SPTR ptr; -PCRE2_SPTR repend; -PCRE2_SIZE extra_needed = 0; -PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength; -PCRE2_SIZE *ovector; -PCRE2_SIZE ovecsave[3]; -pcre2_substitute_callout_block scb; - -/* General initialization */ - -buff_offset = 0; -lengthleft = buff_length = *blength; -*blength = PCRE2_UNSET; -ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; - -/* Partial matching is not valid. This must come after setting *blength to -PCRE2_UNSET, so as not to imply an offset in the replacement. */ - -if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0) - return PCRE2_ERROR_BADOPTION; - -/* Check for using a match that has already happened. Note that the subject -pointer in the match data may be NULL after a no-match. */ - -use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0); -replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0); - -/* If starting from an existing match, there must be an externally provided -match data block. We create an internal match_data block in two cases: (a) an -external one is not supplied (and we are not starting from an existing match); -(b) an existing match is to be used for the first substitution. In the latter -case, we copy the existing match into the internal block. This ensures that no -changes are made to the existing match data block. */ - -if (match_data == NULL) - { - pcre2_general_context *gcontext; - if (use_existing_match) return PCRE2_ERROR_NULL; - gcontext = (mcontext == NULL)? - (pcre2_general_context *)code : - (pcre2_general_context *)mcontext; - match_data = internal_match_data = - pcre2_match_data_create_from_pattern(code, gcontext); - if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; - } - -else if (use_existing_match) - { - pcre2_general_context *gcontext = (mcontext == NULL)? - (pcre2_general_context *)code : - (pcre2_general_context *)mcontext; - int pairs = (code->top_bracket + 1 < match_data->oveccount)? - code->top_bracket + 1 : match_data->oveccount; - internal_match_data = pcre2_match_data_create(match_data->oveccount, - gcontext); - if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; - memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector) - + 2*pairs*sizeof(PCRE2_SIZE)); - match_data = internal_match_data; - } - -/* Remember ovector details */ - -ovector = pcre2_get_ovector_pointer(match_data); -ovector_count = pcre2_get_ovector_count(match_data); - -/* Fixed things in the callout block */ - -scb.version = 0; -scb.input = subject; -scb.output = (PCRE2_SPTR)buffer; -scb.ovector = ovector; - -/* Find lengths of zero-terminated strings and the end of the replacement. */ - -if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); -if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement); -repend = replacement + rlength; - -/* Check UTF replacement string if necessary. */ - -#ifdef SUPPORT_UNICODE -if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) - { - rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar)); - if (rc != 0) - { - match_data->leftchar = 0; - goto EXIT; - } - } -#endif /* SUPPORT_UNICODE */ - -/* Save the substitute options and remove them from the match options. */ - -suboptions = options & SUBSTITUTE_OPTIONS; -options &= ~SUBSTITUTE_OPTIONS; - -/* Error if the start match offset is greater than the length of the subject. */ - -if (start_offset > length) - { - match_data->leftchar = 0; - rc = PCRE2_ERROR_BADOFFSET; - goto EXIT; - } - -/* Copy up to the start offset, unless only the replacement is required. */ - -if (!replacement_only) CHECKMEMCPY(subject, start_offset); - -/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first -match is taken from the match_data that was passed in. */ - -subs = 0; -do - { - PCRE2_SPTR ptrstack[PTR_STACK_SIZE]; - uint32_t ptrstackptr = 0; - - if (use_existing_match) - { - rc = match_data->rc; - use_existing_match = FALSE; - } - else rc = pcre2_match(code, subject, length, start_offset, options|goptions, - match_data, mcontext); - -#ifdef SUPPORT_UNICODE - if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */ -#endif - - /* Any error other than no match returns the error code. No match when not - doing the special after-empty-match global rematch, or when at the end of the - subject, breaks the global loop. Otherwise, advance the starting point by one - character, copying it to the output, and try again. */ - - if (rc < 0) - { - PCRE2_SIZE save_start; - - if (rc != PCRE2_ERROR_NOMATCH) goto EXIT; - if (goptions == 0 || start_offset >= length) break; - - /* Advance by one code point. Then, if CRLF is a valid newline sequence and - we have advanced into the middle of it, advance one more code point. In - other words, do not start in the middle of CRLF, even if CR and LF on their - own are valid newlines. */ - - save_start = start_offset++; - if (subject[start_offset-1] == CHAR_CR && - code->newline_convention != PCRE2_NEWLINE_CR && - code->newline_convention != PCRE2_NEWLINE_LF && - start_offset < length && - subject[start_offset] == CHAR_LF) - start_offset++; - - /* Otherwise, in UTF mode, advance past any secondary code points. */ - - else if ((code->overall_options & PCRE2_UTF) != 0) - { -#if PCRE2_CODE_UNIT_WIDTH == 8 - while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80) - start_offset++; -#elif PCRE2_CODE_UNIT_WIDTH == 16 - while (start_offset < length && - (subject[start_offset] & 0xfc00) == 0xdc00) - start_offset++; -#endif - } - - /* Copy what we have advanced past (unless not required), reset the special - global options, and continue to the next match. */ - - fraglength = start_offset - save_start; - if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength); - goptions = 0; - continue; - } - - /* Handle a successful match. Matches that use \K to end before they start - or start before the current point in the subject are not supported. */ - - if (ovector[1] < ovector[0] || ovector[0] < start_offset) - { - rc = PCRE2_ERROR_BADSUBSPATTERN; - goto EXIT; - } - - /* Check for the same match as previous. This is legitimate after matching an - empty string that starts after the initial match offset. We have tried again - at the match point in case the pattern is one like /(?<=\G.)/ which can never - match at its starting point, so running the match achieves the bumpalong. If - we do get the same (null) match at the original match point, it isn't such a - pattern, so we now do the empty string magic. In all other cases, a repeat - match should never occur. */ - - if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) - { - if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) - { - goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; - ovecsave[2] = start_offset; - continue; /* Back to the top of the loop */ - } - rc = PCRE2_ERROR_INTERNAL_DUPMATCH; - goto EXIT; - } - - /* Count substitutions with a paranoid check for integer overflow; surely no - real call to this function would ever hit this! */ - - if (subs == INT_MAX) - { - rc = PCRE2_ERROR_TOOMANYREPLACE; - goto EXIT; - } - subs++; - - /* Copy the text leading up to the match (unless not required), and remember - where the insert begins and how many ovector pairs are set. */ - - if (rc == 0) rc = ovector_count; - fraglength = ovector[0] - start_offset; - if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength); - scb.output_offsets[0] = buff_offset; - scb.oveccount = rc; - - /* Process the replacement string. If the entire replacement is literal, just - copy it with length check. */ - - ptr = replacement; - if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0) - { - CHECKMEMCPY(ptr, rlength); - } - - /* Within a non-literal replacement, which must be scanned character by - character, local literal mode can be set by \Q, but only in extended mode - when backslashes are being interpreted. In extended mode we must handle - nested substrings that are to be reprocessed. */ - - else for (;;) - { - uint32_t ch; - unsigned int chlen; - - /* If at the end of a nested substring, pop the stack. */ - - if (ptr >= repend) - { - if (ptrstackptr == 0) break; /* End of replacement string */ - repend = ptrstack[--ptrstackptr]; - ptr = ptrstack[--ptrstackptr]; - continue; - } - - /* Handle the next character */ - - if (escaped_literal) - { - if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E) - { - escaped_literal = FALSE; - ptr += 2; - continue; - } - goto LOADLITERAL; - } - - /* Not in literal mode. */ - - if (*ptr == CHAR_DOLLAR_SIGN) - { - int group, n; - uint32_t special = 0; - BOOL inparens; - BOOL star; - PCRE2_SIZE sublength; - PCRE2_SPTR text1_start = NULL; - PCRE2_SPTR text1_end = NULL; - PCRE2_SPTR text2_start = NULL; - PCRE2_SPTR text2_end = NULL; - PCRE2_UCHAR next; - PCRE2_UCHAR name[33]; - - if (++ptr >= repend) goto BAD; - if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL; - - group = -1; - n = 0; - inparens = FALSE; - star = FALSE; - - if (next == CHAR_LEFT_CURLY_BRACKET) - { - if (++ptr >= repend) goto BAD; - next = *ptr; - inparens = TRUE; - } - - if (next == CHAR_ASTERISK) - { - if (++ptr >= repend) goto BAD; - next = *ptr; - star = TRUE; - } - - if (!star && next >= CHAR_0 && next <= CHAR_9) - { - group = next - CHAR_0; - while (++ptr < repend) - { - next = *ptr; - if (next < CHAR_0 || next > CHAR_9) break; - group = group * 10 + next - CHAR_0; - - /* A check for a number greater than the hightest captured group - is sufficient here; no need for a separate overflow check. If unknown - groups are to be treated as unset, just skip over any remaining - digits and carry on. */ - - if (group > code->top_bracket) - { - if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) - { - while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9); - break; - } - else - { - rc = PCRE2_ERROR_NOSUBSTRING; - goto PTREXIT; - } - } - } - } - else - { - const uint8_t *ctypes = code->tables + ctypes_offset; - while (MAX_255(next) && (ctypes[next] & ctype_word) != 0) - { - name[n++] = next; - if (n > 32) goto BAD; - if (++ptr >= repend) break; - next = *ptr; - } - if (n == 0) goto BAD; - name[n] = 0; - } - - /* In extended mode we recognize ${name:+set text:unset text} and - ${name:-default text}. */ - - if (inparens) - { - if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && - !star && ptr < repend - 2 && next == CHAR_COLON) - { - special = *(++ptr); - if (special != CHAR_PLUS && special != CHAR_MINUS) - { - rc = PCRE2_ERROR_BADSUBSTITUTION; - goto PTREXIT; - } - - text1_start = ++ptr; - rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS); - if (rc != 0) goto PTREXIT; - text1_end = ptr; - - if (special == CHAR_PLUS && *ptr == CHAR_COLON) - { - text2_start = ++ptr; - rc = find_text_end(code, &ptr, repend, TRUE); - if (rc != 0) goto PTREXIT; - text2_end = ptr; - } - } - - else - { - if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET) - { - rc = PCRE2_ERROR_REPMISSINGBRACE; - goto PTREXIT; - } - } - - ptr++; - } - - /* Have found a syntactically correct group number or name, or *name. - Only *MARK is currently recognized. */ - - if (star) - { - if (PRIV(strcmp_c8)(name, STRING_MARK) == 0) - { - PCRE2_SPTR mark = pcre2_get_mark(match_data); - if (mark != NULL) - { - PCRE2_SPTR mark_start = mark; - while (*mark != 0) mark++; - fraglength = mark - mark_start; - CHECKMEMCPY(mark_start, fraglength); - } - } - else goto BAD; - } - - /* Substitute the contents of a group. We don't use substring_copy - functions any more, in order to support case forcing. */ - - else - { - PCRE2_SPTR subptr, subptrend; - - /* Find a number for a named group. In case there are duplicate names, - search for the first one that is set. If the name is not found when - PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a - non-existent group. */ - - if (group < 0) - { - PCRE2_SPTR first, last, entry; - rc = pcre2_substring_nametable_scan(code, name, &first, &last); - if (rc == PCRE2_ERROR_NOSUBSTRING && - (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) - { - group = code->top_bracket + 1; - } - else - { - if (rc < 0) goto PTREXIT; - for (entry = first; entry <= last; entry += rc) - { - uint32_t ng = GET2(entry, 0); - if (ng < ovector_count) - { - if (group < 0) group = ng; /* First in ovector */ - if (ovector[ng*2] != PCRE2_UNSET) - { - group = ng; /* First that is set */ - break; - } - } - } - - /* If group is still negative, it means we did not find a group - that is in the ovector. Just set the first group. */ - - if (group < 0) group = GET2(first, 0); - } - } - - /* We now have a group that is identified by number. Find the length of - the captured string. If a group in a non-special substitution is unset - when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */ - - rc = pcre2_substring_length_bynumber(match_data, group, &sublength); - if (rc < 0) - { - if (rc == PCRE2_ERROR_NOSUBSTRING && - (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) - { - rc = PCRE2_ERROR_UNSET; - } - if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */ - if (special == 0) /* Plain substitution */ - { - if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue; - goto PTREXIT; /* Else error */ - } - } - - /* If special is '+' we have a 'set' and possibly an 'unset' text, - both of which are reprocessed when used. If special is '-' we have a - default text for when the group is unset; it must be reprocessed. */ - - if (special != 0) - { - if (special == CHAR_MINUS) - { - if (rc == 0) goto LITERAL_SUBSTITUTE; - text2_start = text1_start; - text2_end = text1_end; - } - - if (ptrstackptr >= PTR_STACK_SIZE) goto BAD; - ptrstack[ptrstackptr++] = ptr; - ptrstack[ptrstackptr++] = repend; - - if (rc == 0) - { - ptr = text1_start; - repend = text1_end; - } - else - { - ptr = text2_start; - repend = text2_end; - } - continue; - } - - /* Otherwise we have a literal substitution of a group's contents. */ - - LITERAL_SUBSTITUTE: - subptr = subject + ovector[group*2]; - subptrend = subject + ovector[group*2 + 1]; - - /* Substitute a literal string, possibly forcing alphabetic case. */ - - while (subptr < subptrend) - { - GETCHARINCTEST(ch, subptr); - if (forcecase != 0) - { -#ifdef SUPPORT_UNICODE - if (utf || ucp) - { - uint32_t type = UCD_CHARTYPE(ch); - if (PRIV(ucp_gentype)[type] == ucp_L && - type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) - ch = UCD_OTHERCASE(ch); - } - else -#endif - { - if (((code->tables + cbits_offset + - ((forcecase > 0)? cbit_upper:cbit_lower) - )[ch/8] & (1u << (ch%8))) == 0) - ch = (code->tables + fcc_offset)[ch]; - } - forcecase = forcecasereset; - } - -#ifdef SUPPORT_UNICODE - if (utf) chlen = PRIV(ord2utf)(ch, temp); else -#endif - { - temp[0] = ch; - chlen = 1; - } - CHECKMEMCPY(temp, chlen); - } - } - } - - /* Handle an escape sequence in extended mode. We can use check_escape() - to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but - the case-forcing escapes are not supported in pcre2_compile() so must be - recognized here. */ - - else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && - *ptr == CHAR_BACKSLASH) - { - int errorcode; - - if (ptr < repend - 1) switch (ptr[1]) - { - case CHAR_L: - forcecase = forcecasereset = -1; - ptr += 2; - continue; - - case CHAR_l: - forcecase = -1; - forcecasereset = 0; - ptr += 2; - continue; - - case CHAR_U: - forcecase = forcecasereset = 1; - ptr += 2; - continue; - - case CHAR_u: - forcecase = 1; - forcecasereset = 0; - ptr += 2; - continue; - - default: - break; - } - - ptr++; /* Point after \ */ - rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode, - code->overall_options, code->extra_options, FALSE, NULL); - if (errorcode != 0) goto BADESCAPE; - - switch(rc) - { - case ESC_E: - forcecase = forcecasereset = 0; - continue; - - case ESC_Q: - escaped_literal = TRUE; - continue; - - case 0: /* Data character */ - goto LITERAL; - - default: - goto BADESCAPE; - } - } - - /* Handle a literal code unit */ - - else - { - LOADLITERAL: - GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */ - - LITERAL: - if (forcecase != 0) - { -#ifdef SUPPORT_UNICODE - if (utf || ucp) - { - uint32_t type = UCD_CHARTYPE(ch); - if (PRIV(ucp_gentype)[type] == ucp_L && - type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) - ch = UCD_OTHERCASE(ch); - } - else -#endif - { - if (((code->tables + cbits_offset + - ((forcecase > 0)? cbit_upper:cbit_lower) - )[ch/8] & (1u << (ch%8))) == 0) - ch = (code->tables + fcc_offset)[ch]; - } - forcecase = forcecasereset; - } - -#ifdef SUPPORT_UNICODE - if (utf) chlen = PRIV(ord2utf)(ch, temp); else -#endif - { - temp[0] = ch; - chlen = 1; - } - CHECKMEMCPY(temp, chlen); - } /* End handling a literal code unit */ - } /* End of loop for scanning the replacement. */ - - /* The replacement has been copied to the output, or its size has been - remembered. Do the callout if there is one and we have done an actual - replacement. */ - - if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL) - { - scb.subscount = subs; - scb.output_offsets[1] = buff_offset; - rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data); - - /* A non-zero return means cancel this substitution. Instead, copy the - matched string fragment. */ - - if (rc != 0) - { - PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0]; - PCRE2_SIZE oldlength = ovector[1] - ovector[0]; - - buff_offset -= newlength; - lengthleft += newlength; - if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength); - - /* A negative return means do not do any more. */ - - if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL); - } - } - - /* Save the details of this match. See above for how this data is used. If we - matched an empty string, do the magic for global matches. Update the start - offset to point to the rest of the subject string. If we re-used an existing - match for the first match, switch to the internal match data block. */ - - ovecsave[0] = ovector[0]; - ovecsave[1] = ovector[1]; - ovecsave[2] = start_offset; - - goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 : - PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; - start_offset = ovector[1]; - } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */ - -/* Copy the rest of the subject unless not required, and terminate the output -with a binary zero. */ - -if (!replacement_only) - { - fraglength = length - start_offset; - CHECKMEMCPY(subject + start_offset, fraglength); - } - -temp[0] = 0; -CHECKMEMCPY(temp, 1); - -/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set, -and matching has carried on after a full buffer, in order to compute the length -needed. Otherwise, an overflow generates an immediate error return. */ - -if (overflowed) - { - rc = PCRE2_ERROR_NOMEMORY; - *blength = buff_length + extra_needed; - } - -/* After a successful execution, return the number of substitutions and set the -length of buffer used, excluding the trailing zero. */ - -else - { - rc = subs; - *blength = buff_offset - 1; - } - -EXIT: -if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data); - else match_data->rc = rc; -return rc; - -NOROOM: -rc = PCRE2_ERROR_NOMEMORY; -goto EXIT; - -BAD: -rc = PCRE2_ERROR_BADREPLACEMENT; -goto PTREXIT; - -BADESCAPE: -rc = PCRE2_ERROR_BADREPESCAPE; - -PTREXIT: -*blength = (PCRE2_SIZE)(ptr - replacement); -goto EXIT; -} - -/* End of pcre2_substitute.c */ diff --git a/pcre2/src/pcre2_substring.c b/pcre2/src/pcre2_substring.c deleted file mode 100644 index ddf5774e1..000000000 --- a/pcre2/src/pcre2_substring.c +++ /dev/null @@ -1,547 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2018 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre2_internal.h" - - - -/************************************************* -* Copy named captured string to given buffer * -*************************************************/ - -/* This function copies a single captured substring into a given buffer, -identifying it by name. If the regex permits duplicate names, the first -substring that is set is chosen. - -Arguments: - match_data points to the match data - stringname the name of the required substring - buffer where to put the substring - sizeptr the size of the buffer, updated to the size of the substring - -Returns: if successful: zero - if not successful, a negative error code: - (1) an error from nametable_scan() - (2) an error from copy_bynumber() - (3) PCRE2_ERROR_UNAVAILABLE: no group is in ovector - (4) PCRE2_ERROR_UNSET: all named groups in ovector are unset -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_substring_copy_byname(pcre2_match_data *match_data, PCRE2_SPTR stringname, - PCRE2_UCHAR *buffer, PCRE2_SIZE *sizeptr) -{ -PCRE2_SPTR first, last, entry; -int failrc, entrysize; -if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER) - return PCRE2_ERROR_DFA_UFUNC; -entrysize = pcre2_substring_nametable_scan(match_data->code, stringname, - &first, &last); -if (entrysize < 0) return entrysize; -failrc = PCRE2_ERROR_UNAVAILABLE; -for (entry = first; entry <= last; entry += entrysize) - { - uint32_t n = GET2(entry, 0); - if (n < match_data->oveccount) - { - if (match_data->ovector[n*2] != PCRE2_UNSET) - return pcre2_substring_copy_bynumber(match_data, n, buffer, sizeptr); - failrc = PCRE2_ERROR_UNSET; - } - } -return failrc; -} - - - -/************************************************* -* Copy numbered captured string to given buffer * -*************************************************/ - -/* This function copies a single captured substring into a given buffer, -identifying it by number. - -Arguments: - match_data points to the match data - stringnumber the number of the required substring - buffer where to put the substring - sizeptr the size of the buffer, updated to the size of the substring - -Returns: if successful: 0 - if not successful, a negative error code: - PCRE2_ERROR_NOMEMORY: buffer too small - PCRE2_ERROR_NOSUBSTRING: no such substring - PCRE2_ERROR_UNAVAILABLE: ovector too small - PCRE2_ERROR_UNSET: substring is not set -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_substring_copy_bynumber(pcre2_match_data *match_data, - uint32_t stringnumber, PCRE2_UCHAR *buffer, PCRE2_SIZE *sizeptr) -{ -int rc; -PCRE2_SIZE size; -rc = pcre2_substring_length_bynumber(match_data, stringnumber, &size); -if (rc < 0) return rc; -if (size + 1 > *sizeptr) return PCRE2_ERROR_NOMEMORY; -memcpy(buffer, match_data->subject + match_data->ovector[stringnumber*2], - CU2BYTES(size)); -buffer[size] = 0; -*sizeptr = size; -return 0; -} - - - -/************************************************* -* Extract named captured string * -*************************************************/ - -/* This function copies a single captured substring, identified by name, into -new memory. If the regex permits duplicate names, the first substring that is -set is chosen. - -Arguments: - match_data pointer to match_data - stringname the name of the required substring - stringptr where to put the pointer to the new memory - sizeptr where to put the length of the substring - -Returns: if successful: zero - if not successful, a negative value: - (1) an error from nametable_scan() - (2) an error from get_bynumber() - (3) PCRE2_ERROR_UNAVAILABLE: no group is in ovector - (4) PCRE2_ERROR_UNSET: all named groups in ovector are unset -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_substring_get_byname(pcre2_match_data *match_data, - PCRE2_SPTR stringname, PCRE2_UCHAR **stringptr, PCRE2_SIZE *sizeptr) -{ -PCRE2_SPTR first, last, entry; -int failrc, entrysize; -if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER) - return PCRE2_ERROR_DFA_UFUNC; -entrysize = pcre2_substring_nametable_scan(match_data->code, stringname, - &first, &last); -if (entrysize < 0) return entrysize; -failrc = PCRE2_ERROR_UNAVAILABLE; -for (entry = first; entry <= last; entry += entrysize) - { - uint32_t n = GET2(entry, 0); - if (n < match_data->oveccount) - { - if (match_data->ovector[n*2] != PCRE2_UNSET) - return pcre2_substring_get_bynumber(match_data, n, stringptr, sizeptr); - failrc = PCRE2_ERROR_UNSET; - } - } -return failrc; -} - - - -/************************************************* -* Extract captured string to new memory * -*************************************************/ - -/* This function copies a single captured substring into a piece of new -memory. - -Arguments: - match_data points to match data - stringnumber the number of the required substring - stringptr where to put a pointer to the new memory - sizeptr where to put the size of the substring - -Returns: if successful: 0 - if not successful, a negative error code: - PCRE2_ERROR_NOMEMORY: failed to get memory - PCRE2_ERROR_NOSUBSTRING: no such substring - PCRE2_ERROR_UNAVAILABLE: ovector too small - PCRE2_ERROR_UNSET: substring is not set -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_substring_get_bynumber(pcre2_match_data *match_data, - uint32_t stringnumber, PCRE2_UCHAR **stringptr, PCRE2_SIZE *sizeptr) -{ -int rc; -PCRE2_SIZE size; -PCRE2_UCHAR *yield; -rc = pcre2_substring_length_bynumber(match_data, stringnumber, &size); -if (rc < 0) return rc; -yield = PRIV(memctl_malloc)(sizeof(pcre2_memctl) + - (size + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)match_data); -if (yield == NULL) return PCRE2_ERROR_NOMEMORY; -yield = (PCRE2_UCHAR *)(((char *)yield) + sizeof(pcre2_memctl)); -memcpy(yield, match_data->subject + match_data->ovector[stringnumber*2], - CU2BYTES(size)); -yield[size] = 0; -*stringptr = yield; -*sizeptr = size; -return 0; -} - - - -/************************************************* -* Free memory obtained by get_substring * -*************************************************/ - -/* -Argument: the result of a previous pcre2_substring_get_byxxx() -Returns: nothing -*/ - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_substring_free(PCRE2_UCHAR *string) -{ -if (string != NULL) - { - pcre2_memctl *memctl = (pcre2_memctl *)((char *)string - sizeof(pcre2_memctl)); - memctl->free(memctl, memctl->memory_data); - } -} - - - -/************************************************* -* Get length of a named substring * -*************************************************/ - -/* This function returns the length of a named captured substring. If the regex -permits duplicate names, the first substring that is set is chosen. - -Arguments: - match_data pointer to match data - stringname the name of the required substring - sizeptr where to put the length - -Returns: 0 if successful, else a negative error number -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_substring_length_byname(pcre2_match_data *match_data, - PCRE2_SPTR stringname, PCRE2_SIZE *sizeptr) -{ -PCRE2_SPTR first, last, entry; -int failrc, entrysize; -if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER) - return PCRE2_ERROR_DFA_UFUNC; -entrysize = pcre2_substring_nametable_scan(match_data->code, stringname, - &first, &last); -if (entrysize < 0) return entrysize; -failrc = PCRE2_ERROR_UNAVAILABLE; -for (entry = first; entry <= last; entry += entrysize) - { - uint32_t n = GET2(entry, 0); - if (n < match_data->oveccount) - { - if (match_data->ovector[n*2] != PCRE2_UNSET) - return pcre2_substring_length_bynumber(match_data, n, sizeptr); - failrc = PCRE2_ERROR_UNSET; - } - } -return failrc; -} - - - -/************************************************* -* Get length of a numbered substring * -*************************************************/ - -/* This function returns the length of a captured substring. If the start is -beyond the end (which can happen when \K is used in an assertion), it sets the -length to zero. - -Arguments: - match_data pointer to match data - stringnumber the number of the required substring - sizeptr where to put the length, if not NULL - -Returns: if successful: 0 - if not successful, a negative error code: - PCRE2_ERROR_NOSUBSTRING: no such substring - PCRE2_ERROR_UNAVAILABLE: ovector is too small - PCRE2_ERROR_UNSET: substring is not set -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_substring_length_bynumber(pcre2_match_data *match_data, - uint32_t stringnumber, PCRE2_SIZE *sizeptr) -{ -PCRE2_SIZE left, right; -int count = match_data->rc; -if (count == PCRE2_ERROR_PARTIAL) - { - if (stringnumber > 0) return PCRE2_ERROR_PARTIAL; - count = 0; - } -else if (count < 0) return count; /* Match failed */ - -if (match_data->matchedby != PCRE2_MATCHEDBY_DFA_INTERPRETER) - { - if (stringnumber > match_data->code->top_bracket) - return PCRE2_ERROR_NOSUBSTRING; - if (stringnumber >= match_data->oveccount) - return PCRE2_ERROR_UNAVAILABLE; - if (match_data->ovector[stringnumber*2] == PCRE2_UNSET) - return PCRE2_ERROR_UNSET; - } -else /* Matched using pcre2_dfa_match() */ - { - if (stringnumber >= match_data->oveccount) return PCRE2_ERROR_UNAVAILABLE; - if (count != 0 && stringnumber >= (uint32_t)count) return PCRE2_ERROR_UNSET; - } - -left = match_data->ovector[stringnumber*2]; -right = match_data->ovector[stringnumber*2+1]; -if (sizeptr != NULL) *sizeptr = (left > right)? 0 : right - left; -return 0; -} - - - -/************************************************* -* Extract all captured strings to new memory * -*************************************************/ - -/* This function gets one chunk of memory and builds a list of pointers and all -the captured substrings in it. A NULL pointer is put on the end of the list. -The substrings are zero-terminated, but also, if the final argument is -non-NULL, a list of lengths is also returned. This allows binary data to be -handled. - -Arguments: - match_data points to the match data - listptr set to point to the list of pointers - lengthsptr set to point to the list of lengths (may be NULL) - -Returns: if successful: 0 - if not successful, a negative error code: - PCRE2_ERROR_NOMEMORY: failed to get memory, - or a match failure code -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_substring_list_get(pcre2_match_data *match_data, PCRE2_UCHAR ***listptr, - PCRE2_SIZE **lengthsptr) -{ -int i, count, count2; -PCRE2_SIZE size; -PCRE2_SIZE *lensp; -pcre2_memctl *memp; -PCRE2_UCHAR **listp; -PCRE2_UCHAR *sp; -PCRE2_SIZE *ovector; - -if ((count = match_data->rc) < 0) return count; /* Match failed */ -if (count == 0) count = match_data->oveccount; /* Ovector too small */ - -count2 = 2*count; -ovector = match_data->ovector; -size = sizeof(pcre2_memctl) + sizeof(PCRE2_UCHAR *); /* For final NULL */ -if (lengthsptr != NULL) size += sizeof(PCRE2_SIZE) * count; /* For lengths */ - -for (i = 0; i < count2; i += 2) - { - size += sizeof(PCRE2_UCHAR *) + CU2BYTES(1); - if (ovector[i+1] > ovector[i]) size += CU2BYTES(ovector[i+1] - ovector[i]); - } - -memp = PRIV(memctl_malloc)(size, (pcre2_memctl *)match_data); -if (memp == NULL) return PCRE2_ERROR_NOMEMORY; - -*listptr = listp = (PCRE2_UCHAR **)((char *)memp + sizeof(pcre2_memctl)); -lensp = (PCRE2_SIZE *)((char *)listp + sizeof(PCRE2_UCHAR *) * (count + 1)); - -if (lengthsptr == NULL) - { - sp = (PCRE2_UCHAR *)lensp; - lensp = NULL; - } -else - { - *lengthsptr = lensp; - sp = (PCRE2_UCHAR *)((char *)lensp + sizeof(PCRE2_SIZE) * count); - } - -for (i = 0; i < count2; i += 2) - { - size = (ovector[i+1] > ovector[i])? (ovector[i+1] - ovector[i]) : 0; - - /* Size == 0 includes the case when the capture is unset. Avoid adding - PCRE2_UNSET to match_data->subject because it overflows, even though with - zero size calling memcpy() is harmless. */ - - if (size != 0) memcpy(sp, match_data->subject + ovector[i], CU2BYTES(size)); - *listp++ = sp; - if (lensp != NULL) *lensp++ = size; - sp += size; - *sp++ = 0; - } - -*listp = NULL; -return 0; -} - - - -/************************************************* -* Free memory obtained by substring_list_get * -*************************************************/ - -/* -Argument: the result of a previous pcre2_substring_list_get() -Returns: nothing -*/ - -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_substring_list_free(PCRE2_SPTR *list) -{ -if (list != NULL) - { - pcre2_memctl *memctl = (pcre2_memctl *)((char *)list - sizeof(pcre2_memctl)); - memctl->free(memctl, memctl->memory_data); - } -} - - - -/************************************************* -* Find (multiple) entries for named string * -*************************************************/ - -/* This function scans the nametable for a given name, using binary chop. It -returns either two pointers to the entries in the table, or, if no pointers are -given, the number of a unique group with the given name. If duplicate names are -permitted, and the name is not unique, an error is generated. - -Arguments: - code the compiled regex - stringname the name whose entries required - firstptr where to put the pointer to the first entry - lastptr where to put the pointer to the last entry - -Returns: PCRE2_ERROR_NOSUBSTRING if the name is not found - otherwise, if firstptr and lastptr are NULL: - a group number for a unique substring - else PCRE2_ERROR_NOUNIQUESUBSTRING - otherwise: - the length of each entry, having set firstptr and lastptr -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR stringname, - PCRE2_SPTR *firstptr, PCRE2_SPTR *lastptr) -{ -uint16_t bot = 0; -uint16_t top = code->name_count; -uint16_t entrysize = code->name_entry_size; -PCRE2_SPTR nametable = (PCRE2_SPTR)((char *)code + sizeof(pcre2_real_code)); - -while (top > bot) - { - uint16_t mid = (top + bot) / 2; - PCRE2_SPTR entry = nametable + entrysize*mid; - int c = PRIV(strcmp)(stringname, entry + IMM2_SIZE); - if (c == 0) - { - PCRE2_SPTR first; - PCRE2_SPTR last; - PCRE2_SPTR lastentry; - lastentry = nametable + entrysize * (code->name_count - 1); - first = last = entry; - while (first > nametable) - { - if (PRIV(strcmp)(stringname, (first - entrysize + IMM2_SIZE)) != 0) break; - first -= entrysize; - } - while (last < lastentry) - { - if (PRIV(strcmp)(stringname, (last + entrysize + IMM2_SIZE)) != 0) break; - last += entrysize; - } - if (firstptr == NULL) return (first == last)? - (int)GET2(entry, 0) : PCRE2_ERROR_NOUNIQUESUBSTRING; - *firstptr = first; - *lastptr = last; - return entrysize; - } - if (c > 0) bot = mid + 1; else top = mid; - } - -return PCRE2_ERROR_NOSUBSTRING; -} - - -/************************************************* -* Find number for named string * -*************************************************/ - -/* This function is a convenience wrapper for pcre2_substring_nametable_scan() -when it is known that names are unique. If there are duplicate names, it is not -defined which number is returned. - -Arguments: - code the compiled regex - stringname the name whose number is required - -Returns: the number of the named parenthesis, or a negative number - PCRE2_ERROR_NOSUBSTRING if not found - PCRE2_ERROR_NOUNIQUESUBSTRING if not unique -*/ - -PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_substring_number_from_name(const pcre2_code *code, - PCRE2_SPTR stringname) -{ -return pcre2_substring_nametable_scan(code, stringname, NULL, NULL); -} - -/* End of pcre2_substring.c */ diff --git a/pcre2/src/pcre2_tables.c b/pcre2/src/pcre2_tables.c deleted file mode 100644 index b10de45ef..000000000 --- a/pcre2/src/pcre2_tables.c +++ /dev/null @@ -1,854 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -/* This module contains some fixed tables that are used by more than one of the -PCRE2 code modules. The tables are also #included by the pcre2test program, -which uses macros to change their names from _pcre2_xxx to xxxx, thereby -avoiding name clashes with the library. In this case, PCRE2_PCRE2TEST is -defined. */ - -#ifndef PCRE2_PCRE2TEST /* We're compiling the library */ -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif -#include "pcre2_internal.h" -#endif /* PCRE2_PCRE2TEST */ - - -/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that -the definition is next to the definition of the opcodes in pcre2_internal.h. -This is mode-dependent, so is skipped when this file is included by pcre2test. */ - -#ifndef PCRE2_PCRE2TEST -const uint8_t PRIV(OP_lengths)[] = { OP_LENGTHS }; -#endif - -/* Tables of horizontal and vertical whitespace characters, suitable for -adding to classes. */ - -const uint32_t PRIV(hspace_list)[] = { HSPACE_LIST }; -const uint32_t PRIV(vspace_list)[] = { VSPACE_LIST }; - -/* These tables are the pairs of delimiters that are valid for callout string -arguments. For each starting delimiter there must be a matching ending -delimiter, which in fact is different only for bracket-like delimiters. */ - -const uint32_t PRIV(callout_start_delims)[] = { - CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK, - CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN, - CHAR_DOLLAR_SIGN, CHAR_LEFT_CURLY_BRACKET, 0 }; - -const uint32_t PRIV(callout_end_delims[]) = { - CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK, - CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN, - CHAR_DOLLAR_SIGN, CHAR_RIGHT_CURLY_BRACKET, 0 }; - - -/************************************************* -* Tables for UTF-8 support * -*************************************************/ - -/* These tables are required by pcre2test in 16- or 32-bit mode, as well -as for the library in 8-bit mode, because pcre2test uses UTF-8 internally for -handling wide characters. */ - -#if defined PCRE2_PCRE2TEST || \ - (defined SUPPORT_UNICODE && \ - defined PCRE2_CODE_UNIT_WIDTH && \ - PCRE2_CODE_UNIT_WIDTH == 8) - -/* These are the breakpoints for different numbers of bytes in a UTF-8 -character. */ - -const int PRIV(utf8_table1)[] = - { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; - -const int PRIV(utf8_table1_size) = sizeof(PRIV(utf8_table1)) / sizeof(int); - -/* These are the indicator bits and the mask for the data bits to set in the -first byte of a character, indexed by the number of additional bytes. */ - -const int PRIV(utf8_table2)[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; -const int PRIV(utf8_table3)[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; - -/* Table of the number of extra bytes, indexed by the first byte masked with -0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ - -const uint8_t PRIV(utf8_table4)[] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; - -#endif /* UTF-8 support needed */ - - -#ifdef SUPPORT_UNICODE - -/* Table to translate from particular type value to the general value. */ - -const uint32_t PRIV(ucp_gentype)[] = { - ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */ - ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */ - ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */ - ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */ - ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */ - ucp_P, ucp_P, /* Ps, Po */ - ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */ - ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */ -}; - -/* This table encodes the rules for finding the end of an extended grapheme -cluster. Every code point has a grapheme break property which is one of the -ucp_gbXX values defined in pcre2_ucp.h. These changed between Unicode versions -10 and 11. The 2-dimensional table is indexed by the properties of two adjacent -code points. The left property selects a word from the table, and the right -property selects a bit from that word like this: - - PRIV(ucp_gbtable)[left-property] & (1u << right-property) - -The value is non-zero if a grapheme break is NOT permitted between the relevant -two code points. The breaking rules are as follows: - -1. Break at the start and end of text (pretty obviously). - -2. Do not break between a CR and LF; otherwise, break before and after - controls. - -3. Do not break Hangul syllable sequences, the rules for which are: - - L may be followed by L, V, LV or LVT - LV or V may be followed by V or T - LVT or T may be followed by T - -4. Do not break before extending characters or zero-width-joiner (ZWJ). - -The following rules are only for extended grapheme clusters (but that's what we -are implementing). - -5. Do not break before SpacingMarks. - -6. Do not break after Prepend characters. - -7. Do not break within emoji modifier sequences or emoji zwj sequences. That - is, do not break between characters with the Extended_Pictographic property. - Extend and ZWJ characters are allowed between the characters; this cannot be - represented in this table, the code has to deal with it. - -8. Do not break within emoji flag sequences. That is, do not break between - regional indicator (RI) symbols if there are an odd number of RI characters - before the break point. This table encodes "join RI characters"; the code - has to deal with checking for previous adjoining RIs. - -9. Otherwise, break everywhere. -*/ - -#define ESZ (1< 0x10ffff is not permitted -PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted -PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence -PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence -PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence -PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) -PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) -PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) -PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff -*/ - -for (p = string; length > 0; p++) - { - uint32_t ab, d; - - c = *p; - length--; - - if (c < 128) continue; /* ASCII character */ - - if (c < 0xc0) /* Isolated 10xx xxxx byte */ - { - *erroroffset = (PCRE2_SIZE)(p - string); - return PCRE2_ERROR_UTF8_ERR20; - } - - if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */ - { - *erroroffset = (PCRE2_SIZE)(p - string); - return PCRE2_ERROR_UTF8_ERR21; - } - - ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */ - if (length < ab) /* Missing bytes */ - { - *erroroffset = (PCRE2_SIZE)(p - string); - switch(ab - length) - { - case 1: return PCRE2_ERROR_UTF8_ERR1; - case 2: return PCRE2_ERROR_UTF8_ERR2; - case 3: return PCRE2_ERROR_UTF8_ERR3; - case 4: return PCRE2_ERROR_UTF8_ERR4; - case 5: return PCRE2_ERROR_UTF8_ERR5; - } - } - length -= ab; /* Length remaining */ - - /* Check top bits in the second byte */ - - if (((d = *(++p)) & 0xc0) != 0x80) - { - *erroroffset = (int)(p - string) - 1; - return PCRE2_ERROR_UTF8_ERR6; - } - - /* For each length, check that the remaining bytes start with the 0x80 bit - set and not the 0x40 bit. Then check for an overlong sequence, and for the - excluded range 0xd800 to 0xdfff. */ - - switch (ab) - { - /* 2-byte character. No further bytes to check for 0x80. Check first byte - for for xx00 000x (overlong sequence). */ - - case 1: if ((c & 0x3e) == 0) - { - *erroroffset = (int)(p - string) - 1; - return PCRE2_ERROR_UTF8_ERR15; - } - break; - - /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes - for 1110 0000, xx0x xxxx (overlong sequence) or - 1110 1101, 1010 xxxx (0xd800 - 0xdfff) */ - - case 2: - if ((*(++p) & 0xc0) != 0x80) /* Third byte */ - { - *erroroffset = (int)(p - string) - 2; - return PCRE2_ERROR_UTF8_ERR7; - } - if (c == 0xe0 && (d & 0x20) == 0) - { - *erroroffset = (int)(p - string) - 2; - return PCRE2_ERROR_UTF8_ERR16; - } - if (c == 0xed && d >= 0xa0) - { - *erroroffset = (int)(p - string) - 2; - return PCRE2_ERROR_UTF8_ERR14; - } - break; - - /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2 - bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a - character greater than 0x0010ffff (f4 8f bf bf) */ - - case 3: - if ((*(++p) & 0xc0) != 0x80) /* Third byte */ - { - *erroroffset = (int)(p - string) - 2; - return PCRE2_ERROR_UTF8_ERR7; - } - if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ - { - *erroroffset = (int)(p - string) - 3; - return PCRE2_ERROR_UTF8_ERR8; - } - if (c == 0xf0 && (d & 0x30) == 0) - { - *erroroffset = (int)(p - string) - 3; - return PCRE2_ERROR_UTF8_ERR17; - } - if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) - { - *erroroffset = (int)(p - string) - 3; - return PCRE2_ERROR_UTF8_ERR13; - } - break; - - /* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be - rejected by the length test below. However, we do the appropriate tests - here so that overlong sequences get diagnosed, and also in case there is - ever an option for handling these larger code points. */ - - /* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for - 1111 1000, xx00 0xxx */ - - case 4: - if ((*(++p) & 0xc0) != 0x80) /* Third byte */ - { - *erroroffset = (int)(p - string) - 2; - return PCRE2_ERROR_UTF8_ERR7; - } - if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ - { - *erroroffset = (int)(p - string) - 3; - return PCRE2_ERROR_UTF8_ERR8; - } - if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ - { - *erroroffset = (int)(p - string) - 4; - return PCRE2_ERROR_UTF8_ERR9; - } - if (c == 0xf8 && (d & 0x38) == 0) - { - *erroroffset = (int)(p - string) - 4; - return PCRE2_ERROR_UTF8_ERR18; - } - break; - - /* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for - 1111 1100, xx00 00xx. */ - - case 5: - if ((*(++p) & 0xc0) != 0x80) /* Third byte */ - { - *erroroffset = (int)(p - string) - 2; - return PCRE2_ERROR_UTF8_ERR7; - } - if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ - { - *erroroffset = (int)(p - string) - 3; - return PCRE2_ERROR_UTF8_ERR8; - } - if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ - { - *erroroffset = (int)(p - string) - 4; - return PCRE2_ERROR_UTF8_ERR9; - } - if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */ - { - *erroroffset = (int)(p - string) - 5; - return PCRE2_ERROR_UTF8_ERR10; - } - if (c == 0xfc && (d & 0x3c) == 0) - { - *erroroffset = (int)(p - string) - 5; - return PCRE2_ERROR_UTF8_ERR19; - } - break; - } - - /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are - excluded by RFC 3629. The pointer p is currently at the last byte of the - character. */ - - if (ab > 3) - { - *erroroffset = (int)(p - string) - ab; - return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12; - } - } -return 0; - - -/* ----------------- Check a UTF-16 string ----------------- */ - -#elif PCRE2_CODE_UNIT_WIDTH == 16 - -/* There's not so much work, nor so many errors, for UTF-16. -PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string -PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate -PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate -*/ - -for (p = string; length > 0; p++) - { - c = *p; - length--; - - if ((c & 0xf800) != 0xd800) - { - /* Normal UTF-16 code point. Neither high nor low surrogate. */ - } - else if ((c & 0x0400) == 0) - { - /* High surrogate. Must be a followed by a low surrogate. */ - if (length == 0) - { - *erroroffset = p - string; - return PCRE2_ERROR_UTF16_ERR1; - } - p++; - length--; - if ((*p & 0xfc00) != 0xdc00) - { - *erroroffset = p - string - 1; - return PCRE2_ERROR_UTF16_ERR2; - } - } - else - { - /* Isolated low surrogate. Always an error. */ - *erroroffset = p - string; - return PCRE2_ERROR_UTF16_ERR3; - } - } -return 0; - - - -/* ----------------- Check a UTF-32 string ----------------- */ - -#else - -/* There is very little to do for a UTF-32 string. -PCRE2_ERROR_UTF32_ERR1 Surrogate character -PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff -*/ - -for (p = string; length > 0; length--, p++) - { - c = *p; - if ((c & 0xfffff800u) != 0xd800u) - { - /* Normal UTF-32 code point. Neither high nor low surrogate. */ - if (c > 0x10ffffu) - { - *erroroffset = p - string; - return PCRE2_ERROR_UTF32_ERR2; - } - } - else - { - /* A surrogate */ - *erroroffset = p - string; - return PCRE2_ERROR_UTF32_ERR1; - } - } -return 0; -#endif /* CODE_UNIT_WIDTH */ -} -#endif /* SUPPORT_UNICODE */ - -/* End of pcre2_valid_utf.c */ diff --git a/pcre2/src/pcre2_xclass.c b/pcre2/src/pcre2_xclass.c deleted file mode 100644 index 8b052be66..000000000 --- a/pcre2/src/pcre2_xclass.c +++ /dev/null @@ -1,271 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - -/* This module contains an internal function that is used to match an extended -class. It is used by pcre2_auto_possessify() and by both pcre2_match() and -pcre2_def_match(). */ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - - -#include "pcre2_internal.h" - -/************************************************* -* Match character against an XCLASS * -*************************************************/ - -/* This function is called to match a character against an extended class that -might contain codepoints above 255 and/or Unicode properties. - -Arguments: - c the character - data points to the flag code unit of the XCLASS data - utf TRUE if in UTF mode - -Returns: TRUE if character matches, else FALSE -*/ - -BOOL -PRIV(xclass)(uint32_t c, PCRE2_SPTR data, BOOL utf) -{ -PCRE2_UCHAR t; -BOOL negated = (*data & XCL_NOT) != 0; - -#if PCRE2_CODE_UNIT_WIDTH == 8 -/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */ -utf = TRUE; -#endif - -/* Code points < 256 are matched against a bitmap, if one is present. If not, -we still carry on, because there may be ranges that start below 256 in the -additional data. */ - -if (c < 256) - { - if ((*data & XCL_HASPROP) == 0) - { - if ((*data & XCL_MAP) == 0) return negated; - return (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0; - } - if ((*data & XCL_MAP) != 0 && - (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0) - return !negated; /* char found */ - } - -/* First skip the bit map if present. Then match against the list of Unicode -properties or large chars or ranges that end with a large char. We won't ever -encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */ - -if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(PCRE2_UCHAR); - -while ((t = *data++) != XCL_END) - { - uint32_t x, y; - if (t == XCL_SINGLE) - { -#ifdef SUPPORT_UNICODE - if (utf) - { - GETCHARINC(x, data); /* macro generates multiple statements */ - } - else -#endif - x = *data++; - if (c == x) return !negated; - } - else if (t == XCL_RANGE) - { -#ifdef SUPPORT_UNICODE - if (utf) - { - GETCHARINC(x, data); /* macro generates multiple statements */ - GETCHARINC(y, data); /* macro generates multiple statements */ - } - else -#endif - { - x = *data++; - y = *data++; - } - if (c >= x && c <= y) return !negated; - } - -#ifdef SUPPORT_UNICODE - else /* XCL_PROP & XCL_NOTPROP */ - { - const ucd_record *prop = GET_UCD(c); - BOOL isprop = t == XCL_PROP; - - switch(*data) - { - case PT_ANY: - if (isprop) return !negated; - break; - - case PT_LAMP: - if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt) == isprop) return !negated; - break; - - case PT_GC: - if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop) - return !negated; - break; - - case PT_PC: - if ((data[1] == prop->chartype) == isprop) return !negated; - break; - - case PT_SC: - if ((data[1] == prop->script) == isprop) return !negated; - break; - - case PT_ALNUM: - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop) - return !negated; - break; - - /* Perl space used to exclude VT, but from Perl 5.18 it is included, - which means that Perl space and POSIX space are now identical. PCRE - was changed at release 8.34. */ - - case PT_SPACE: /* Perl space */ - case PT_PXSPACE: /* POSIX space */ - switch(c) - { - HSPACE_CASES: - VSPACE_CASES: - if (isprop) return !negated; - break; - - default: - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop) - return !negated; - break; - } - break; - - case PT_WORD: - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) - == isprop) - return !negated; - break; - - case PT_UCNC: - if (c < 0xa0) - { - if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || - c == CHAR_GRAVE_ACCENT) == isprop) - return !negated; - } - else - { - if ((c < 0xd800 || c > 0xdfff) == isprop) - return !negated; - } - break; - - /* The following three properties can occur only in an XCLASS, as there - is no \p or \P coding for them. */ - - /* Graphic character. Implement this as not Z (space or separator) and - not C (other), except for Cf (format) with a few exceptions. This seems - to be what Perl does. The exceptional characters are: - - U+061C Arabic Letter Mark - U+180E Mongolian Vowel Separator - U+2066 - U+2069 Various "isolate"s - */ - - case PT_PXGRAPH: - if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z && - (PRIV(ucp_gentype)[prop->chartype] != ucp_C || - (prop->chartype == ucp_Cf && - c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069)) - )) == isprop) - return !negated; - break; - - /* Printable character: same as graphic, with the addition of Zs, i.e. - not Zl and not Zp, and U+180E. */ - - case PT_PXPRINT: - if ((prop->chartype != ucp_Zl && - prop->chartype != ucp_Zp && - (PRIV(ucp_gentype)[prop->chartype] != ucp_C || - (prop->chartype == ucp_Cf && - c != 0x061c && (c < 0x2066 || c > 0x2069)) - )) == isprop) - return !negated; - break; - - /* Punctuation: all Unicode punctuation, plus ASCII characters that - Unicode treats as symbols rather than punctuation, for Perl - compatibility (these are $+<=>^`|~). */ - - case PT_PXPUNCT: - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P || - (c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop) - return !negated; - break; - - /* This should never occur, but compilers may mutter if there is no - default. */ - - default: - return FALSE; - } - - data += 2; - } -#else - (void)utf; /* Avoid compiler warning */ -#endif /* SUPPORT_UNICODE */ - } - -return negated; /* char did not match */ -} - -/* End of pcre2_xclass.c */ diff --git a/pcre2/src/pcre2demo.c b/pcre2/src/pcre2demo.c deleted file mode 100644 index a49f1f8e5..000000000 --- a/pcre2/src/pcre2demo.c +++ /dev/null @@ -1,494 +0,0 @@ -/************************************************* -* PCRE2 DEMONSTRATION PROGRAM * -*************************************************/ - -/* This is a demonstration program to illustrate a straightforward way of -using the PCRE2 regular expression library from a C program. See the -pcre2sample documentation for a short discussion ("man pcre2sample" if you have -the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is -incompatible with the original PCRE API. - -There are actually three libraries, each supporting a different code unit -width. This demonstration program uses the 8-bit library. The default is to -process each code unit as a separate character, but if the pattern begins with -"(*UTF)", both it and the subject are treated as UTF-8 strings, where -characters may occupy multiple code units. - -In Unix-like environments, if PCRE2 is installed in your standard system -libraries, you should be able to compile this program using this command: - -cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo - -If PCRE2 is not installed in a standard place, it is likely to be installed -with support for the pkg-config mechanism. If you have pkg-config, you can -compile this program using this command: - -cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo - -If you do not have pkg-config, you may have to use something like this: - -cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \ - -R/usr/local/lib -lpcre2-8 -o pcre2demo - -Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and -library files for PCRE2 are installed on your system. Only some operating -systems (Solaris is one) use the -R option. - -Building under Windows: - -If you want to statically link this program against a non-dll .a file, you must -define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment -the following line. */ - -/* #define PCRE2_STATIC */ - -/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h. -For a program that uses only one code unit width, setting it to 8, 16, or 32 -makes it possible to use generic function names such as pcre2_compile(). Note -that just changing 8 to 16 (for example) is not sufficient to convert this -program to process 16-bit characters. Even in a fully 16-bit environment, where -string-handling functions such as strcmp() and printf() work with 16-bit -characters, the code for handling the table of named substrings will still need -to be modified. */ - -#define PCRE2_CODE_UNIT_WIDTH 8 - -#include -#include -#include - - -/************************************************************************** -* Here is the program. The API includes the concept of "contexts" for * -* setting up unusual interface requirements for compiling and matching, * -* such as custom memory managers and non-standard newline definitions. * -* This program does not do any of this, so it makes no use of contexts, * -* always passing NULL where a context could be given. * -**************************************************************************/ - -int main(int argc, char **argv) -{ -pcre2_code *re; -PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ -PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */ -PCRE2_SPTR name_table; - -int crlf_is_newline; -int errornumber; -int find_all; -int i; -int rc; -int utf8; - -uint32_t option_bits; -uint32_t namecount; -uint32_t name_entry_size; -uint32_t newline; - -PCRE2_SIZE erroroffset; -PCRE2_SIZE *ovector; -PCRE2_SIZE subject_length; - -pcre2_match_data *match_data; - - -/************************************************************************** -* First, sort out the command line. There is only one possible option at * -* the moment, "-g" to request repeated matching to find all occurrences, * -* like Perl's /g option. We set the variable find_all to a non-zero value * -* if the -g option is present. * -**************************************************************************/ - -find_all = 0; -for (i = 1; i < argc; i++) - { - if (strcmp(argv[i], "-g") == 0) find_all = 1; - else if (argv[i][0] == '-') - { - printf("Unrecognised option %s\n", argv[i]); - return 1; - } - else break; - } - -/* After the options, we require exactly two arguments, which are the pattern, -and the subject string. */ - -if (argc - i != 2) - { - printf("Exactly two arguments required: a regex and a subject string\n"); - return 1; - } - -/* Pattern and subject are char arguments, so they can be straightforwardly -cast to PCRE2_SPTR because we are working in 8-bit code units. The subject -length is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact -defined to be size_t. */ - -pattern = (PCRE2_SPTR)argv[i]; -subject = (PCRE2_SPTR)argv[i+1]; -subject_length = (PCRE2_SIZE)strlen((char *)subject); - - -/************************************************************************* -* Now we are going to compile the regular expression pattern, and handle * -* any errors that are detected. * -*************************************************************************/ - -re = pcre2_compile( - pattern, /* the pattern */ - PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */ - 0, /* default options */ - &errornumber, /* for error number */ - &erroroffset, /* for error offset */ - NULL); /* use default compile context */ - -/* Compilation failed: print the error message and exit. */ - -if (re == NULL) - { - PCRE2_UCHAR buffer[256]; - pcre2_get_error_message(errornumber, buffer, sizeof(buffer)); - printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset, - buffer); - return 1; - } - - -/************************************************************************* -* If the compilation succeeded, we call PCRE2 again, in order to do a * -* pattern match against the subject string. This does just ONE match. If * -* further matching is needed, it will be done below. Before running the * -* match we must set up a match_data block for holding the result. Using * -* pcre2_match_data_create_from_pattern() ensures that the block is * -* exactly the right size for the number of capturing parentheses in the * -* pattern. If you need to know the actual size of a match_data block as * -* a number of bytes, you can find it like this: * -* * -* PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data); * -*************************************************************************/ - -match_data = pcre2_match_data_create_from_pattern(re, NULL); - -/* Now run the match. */ - -rc = pcre2_match( - re, /* the compiled pattern */ - subject, /* the subject string */ - subject_length, /* the length of the subject */ - 0, /* start at offset 0 in the subject */ - 0, /* default options */ - match_data, /* block for storing the result */ - NULL); /* use default match context */ - -/* Matching failed: handle error cases */ - -if (rc < 0) - { - switch(rc) - { - case PCRE2_ERROR_NOMATCH: printf("No match\n"); break; - /* - Handle other special cases if you like - */ - default: printf("Matching error %d\n", rc); break; - } - pcre2_match_data_free(match_data); /* Release memory used for the match */ - pcre2_code_free(re); /* data and the compiled pattern. */ - return 1; - } - -/* Match succeded. Get a pointer to the output vector, where string offsets are -stored. */ - -ovector = pcre2_get_ovector_pointer(match_data); -printf("Match succeeded at offset %d\n", (int)ovector[0]); - - -/************************************************************************* -* We have found the first match within the subject string. If the output * -* vector wasn't big enough, say so. Then output any substrings that were * -* captured. * -*************************************************************************/ - -/* The output vector wasn't big enough. This should not happen, because we used -pcre2_match_data_create_from_pattern() above. */ - -if (rc == 0) - printf("ovector was not big enough for all the captured substrings\n"); - -/* We must guard against patterns such as /(?=.\K)/ that use \K in an assertion -to set the start of a match later than its end. In this demonstration program, -we just detect this case and give up. */ - -if (ovector[0] > ovector[1]) - { - printf("\\K was used in an assertion to set the match start after its end.\n" - "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]), - (char *)(subject + ovector[1])); - printf("Run abandoned\n"); - pcre2_match_data_free(match_data); - pcre2_code_free(re); - return 1; - } - -/* Show substrings stored in the output vector by number. Obviously, in a real -application you might want to do things other than print them. */ - -for (i = 0; i < rc; i++) - { - PCRE2_SPTR substring_start = subject + ovector[2*i]; - PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i]; - printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); - } - - -/************************************************************************** -* That concludes the basic part of this demonstration program. We have * -* compiled a pattern, and performed a single match. The code that follows * -* shows first how to access named substrings, and then how to code for * -* repeated matches on the same subject. * -**************************************************************************/ - -/* See if there are any named substrings, and if so, show them by name. First -we have to extract the count of named parentheses from the pattern. */ - -(void)pcre2_pattern_info( - re, /* the compiled pattern */ - PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ - &namecount); /* where to put the answer */ - -if (namecount == 0) printf("No named substrings\n"); else - { - PCRE2_SPTR tabptr; - printf("Named substrings\n"); - - /* Before we can access the substrings, we must extract the table for - translating names to numbers, and the size of each entry in the table. */ - - (void)pcre2_pattern_info( - re, /* the compiled pattern */ - PCRE2_INFO_NAMETABLE, /* address of the table */ - &name_table); /* where to put the answer */ - - (void)pcre2_pattern_info( - re, /* the compiled pattern */ - PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */ - &name_entry_size); /* where to put the answer */ - - /* Now we can scan the table and, for each entry, print the number, the name, - and the substring itself. In the 8-bit library the number is held in two - bytes, most significant first. */ - - tabptr = name_table; - for (i = 0; i < namecount; i++) - { - int n = (tabptr[0] << 8) | tabptr[1]; - printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, - (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]); - tabptr += name_entry_size; - } - } - - -/************************************************************************* -* If the "-g" option was given on the command line, we want to continue * -* to search for additional matches in the subject string, in a similar * -* way to the /g option in Perl. This turns out to be trickier than you * -* might think because of the possibility of matching an empty string. * -* What happens is as follows: * -* * -* If the previous match was NOT for an empty string, we can just start * -* the next match at the end of the previous one. * -* * -* If the previous match WAS for an empty string, we can't do that, as it * -* would lead to an infinite loop. Instead, a call of pcre2_match() is * -* made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The * -* first of these tells PCRE2 that an empty string at the start of the * -* subject is not a valid match; other possibilities must be tried. The * -* second flag restricts PCRE2 to one match attempt at the initial string * -* position. If this match succeeds, an alternative to the empty string * -* match has been found, and we can print it and proceed round the loop, * -* advancing by the length of whatever was found. If this match does not * -* succeed, we still stay in the loop, advancing by just one character. * -* In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be * -* more than one byte. * -* * -* However, there is a complication concerned with newlines. When the * -* newline convention is such that CRLF is a valid newline, we must * -* advance by two characters rather than one. The newline convention can * -* be set in the regex by (*CR), etc.; if not, we must find the default. * -*************************************************************************/ - -if (!find_all) /* Check for -g */ - { - pcre2_match_data_free(match_data); /* Release the memory that was used */ - pcre2_code_free(re); /* for the match data and the pattern. */ - return 0; /* Exit the program. */ - } - -/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline -sequence. First, find the options with which the regex was compiled and extract -the UTF state. */ - -(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &option_bits); -utf8 = (option_bits & PCRE2_UTF) != 0; - -/* Now find the newline convention and see whether CRLF is a valid newline -sequence. */ - -(void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &newline); -crlf_is_newline = newline == PCRE2_NEWLINE_ANY || - newline == PCRE2_NEWLINE_CRLF || - newline == PCRE2_NEWLINE_ANYCRLF; - -/* Loop for second and subsequent matches */ - -for (;;) - { - uint32_t options = 0; /* Normally no options */ - PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ - - /* If the previous match was for an empty string, we are finished if we are - at the end of the subject. Otherwise, arrange to run another match at the - same point to see if a non-empty match can be found. */ - - if (ovector[0] == ovector[1]) - { - if (ovector[0] == subject_length) break; - options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; - } - - /* If the previous match was not an empty string, there is one tricky case to - consider. If a pattern contains \K within a lookbehind assertion at the - start, the end of the matched string can be at the offset where the match - started. Without special action, this leads to a loop that keeps on matching - the same substring. We must detect this case and arrange to move the start on - by one character. The pcre2_get_startchar() function returns the starting - offset that was passed to pcre2_match(). */ - - else - { - PCRE2_SIZE startchar = pcre2_get_startchar(match_data); - if (start_offset <= startchar) - { - if (startchar >= subject_length) break; /* Reached end of subject. */ - start_offset = startchar + 1; /* Advance by one character. */ - if (utf8) /* If UTF-8, it may be more */ - { /* than one code unit. */ - for (; start_offset < subject_length; start_offset++) - if ((subject[start_offset] & 0xc0) != 0x80) break; - } - } - } - - /* Run the next matching operation */ - - rc = pcre2_match( - re, /* the compiled pattern */ - subject, /* the subject string */ - subject_length, /* the length of the subject */ - start_offset, /* starting offset in the subject */ - options, /* options */ - match_data, /* block for storing the result */ - NULL); /* use default match context */ - - /* This time, a result of NOMATCH isn't an error. If the value in "options" - is zero, it just means we have found all possible matches, so the loop ends. - Otherwise, it means we have failed to find a non-empty-string match at a - point where there was a previous empty-string match. In this case, we do what - Perl does: advance the matching position by one character, and continue. We - do this by setting the "end of previous match" offset, because that is picked - up at the top of the loop as the point at which to start again. - - There are two complications: (a) When CRLF is a valid newline sequence, and - the current position is just before it, advance by an extra byte. (b) - Otherwise we must ensure that we skip an entire UTF character if we are in - UTF mode. */ - - if (rc == PCRE2_ERROR_NOMATCH) - { - if (options == 0) break; /* All matches found */ - ovector[1] = start_offset + 1; /* Advance one code unit */ - if (crlf_is_newline && /* If CRLF is a newline & */ - start_offset < subject_length - 1 && /* we are at CRLF, */ - subject[start_offset] == '\r' && - subject[start_offset + 1] == '\n') - ovector[1] += 1; /* Advance by one more. */ - else if (utf8) /* Otherwise, ensure we */ - { /* advance a whole UTF-8 */ - while (ovector[1] < subject_length) /* character. */ - { - if ((subject[ovector[1]] & 0xc0) != 0x80) break; - ovector[1] += 1; - } - } - continue; /* Go round the loop again */ - } - - /* Other matching errors are not recoverable. */ - - if (rc < 0) - { - printf("Matching error %d\n", rc); - pcre2_match_data_free(match_data); - pcre2_code_free(re); - return 1; - } - - /* Match succeded */ - - printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]); - - /* The match succeeded, but the output vector wasn't big enough. This - should not happen. */ - - if (rc == 0) - printf("ovector was not big enough for all the captured substrings\n"); - - /* We must guard against patterns such as /(?=.\K)/ that use \K in an - assertion to set the start of a match later than its end. In this - demonstration program, we just detect this case and give up. */ - - if (ovector[0] > ovector[1]) - { - printf("\\K was used in an assertion to set the match start after its end.\n" - "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]), - (char *)(subject + ovector[1])); - printf("Run abandoned\n"); - pcre2_match_data_free(match_data); - pcre2_code_free(re); - return 1; - } - - /* As before, show substrings stored in the output vector by number, and then - also any named substrings. */ - - for (i = 0; i < rc; i++) - { - PCRE2_SPTR substring_start = subject + ovector[2*i]; - size_t substring_length = ovector[2*i+1] - ovector[2*i]; - printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); - } - - if (namecount == 0) printf("No named substrings\n"); else - { - PCRE2_SPTR tabptr = name_table; - printf("Named substrings\n"); - for (i = 0; i < namecount; i++) - { - int n = (tabptr[0] << 8) | tabptr[1]; - printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, - (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]); - tabptr += name_entry_size; - } - } - } /* End of loop to find second and subsequent matches */ - -printf("\n"); -pcre2_match_data_free(match_data); -pcre2_code_free(re); -return 0; -} - -/* End of pcre2demo.c */ diff --git a/pcre2/src/pcre2posix.c b/pcre2/src/pcre2posix.c deleted file mode 100644 index b24620a45..000000000 --- a/pcre2/src/pcre2posix.c +++ /dev/null @@ -1,423 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -/* This module is a wrapper that provides a POSIX API to the underlying PCRE2 -functions. The operative functions are called pcre2_regcomp(), etc., with -wrappers that use the plain POSIX names. In addition, pcre2posix.h defines the -POSIX names as macros for the pcre2_xxx functions, so any program that includes -it and uses the POSIX names will call the base functions directly. This makes -it easier for an application to be sure it gets the PCRE2 versions in the -presence of other POSIX regex libraries. */ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - - -/* Ensure that the PCRE2POSIX_EXP_xxx macros are set appropriately for -compiling these functions. This must come before including pcre2posix.h, where -they are set for an application (using these functions) if they have not -previously been set. */ - -#if defined(_WIN32) && !defined(PCRE2_STATIC) -# define PCRE2POSIX_EXP_DECL extern __declspec(dllexport) -# define PCRE2POSIX_EXP_DEFN __declspec(dllexport) -#endif - -/* Older versions of MSVC lack snprintf(). This define allows for -warning/error-free compilation and testing with MSVC compilers back to at least -MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */ - -#if defined(_MSC_VER) && (_MSC_VER < 1900) -#define snprintf _snprintf -#endif - - -/* Compile-time error numbers start at this value. It should probably never be -changed. This #define is a copy of the one in pcre2_internal.h. */ - -#define COMPILE_ERROR_BASE 100 - - -/* Standard C headers */ - -#include -#include -#include -#include -#include -#include - -/* PCRE2 headers */ - -#include "pcre2.h" -#include "pcre2posix.h" - -/* When compiling with the MSVC compiler, it is sometimes necessary to include -a "calling convention" before exported function names. (This is secondhand -information; I know nothing about MSVC myself). For example, something like - - void __cdecl function(....) - -might be needed. In order to make this easy, all the exported functions have -PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not -set, we ensure here that it has no effect. */ - -#ifndef PCRE2_CALL_CONVENTION -#define PCRE2_CALL_CONVENTION -#endif - -/* Table to translate PCRE2 compile time error codes into POSIX error codes. -Only a few PCRE2 errors with a value greater than 23 turn into special POSIX -codes: most go to REG_BADPAT. The second table lists, in pairs, those that -don't. */ - -static const int eint1[] = { - 0, /* No error */ - REG_EESCAPE, /* \ at end of pattern */ - REG_EESCAPE, /* \c at end of pattern */ - REG_EESCAPE, /* unrecognized character follows \ */ - REG_BADBR, /* numbers out of order in {} quantifier */ - /* 5 */ - REG_BADBR, /* number too big in {} quantifier */ - REG_EBRACK, /* missing terminating ] for character class */ - REG_ECTYPE, /* invalid escape sequence in character class */ - REG_ERANGE, /* range out of order in character class */ - REG_BADRPT, /* nothing to repeat */ - /* 10 */ - REG_ASSERT, /* internal error: unexpected repeat */ - REG_BADPAT, /* unrecognized character after (? or (?- */ - REG_BADPAT, /* POSIX named classes are supported only within a class */ - REG_BADPAT, /* POSIX collating elements are not supported */ - REG_EPAREN, /* missing ) */ - /* 15 */ - REG_ESUBREG, /* reference to non-existent subpattern */ - REG_INVARG, /* pattern passed as NULL */ - REG_INVARG, /* unknown compile-time option bit(s) */ - REG_EPAREN, /* missing ) after (?# comment */ - REG_ESIZE, /* parentheses nested too deeply */ - /* 20 */ - REG_ESIZE, /* regular expression too large */ - REG_ESPACE, /* failed to get memory */ - REG_EPAREN, /* unmatched closing parenthesis */ - REG_ASSERT /* internal error: code overflow */ - }; - -static const int eint2[] = { - 30, REG_ECTYPE, /* unknown POSIX class name */ - 32, REG_INVARG, /* this version of PCRE2 does not have Unicode support */ - 37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */ - 56, REG_INVARG, /* internal error: unknown newline setting */ - 92, REG_INVARG, /* invalid option bits with PCRE2_LITERAL */ -}; - -/* Table of texts corresponding to POSIX error codes */ - -static const char *const pstring[] = { - "", /* Dummy for value 0 */ - "internal error", /* REG_ASSERT */ - "invalid repeat counts in {}", /* BADBR */ - "pattern error", /* BADPAT */ - "? * + invalid", /* BADRPT */ - "unbalanced {}", /* EBRACE */ - "unbalanced []", /* EBRACK */ - "collation error - not relevant", /* ECOLLATE */ - "bad class", /* ECTYPE */ - "bad escape sequence", /* EESCAPE */ - "empty expression", /* EMPTY */ - "unbalanced ()", /* EPAREN */ - "bad range inside []", /* ERANGE */ - "expression too big", /* ESIZE */ - "failed to get memory", /* ESPACE */ - "bad back reference", /* ESUBREG */ - "bad argument", /* INVARG */ - "match failed" /* NOMATCH */ -}; - - - -/************************************************* -* Wrappers with traditional POSIX names * -*************************************************/ - -/* Keep defining them to preseve the ABI for applications linked to the pcre2 -POSIX library before these names were changed into macros in pcre2posix.h. -This also ensures that the POSIX names are callable from languages that do not -include pcre2posix.h. It is vital to #undef the macro definitions from -pcre2posix.h! */ - -#undef regerror -PCRE2POSIX_EXP_DECL size_t regerror(int, const regex_t *, char *, size_t); -PCRE2POSIX_EXP_DEFN size_t PCRE2_CALL_CONVENTION -regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) -{ -return pcre2_regerror(errcode, preg, errbuf, errbuf_size); -} - -#undef regfree -PCRE2POSIX_EXP_DECL void regfree(regex_t *); -PCRE2POSIX_EXP_DEFN void PCRE2_CALL_CONVENTION -regfree(regex_t *preg) -{ -pcre2_regfree(preg); -} - -#undef regcomp -PCRE2POSIX_EXP_DECL int regcomp(regex_t *, const char *, int); -PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION -regcomp(regex_t *preg, const char *pattern, int cflags) -{ -return pcre2_regcomp(preg, pattern, cflags); -} - -#undef regexec -PCRE2POSIX_EXP_DECL int regexec(const regex_t *, const char *, size_t, - regmatch_t *, int); -PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION -regexec(const regex_t *preg, const char *string, size_t nmatch, - regmatch_t pmatch[], int eflags) -{ -return pcre2_regexec(preg, string, nmatch, pmatch, eflags); -} - - - -/************************************************* -* Translate error code to string * -*************************************************/ - -PCRE2POSIX_EXP_DEFN size_t PCRE2_CALL_CONVENTION -pcre2_regerror(int errcode, const regex_t *preg, char *errbuf, - size_t errbuf_size) -{ -int used; -const char *message; - -message = (errcode <= 0 || errcode >= (int)(sizeof(pstring)/sizeof(char *)))? - "unknown error code" : pstring[errcode]; - -if (preg != NULL && (int)preg->re_erroffset != -1) - { - used = snprintf(errbuf, errbuf_size, "%s at offset %-6d", message, - (int)preg->re_erroffset); - } -else - { - used = snprintf(errbuf, errbuf_size, "%s", message); - } - -return used + 1; -} - - - -/************************************************* -* Free store held by a regex * -*************************************************/ - -PCRE2POSIX_EXP_DEFN void PCRE2_CALL_CONVENTION -pcre2_regfree(regex_t *preg) -{ -pcre2_match_data_free(preg->re_match_data); -pcre2_code_free(preg->re_pcre2_code); -} - - - -/************************************************* -* Compile a regular expression * -*************************************************/ - -/* -Arguments: - preg points to a structure for recording the compiled expression - pattern the pattern to compile - cflags compilation flags - -Returns: 0 on success - various non-zero codes on failure -*/ - -PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_regcomp(regex_t *preg, const char *pattern, int cflags) -{ -PCRE2_SIZE erroffset; -PCRE2_SIZE patlen; -int errorcode; -int options = 0; -int re_nsub = 0; - -patlen = ((cflags & REG_PEND) != 0)? (PCRE2_SIZE)(preg->re_endp - pattern) : - PCRE2_ZERO_TERMINATED; - -if ((cflags & REG_ICASE) != 0) options |= PCRE2_CASELESS; -if ((cflags & REG_NEWLINE) != 0) options |= PCRE2_MULTILINE; -if ((cflags & REG_DOTALL) != 0) options |= PCRE2_DOTALL; -if ((cflags & REG_NOSPEC) != 0) options |= PCRE2_LITERAL; -if ((cflags & REG_UTF) != 0) options |= PCRE2_UTF; -if ((cflags & REG_UCP) != 0) options |= PCRE2_UCP; -if ((cflags & REG_UNGREEDY) != 0) options |= PCRE2_UNGREEDY; - -preg->re_cflags = cflags; -preg->re_pcre2_code = pcre2_compile((PCRE2_SPTR)pattern, patlen, options, - &errorcode, &erroffset, NULL); -preg->re_erroffset = erroffset; - -if (preg->re_pcre2_code == NULL) - { - unsigned int i; - - /* A negative value is a UTF error; otherwise all error codes are greater - than COMPILE_ERROR_BASE, but check, just in case. */ - - if (errorcode < COMPILE_ERROR_BASE) return REG_BADPAT; - errorcode -= COMPILE_ERROR_BASE; - - if (errorcode < (int)(sizeof(eint1)/sizeof(const int))) - return eint1[errorcode]; - for (i = 0; i < sizeof(eint2)/sizeof(const int); i += 2) - if (errorcode == eint2[i]) return eint2[i+1]; - return REG_BADPAT; - } - -(void)pcre2_pattern_info((const pcre2_code *)preg->re_pcre2_code, - PCRE2_INFO_CAPTURECOUNT, &re_nsub); -preg->re_nsub = (size_t)re_nsub; -preg->re_match_data = pcre2_match_data_create(re_nsub + 1, NULL); -preg->re_erroffset = (size_t)(-1); /* No meaning after successful compile */ - -if (preg->re_match_data == NULL) - { - pcre2_code_free(preg->re_pcre2_code); - return REG_ESPACE; - } - -return 0; -} - - - -/************************************************* -* Match a regular expression * -*************************************************/ - -/* A suitable match_data block, large enough to hold all possible captures, was -obtained when the pattern was compiled, to save having to allocate and free it -for each match. If REG_NOSUB was specified at compile time, the nmatch and -pmatch arguments are ignored, and the only result is yes/no/error. */ - -PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_regexec(const regex_t *preg, const char *string, size_t nmatch, - regmatch_t pmatch[], int eflags) -{ -int rc, so, eo; -int options = 0; -pcre2_match_data *md = (pcre2_match_data *)preg->re_match_data; - -if ((eflags & REG_NOTBOL) != 0) options |= PCRE2_NOTBOL; -if ((eflags & REG_NOTEOL) != 0) options |= PCRE2_NOTEOL; -if ((eflags & REG_NOTEMPTY) != 0) options |= PCRE2_NOTEMPTY; - -/* When REG_NOSUB was specified, or if no vector has been passed in which to -put captured strings, ensure that nmatch is zero. This will stop any attempt to -write to pmatch. */ - -if ((preg->re_cflags & REG_NOSUB) != 0 || pmatch == NULL) nmatch = 0; - -/* REG_STARTEND is a BSD extension, to allow for non-NUL-terminated strings. -The man page from OS X says "REG_STARTEND affects only the location of the -string, not how it is matched". That is why the "so" value is used to bump the -start location rather than being passed as a PCRE2 "starting offset". */ - -if ((eflags & REG_STARTEND) != 0) - { - if (pmatch == NULL) return REG_INVARG; - so = pmatch[0].rm_so; - eo = pmatch[0].rm_eo; - } -else - { - so = 0; - eo = (int)strlen(string); - } - -rc = pcre2_match((const pcre2_code *)preg->re_pcre2_code, - (PCRE2_SPTR)string + so, (eo - so), 0, options, md, NULL); - -/* Successful match */ - -if (rc >= 0) - { - size_t i; - PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(md); - if ((size_t)rc > nmatch) rc = (int)nmatch; - for (i = 0; i < (size_t)rc; i++) - { - pmatch[i].rm_so = (ovector[i*2] == PCRE2_UNSET)? -1 : - (int)(ovector[i*2] + so); - pmatch[i].rm_eo = (ovector[i*2+1] == PCRE2_UNSET)? -1 : - (int)(ovector[i*2+1] + so); - } - for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1; - return 0; - } - -/* Unsuccessful match */ - -if (rc <= PCRE2_ERROR_UTF8_ERR1 && rc >= PCRE2_ERROR_UTF8_ERR21) - return REG_INVARG; - -switch(rc) - { - default: return REG_ASSERT; - case PCRE2_ERROR_BADMODE: return REG_INVARG; - case PCRE2_ERROR_BADMAGIC: return REG_INVARG; - case PCRE2_ERROR_BADOPTION: return REG_INVARG; - case PCRE2_ERROR_BADUTFOFFSET: return REG_INVARG; - case PCRE2_ERROR_MATCHLIMIT: return REG_ESPACE; - case PCRE2_ERROR_NOMATCH: return REG_NOMATCH; - case PCRE2_ERROR_NOMEMORY: return REG_ESPACE; - case PCRE2_ERROR_NULL: return REG_INVARG; - } -} - -/* End of pcre2posix.c */ diff --git a/pcre2/src/pcre2posix.h b/pcre2/src/pcre2posix.h deleted file mode 100644 index 3a663b9ff..000000000 --- a/pcre2/src/pcre2posix.h +++ /dev/null @@ -1,170 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE2 is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. This is -the public header file to be #included by applications that call PCRE2 via the -POSIX wrapper interface. - - Written by Philip Hazel - Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -/* Have to include stdlib.h in order to ensure that size_t is defined. */ - -#include - -/* Allow for C++ users */ - -#ifdef __cplusplus -extern "C" { -#endif - -/* Options, mostly defined by POSIX, but with some extras. */ - -#define REG_ICASE 0x0001 /* Maps to PCRE2_CASELESS */ -#define REG_NEWLINE 0x0002 /* Maps to PCRE2_MULTILINE */ -#define REG_NOTBOL 0x0004 /* Maps to PCRE2_NOTBOL */ -#define REG_NOTEOL 0x0008 /* Maps to PCRE2_NOTEOL */ -#define REG_DOTALL 0x0010 /* NOT defined by POSIX; maps to PCRE2_DOTALL */ -#define REG_NOSUB 0x0020 /* Do not report what was matched */ -#define REG_UTF 0x0040 /* NOT defined by POSIX; maps to PCRE2_UTF */ -#define REG_STARTEND 0x0080 /* BSD feature: pass subject string by so,eo */ -#define REG_NOTEMPTY 0x0100 /* NOT defined by POSIX; maps to PCRE2_NOTEMPTY */ -#define REG_UNGREEDY 0x0200 /* NOT defined by POSIX; maps to PCRE2_UNGREEDY */ -#define REG_UCP 0x0400 /* NOT defined by POSIX; maps to PCRE2_UCP */ -#define REG_PEND 0x0800 /* GNU feature: pass end pattern by re_endp */ -#define REG_NOSPEC 0x1000 /* Maps to PCRE2_LITERAL */ - -/* This is not used by PCRE2, but by defining it we make it easier -to slot PCRE2 into existing programs that make POSIX calls. */ - -#define REG_EXTENDED 0 - -/* Error values. Not all these are relevant or used by the wrapper. */ - -enum { - REG_ASSERT = 1, /* internal error ? */ - REG_BADBR, /* invalid repeat counts in {} */ - REG_BADPAT, /* pattern error */ - REG_BADRPT, /* ? * + invalid */ - REG_EBRACE, /* unbalanced {} */ - REG_EBRACK, /* unbalanced [] */ - REG_ECOLLATE, /* collation error - not relevant */ - REG_ECTYPE, /* bad class */ - REG_EESCAPE, /* bad escape sequence */ - REG_EMPTY, /* empty expression */ - REG_EPAREN, /* unbalanced () */ - REG_ERANGE, /* bad range inside [] */ - REG_ESIZE, /* expression too big */ - REG_ESPACE, /* failed to get memory */ - REG_ESUBREG, /* bad back reference */ - REG_INVARG, /* bad argument */ - REG_NOMATCH /* match failed */ -}; - - -/* The structure representing a compiled regular expression. It is also used -for passing the pattern end pointer when REG_PEND is set. */ - -typedef struct { - void *re_pcre2_code; - void *re_match_data; - const char *re_endp; - size_t re_nsub; - size_t re_erroffset; - int re_cflags; -} regex_t; - -/* The structure in which a captured offset is returned. */ - -typedef int regoff_t; - -typedef struct { - regoff_t rm_so; - regoff_t rm_eo; -} regmatch_t; - -/* When an application links to a PCRE2 DLL in Windows, the symbols that are -imported have to be identified as such. When building PCRE2, the appropriate -export settings are needed, and are set in pcre2posix.c before including this -file. */ - -#if defined(_WIN32) && !defined(PCRE2_STATIC) && !defined(PCRE2POSIX_EXP_DECL) -# define PCRE2POSIX_EXP_DECL extern __declspec(dllimport) -# define PCRE2POSIX_EXP_DEFN __declspec(dllimport) -#endif - -/* By default, we use the standard "extern" declarations. */ - -#ifndef PCRE2POSIX_EXP_DECL -# ifdef __cplusplus -# define PCRE2POSIX_EXP_DECL extern "C" -# define PCRE2POSIX_EXP_DEFN extern "C" -# else -# define PCRE2POSIX_EXP_DECL extern -# define PCRE2POSIX_EXP_DEFN extern -# endif -#endif - -/* The functions. The actual code is in functions with pcre2_xxx names for -uniqueness. POSIX names are provided as macros for API compatibility with POSIX -regex functions. It's done this way to ensure to they are always linked from -the PCRE2 library and not by accident from elsewhere (regex_t differs in size -elsewhere). */ - -PCRE2POSIX_EXP_DECL int pcre2_regcomp(regex_t *, const char *, int); -PCRE2POSIX_EXP_DECL int pcre2_regexec(const regex_t *, const char *, size_t, - regmatch_t *, int); -PCRE2POSIX_EXP_DECL size_t pcre2_regerror(int, const regex_t *, char *, size_t); -PCRE2POSIX_EXP_DECL void pcre2_regfree(regex_t *); - -#define regcomp pcre2_regcomp -#define regexec pcre2_regexec -#define regerror pcre2_regerror -#define regfree pcre2_regfree - -/* Debian had a patch that used different names. These are now here to save -them having to maintain their own patch, but are not documented by PCRE2. */ - -#define PCRE2regcomp pcre2_regcomp -#define PCRE2regexec pcre2_regexec -#define PCRE2regerror pcre2_regerror -#define PCRE2regfree pcre2_regfree - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -/* End of pcre2posix.h */ diff --git a/pcre2/test-driver b/pcre2/test-driver deleted file mode 100755 index 89dba1e07..000000000 --- a/pcre2/test-driver +++ /dev/null @@ -1,148 +0,0 @@ -#! /bin/sh -# test-driver - basic testsuite driver script. - -scriptversion=2018-03-07.03; # UTC - -# Copyright (C) 2011-2020 Free Software Foundation, Inc. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - -# This file is maintained in Automake, please report -# bugs to or send patches to -# . - -# Make unconditional expansion of undefined variables an error. This -# helps a lot in preventing typo-related bugs. -set -u - -usage_error () -{ - echo "$0: $*" >&2 - print_usage >&2 - exit 2 -} - -print_usage () -{ - cat <$log_file 2>&1 -estatus=$? - -if test $enable_hard_errors = no && test $estatus -eq 99; then - tweaked_estatus=1 -else - tweaked_estatus=$estatus -fi - -case $tweaked_estatus:$expect_failure in - 0:yes) col=$red res=XPASS recheck=yes gcopy=yes;; - 0:*) col=$grn res=PASS recheck=no gcopy=no;; - 77:*) col=$blu res=SKIP recheck=no gcopy=yes;; - 99:*) col=$mgn res=ERROR recheck=yes gcopy=yes;; - *:yes) col=$lgn res=XFAIL recheck=no gcopy=yes;; - *:*) col=$red res=FAIL recheck=yes gcopy=yes;; -esac - -# Report the test outcome and exit status in the logs, so that one can -# know whether the test passed or failed simply by looking at the '.log' -# file, without the need of also peaking into the corresponding '.trs' -# file (automake bug#11814). -echo "$res $test_name (exit status: $estatus)" >>$log_file - -# Report outcome to console. -echo "${col}${res}${std}: $test_name" - -# Register the test result, and other relevant metadata. -echo ":test-result: $res" > $trs_file -echo ":global-test-result: $res" >> $trs_file -echo ":recheck: $recheck" >> $trs_file -echo ":copy-in-global-log: $gcopy" >> $trs_file - -# Local Variables: -# mode: shell-script -# sh-indentation: 2 -# eval: (add-hook 'before-save-hook 'time-stamp) -# time-stamp-start: "scriptversion=" -# time-stamp-format: "%:y-%02m-%02d.%02H" -# time-stamp-time-zone: "UTC0" -# time-stamp-end: "; # UTC" -# End: