Basic "script run" implementation. Not yet complete, and not yet documented.
This commit is contained in:
parent
f26b0b0bae
commit
866750fd53
|
@ -30,6 +30,10 @@ new "is lower case letter" bit. At the same time, the now unused "is
|
|||
hexadecimal digit" bit was removed. The default tables in
|
||||
src/pcre2_chartables.c.dist are updated.
|
||||
|
||||
8. Implement the new Perl "script run" features (*script_run:...) and
|
||||
(*atomic_script_run:...) aka (*sr:...) and (*asr:...). At present, this is
|
||||
incomplete and not yet documented.
|
||||
|
||||
|
||||
Version 10.32 10-September-2018
|
||||
-------------------------------
|
||||
|
|
|
@ -364,6 +364,7 @@ COMMON_SOURCES = \
|
|||
src/pcre2_newline.c \
|
||||
src/pcre2_ord2utf.c \
|
||||
src/pcre2_pattern_info.c \
|
||||
src/pcre2_script_run.c \
|
||||
src/pcre2_serialize.c \
|
||||
src/pcre2_string_utils.c \
|
||||
src/pcre2_study.c \
|
||||
|
|
|
@ -104,6 +104,7 @@ can skip ahead to the CMake section.
|
|||
pcre2_newline.c
|
||||
pcre2_ord2utf.c
|
||||
pcre2_pattern_info.c
|
||||
pcre2_script_run.c
|
||||
pcre2_serialize.c
|
||||
pcre2_string_utils.c
|
||||
pcre2_study.c
|
||||
|
|
1
README
1
README
|
@ -788,6 +788,7 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_newline.c )
|
||||
src/pcre2_ord2utf.c )
|
||||
src/pcre2_pattern_info.c )
|
||||
src/pcre2_script_run.c )
|
||||
src/pcre2_serialize.c )
|
||||
src/pcre2_string_utils.c )
|
||||
src/pcre2_study.c )
|
||||
|
|
|
@ -104,6 +104,7 @@ can skip ahead to the CMake section.
|
|||
pcre2_newline.c
|
||||
pcre2_ord2utf.c
|
||||
pcre2_pattern_info.c
|
||||
pcre2_script_run.c
|
||||
pcre2_serialize.c
|
||||
pcre2_string_utils.c
|
||||
pcre2_study.c
|
||||
|
|
|
@ -788,6 +788,7 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_newline.c )
|
||||
src/pcre2_ord2utf.c )
|
||||
src/pcre2_pattern_info.c )
|
||||
src/pcre2_script_run.c )
|
||||
src/pcre2_serialize.c )
|
||||
src/pcre2_string_utils.c )
|
||||
src/pcre2_study.c )
|
||||
|
|
|
@ -25,8 +25,9 @@
|
|||
# Added script names for Unicode 8.0.0, 19-June-2015.
|
||||
# Added script names for Unicode 10.0.0, 02-July-2017.
|
||||
# Added script names for Unicode 11.0.0, 03-July-2018.
|
||||
# Added 'Unknown' script, 01-October-2018.
|
||||
|
||||
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
||||
|
|
|
@ -143,6 +143,7 @@
|
|||
# 03-July-2018: Updated for Unicode 11.0.0
|
||||
# 07-July-2018: Added code to scan emoji-data.txt for the Extended
|
||||
# Pictographic property.
|
||||
# 01-October-2018: Added the 'Unknown' script name
|
||||
##############################################################################
|
||||
|
||||
|
||||
|
@ -300,7 +301,7 @@ def get_record_size_struct(records):
|
|||
slice_type, slice_size = get_type_size(record_slice)
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
|
||||
structure += '} ucd_record;\n*/\n\n'
|
||||
structure += '} ucd_record;\n*/\n'
|
||||
return size, structure
|
||||
|
||||
def test_record_size():
|
||||
|
@ -329,7 +330,7 @@ def print_records(records, record_size):
|
|||
print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
|
||||
print('};\n')
|
||||
|
||||
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
||||
|
@ -380,7 +381,7 @@ break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
|
|||
test_record_size()
|
||||
unicode_version = ""
|
||||
|
||||
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
|
||||
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
|
||||
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
|
||||
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
|
||||
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
|
||||
|
@ -553,11 +554,11 @@ print("special record. */")
|
|||
print()
|
||||
print("#if PCRE2_CODE_UNIT_WIDTH == 32")
|
||||
print("const ucd_record PRIV(dummy_ucd_record)[] = {{")
|
||||
print(" ucp_Common, /* script */")
|
||||
print(" ucp_Cn, /* type unassigned */")
|
||||
print(" ucp_gbOther, /* grapheme break property */")
|
||||
print(" 0, /* case set */")
|
||||
print(" 0, /* other case */")
|
||||
print(" ucp_Unknown, /* script */")
|
||||
print(" ucp_Cn, /* type unassigned */")
|
||||
print(" ucp_gbOther, /* grapheme break property */")
|
||||
print(" 0, /* case set */")
|
||||
print(" 0, /* other case */")
|
||||
print(" }};")
|
||||
print("#endif")
|
||||
print()
|
||||
|
@ -565,6 +566,9 @@ print(record_struct)
|
|||
|
||||
# --- Added by PH: output the table of caseless character sets ---
|
||||
|
||||
print("/* This table contains lists of characters that are caseless sets of")
|
||||
print("more than one character. Each list is terminated by NOTACHAR. */\n")
|
||||
|
||||
print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
|
||||
print(" NOTACHAR,")
|
||||
for s in sets:
|
||||
|
@ -577,10 +581,53 @@ print()
|
|||
|
||||
# ------
|
||||
|
||||
print("/* When #included in pcre2test, we don't need this large table. */")
|
||||
print("/* When #included in pcre2test, we don't need the table of digit")
|
||||
print("sets, nor the the large main UCD tables. */")
|
||||
print()
|
||||
print("#ifndef PCRE2_PCRE2TEST")
|
||||
print()
|
||||
|
||||
# --- Added by PH: read Scripts.txt again for the sets of 10 digits. ---
|
||||
|
||||
digitsets = []
|
||||
file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
|
||||
|
||||
for line in file:
|
||||
m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
|
||||
if m is None:
|
||||
continue
|
||||
first = int(m.group(1),16)
|
||||
last = int(m.group(2),16)
|
||||
if ((last - first + 1) % 10) != 0:
|
||||
print("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
|
||||
file=sys.stderr)
|
||||
while first < last:
|
||||
digitsets.append(first + 9)
|
||||
first += 10
|
||||
file.close()
|
||||
digitsets.sort()
|
||||
|
||||
print("/* This table lists the code points for the '9' characters in each")
|
||||
print("set of decimal digits. It is used to ensure that all the digits in")
|
||||
print("a script run come from the same set. */")
|
||||
print()
|
||||
print("const uint32_t PRIV(ucd_digit_sets)[] = {")
|
||||
|
||||
print(" %d, /* Number of subsequent values */" % len(digitsets), end='')
|
||||
count = 8
|
||||
for d in digitsets:
|
||||
if count == 8:
|
||||
print("\n ", end='')
|
||||
count = 0
|
||||
print(" 0x%05x," % d, end='')
|
||||
count += 1
|
||||
print("\n};")
|
||||
print()
|
||||
|
||||
# Output the main UCD tables.
|
||||
|
||||
print("/* These are the main two-stage UCD tables. */\n")
|
||||
|
||||
print_records(records, record_size)
|
||||
print_table(min_stage1, 'PRIV(ucd_stage1)')
|
||||
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
||||
|
@ -591,6 +638,10 @@ print("#endif /* SUPPORT_UNICODE */")
|
|||
print()
|
||||
print("#endif /* PCRE2_PCRE2TEST */")
|
||||
|
||||
|
||||
# This code was part of the original contribution, but is commented out as it
|
||||
# was never used. A two-stage table has sufficed.
|
||||
|
||||
"""
|
||||
|
||||
# Three-stage tables:
|
||||
|
|
|
@ -134,6 +134,7 @@ switch(gbprop)
|
|||
|
||||
switch(script)
|
||||
{
|
||||
case ucp_Unknown: scriptname = US"Unknown"; break;
|
||||
case ucp_Arabic: scriptname = US"Arabic"; break;
|
||||
case ucp_Armenian: scriptname = US"Armenian"; break;
|
||||
case ucp_Balinese: scriptname = US"Balinese"; break;
|
||||
|
|
17
perltest.sh
17
perltest.sh
|
@ -1,8 +1,10 @@
|
|||
#! /bin/sh
|
||||
|
||||
# Script for testing regular expressions with perl to check that PCRE2 handles
|
||||
# them the same. If the first argument to this script is "-w", Perl is also
|
||||
# called with "-w", which turns on its warning mode.
|
||||
# them the same. For testing with different versions of Perl, if the first
|
||||
# argument is -perl then the second is taken as the Perl command to use, and
|
||||
# both are then removed. If the next argument is "-w", Perl is called with
|
||||
# "-w", which turns on its warning mode.
|
||||
#
|
||||
# The Perl code has to have "use utf8" and "require Encode" at the start when
|
||||
# running UTF-8 tests, but *not* for non-utf8 tests. (The "require" would
|
||||
|
@ -10,8 +12,8 @@
|
|||
# the script will always run for these tests.)
|
||||
#
|
||||
# The desired effect is achieved by making this a shell script that passes the
|
||||
# Perl script to Perl through a pipe. If the first argument (possibly after
|
||||
# removing "-w") is "-utf8", a suitable prefix is set up.
|
||||
# Perl script to Perl through a pipe. If the next argument is "-utf8", a
|
||||
# suitable prefix is set up.
|
||||
#
|
||||
# The remaining arguments, if any, are passed to Perl. They are an input file
|
||||
# and an output file. If there is one argument, the output is written to
|
||||
|
@ -23,6 +25,12 @@ perl=perl
|
|||
perlarg=''
|
||||
prefix=''
|
||||
|
||||
if [ $# -gt 1 -a "$1" = "-perl" ] ; then
|
||||
shift
|
||||
perl=$1
|
||||
shift
|
||||
fi
|
||||
|
||||
if [ $# -gt 0 -a "$1" = "-w" ] ; then
|
||||
perlarg="-w"
|
||||
shift
|
||||
|
@ -78,6 +86,7 @@ fi
|
|||
# The alpha assertions currently give warnings even when -w is not specified.
|
||||
|
||||
no warnings "experimental::alpha_assertions";
|
||||
no warnings "experimental::script_run";
|
||||
|
||||
# Function for turning a string into a string of printing chars.
|
||||
|
||||
|
|
|
@ -321,6 +321,7 @@ pcre2_pattern_convert(). */
|
|||
#define PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE 193
|
||||
#define PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS 194
|
||||
#define PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN 195
|
||||
#define PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE 196
|
||||
|
||||
|
||||
/* "Expected" matching error codes: no match and partial match. */
|
||||
|
|
|
@ -604,6 +604,15 @@ for(;;)
|
|||
case OP_SCBRAPOS:
|
||||
if (cb->had_recurse) return FALSE;
|
||||
break;
|
||||
|
||||
/* A script run might have to backtrack if the iterated item can match
|
||||
characters from more than one script. So give up unless repeating an
|
||||
explicit character. */
|
||||
|
||||
case OP_SCRIPT_RUN:
|
||||
if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI)
|
||||
return FALSE;
|
||||
break;
|
||||
|
||||
/* Atomic sub-patterns and assertions can always auto-possessify their
|
||||
last iterator. However, if the group was entered as a result of checking
|
||||
|
@ -614,7 +623,6 @@ for(;;)
|
|||
case OP_ASSERTBACK:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
case OP_ONCE:
|
||||
|
||||
return !entered_a_group;
|
||||
}
|
||||
|
||||
|
|
|
@ -240,49 +240,57 @@ code (meta_extra_lengths, just below) must be updated to remain in step. */
|
|||
#define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
|
||||
#define META_RECURSE 0x80200000u /* Recursion */
|
||||
#define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
|
||||
#define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */
|
||||
|
||||
/* These must be kept together to make it easy to check that an assertion
|
||||
is present where expected in a conditional group. */
|
||||
|
||||
#define META_LOOKAHEAD 0x80220000u /* (?= */
|
||||
#define META_LOOKAHEADNOT 0x80230000u /* (?! */
|
||||
#define META_LOOKBEHIND 0x80240000u /* (?<= */
|
||||
#define META_LOOKBEHINDNOT 0x80250000u /* (?<! */
|
||||
#define META_LOOKAHEAD 0x80230000u /* (?= */
|
||||
#define META_LOOKAHEADNOT 0x80240000u /* (?! */
|
||||
#define META_LOOKBEHIND 0x80250000u /* (?<= */
|
||||
#define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
|
||||
|
||||
/* These must be kept in this order, with consecutive values, and the _ARG
|
||||
versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
|
||||
versions. */
|
||||
|
||||
#define META_MARK 0x80260000u /* (*MARK) */
|
||||
#define META_ACCEPT 0x80270000u /* (*ACCEPT) */
|
||||
#define META_FAIL 0x80280000u /* (*FAIL) */
|
||||
#define META_COMMIT 0x80290000u /* These */
|
||||
#define META_COMMIT_ARG 0x802a0000u /* pairs */
|
||||
#define META_PRUNE 0x802b0000u /* must */
|
||||
#define META_PRUNE_ARG 0x802c0000u /* be */
|
||||
#define META_SKIP 0x802d0000u /* kept */
|
||||
#define META_SKIP_ARG 0x802e0000u /* in */
|
||||
#define META_THEN 0x802f0000u /* this */
|
||||
#define META_THEN_ARG 0x80300000u /* order */
|
||||
#define META_MARK 0x80270000u /* (*MARK) */
|
||||
#define META_ACCEPT 0x80280000u /* (*ACCEPT) */
|
||||
#define META_FAIL 0x80290000u /* (*FAIL) */
|
||||
#define META_COMMIT 0x802a0000u /* These */
|
||||
#define META_COMMIT_ARG 0x802b0000u /* pairs */
|
||||
#define META_PRUNE 0x802c0000u /* must */
|
||||
#define META_PRUNE_ARG 0x802d0000u /* be */
|
||||
#define META_SKIP 0x802e0000u /* kept */
|
||||
#define META_SKIP_ARG 0x802f0000u /* in */
|
||||
#define META_THEN 0x80300000u /* this */
|
||||
#define META_THEN_ARG 0x80310000u /* order */
|
||||
|
||||
/* These must be kept in groups of adjacent 3 values, and all together. */
|
||||
|
||||
#define META_ASTERISK 0x80310000u /* * */
|
||||
#define META_ASTERISK_PLUS 0x80320000u /* *+ */
|
||||
#define META_ASTERISK_QUERY 0x80330000u /* *? */
|
||||
#define META_PLUS 0x80340000u /* + */
|
||||
#define META_PLUS_PLUS 0x80350000u /* ++ */
|
||||
#define META_PLUS_QUERY 0x80360000u /* +? */
|
||||
#define META_QUERY 0x80370000u /* ? */
|
||||
#define META_QUERY_PLUS 0x80380000u /* ?+ */
|
||||
#define META_QUERY_QUERY 0x80390000u /* ?? */
|
||||
#define META_MINMAX 0x803a0000u /* {n,m} repeat */
|
||||
#define META_MINMAX_PLUS 0x803b0000u /* {n,m}+ repeat */
|
||||
#define META_MINMAX_QUERY 0x803c0000u /* {n,m}? repeat */
|
||||
#define META_ASTERISK 0x80320000u /* * */
|
||||
#define META_ASTERISK_PLUS 0x80330000u /* *+ */
|
||||
#define META_ASTERISK_QUERY 0x80340000u /* *? */
|
||||
#define META_PLUS 0x80350000u /* + */
|
||||
#define META_PLUS_PLUS 0x80360000u /* ++ */
|
||||
#define META_PLUS_QUERY 0x80370000u /* +? */
|
||||
#define META_QUERY 0x80380000u /* ? */
|
||||
#define META_QUERY_PLUS 0x80390000u /* ?+ */
|
||||
#define META_QUERY_QUERY 0x803a0000u /* ?? */
|
||||
#define META_MINMAX 0x803b0000u /* {n,m} repeat */
|
||||
#define META_MINMAX_PLUS 0x803c0000u /* {n,m}+ repeat */
|
||||
#define META_MINMAX_QUERY 0x803d0000u /* {n,m}? repeat */
|
||||
|
||||
#define META_FIRST_QUANTIFIER META_ASTERISK
|
||||
#define META_LAST_QUANTIFIER META_MINMAX_QUERY
|
||||
|
||||
/* This is a special "meta code" that is used only to distinguish (*asr: from
|
||||
(*sr in the table of aphabetic assertions. It is never stored in the parsed
|
||||
pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
|
||||
therefore no need for it to have a length entry, so use a high value. */
|
||||
|
||||
#define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
|
||||
|
||||
/* Table of extra lengths for each of the meta codes. Must be kept in step with
|
||||
the definitions above. For some items these values are a basic length to which
|
||||
a variable amount has to be added. */
|
||||
|
@ -322,6 +330,7 @@ static unsigned char meta_extra_lengths[] = {
|
|||
0, /* META_RANGE_LITERAL */
|
||||
SIZEOFFSET, /* META_RECURSE */
|
||||
1+SIZEOFFSET, /* META_RECURSE_BYNAME */
|
||||
0, /* META_SCRIPT_RUN */
|
||||
0, /* META_LOOKAHEAD */
|
||||
0, /* META_LOOKAHEADNOT */
|
||||
SIZEOFFSET, /* META_LOOKBEHIND */
|
||||
|
@ -638,19 +647,19 @@ static const char alasnames[] =
|
|||
STRING_atomic_script_run;
|
||||
|
||||
static const alasitem alasmeta[] = {
|
||||
{ 3, META_LOOKAHEAD },
|
||||
{ 3, META_LOOKBEHIND },
|
||||
{ 3, META_LOOKAHEADNOT },
|
||||
{ 3, META_LOOKBEHINDNOT },
|
||||
{ 18, META_LOOKAHEAD },
|
||||
{ 19, META_LOOKBEHIND },
|
||||
{ 18, META_LOOKAHEADNOT },
|
||||
{ 19, META_LOOKBEHINDNOT },
|
||||
{ 6, META_ATOMIC },
|
||||
{ 2, 0 }, /* sr = script run */
|
||||
{ 3, 0 }, /* asr = atomic script run */
|
||||
{ 10, 0 }, /* script run */
|
||||
{ 17, 0 } /* atomic script run */
|
||||
{ 3, META_LOOKAHEAD },
|
||||
{ 3, META_LOOKBEHIND },
|
||||
{ 3, META_LOOKAHEADNOT },
|
||||
{ 3, META_LOOKBEHINDNOT },
|
||||
{ 18, META_LOOKAHEAD },
|
||||
{ 19, META_LOOKBEHIND },
|
||||
{ 18, META_LOOKAHEADNOT },
|
||||
{ 19, META_LOOKBEHINDNOT },
|
||||
{ 6, META_ATOMIC },
|
||||
{ 2, META_SCRIPT_RUN }, /* sr = script run */
|
||||
{ 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
|
||||
{ 10, META_SCRIPT_RUN }, /* script run */
|
||||
{ 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
|
||||
};
|
||||
|
||||
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
|
||||
|
@ -772,7 +781,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
|
|||
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
|
||||
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
|
||||
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
|
||||
ERR91, ERR92, ERR93, ERR94, ERR95 };
|
||||
ERR91, ERR92, ERR93, ERR94, ERR95, ERR96 };
|
||||
|
||||
/* This is a table of start-of-pattern options such as (*UTF) and settings such
|
||||
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
|
||||
|
@ -1003,6 +1012,7 @@ for (;;)
|
|||
case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
|
||||
case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
|
||||
case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
|
||||
case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
|
||||
case META_KET: fprintf(stderr, "META )"); break;
|
||||
case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
|
||||
|
||||
|
@ -2210,15 +2220,15 @@ if (++ptr >= ptrend) /* No characters in name */
|
|||
ERR60; /* Verb not recognized or malformed */
|
||||
goto FAILED;
|
||||
}
|
||||
|
||||
/* A group name must not start with a digit. If either of the others start with
|
||||
a digit it just won't be recognized. */
|
||||
|
||||
|
||||
/* A group name must not start with a digit. If either of the others start with
|
||||
a digit it just won't be recognized. */
|
||||
|
||||
if (is_group && IS_DIGIT(*ptr))
|
||||
{
|
||||
*errorcodeptr = ERR44;
|
||||
goto FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
*nameptr = ptr;
|
||||
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
|
||||
|
@ -2345,6 +2355,7 @@ typedef struct nest_save {
|
|||
|
||||
#define NSF_RESET 0x0001u
|
||||
#define NSF_CONDASSERT 0x0002u
|
||||
#define NSF_ATOMICSR 0x0004u
|
||||
|
||||
/* Options that are changeable within the pattern must be tracked during
|
||||
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
|
||||
|
@ -2707,19 +2718,19 @@ while (ptr < ptrend)
|
|||
case CHAR_C:
|
||||
ok = expect_cond_assert == 2;
|
||||
break;
|
||||
|
||||
|
||||
case CHAR_EQUALS_SIGN:
|
||||
case CHAR_EXCLAMATION_MARK:
|
||||
break;
|
||||
|
||||
|
||||
case CHAR_LESS_THAN_SIGN:
|
||||
ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
ok = FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!ok)
|
||||
{
|
||||
|
@ -3533,13 +3544,13 @@ while (ptr < ptrend)
|
|||
/* Handle "alpha assertions" such as (*pla:...). Most of these are
|
||||
synonyms for the historical symbolic assertions, but the script run ones
|
||||
are new. They are distinguished by starting with a lower case letter.
|
||||
Checking both ends of the alphabet makes this work in all character
|
||||
Checking both ends of the alphabet makes this work in all character
|
||||
codes. */
|
||||
|
||||
else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
|
||||
{
|
||||
uint32_t meta;
|
||||
|
||||
|
||||
vn = alasnames;
|
||||
if (!read_name(&ptr, ptrend, 0, &offset, &name, &namelen, &errorcode,
|
||||
cb)) goto FAILED;
|
||||
|
@ -3550,7 +3561,7 @@ while (ptr < ptrend)
|
|||
}
|
||||
|
||||
/* Scan the table of alpha assertion names */
|
||||
|
||||
|
||||
for (i = 0; i < alascount; i++)
|
||||
{
|
||||
if (namelen == alasmeta[i].len &&
|
||||
|
@ -3564,42 +3575,72 @@ while (ptr < ptrend)
|
|||
errorcode = ERR95; /* Alpha assertion not recognized */
|
||||
goto FAILED;
|
||||
}
|
||||
|
||||
/* Check for expecting an assertion condition. If so, only lookaround
|
||||
|
||||
/* Check for expecting an assertion condition. If so, only lookaround
|
||||
assertions are valid. */
|
||||
|
||||
|
||||
meta = alasmeta[i].meta;
|
||||
if (prev_expect_cond_assert > 0 &&
|
||||
if (prev_expect_cond_assert > 0 &&
|
||||
(meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
|
||||
{
|
||||
errorcode = ERR28; /* Assertion expected */
|
||||
goto FAILED;
|
||||
}
|
||||
goto FAILED;
|
||||
}
|
||||
|
||||
/* The lookaround alphabetic synonyms can be almost entirely handled by
|
||||
jumping to the code that handles the traditional symbolic forms. */
|
||||
|
||||
switch(meta)
|
||||
{
|
||||
default:
|
||||
errorcode = ERR89; /* Unknown code; should never occur because */
|
||||
goto FAILED; /* the meta values come from a table above. */
|
||||
|
||||
case META_ATOMIC:
|
||||
goto ATOMIC_GROUP;
|
||||
goto ATOMIC_GROUP;
|
||||
|
||||
case META_LOOKAHEAD:
|
||||
goto POSITIVE_LOOK_AHEAD;
|
||||
|
||||
|
||||
case META_LOOKAHEADNOT:
|
||||
goto NEGATIVE_LOOK_AHEAD;
|
||||
|
||||
case META_LOOKBEHIND:
|
||||
case META_LOOKBEHINDNOT:
|
||||
*parsed_pattern++ = meta;
|
||||
ptr--;
|
||||
goto LOOKBEHIND;
|
||||
|
||||
/* FIXME: Script Run stuff ... */
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
case META_LOOKBEHIND:
|
||||
case META_LOOKBEHINDNOT:
|
||||
*parsed_pattern++ = meta;
|
||||
ptr--;
|
||||
goto POST_LOOKBEHIND;
|
||||
|
||||
/* The script run facilities are handled here. Unicode support is
|
||||
required (give an error if not, as this is a security issue). Always
|
||||
record a META_SCRIPT_RUN item. Then, for the atomic version, insert
|
||||
META_ATOMIC and remember that we need two META_KETs at the end. */
|
||||
|
||||
case META_SCRIPT_RUN:
|
||||
case META_ATOMIC_SCRIPT_RUN:
|
||||
#ifdef SUPPORT_UNICODE
|
||||
*parsed_pattern++ = META_SCRIPT_RUN;
|
||||
nest_depth++;
|
||||
ptr++;
|
||||
if (meta == META_ATOMIC_SCRIPT_RUN)
|
||||
{
|
||||
*parsed_pattern++ = META_ATOMIC;
|
||||
if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
|
||||
else if (++top_nest >= end_nests)
|
||||
{
|
||||
errorcode = ERR84;
|
||||
goto FAILED;
|
||||
}
|
||||
top_nest->nest_depth = nest_depth;
|
||||
top_nest->flags = NSF_ATOMICSR;
|
||||
top_nest->options = options & PARSE_TRACKED_OPTIONS;
|
||||
}
|
||||
break;
|
||||
#else /* SUPPORT_UNICODE */
|
||||
errorcode = ERR96;
|
||||
goto FAILED;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -4262,8 +4303,8 @@ while (ptr < ptrend)
|
|||
}
|
||||
*parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
|
||||
META_LOOKBEHIND : META_LOOKBEHINDNOT;
|
||||
|
||||
LOOKBEHIND: /* Come from (*plb: and (*nlb: */
|
||||
|
||||
POST_LOOKBEHIND: /* Come from (*plb: and (*nlb: */
|
||||
*has_lookbehind = TRUE;
|
||||
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
|
||||
PUTOFFSET(offset, parsed_pattern);
|
||||
|
@ -4425,6 +4466,14 @@ while (ptr < ptrend)
|
|||
cb->bracount = top_nest->max_group;
|
||||
if ((top_nest->flags & NSF_CONDASSERT) != 0)
|
||||
okquantifier = FALSE;
|
||||
|
||||
if ((top_nest->flags & NSF_ATOMICSR) != 0)
|
||||
{
|
||||
*parsed_pattern++ = META_KET;
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
|
||||
else top_nest--;
|
||||
}
|
||||
|
@ -6142,6 +6191,10 @@ for (;; pptr++)
|
|||
bravalue = OP_ONCE;
|
||||
goto GROUP_PROCESS_NOTE_EMPTY;
|
||||
|
||||
case META_SCRIPT_RUN:
|
||||
bravalue = OP_SCRIPT_RUN;
|
||||
goto GROUP_PROCESS_NOTE_EMPTY;
|
||||
|
||||
case META_NOCAPTURE:
|
||||
bravalue = OP_BRA;
|
||||
/* Fall through */
|
||||
|
@ -6777,6 +6830,7 @@ for (;; pptr++)
|
|||
case OP_ASSERTBACK:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
case OP_ONCE:
|
||||
case OP_SCRIPT_RUN:
|
||||
case OP_BRA:
|
||||
case OP_CBRA:
|
||||
case OP_COND:
|
||||
|
@ -6989,16 +7043,16 @@ for (;; pptr++)
|
|||
}
|
||||
|
||||
/* If the maximum is unlimited, set a repeater in the final copy. For
|
||||
ONCE brackets, that's all we need to do. However, possessively repeated
|
||||
ONCE brackets can be converted into non-capturing brackets, as the
|
||||
behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
|
||||
deal with possessive ONCEs specially.
|
||||
SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
|
||||
possessively repeated ONCE brackets can be converted into non-capturing
|
||||
brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
|
||||
saves having to deal with possessive ONCEs specially.
|
||||
|
||||
Otherwise, when we are doing the actual compile phase, check to see
|
||||
whether this group is one that could match an empty string. If so,
|
||||
convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
|
||||
that runtime checking can be done. [This check is also applied to ONCE
|
||||
groups at runtime, but in a different way.]
|
||||
and SCRIPT_RUN groups at runtime, but in a different way.]
|
||||
|
||||
Then, if the quantifier was possessive and the bracket is not a
|
||||
conditional, we convert the BRA code to the POS form, and the KET code to
|
||||
|
@ -7022,13 +7076,14 @@ for (;; pptr++)
|
|||
|
||||
if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
|
||||
|
||||
/* For non-possessive ONCE brackets, all we need to do is to
|
||||
set the KET. */
|
||||
/* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
|
||||
to do is to set the KET. */
|
||||
|
||||
if (*bracode == OP_ONCE) *ketcode = OP_KETRMAX + repeat_type;
|
||||
if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
|
||||
*ketcode = OP_KETRMAX + repeat_type;
|
||||
|
||||
/* Handle non-ONCE brackets and possessive ONCEs (which have been
|
||||
converted to non-capturing above). */
|
||||
/* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
|
||||
(which have been converted to non-capturing above). */
|
||||
|
||||
else
|
||||
{
|
||||
|
@ -8385,6 +8440,7 @@ do {
|
|||
case OP_SCBRAPOS:
|
||||
case OP_ASSERT:
|
||||
case OP_ONCE:
|
||||
case OP_SCRIPT_RUN:
|
||||
d = find_firstassertedcu(scode, &dflags, inassert + ((op==OP_ASSERT)?1:0));
|
||||
if (dflags < 0)
|
||||
return 0;
|
||||
|
|
|
@ -173,6 +173,7 @@ static const uint8_t coptable[] = {
|
|||
0, /* Assert behind */
|
||||
0, /* Assert behind not */
|
||||
0, /* ONCE */
|
||||
0, /* SCRIPT_RUN */
|
||||
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
|
||||
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
|
||||
0, 0, /* CREF, DNCREF */
|
||||
|
@ -247,6 +248,7 @@ static const uint8_t poptable[] = {
|
|||
0, /* Assert behind */
|
||||
0, /* Assert behind not */
|
||||
0, /* ONCE */
|
||||
0, /* SCRIPT_RUN */
|
||||
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
|
||||
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
|
||||
0, 0, /* CREF, DNCREF */
|
||||
|
|
|
@ -183,6 +183,7 @@ static const unsigned char compile_error_texts[] =
|
|||
"invalid hyphen in option setting\0"
|
||||
/* 95 */
|
||||
"(*alpha_assertion) not recognized\0"
|
||||
"script runs require Unicode support, which this version of PCRE2 does not have\0"
|
||||
;
|
||||
|
||||
/* Match-time and UTF error texts are in the same format. */
|
||||
|
|
|
@ -1513,70 +1513,71 @@ enum {
|
|||
OP_ASSERTBACK, /* 128 Positive lookbehind */
|
||||
OP_ASSERTBACK_NOT, /* 129 Negative lookbehind */
|
||||
|
||||
/* ONCE, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately after the
|
||||
assertions, with ONCE first, as there's a test for >= ONCE for a subpattern
|
||||
that isn't an assertion. The POS versions must immediately follow the non-POS
|
||||
versions in each case. */
|
||||
/* ONCE, SCRIPT_RUN, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come
|
||||
immediately after the assertions, with ONCE first, as there's a test for >=
|
||||
ONCE for a subpattern that isn't an assertion. The POS versions must
|
||||
immediately follow the non-POS versions in each case. */
|
||||
|
||||
OP_ONCE, /* 130 Atomic group, contains captures */
|
||||
OP_BRA, /* 131 Start of non-capturing bracket */
|
||||
OP_BRAPOS, /* 132 Ditto, with unlimited, possessive repeat */
|
||||
OP_CBRA, /* 133 Start of capturing bracket */
|
||||
OP_CBRAPOS, /* 134 Ditto, with unlimited, possessive repeat */
|
||||
OP_COND, /* 135 Conditional group */
|
||||
OP_SCRIPT_RUN, /* 131 Non-capture, but check characters' scripts */
|
||||
OP_BRA, /* 132 Start of non-capturing bracket */
|
||||
OP_BRAPOS, /* 133 Ditto, with unlimited, possessive repeat */
|
||||
OP_CBRA, /* 134 Start of capturing bracket */
|
||||
OP_CBRAPOS, /* 135 Ditto, with unlimited, possessive repeat */
|
||||
OP_COND, /* 136 Conditional group */
|
||||
|
||||
/* These five must follow the previous five, in the same order. There's a
|
||||
check for >= SBRA to distinguish the two sets. */
|
||||
|
||||
OP_SBRA, /* 136 Start of non-capturing bracket, check empty */
|
||||
OP_SBRAPOS, /* 137 Ditto, with unlimited, possessive repeat */
|
||||
OP_SCBRA, /* 138 Start of capturing bracket, check empty */
|
||||
OP_SCBRAPOS, /* 139 Ditto, with unlimited, possessive repeat */
|
||||
OP_SCOND, /* 140 Conditional group, check empty */
|
||||
OP_SBRA, /* 137 Start of non-capturing bracket, check empty */
|
||||
OP_SBRAPOS, /* 138 Ditto, with unlimited, possessive repeat */
|
||||
OP_SCBRA, /* 139 Start of capturing bracket, check empty */
|
||||
OP_SCBRAPOS, /* 140 Ditto, with unlimited, possessive repeat */
|
||||
OP_SCOND, /* 141 Conditional group, check empty */
|
||||
|
||||
/* The next two pairs must (respectively) be kept together. */
|
||||
|
||||
OP_CREF, /* 141 Used to hold a capture number as condition */
|
||||
OP_DNCREF, /* 142 Used to point to duplicate names as a condition */
|
||||
OP_RREF, /* 143 Used to hold a recursion number as condition */
|
||||
OP_DNRREF, /* 144 Used to point to duplicate names as a condition */
|
||||
OP_FALSE, /* 145 Always false (used by DEFINE and VERSION) */
|
||||
OP_TRUE, /* 146 Always true (used by VERSION) */
|
||||
OP_CREF, /* 142 Used to hold a capture number as condition */
|
||||
OP_DNCREF, /* 143 Used to point to duplicate names as a condition */
|
||||
OP_RREF, /* 144 Used to hold a recursion number as condition */
|
||||
OP_DNRREF, /* 145 Used to point to duplicate names as a condition */
|
||||
OP_FALSE, /* 146 Always false (used by DEFINE and VERSION) */
|
||||
OP_TRUE, /* 147 Always true (used by VERSION) */
|
||||
|
||||
OP_BRAZERO, /* 147 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 148 order. */
|
||||
OP_BRAPOSZERO, /* 149 */
|
||||
OP_BRAZERO, /* 148 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 149 order. */
|
||||
OP_BRAPOSZERO, /* 150 */
|
||||
|
||||
/* These are backtracking control verbs */
|
||||
|
||||
OP_MARK, /* 150 always has an argument */
|
||||
OP_PRUNE, /* 151 */
|
||||
OP_PRUNE_ARG, /* 152 same, but with argument */
|
||||
OP_SKIP, /* 153 */
|
||||
OP_SKIP_ARG, /* 154 same, but with argument */
|
||||
OP_THEN, /* 155 */
|
||||
OP_THEN_ARG, /* 156 same, but with argument */
|
||||
OP_COMMIT, /* 157 */
|
||||
OP_COMMIT_ARG, /* 158 same, but with argument */
|
||||
OP_MARK, /* 151 always has an argument */
|
||||
OP_PRUNE, /* 152 */
|
||||
OP_PRUNE_ARG, /* 153 same, but with argument */
|
||||
OP_SKIP, /* 154 */
|
||||
OP_SKIP_ARG, /* 155 same, but with argument */
|
||||
OP_THEN, /* 156 */
|
||||
OP_THEN_ARG, /* 157 same, but with argument */
|
||||
OP_COMMIT, /* 158 */
|
||||
OP_COMMIT_ARG, /* 159 same, but with argument */
|
||||
|
||||
/* These are forced failure and success verbs. FAIL and ACCEPT do accept an
|
||||
argument, but these cases can be compiled as, for example, (*MARK:X)(*FAIL)
|
||||
without the need for a special opcode. */
|
||||
|
||||
OP_FAIL, /* 159 */
|
||||
OP_ACCEPT, /* 160 */
|
||||
OP_ASSERT_ACCEPT, /* 161 Used inside assertions */
|
||||
OP_CLOSE, /* 162 Used before OP_ACCEPT to close open captures */
|
||||
OP_FAIL, /* 160 */
|
||||
OP_ACCEPT, /* 161 */
|
||||
OP_ASSERT_ACCEPT, /* 162 Used inside assertions */
|
||||
OP_CLOSE, /* 163 Used before OP_ACCEPT to close open captures */
|
||||
|
||||
/* This is used to skip a subpattern with a {0} quantifier */
|
||||
|
||||
OP_SKIPZERO, /* 163 */
|
||||
OP_SKIPZERO, /* 164 */
|
||||
|
||||
/* This is used to identify a DEFINE group during compilation so that it can
|
||||
be checked for having only one branch. It is changed to OP_FALSE before
|
||||
compilation finishes. */
|
||||
|
||||
OP_DEFINE, /* 164 */
|
||||
OP_DEFINE, /* 165 */
|
||||
|
||||
/* This is not an opcode, but is used to check that tables indexed by opcode
|
||||
are the correct length, in order to catch updating errors - there have been
|
||||
|
@ -1624,6 +1625,7 @@ some cases doesn't actually use these names at all). */
|
|||
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \
|
||||
"Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \
|
||||
"Once", \
|
||||
"Script run", \
|
||||
"Bra", "BraPos", "CBra", "CBraPos", \
|
||||
"Cond", \
|
||||
"SBra", "SBraPos", "SCBra", "SCBraPos", \
|
||||
|
@ -1707,6 +1709,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
|||
1+LINK_SIZE, /* Assert behind */ \
|
||||
1+LINK_SIZE, /* Assert behind not */ \
|
||||
1+LINK_SIZE, /* ONCE */ \
|
||||
1+LINK_SIZE, /* SCRIPT_RUN */ \
|
||||
1+LINK_SIZE, /* BRA */ \
|
||||
1+LINK_SIZE, /* BRAPOS */ \
|
||||
1+LINK_SIZE+IMM2_SIZE, /* CBRA */ \
|
||||
|
@ -1854,6 +1857,7 @@ extern const uint8_t PRIV(utf8_table4)[];
|
|||
#define _pcre2_hspace_list PCRE2_SUFFIX(_pcre2_hspace_list_)
|
||||
#define _pcre2_vspace_list PCRE2_SUFFIX(_pcre2_vspace_list_)
|
||||
#define _pcre2_ucd_caseless_sets PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
|
||||
#define _pcre2_ucd_digit_sets PCRE2_SUFFIX(_pcre2_ucd_digit_sets_)
|
||||
#define _pcre2_ucd_records PCRE2_SUFFIX(_pcre2_ucd_records_)
|
||||
#define _pcre2_ucd_stage1 PCRE2_SUFFIX(_pcre2_ucd_stage1_)
|
||||
#define _pcre2_ucd_stage2 PCRE2_SUFFIX(_pcre2_ucd_stage2_)
|
||||
|
@ -1875,6 +1879,7 @@ extern const uint8_t PRIV(default_tables)[];
|
|||
extern const uint32_t PRIV(hspace_list)[];
|
||||
extern const uint32_t PRIV(vspace_list)[];
|
||||
extern const uint32_t PRIV(ucd_caseless_sets)[];
|
||||
extern const uint32_t PRIV(ucd_digit_sets)[];
|
||||
extern const ucd_record PRIV(ucd_records)[];
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
extern const ucd_record PRIV(dummy_ucd_record)[];
|
||||
|
@ -1922,6 +1927,7 @@ is available. */
|
|||
#define _pcre2_jit_get_target PCRE2_SUFFIX(_pcre2_jit_get_target_)
|
||||
#define _pcre2_memctl_malloc PCRE2_SUFFIX(_pcre2_memctl_malloc_)
|
||||
#define _pcre2_ord2utf PCRE2_SUFFIX(_pcre2_ord2utf_)
|
||||
#define _pcre2_script_run PCRE2_SUFFIX(_pcre2_script_run_)
|
||||
#define _pcre2_strcmp PCRE2_SUFFIX(_pcre2_strcmp_)
|
||||
#define _pcre2_strcmp_c8 PCRE2_SUFFIX(_pcre2_strcmp_c8_)
|
||||
#define _pcre2_strcpy_c8 PCRE2_SUFFIX(_pcre2_strcpy_c8_)
|
||||
|
@ -1948,6 +1954,7 @@ extern size_t _pcre2_jit_get_size(void *);
|
|||
const char * _pcre2_jit_get_target(void);
|
||||
extern void * _pcre2_memctl_malloc(size_t, pcre2_memctl *);
|
||||
extern unsigned int _pcre2_ord2utf(uint32_t, PCRE2_UCHAR *);
|
||||
extern BOOL _pcre2_script_run(PCRE2_SPTR, PCRE2_SPTR, BOOL);
|
||||
extern int _pcre2_strcmp(PCRE2_SPTR, PCRE2_SPTR);
|
||||
extern int _pcre2_strcmp_c8(PCRE2_SPTR, const char *);
|
||||
extern PCRE2_SIZE _pcre2_strcpy_c8(PCRE2_UCHAR *, const char *);
|
||||
|
|
|
@ -5014,6 +5014,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
must record a backtracking point and also set up a chained frame. */
|
||||
|
||||
case OP_ONCE:
|
||||
case OP_SCRIPT_RUN:
|
||||
case OP_SBRA:
|
||||
Lframe_type = GF_NOCAPTURE | Fop;
|
||||
|
||||
|
@ -5525,6 +5526,14 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
case OP_ASSERT_NOT:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
RRETURN(MATCH_MATCH);
|
||||
|
||||
/* At the end of a script run, apply the script-checking rules. This code
|
||||
will never by exercised if Unicode support it not compiled, because in
|
||||
that environment script runs cause an error at compile time. */
|
||||
|
||||
case OP_SCRIPT_RUN:
|
||||
if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
|
||||
break;
|
||||
|
||||
/* Whole-pattern recursion is coded as a recurse into group 0, so it
|
||||
won't be picked up here. Instead, we catch it when the OP_END is reached.
|
||||
|
|
|
@ -393,6 +393,7 @@ for(;;)
|
|||
case OP_ASSERTBACK:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
case OP_ONCE:
|
||||
case OP_SCRIPT_RUN:
|
||||
case OP_COND:
|
||||
case OP_SCOND:
|
||||
case OP_REVERSE:
|
||||
|
|
|
@ -0,0 +1,228 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains the function for checking a script run. */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Check script run *
|
||||
*************************************************/
|
||||
|
||||
/* A script run is conceptually a sequence of characters all in the same
|
||||
Unicode script. However, it isn't quite that simple. There are special rules
|
||||
for scripts that are commonly used together, and also special rules for digits.
|
||||
This function implements the appropriate checks, which is possible only when
|
||||
PCRE2 is compiled with Unicode support. The function returns TRUE if there is
|
||||
no Unicode support; however, it should never be called in that circumstance
|
||||
because an error is given by pcre2_compile() if a script run is called for in a
|
||||
version of PCRE2 compiled without Unicode support.
|
||||
|
||||
Arguments:
|
||||
pgr point to the first character
|
||||
endptr point after the last character
|
||||
utf TRUE if in UTF mode
|
||||
|
||||
Returns: TRUE if this is a valid script run
|
||||
*/
|
||||
|
||||
#define SCRIPT_UNSET (-1)
|
||||
#define SCRIPT_HANPENDING (-2)
|
||||
#define SCRIPT_HANHIRAKATA (-3)
|
||||
#define SCRIPT_HANBOPOMOFO (-4)
|
||||
#define SCRIPT_HANHANGUL (-5)
|
||||
|
||||
BOOL
|
||||
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
|
||||
{
|
||||
#ifdef SUPPORT_UNICODE
|
||||
int require_script = SCRIPT_UNSET;
|
||||
uint32_t require_digitset = 0;
|
||||
uint32_t c;
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
(void)utf; /* Avoid compiler warning */
|
||||
#endif
|
||||
|
||||
/* Any string containing fewer than 2 characters is a valid script run. */
|
||||
|
||||
if (ptr >= endptr) return TRUE;
|
||||
GETCHARINCTEST(c, ptr);
|
||||
if (ptr >= endptr) return TRUE;
|
||||
|
||||
/* Scan strings of two or more characters, checking the Unicode characteristics
|
||||
of each code point. */
|
||||
|
||||
for (;;)
|
||||
{
|
||||
const ucd_record *ucd = GET_UCD(c);
|
||||
uint32_t script = ucd->script;
|
||||
|
||||
/* If the script is Unknown, the string is not a valid script run. Such
|
||||
characters can only form script runs of length one. */
|
||||
|
||||
if (script == ucp_Unknown) return FALSE;
|
||||
|
||||
/* A character whose script is Inherited is always accepted, and plays no
|
||||
further part. A character whose script is Common is always accepted, but must
|
||||
still be tested for a digit below. Otherwise, the character must match the
|
||||
script of the first non-Inherited, non-Common character encountered. For most
|
||||
scripts, the test is for the same script. However, the Han Chinese script may
|
||||
be used in conjunction with four other scripts in these combinations:
|
||||
|
||||
. Han with Hiragana and Katakana is allowed (for Japanese).
|
||||
|
||||
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
|
||||
|
||||
. Han with Hangul is allowed (for Korean).
|
||||
|
||||
If the first significant character's script is one of the four, the required
|
||||
script type is immediately known. However, if the first significant
|
||||
character's script is Han, we have to keep checking for a non-Han character.
|
||||
Hence the SCRIPT_HANPENDING state. */
|
||||
|
||||
if (script != ucp_Inherited)
|
||||
{
|
||||
if (script != ucp_Common) switch(require_script)
|
||||
{
|
||||
default:
|
||||
if (script != (unsigned int)require_script) return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_UNSET:
|
||||
case SCRIPT_HANPENDING:
|
||||
switch(script)
|
||||
{
|
||||
case ucp_Han:
|
||||
require_script = SCRIPT_HANPENDING;
|
||||
break;
|
||||
|
||||
case ucp_Hiragana:
|
||||
case ucp_Katakana:
|
||||
require_script = SCRIPT_HANHIRAKATA;
|
||||
break;
|
||||
|
||||
case ucp_Bopomofo:
|
||||
require_script = SCRIPT_HANBOPOMOFO;
|
||||
break;
|
||||
|
||||
case ucp_Hangul:
|
||||
require_script = SCRIPT_HANHANGUL;
|
||||
break;
|
||||
|
||||
default:
|
||||
if (require_script == SCRIPT_HANPENDING) return FALSE;
|
||||
require_script = script;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case SCRIPT_HANHIRAKATA:
|
||||
if (script != ucp_Han && script != ucp_Hiragana && script != ucp_Katakana)
|
||||
return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_HANBOPOMOFO:
|
||||
if (script != ucp_Han && script != ucp_Bopomofo) return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_HANHANGUL:
|
||||
if (script != ucp_Han && script != ucp_Hangul) return FALSE;
|
||||
break;
|
||||
}
|
||||
|
||||
/* The character is in an acceptable script. We must now ensure that all
|
||||
decimal digits in the string come from the same set. Some scripts (e.g.
|
||||
Common, Arabic) have more than one set of decimal digits. This code does
|
||||
not allow mixing sets, even within the same script. The vector called
|
||||
PRIV(ucd_digit_sets)[] contains, in its first element, the number of
|
||||
following elements, and then, in ascending order, the code points of the
|
||||
'9' characters in every set of 10 digits. Each set is identified by the
|
||||
offset in the vector of its '9' character. An initial check of the first
|
||||
value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
|
||||
|
||||
if (ucd->chartype == ucp_Nd)
|
||||
{
|
||||
uint32_t digitset;
|
||||
|
||||
if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
|
||||
{
|
||||
int mid;
|
||||
int bot = 1;
|
||||
int top = PRIV(ucd_digit_sets)[0];
|
||||
for (;;)
|
||||
{
|
||||
if (top <= bot + 1) /* <= rather than == is paranoia */
|
||||
{
|
||||
digitset = top;
|
||||
break;
|
||||
}
|
||||
mid = (top + bot) / 2;
|
||||
if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
|
||||
}
|
||||
}
|
||||
|
||||
/* A required value of 0 means "unset". */
|
||||
|
||||
if (require_digitset == 0) require_digitset = digitset;
|
||||
else if (digitset != require_digitset) return FALSE;
|
||||
} /* End digit handling */
|
||||
} /* End checking non-Inherited character */
|
||||
|
||||
/* If we haven't yet got to the end, pick up the next character. */
|
||||
|
||||
if (ptr >= endptr) return TRUE;
|
||||
GETCHARINCTEST(c, ptr);
|
||||
} /* End checking loop */
|
||||
|
||||
#else /* NOT SUPPORT_UNICODE */
|
||||
(void)ptr;
|
||||
(void)endptr;
|
||||
(void)utf;
|
||||
return TRUE;
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
}
|
||||
|
||||
/* End of pcre2_script_run.c */
|
|
@ -171,6 +171,7 @@ for (;;)
|
|||
/* Fall through */
|
||||
|
||||
case OP_ONCE:
|
||||
case OP_SCRIPT_RUN:
|
||||
case OP_SBRA:
|
||||
case OP_BRAPOS:
|
||||
case OP_SBRAPOS:
|
||||
|
@ -1075,6 +1076,7 @@ do
|
|||
case OP_CBRAPOS:
|
||||
case OP_SCBRAPOS:
|
||||
case OP_ONCE:
|
||||
case OP_SCRIPT_RUN:
|
||||
case OP_ASSERT:
|
||||
rc = set_start_bits(re, tcode, utf);
|
||||
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
|
||||
|
|
|
@ -417,6 +417,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
|||
#define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
|
||||
#define STRING_Tirhuta0 STR_T STR_i STR_r STR_h STR_u STR_t STR_a "\0"
|
||||
#define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
|
||||
#define STRING_Unknown0 STR_U STR_n STR_k STR_n STR_o STR_w STR_n "\0"
|
||||
#define STRING_Vai0 STR_V STR_a STR_i "\0"
|
||||
#define STRING_Warang_Citi0 STR_W STR_a STR_r STR_a STR_n STR_g STR_UNDERSCORE STR_C STR_i STR_t STR_i "\0"
|
||||
#define STRING_Xan0 STR_X STR_a STR_n "\0"
|
||||
|
@ -611,6 +612,7 @@ const char PRIV(utt_names)[] =
|
|||
STRING_Tifinagh0
|
||||
STRING_Tirhuta0
|
||||
STRING_Ugaritic0
|
||||
STRING_Unknown0
|
||||
STRING_Vai0
|
||||
STRING_Warang_Citi0
|
||||
STRING_Xan0
|
||||
|
@ -805,19 +807,20 @@ const ucp_type_table PRIV(utt)[] = {
|
|||
{ 1424, PT_SC, ucp_Tifinagh },
|
||||
{ 1433, PT_SC, ucp_Tirhuta },
|
||||
{ 1441, PT_SC, ucp_Ugaritic },
|
||||
{ 1450, PT_SC, ucp_Vai },
|
||||
{ 1454, PT_SC, ucp_Warang_Citi },
|
||||
{ 1466, PT_ALNUM, 0 },
|
||||
{ 1470, PT_PXSPACE, 0 },
|
||||
{ 1474, PT_SPACE, 0 },
|
||||
{ 1478, PT_UCNC, 0 },
|
||||
{ 1482, PT_WORD, 0 },
|
||||
{ 1486, PT_SC, ucp_Yi },
|
||||
{ 1489, PT_GC, ucp_Z },
|
||||
{ 1491, PT_SC, ucp_Zanabazar_Square },
|
||||
{ 1508, PT_PC, ucp_Zl },
|
||||
{ 1511, PT_PC, ucp_Zp },
|
||||
{ 1514, PT_PC, ucp_Zs }
|
||||
{ 1450, PT_SC, ucp_Unknown },
|
||||
{ 1458, PT_SC, ucp_Vai },
|
||||
{ 1462, PT_SC, ucp_Warang_Citi },
|
||||
{ 1474, PT_ALNUM, 0 },
|
||||
{ 1478, PT_PXSPACE, 0 },
|
||||
{ 1482, PT_SPACE, 0 },
|
||||
{ 1486, PT_UCNC, 0 },
|
||||
{ 1490, PT_WORD, 0 },
|
||||
{ 1494, PT_SC, ucp_Yi },
|
||||
{ 1497, PT_GC, ucp_Z },
|
||||
{ 1499, PT_SC, ucp_Zanabazar_Square },
|
||||
{ 1516, PT_PC, ucp_Zl },
|
||||
{ 1519, PT_PC, ucp_Zp },
|
||||
{ 1522, PT_PC, ucp_Zs }
|
||||
};
|
||||
|
||||
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
|
||||
|
|
1741
src/pcre2_ucd.c
1741
src/pcre2_ucd.c
File diff suppressed because it is too large
Load Diff
|
@ -124,6 +124,7 @@ enum {
|
|||
/* These are the script identifications. */
|
||||
|
||||
enum {
|
||||
ucp_Unknown,
|
||||
ucp_Arabic,
|
||||
ucp_Armenian,
|
||||
ucp_Bengali,
|
||||
|
|
|
@ -386,5 +386,16 @@
|
|||
|
||||
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
|
||||
123abcáyzabcdef789abcሴqr
|
||||
|
||||
# A few script run tests in non-UTF mode (but they need Unicode support)
|
||||
|
||||
/^(*script_run:.{4})/
|
||||
\x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
|
||||
\x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
|
||||
\x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
|
||||
|
||||
/^(*sr:.*)/utf,allow_surrogate_escapes
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
\x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -2317,5 +2317,96 @@
|
|||
|
||||
/[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf
|
||||
\x{99}\x{99}\x{99}
|
||||
|
||||
# Script run tests
|
||||
|
||||
/^(*script_run:.{4})/utf
|
||||
abcd Latin x4
|
||||
\x{2e80}\x{2fa1d}\x{3041}\x{30a1} Han Han Hiragana Katakana
|
||||
\x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
|
||||
\x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
|
||||
\x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
|
||||
\x{2e80}\x{3105}\x{2e80}\x{3105} Han Bopomofo Han Bopomofo
|
||||
\x{02ea}\x{2e80}\x{2e80}\x{3105} Bopomofo-Sk Han Han Bopomofo
|
||||
\x{3105}\x{2e80}\x{2e80}\x{3105} Bopomofo Han Han Bopomofo
|
||||
\x{0300}cd! Inherited Latin Latin Common
|
||||
\x{0391}12\x{03a9} Greek Common-digits Greek
|
||||
\x{0400}12\x{fe2f} Cyrillic Common-digits Cyrillic
|
||||
\x{0531}12\x{fb17} Armenian Common-digits Armenian
|
||||
\x{0591}12\x{fb4f} Hebrew Common-digits Hebrew
|
||||
\x{0600}12\x{1eef1} Arabic Common-digits Arabic
|
||||
\x{0600}\x{0660}\x{0669}\x{1eef1} Arabic Arabic-digits Arabic
|
||||
\x{0700}12\x{086a} Syriac Common-digits Syriac
|
||||
\x{1200}12\x{ab2e} Ethiopic Common-digits Ethiopic
|
||||
\x{1680}12\x{169c} Ogham Common-digits Ogham
|
||||
\x{3041}12\x{3041} Hiragana Common-digits Hiragana
|
||||
\x{0980}\x{09e6}\x{09e7}\x{0993} Bengali Bengali-digits Bengali
|
||||
!cde Common Latin Latin Latin
|
||||
A..B Latin Common Common Latin
|
||||
0abc Ascii-digit Latin Latin Latin
|
||||
1\x{0700}\x{0700}\x{0700} Ascii-digit Syriac x 3
|
||||
\x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters
|
||||
\= Expect no match
|
||||
a\x{370}bcd Latin Greek Latin Latin
|
||||
\x{1100}\x{02ea}\x{02ea}\x{02ea} Hangul Bopomofo x3
|
||||
\x{02ea}\x{02ea}\x{02ea}\x{1100} Bopomofo x3 Hangul
|
||||
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
|
||||
\x{0391}\x{09e6}\x{09e7}\x{03a9} Greek Bengali digits Greek
|
||||
\x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic
|
||||
\x{0600}\x{0669}7\x{1eef1} Arabic Arabic-digit ascii-digit Arabic
|
||||
A5\x{ff19}B Latin Common-ascii/notascii-digits Latin
|
||||
\x{0300}cd\x{0391} Inherited Latin Latin Greek
|
||||
!cd\x{0391} Common Latin Latin Greek
|
||||
\x{1A80}\x{1A90}\x{1a40}\x{1a41} Tai Tham Hora digit, Tham digit, letters
|
||||
A\x{1d7ce}\x{1d7ff}B Common fancy-common-2-sets-digits Common
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
|
||||
/^(*sr:.{4}|..)/utf
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
|
||||
/^(*atomic_script_run:.{4}|..)/utf
|
||||
\= Expect no match
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
|
||||
/^(*asr:.*)/utf
|
||||
\= Expect no match
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
|
||||
/^(?>(*sr:.*))/utf
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
|
||||
/^(*sr:.*)/utf
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
\x{10fffd}\x{10fffd}\x{10fffd} Private use (Unknown)
|
||||
|
||||
/^(*sr:\x{2e80}*)/utf
|
||||
\x{2e80}\x{2e80}\x{3105} Han Han Bopomofo
|
||||
|
||||
/^(*sr:\x{2e80}*)\x{2e80}/utf
|
||||
\x{2e80}\x{2e80}\x{3105} Han Han Bopomofo
|
||||
|
||||
/^(*sr:.*)Test/utf
|
||||
Test script run on an empty string
|
||||
|
||||
/^(*sr:(.{2})){2}/utf
|
||||
\x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic
|
||||
\x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters
|
||||
\x{1A80}\x{1a40}\x{1A90}\x{1a41} Tai Tham Hora digit, letter, Tham digit, letter
|
||||
\= Expect no match
|
||||
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
|
||||
|
||||
# Test loop breaking for empty string match
|
||||
|
||||
/^(*sr:A|)*BCD/utf
|
||||
AABCD
|
||||
ABCD
|
||||
BCD
|
||||
|
||||
# The use of (*ACCEPT) breaks script run checking
|
||||
|
||||
/^(*sr:.*(*ACCEPT)ZZ)/utf
|
||||
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
|
||||
|
||||
# -------
|
||||
|
||||
# End of testinput4
|
||||
|
|
|
@ -2106,5 +2106,28 @@
|
|||
|
||||
/(*: AB
C)abc/x,utf,mark,alt_verbnames
|
||||
abc
|
||||
|
||||
# Script run tests: auto-possessification
|
||||
|
||||
/^(*sr:.*)/B,utf
|
||||
paypаl.com A classic example of why script run checks are a good thing
|
||||
|
||||
/^(*sr:\x{2e80}*)/B,utf
|
||||
|
||||
/^(*sr:\x{2e80}*)\x{2e80}/B,utf
|
||||
|
||||
# Some script run patterns are broken in Perl 5.28.0. These can be moved into
|
||||
# test 4 when a mended version of Perl is released.
|
||||
|
||||
/^(*sr:.{4})/utf
|
||||
\x{0980}12\x{0993} Bengali Common-digits Bengali
|
||||
\x{0780}12\x{07b1} Thaana Common-digits Thaana
|
||||
\x{0e01}12\x{0e5b} Thai Common-digits Thai
|
||||
\x{1780}12\x{19ff} Khmer Common-digits Khmer
|
||||
\x{0904}12\x{0939} Devanagari Common-digits Devanagari
|
||||
A\x{ff10}\x{ff19}B Latin Common-notascii-digits Latin
|
||||
A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin
|
||||
|
||||
# -------
|
||||
|
||||
# End of testinput5
|
||||
|
|
|
@ -1480,5 +1480,20 @@ Old 12 12 New 14 16
|
|||
Old 12 15 New 16 21
|
||||
Old 21 21 New 27 29
|
||||
4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
|
||||
|
||||
# A few script run tests in non-UTF mode (but they need Unicode support)
|
||||
|
||||
/^(*script_run:.{4})/
|
||||
\x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
|
||||
0: \x{3041}\x{30a1}\x{3007}\x{3007}
|
||||
\x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
|
||||
0: \x{30a1}\x{3041}\x{3007}\x{3007}
|
||||
\x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
|
||||
0: \x{1100}\x{2e80}\x{2e80}\x{1101}
|
||||
|
||||
/^(*sr:.*)/utf,allow_surrogate_escapes
|
||||
Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
\x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -1477,5 +1477,21 @@ Old 12 12 New 14 16
|
|||
Old 12 15 New 16 21
|
||||
Old 21 21 New 27 29
|
||||
4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
|
||||
|
||||
# A few script run tests in non-UTF mode (but they need Unicode support)
|
||||
|
||||
/^(*script_run:.{4})/
|
||||
\x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
|
||||
0: \x{3041}\x{30a1}\x{3007}\x{3007}
|
||||
\x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
|
||||
0: \x{30a1}\x{3041}\x{3007}\x{3007}
|
||||
\x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
|
||||
0: \x{1100}\x{2e80}\x{2e80}\x{1101}
|
||||
|
||||
/^(*sr:.*)/utf,allow_surrogate_escapes
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
0: \x{2e80}\x{3105}\x{2e80}
|
||||
\x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
|
||||
0: \x{d800}
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -3741,5 +3741,154 @@ No match
|
|||
/[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf
|
||||
\x{99}\x{99}\x{99}
|
||||
0: \x{99}\x{99}\x{99}
|
||||
|
||||
# Script run tests
|
||||
|
||||
/^(*script_run:.{4})/utf
|
||||
abcd Latin x4
|
||||
0: abcd
|
||||
\x{2e80}\x{2fa1d}\x{3041}\x{30a1} Han Han Hiragana Katakana
|
||||
0: \x{2e80}\x{2fa1d}\x{3041}\x{30a1}
|
||||
\x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
|
||||
0: \x{3041}\x{30a1}\x{3007}\x{3007}
|
||||
\x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
|
||||
0: \x{30a1}\x{3041}\x{3007}\x{3007}
|
||||
\x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
|
||||
0: \x{1100}\x{2e80}\x{2e80}\x{1101}
|
||||
\x{2e80}\x{3105}\x{2e80}\x{3105} Han Bopomofo Han Bopomofo
|
||||
0: \x{2e80}\x{3105}\x{2e80}\x{3105}
|
||||
\x{02ea}\x{2e80}\x{2e80}\x{3105} Bopomofo-Sk Han Han Bopomofo
|
||||
0: \x{2ea}\x{2e80}\x{2e80}\x{3105}
|
||||
\x{3105}\x{2e80}\x{2e80}\x{3105} Bopomofo Han Han Bopomofo
|
||||
0: \x{3105}\x{2e80}\x{2e80}\x{3105}
|
||||
\x{0300}cd! Inherited Latin Latin Common
|
||||
0: \x{300}cd!
|
||||
\x{0391}12\x{03a9} Greek Common-digits Greek
|
||||
0: \x{391}12\x{3a9}
|
||||
\x{0400}12\x{fe2f} Cyrillic Common-digits Cyrillic
|
||||
0: \x{400}12\x{fe2f}
|
||||
\x{0531}12\x{fb17} Armenian Common-digits Armenian
|
||||
0: \x{531}12\x{fb17}
|
||||
\x{0591}12\x{fb4f} Hebrew Common-digits Hebrew
|
||||
0: \x{591}12\x{fb4f}
|
||||
\x{0600}12\x{1eef1} Arabic Common-digits Arabic
|
||||
0: \x{600}12\x{1eef1}
|
||||
\x{0600}\x{0660}\x{0669}\x{1eef1} Arabic Arabic-digits Arabic
|
||||
0: \x{600}\x{660}\x{669}\x{1eef1}
|
||||
\x{0700}12\x{086a} Syriac Common-digits Syriac
|
||||
0: \x{700}12\x{86a}
|
||||
\x{1200}12\x{ab2e} Ethiopic Common-digits Ethiopic
|
||||
0: \x{1200}12\x{ab2e}
|
||||
\x{1680}12\x{169c} Ogham Common-digits Ogham
|
||||
0: \x{1680}12\x{169c}
|
||||
\x{3041}12\x{3041} Hiragana Common-digits Hiragana
|
||||
0: \x{3041}12\x{3041}
|
||||
\x{0980}\x{09e6}\x{09e7}\x{0993} Bengali Bengali-digits Bengali
|
||||
0: \x{980}\x{9e6}\x{9e7}\x{993}
|
||||
!cde Common Latin Latin Latin
|
||||
0: !cde
|
||||
A..B Latin Common Common Latin
|
||||
0: A..B
|
||||
0abc Ascii-digit Latin Latin Latin
|
||||
0: 0abc
|
||||
1\x{0700}\x{0700}\x{0700} Ascii-digit Syriac x 3
|
||||
0: 1\x{700}\x{700}\x{700}
|
||||
\x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters
|
||||
0: \x{1a80}\x{1a80}\x{1a40}\x{1a41}
|
||||
\= Expect no match
|
||||
a\x{370}bcd Latin Greek Latin Latin
|
||||
No match
|
||||
\x{1100}\x{02ea}\x{02ea}\x{02ea} Hangul Bopomofo x3
|
||||
No match
|
||||
\x{02ea}\x{02ea}\x{02ea}\x{1100} Bopomofo x3 Hangul
|
||||
No match
|
||||
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
|
||||
No match
|
||||
\x{0391}\x{09e6}\x{09e7}\x{03a9} Greek Bengali digits Greek
|
||||
No match
|
||||
\x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic
|
||||
No match
|
||||
\x{0600}\x{0669}7\x{1eef1} Arabic Arabic-digit ascii-digit Arabic
|
||||
No match
|
||||
A5\x{ff19}B Latin Common-ascii/notascii-digits Latin
|
||||
No match
|
||||
\x{0300}cd\x{0391} Inherited Latin Latin Greek
|
||||
No match
|
||||
!cd\x{0391} Common Latin Latin Greek
|
||||
No match
|
||||
\x{1A80}\x{1A90}\x{1a40}\x{1a41} Tai Tham Hora digit, Tham digit, letters
|
||||
No match
|
||||
A\x{1d7ce}\x{1d7ff}B Common fancy-common-2-sets-digits Common
|
||||
No match
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
No match
|
||||
|
||||
/^(*sr:.{4}|..)/utf
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
0: \x{2e80}\x{3105}
|
||||
|
||||
/^(*atomic_script_run:.{4}|..)/utf
|
||||
\= Expect no match
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
No match
|
||||
|
||||
/^(*asr:.*)/utf
|
||||
\= Expect no match
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
No match
|
||||
|
||||
/^(?>(*sr:.*))/utf
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
0: \x{2e80}\x{3105}\x{2e80}
|
||||
|
||||
/^(*sr:.*)/utf
|
||||
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
|
||||
0: \x{2e80}\x{3105}\x{2e80}
|
||||
\x{10fffd}\x{10fffd}\x{10fffd} Private use (Unknown)
|
||||
0: \x{10fffd}
|
||||
|
||||
/^(*sr:\x{2e80}*)/utf
|
||||
\x{2e80}\x{2e80}\x{3105} Han Han Bopomofo
|
||||
0: \x{2e80}\x{2e80}
|
||||
|
||||
/^(*sr:\x{2e80}*)\x{2e80}/utf
|
||||
\x{2e80}\x{2e80}\x{3105} Han Han Bopomofo
|
||||
0: \x{2e80}\x{2e80}
|
||||
|
||||
/^(*sr:.*)Test/utf
|
||||
Test script run on an empty string
|
||||
0: Test
|
||||
|
||||
/^(*sr:(.{2})){2}/utf
|
||||
\x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic
|
||||
0: \x{600}7\x{669}\x{1eef1}
|
||||
1: \x{669}\x{1eef1}
|
||||
\x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters
|
||||
0: \x{1a80}\x{1a80}\x{1a40}\x{1a41}
|
||||
1: \x{1a40}\x{1a41}
|
||||
\x{1A80}\x{1a40}\x{1A90}\x{1a41} Tai Tham Hora digit, letter, Tham digit, letter
|
||||
0: \x{1a80}\x{1a40}\x{1a90}\x{1a41}
|
||||
1: \x{1a90}\x{1a41}
|
||||
\= Expect no match
|
||||
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
|
||||
No match
|
||||
|
||||
# Test loop breaking for empty string match
|
||||
|
||||
/^(*sr:A|)*BCD/utf
|
||||
AABCD
|
||||
0: AABCD
|
||||
ABCD
|
||||
0: ABCD
|
||||
BCD
|
||||
0: BCD
|
||||
|
||||
# The use of (*ACCEPT) breaks script run checking
|
||||
|
||||
/^(*sr:.*(*ACCEPT)ZZ)/utf
|
||||
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
|
||||
0: \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
|
||||
|
||||
# -------
|
||||
|
||||
# End of testinput4
|
||||
|
|
|
@ -4775,5 +4775,64 @@ Failed: error 137 at offset 2: PCRE2 does not support \F, \L, \l, \N{name}, \U,
|
|||
abc
|
||||
0: abc
|
||||
MK: ABC
|
||||
|
||||
# Script run tests: auto-possessification
|
||||
|
||||
/^(*sr:.*)/B,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
^
|
||||
Script run
|
||||
Any*
|
||||
Ket
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
paypаl.com A classic example of why script run checks are a good thing
|
||||
0: payp
|
||||
|
||||
/^(*sr:\x{2e80}*)/B,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
^
|
||||
Script run
|
||||
\x{2e80}*+
|
||||
Ket
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/^(*sr:\x{2e80}*)\x{2e80}/B,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
^
|
||||
Script run
|
||||
\x{2e80}*
|
||||
Ket
|
||||
\x{2e80}
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
# Some script run patterns are broken in Perl 5.28.0. These can be moved into
|
||||
# test 4 when a mended version of Perl is released.
|
||||
|
||||
/^(*sr:.{4})/utf
|
||||
\x{0980}12\x{0993} Bengali Common-digits Bengali
|
||||
0: \x{980}12\x{993}
|
||||
\x{0780}12\x{07b1} Thaana Common-digits Thaana
|
||||
0: \x{780}12\x{7b1}
|
||||
\x{0e01}12\x{0e5b} Thai Common-digits Thai
|
||||
0: \x{e01}12\x{e5b}
|
||||
\x{1780}12\x{19ff} Khmer Common-digits Khmer
|
||||
0: \x{1780}12\x{19ff}
|
||||
\x{0904}12\x{0939} Devanagari Common-digits Devanagari
|
||||
0: \x{904}12\x{939}
|
||||
A\x{ff10}\x{ff19}B Latin Common-notascii-digits Latin
|
||||
0: A\x{ff10}\x{ff19}B
|
||||
A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin
|
||||
0: A\x{1d7ce}\x{1d7cf}B
|
||||
|
||||
# -------
|
||||
|
||||
# End of testinput5
|
||||
|
|
Loading…
Reference in New Issue