Basic "script run" implementation. Not yet complete, and not yet documented.

This commit is contained in:
Philip.Hazel 2018-10-02 15:25:58 +00:00
parent f26b0b0bae
commit 866750fd53
30 changed files with 1787 additions and 1012 deletions

View File

@ -30,6 +30,10 @@ new "is lower case letter" bit. At the same time, the now unused "is
hexadecimal digit" bit was removed. The default tables in
src/pcre2_chartables.c.dist are updated.
8. Implement the new Perl "script run" features (*script_run:...) and
(*atomic_script_run:...) aka (*sr:...) and (*asr:...). At present, this is
incomplete and not yet documented.
Version 10.32 10-September-2018
-------------------------------

View File

@ -364,6 +364,7 @@ COMMON_SOURCES = \
src/pcre2_newline.c \
src/pcre2_ord2utf.c \
src/pcre2_pattern_info.c \
src/pcre2_script_run.c \
src/pcre2_serialize.c \
src/pcre2_string_utils.c \
src/pcre2_study.c \

View File

@ -104,6 +104,7 @@ can skip ahead to the CMake section.
pcre2_newline.c
pcre2_ord2utf.c
pcre2_pattern_info.c
pcre2_script_run.c
pcre2_serialize.c
pcre2_string_utils.c
pcre2_study.c

1
README
View File

@ -788,6 +788,7 @@ The distribution should contain the files listed below.
src/pcre2_newline.c )
src/pcre2_ord2utf.c )
src/pcre2_pattern_info.c )
src/pcre2_script_run.c )
src/pcre2_serialize.c )
src/pcre2_string_utils.c )
src/pcre2_study.c )

View File

@ -104,6 +104,7 @@ can skip ahead to the CMake section.
pcre2_newline.c
pcre2_ord2utf.c
pcre2_pattern_info.c
pcre2_script_run.c
pcre2_serialize.c
pcre2_string_utils.c
pcre2_study.c

View File

@ -788,6 +788,7 @@ The distribution should contain the files listed below.
src/pcre2_newline.c )
src/pcre2_ord2utf.c )
src/pcre2_pattern_info.c )
src/pcre2_script_run.c )
src/pcre2_serialize.c )
src/pcre2_string_utils.c )
src/pcre2_study.c )

View File

@ -25,8 +25,9 @@
# Added script names for Unicode 8.0.0, 19-June-2015.
# Added script names for Unicode 10.0.0, 02-July-2017.
# Added script names for Unicode 11.0.0, 03-July-2018.
# Added 'Unknown' script, 01-October-2018.
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \

View File

@ -143,6 +143,7 @@
# 03-July-2018: Updated for Unicode 11.0.0
# 07-July-2018: Added code to scan emoji-data.txt for the Extended
# Pictographic property.
# 01-October-2018: Added the 'Unknown' script name
##############################################################################
@ -300,7 +301,7 @@ def get_record_size_struct(records):
slice_type, slice_size = get_type_size(record_slice)
size = (size + slice_size - 1) & -slice_size
structure += '} ucd_record;\n*/\n\n'
structure += '} ucd_record;\n*/\n'
return size, structure
def test_record_size():
@ -329,7 +330,7 @@ def print_records(records, record_size):
print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
print('};\n')
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
@ -380,7 +381,7 @@ break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
test_record_size()
unicode_version = ""
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
@ -553,11 +554,11 @@ print("special record. */")
print()
print("#if PCRE2_CODE_UNIT_WIDTH == 32")
print("const ucd_record PRIV(dummy_ucd_record)[] = {{")
print(" ucp_Common, /* script */")
print(" ucp_Cn, /* type unassigned */")
print(" ucp_gbOther, /* grapheme break property */")
print(" 0, /* case set */")
print(" 0, /* other case */")
print(" ucp_Unknown, /* script */")
print(" ucp_Cn, /* type unassigned */")
print(" ucp_gbOther, /* grapheme break property */")
print(" 0, /* case set */")
print(" 0, /* other case */")
print(" }};")
print("#endif")
print()
@ -565,6 +566,9 @@ print(record_struct)
# --- Added by PH: output the table of caseless character sets ---
print("/* This table contains lists of characters that are caseless sets of")
print("more than one character. Each list is terminated by NOTACHAR. */\n")
print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
print(" NOTACHAR,")
for s in sets:
@ -577,10 +581,53 @@ print()
# ------
print("/* When #included in pcre2test, we don't need this large table. */")
print("/* When #included in pcre2test, we don't need the table of digit")
print("sets, nor the the large main UCD tables. */")
print()
print("#ifndef PCRE2_PCRE2TEST")
print()
# --- Added by PH: read Scripts.txt again for the sets of 10 digits. ---
digitsets = []
file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
for line in file:
m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
if m is None:
continue
first = int(m.group(1),16)
last = int(m.group(2),16)
if ((last - first + 1) % 10) != 0:
print("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
file=sys.stderr)
while first < last:
digitsets.append(first + 9)
first += 10
file.close()
digitsets.sort()
print("/* This table lists the code points for the '9' characters in each")
print("set of decimal digits. It is used to ensure that all the digits in")
print("a script run come from the same set. */")
print()
print("const uint32_t PRIV(ucd_digit_sets)[] = {")
print(" %d, /* Number of subsequent values */" % len(digitsets), end='')
count = 8
for d in digitsets:
if count == 8:
print("\n ", end='')
count = 0
print(" 0x%05x," % d, end='')
count += 1
print("\n};")
print()
# Output the main UCD tables.
print("/* These are the main two-stage UCD tables. */\n")
print_records(records, record_size)
print_table(min_stage1, 'PRIV(ucd_stage1)')
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
@ -591,6 +638,10 @@ print("#endif /* SUPPORT_UNICODE */")
print()
print("#endif /* PCRE2_PCRE2TEST */")
# This code was part of the original contribution, but is commented out as it
# was never used. A two-stage table has sufficed.
"""
# Three-stage tables:

View File

@ -134,6 +134,7 @@ switch(gbprop)
switch(script)
{
case ucp_Unknown: scriptname = US"Unknown"; break;
case ucp_Arabic: scriptname = US"Arabic"; break;
case ucp_Armenian: scriptname = US"Armenian"; break;
case ucp_Balinese: scriptname = US"Balinese"; break;

View File

@ -1,8 +1,10 @@
#! /bin/sh
# Script for testing regular expressions with perl to check that PCRE2 handles
# them the same. If the first argument to this script is "-w", Perl is also
# called with "-w", which turns on its warning mode.
# them the same. For testing with different versions of Perl, if the first
# argument is -perl then the second is taken as the Perl command to use, and
# both are then removed. If the next argument is "-w", Perl is called with
# "-w", which turns on its warning mode.
#
# The Perl code has to have "use utf8" and "require Encode" at the start when
# running UTF-8 tests, but *not* for non-utf8 tests. (The "require" would
@ -10,8 +12,8 @@
# the script will always run for these tests.)
#
# The desired effect is achieved by making this a shell script that passes the
# Perl script to Perl through a pipe. If the first argument (possibly after
# removing "-w") is "-utf8", a suitable prefix is set up.
# Perl script to Perl through a pipe. If the next argument is "-utf8", a
# suitable prefix is set up.
#
# The remaining arguments, if any, are passed to Perl. They are an input file
# and an output file. If there is one argument, the output is written to
@ -23,6 +25,12 @@ perl=perl
perlarg=''
prefix=''
if [ $# -gt 1 -a "$1" = "-perl" ] ; then
shift
perl=$1
shift
fi
if [ $# -gt 0 -a "$1" = "-w" ] ; then
perlarg="-w"
shift
@ -78,6 +86,7 @@ fi
# The alpha assertions currently give warnings even when -w is not specified.
no warnings "experimental::alpha_assertions";
no warnings "experimental::script_run";
# Function for turning a string into a string of printing chars.

View File

@ -321,6 +321,7 @@ pcre2_pattern_convert(). */
#define PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE 193
#define PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS 194
#define PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN 195
#define PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE 196
/* "Expected" matching error codes: no match and partial match. */

View File

@ -604,6 +604,15 @@ for(;;)
case OP_SCBRAPOS:
if (cb->had_recurse) return FALSE;
break;
/* A script run might have to backtrack if the iterated item can match
characters from more than one script. So give up unless repeating an
explicit character. */
case OP_SCRIPT_RUN:
if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI)
return FALSE;
break;
/* Atomic sub-patterns and assertions can always auto-possessify their
last iterator. However, if the group was entered as a result of checking
@ -614,7 +623,6 @@ for(;;)
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
case OP_ONCE:
return !entered_a_group;
}

View File

@ -240,49 +240,57 @@ code (meta_extra_lengths, just below) must be updated to remain in step. */
#define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
#define META_RECURSE 0x80200000u /* Recursion */
#define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
#define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */
/* These must be kept together to make it easy to check that an assertion
is present where expected in a conditional group. */
#define META_LOOKAHEAD 0x80220000u /* (?= */
#define META_LOOKAHEADNOT 0x80230000u /* (?! */
#define META_LOOKBEHIND 0x80240000u /* (?<= */
#define META_LOOKBEHINDNOT 0x80250000u /* (?<! */
#define META_LOOKAHEAD 0x80230000u /* (?= */
#define META_LOOKAHEADNOT 0x80240000u /* (?! */
#define META_LOOKBEHIND 0x80250000u /* (?<= */
#define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
/* These must be kept in this order, with consecutive values, and the _ARG
versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
versions. */
#define META_MARK 0x80260000u /* (*MARK) */
#define META_ACCEPT 0x80270000u /* (*ACCEPT) */
#define META_FAIL 0x80280000u /* (*FAIL) */
#define META_COMMIT 0x80290000u /* These */
#define META_COMMIT_ARG 0x802a0000u /* pairs */
#define META_PRUNE 0x802b0000u /* must */
#define META_PRUNE_ARG 0x802c0000u /* be */
#define META_SKIP 0x802d0000u /* kept */
#define META_SKIP_ARG 0x802e0000u /* in */
#define META_THEN 0x802f0000u /* this */
#define META_THEN_ARG 0x80300000u /* order */
#define META_MARK 0x80270000u /* (*MARK) */
#define META_ACCEPT 0x80280000u /* (*ACCEPT) */
#define META_FAIL 0x80290000u /* (*FAIL) */
#define META_COMMIT 0x802a0000u /* These */
#define META_COMMIT_ARG 0x802b0000u /* pairs */
#define META_PRUNE 0x802c0000u /* must */
#define META_PRUNE_ARG 0x802d0000u /* be */
#define META_SKIP 0x802e0000u /* kept */
#define META_SKIP_ARG 0x802f0000u /* in */
#define META_THEN 0x80300000u /* this */
#define META_THEN_ARG 0x80310000u /* order */
/* These must be kept in groups of adjacent 3 values, and all together. */
#define META_ASTERISK 0x80310000u /* * */
#define META_ASTERISK_PLUS 0x80320000u /* *+ */
#define META_ASTERISK_QUERY 0x80330000u /* *? */
#define META_PLUS 0x80340000u /* + */
#define META_PLUS_PLUS 0x80350000u /* ++ */
#define META_PLUS_QUERY 0x80360000u /* +? */
#define META_QUERY 0x80370000u /* ? */
#define META_QUERY_PLUS 0x80380000u /* ?+ */
#define META_QUERY_QUERY 0x80390000u /* ?? */
#define META_MINMAX 0x803a0000u /* {n,m} repeat */
#define META_MINMAX_PLUS 0x803b0000u /* {n,m}+ repeat */
#define META_MINMAX_QUERY 0x803c0000u /* {n,m}? repeat */
#define META_ASTERISK 0x80320000u /* * */
#define META_ASTERISK_PLUS 0x80330000u /* *+ */
#define META_ASTERISK_QUERY 0x80340000u /* *? */
#define META_PLUS 0x80350000u /* + */
#define META_PLUS_PLUS 0x80360000u /* ++ */
#define META_PLUS_QUERY 0x80370000u /* +? */
#define META_QUERY 0x80380000u /* ? */
#define META_QUERY_PLUS 0x80390000u /* ?+ */
#define META_QUERY_QUERY 0x803a0000u /* ?? */
#define META_MINMAX 0x803b0000u /* {n,m} repeat */
#define META_MINMAX_PLUS 0x803c0000u /* {n,m}+ repeat */
#define META_MINMAX_QUERY 0x803d0000u /* {n,m}? repeat */
#define META_FIRST_QUANTIFIER META_ASTERISK
#define META_LAST_QUANTIFIER META_MINMAX_QUERY
/* This is a special "meta code" that is used only to distinguish (*asr: from
(*sr in the table of aphabetic assertions. It is never stored in the parsed
pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
therefore no need for it to have a length entry, so use a high value. */
#define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
/* Table of extra lengths for each of the meta codes. Must be kept in step with
the definitions above. For some items these values are a basic length to which
a variable amount has to be added. */
@ -322,6 +330,7 @@ static unsigned char meta_extra_lengths[] = {
0, /* META_RANGE_LITERAL */
SIZEOFFSET, /* META_RECURSE */
1+SIZEOFFSET, /* META_RECURSE_BYNAME */
0, /* META_SCRIPT_RUN */
0, /* META_LOOKAHEAD */
0, /* META_LOOKAHEADNOT */
SIZEOFFSET, /* META_LOOKBEHIND */
@ -638,19 +647,19 @@ static const char alasnames[] =
STRING_atomic_script_run;
static const alasitem alasmeta[] = {
{ 3, META_LOOKAHEAD },
{ 3, META_LOOKBEHIND },
{ 3, META_LOOKAHEADNOT },
{ 3, META_LOOKBEHINDNOT },
{ 18, META_LOOKAHEAD },
{ 19, META_LOOKBEHIND },
{ 18, META_LOOKAHEADNOT },
{ 19, META_LOOKBEHINDNOT },
{ 6, META_ATOMIC },
{ 2, 0 }, /* sr = script run */
{ 3, 0 }, /* asr = atomic script run */
{ 10, 0 }, /* script run */
{ 17, 0 } /* atomic script run */
{ 3, META_LOOKAHEAD },
{ 3, META_LOOKBEHIND },
{ 3, META_LOOKAHEADNOT },
{ 3, META_LOOKBEHINDNOT },
{ 18, META_LOOKAHEAD },
{ 19, META_LOOKBEHIND },
{ 18, META_LOOKAHEADNOT },
{ 19, META_LOOKBEHINDNOT },
{ 6, META_ATOMIC },
{ 2, META_SCRIPT_RUN }, /* sr = script run */
{ 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
{ 10, META_SCRIPT_RUN }, /* script run */
{ 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
};
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
@ -772,7 +781,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
ERR91, ERR92, ERR93, ERR94, ERR95 };
ERR91, ERR92, ERR93, ERR94, ERR95, ERR96 };
/* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@ -1003,6 +1012,7 @@ for (;;)
case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
case META_KET: fprintf(stderr, "META )"); break;
case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
@ -2210,15 +2220,15 @@ if (++ptr >= ptrend) /* No characters in name */
ERR60; /* Verb not recognized or malformed */
goto FAILED;
}
/* A group name must not start with a digit. If either of the others start with
a digit it just won't be recognized. */
/* A group name must not start with a digit. If either of the others start with
a digit it just won't be recognized. */
if (is_group && IS_DIGIT(*ptr))
{
*errorcodeptr = ERR44;
goto FAILED;
}
}
*nameptr = ptr;
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
@ -2345,6 +2355,7 @@ typedef struct nest_save {
#define NSF_RESET 0x0001u
#define NSF_CONDASSERT 0x0002u
#define NSF_ATOMICSR 0x0004u
/* Options that are changeable within the pattern must be tracked during
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
@ -2707,19 +2718,19 @@ while (ptr < ptrend)
case CHAR_C:
ok = expect_cond_assert == 2;
break;
case CHAR_EQUALS_SIGN:
case CHAR_EXCLAMATION_MARK:
break;
case CHAR_LESS_THAN_SIGN:
ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
break;
default:
ok = FALSE;
}
}
}
if (!ok)
{
@ -3533,13 +3544,13 @@ while (ptr < ptrend)
/* Handle "alpha assertions" such as (*pla:...). Most of these are
synonyms for the historical symbolic assertions, but the script run ones
are new. They are distinguished by starting with a lower case letter.
Checking both ends of the alphabet makes this work in all character
Checking both ends of the alphabet makes this work in all character
codes. */
else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
{
uint32_t meta;
vn = alasnames;
if (!read_name(&ptr, ptrend, 0, &offset, &name, &namelen, &errorcode,
cb)) goto FAILED;
@ -3550,7 +3561,7 @@ while (ptr < ptrend)
}
/* Scan the table of alpha assertion names */
for (i = 0; i < alascount; i++)
{
if (namelen == alasmeta[i].len &&
@ -3564,42 +3575,72 @@ while (ptr < ptrend)
errorcode = ERR95; /* Alpha assertion not recognized */
goto FAILED;
}
/* Check for expecting an assertion condition. If so, only lookaround
/* Check for expecting an assertion condition. If so, only lookaround
assertions are valid. */
meta = alasmeta[i].meta;
if (prev_expect_cond_assert > 0 &&
if (prev_expect_cond_assert > 0 &&
(meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
{
errorcode = ERR28; /* Assertion expected */
goto FAILED;
}
goto FAILED;
}
/* The lookaround alphabetic synonyms can be almost entirely handled by
jumping to the code that handles the traditional symbolic forms. */
switch(meta)
{
default:
errorcode = ERR89; /* Unknown code; should never occur because */
goto FAILED; /* the meta values come from a table above. */
case META_ATOMIC:
goto ATOMIC_GROUP;
goto ATOMIC_GROUP;
case META_LOOKAHEAD:
goto POSITIVE_LOOK_AHEAD;
case META_LOOKAHEADNOT:
goto NEGATIVE_LOOK_AHEAD;
case META_LOOKBEHIND:
case META_LOOKBEHINDNOT:
*parsed_pattern++ = meta;
ptr--;
goto LOOKBEHIND;
/* FIXME: Script Run stuff ... */
}
case META_LOOKBEHIND:
case META_LOOKBEHINDNOT:
*parsed_pattern++ = meta;
ptr--;
goto POST_LOOKBEHIND;
/* The script run facilities are handled here. Unicode support is
required (give an error if not, as this is a security issue). Always
record a META_SCRIPT_RUN item. Then, for the atomic version, insert
META_ATOMIC and remember that we need two META_KETs at the end. */
case META_SCRIPT_RUN:
case META_ATOMIC_SCRIPT_RUN:
#ifdef SUPPORT_UNICODE
*parsed_pattern++ = META_SCRIPT_RUN;
nest_depth++;
ptr++;
if (meta == META_ATOMIC_SCRIPT_RUN)
{
*parsed_pattern++ = META_ATOMIC;
if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
else if (++top_nest >= end_nests)
{
errorcode = ERR84;
goto FAILED;
}
top_nest->nest_depth = nest_depth;
top_nest->flags = NSF_ATOMICSR;
top_nest->options = options & PARSE_TRACKED_OPTIONS;
}
break;
#else /* SUPPORT_UNICODE */
errorcode = ERR96;
goto FAILED;
#endif
}
}
@ -4262,8 +4303,8 @@ while (ptr < ptrend)
}
*parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
META_LOOKBEHIND : META_LOOKBEHINDNOT;
LOOKBEHIND: /* Come from (*plb: and (*nlb: */
POST_LOOKBEHIND: /* Come from (*plb: and (*nlb: */
*has_lookbehind = TRUE;
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
PUTOFFSET(offset, parsed_pattern);
@ -4425,6 +4466,14 @@ while (ptr < ptrend)
cb->bracount = top_nest->max_group;
if ((top_nest->flags & NSF_CONDASSERT) != 0)
okquantifier = FALSE;
if ((top_nest->flags & NSF_ATOMICSR) != 0)
{
*parsed_pattern++ = META_KET;
}
if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
else top_nest--;
}
@ -6142,6 +6191,10 @@ for (;; pptr++)
bravalue = OP_ONCE;
goto GROUP_PROCESS_NOTE_EMPTY;
case META_SCRIPT_RUN:
bravalue = OP_SCRIPT_RUN;
goto GROUP_PROCESS_NOTE_EMPTY;
case META_NOCAPTURE:
bravalue = OP_BRA;
/* Fall through */
@ -6777,6 +6830,7 @@ for (;; pptr++)
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
case OP_ONCE:
case OP_SCRIPT_RUN:
case OP_BRA:
case OP_CBRA:
case OP_COND:
@ -6989,16 +7043,16 @@ for (;; pptr++)
}
/* If the maximum is unlimited, set a repeater in the final copy. For
ONCE brackets, that's all we need to do. However, possessively repeated
ONCE brackets can be converted into non-capturing brackets, as the
behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
deal with possessive ONCEs specially.
SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
possessively repeated ONCE brackets can be converted into non-capturing
brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
saves having to deal with possessive ONCEs specially.
Otherwise, when we are doing the actual compile phase, check to see
whether this group is one that could match an empty string. If so,
convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
that runtime checking can be done. [This check is also applied to ONCE
groups at runtime, but in a different way.]
and SCRIPT_RUN groups at runtime, but in a different way.]
Then, if the quantifier was possessive and the bracket is not a
conditional, we convert the BRA code to the POS form, and the KET code to
@ -7022,13 +7076,14 @@ for (;; pptr++)
if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
/* For non-possessive ONCE brackets, all we need to do is to
set the KET. */
/* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
to do is to set the KET. */
if (*bracode == OP_ONCE) *ketcode = OP_KETRMAX + repeat_type;
if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
*ketcode = OP_KETRMAX + repeat_type;
/* Handle non-ONCE brackets and possessive ONCEs (which have been
converted to non-capturing above). */
/* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
(which have been converted to non-capturing above). */
else
{
@ -8385,6 +8440,7 @@ do {
case OP_SCBRAPOS:
case OP_ASSERT:
case OP_ONCE:
case OP_SCRIPT_RUN:
d = find_firstassertedcu(scode, &dflags, inassert + ((op==OP_ASSERT)?1:0));
if (dflags < 0)
return 0;

View File

@ -173,6 +173,7 @@ static const uint8_t coptable[] = {
0, /* Assert behind */
0, /* Assert behind not */
0, /* ONCE */
0, /* SCRIPT_RUN */
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
0, 0, /* CREF, DNCREF */
@ -247,6 +248,7 @@ static const uint8_t poptable[] = {
0, /* Assert behind */
0, /* Assert behind not */
0, /* ONCE */
0, /* SCRIPT_RUN */
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
0, 0, /* CREF, DNCREF */

View File

@ -183,6 +183,7 @@ static const unsigned char compile_error_texts[] =
"invalid hyphen in option setting\0"
/* 95 */
"(*alpha_assertion) not recognized\0"
"script runs require Unicode support, which this version of PCRE2 does not have\0"
;
/* Match-time and UTF error texts are in the same format. */

View File

@ -1513,70 +1513,71 @@ enum {
OP_ASSERTBACK, /* 128 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 129 Negative lookbehind */
/* ONCE, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately after the
assertions, with ONCE first, as there's a test for >= ONCE for a subpattern
that isn't an assertion. The POS versions must immediately follow the non-POS
versions in each case. */
/* ONCE, SCRIPT_RUN, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come
immediately after the assertions, with ONCE first, as there's a test for >=
ONCE for a subpattern that isn't an assertion. The POS versions must
immediately follow the non-POS versions in each case. */
OP_ONCE, /* 130 Atomic group, contains captures */
OP_BRA, /* 131 Start of non-capturing bracket */
OP_BRAPOS, /* 132 Ditto, with unlimited, possessive repeat */
OP_CBRA, /* 133 Start of capturing bracket */
OP_CBRAPOS, /* 134 Ditto, with unlimited, possessive repeat */
OP_COND, /* 135 Conditional group */
OP_SCRIPT_RUN, /* 131 Non-capture, but check characters' scripts */
OP_BRA, /* 132 Start of non-capturing bracket */
OP_BRAPOS, /* 133 Ditto, with unlimited, possessive repeat */
OP_CBRA, /* 134 Start of capturing bracket */
OP_CBRAPOS, /* 135 Ditto, with unlimited, possessive repeat */
OP_COND, /* 136 Conditional group */
/* These five must follow the previous five, in the same order. There's a
check for >= SBRA to distinguish the two sets. */
OP_SBRA, /* 136 Start of non-capturing bracket, check empty */
OP_SBRAPOS, /* 137 Ditto, with unlimited, possessive repeat */
OP_SCBRA, /* 138 Start of capturing bracket, check empty */
OP_SCBRAPOS, /* 139 Ditto, with unlimited, possessive repeat */
OP_SCOND, /* 140 Conditional group, check empty */
OP_SBRA, /* 137 Start of non-capturing bracket, check empty */
OP_SBRAPOS, /* 138 Ditto, with unlimited, possessive repeat */
OP_SCBRA, /* 139 Start of capturing bracket, check empty */
OP_SCBRAPOS, /* 140 Ditto, with unlimited, possessive repeat */
OP_SCOND, /* 141 Conditional group, check empty */
/* The next two pairs must (respectively) be kept together. */
OP_CREF, /* 141 Used to hold a capture number as condition */
OP_DNCREF, /* 142 Used to point to duplicate names as a condition */
OP_RREF, /* 143 Used to hold a recursion number as condition */
OP_DNRREF, /* 144 Used to point to duplicate names as a condition */
OP_FALSE, /* 145 Always false (used by DEFINE and VERSION) */
OP_TRUE, /* 146 Always true (used by VERSION) */
OP_CREF, /* 142 Used to hold a capture number as condition */
OP_DNCREF, /* 143 Used to point to duplicate names as a condition */
OP_RREF, /* 144 Used to hold a recursion number as condition */
OP_DNRREF, /* 145 Used to point to duplicate names as a condition */
OP_FALSE, /* 146 Always false (used by DEFINE and VERSION) */
OP_TRUE, /* 147 Always true (used by VERSION) */
OP_BRAZERO, /* 147 These two must remain together and in this */
OP_BRAMINZERO, /* 148 order. */
OP_BRAPOSZERO, /* 149 */
OP_BRAZERO, /* 148 These two must remain together and in this */
OP_BRAMINZERO, /* 149 order. */
OP_BRAPOSZERO, /* 150 */
/* These are backtracking control verbs */
OP_MARK, /* 150 always has an argument */
OP_PRUNE, /* 151 */
OP_PRUNE_ARG, /* 152 same, but with argument */
OP_SKIP, /* 153 */
OP_SKIP_ARG, /* 154 same, but with argument */
OP_THEN, /* 155 */
OP_THEN_ARG, /* 156 same, but with argument */
OP_COMMIT, /* 157 */
OP_COMMIT_ARG, /* 158 same, but with argument */
OP_MARK, /* 151 always has an argument */
OP_PRUNE, /* 152 */
OP_PRUNE_ARG, /* 153 same, but with argument */
OP_SKIP, /* 154 */
OP_SKIP_ARG, /* 155 same, but with argument */
OP_THEN, /* 156 */
OP_THEN_ARG, /* 157 same, but with argument */
OP_COMMIT, /* 158 */
OP_COMMIT_ARG, /* 159 same, but with argument */
/* These are forced failure and success verbs. FAIL and ACCEPT do accept an
argument, but these cases can be compiled as, for example, (*MARK:X)(*FAIL)
without the need for a special opcode. */
OP_FAIL, /* 159 */
OP_ACCEPT, /* 160 */
OP_ASSERT_ACCEPT, /* 161 Used inside assertions */
OP_CLOSE, /* 162 Used before OP_ACCEPT to close open captures */
OP_FAIL, /* 160 */
OP_ACCEPT, /* 161 */
OP_ASSERT_ACCEPT, /* 162 Used inside assertions */
OP_CLOSE, /* 163 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */
OP_SKIPZERO, /* 163 */
OP_SKIPZERO, /* 164 */
/* This is used to identify a DEFINE group during compilation so that it can
be checked for having only one branch. It is changed to OP_FALSE before
compilation finishes. */
OP_DEFINE, /* 164 */
OP_DEFINE, /* 165 */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
@ -1624,6 +1625,7 @@ some cases doesn't actually use these names at all). */
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \
"Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \
"Once", \
"Script run", \
"Bra", "BraPos", "CBra", "CBraPos", \
"Cond", \
"SBra", "SBraPos", "SCBra", "SCBraPos", \
@ -1707,6 +1709,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1+LINK_SIZE, /* Assert behind */ \
1+LINK_SIZE, /* Assert behind not */ \
1+LINK_SIZE, /* ONCE */ \
1+LINK_SIZE, /* SCRIPT_RUN */ \
1+LINK_SIZE, /* BRA */ \
1+LINK_SIZE, /* BRAPOS */ \
1+LINK_SIZE+IMM2_SIZE, /* CBRA */ \
@ -1854,6 +1857,7 @@ extern const uint8_t PRIV(utf8_table4)[];
#define _pcre2_hspace_list PCRE2_SUFFIX(_pcre2_hspace_list_)
#define _pcre2_vspace_list PCRE2_SUFFIX(_pcre2_vspace_list_)
#define _pcre2_ucd_caseless_sets PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
#define _pcre2_ucd_digit_sets PCRE2_SUFFIX(_pcre2_ucd_digit_sets_)
#define _pcre2_ucd_records PCRE2_SUFFIX(_pcre2_ucd_records_)
#define _pcre2_ucd_stage1 PCRE2_SUFFIX(_pcre2_ucd_stage1_)
#define _pcre2_ucd_stage2 PCRE2_SUFFIX(_pcre2_ucd_stage2_)
@ -1875,6 +1879,7 @@ extern const uint8_t PRIV(default_tables)[];
extern const uint32_t PRIV(hspace_list)[];
extern const uint32_t PRIV(vspace_list)[];
extern const uint32_t PRIV(ucd_caseless_sets)[];
extern const uint32_t PRIV(ucd_digit_sets)[];
extern const ucd_record PRIV(ucd_records)[];
#if PCRE2_CODE_UNIT_WIDTH == 32
extern const ucd_record PRIV(dummy_ucd_record)[];
@ -1922,6 +1927,7 @@ is available. */
#define _pcre2_jit_get_target PCRE2_SUFFIX(_pcre2_jit_get_target_)
#define _pcre2_memctl_malloc PCRE2_SUFFIX(_pcre2_memctl_malloc_)
#define _pcre2_ord2utf PCRE2_SUFFIX(_pcre2_ord2utf_)
#define _pcre2_script_run PCRE2_SUFFIX(_pcre2_script_run_)
#define _pcre2_strcmp PCRE2_SUFFIX(_pcre2_strcmp_)
#define _pcre2_strcmp_c8 PCRE2_SUFFIX(_pcre2_strcmp_c8_)
#define _pcre2_strcpy_c8 PCRE2_SUFFIX(_pcre2_strcpy_c8_)
@ -1948,6 +1954,7 @@ extern size_t _pcre2_jit_get_size(void *);
const char * _pcre2_jit_get_target(void);
extern void * _pcre2_memctl_malloc(size_t, pcre2_memctl *);
extern unsigned int _pcre2_ord2utf(uint32_t, PCRE2_UCHAR *);
extern BOOL _pcre2_script_run(PCRE2_SPTR, PCRE2_SPTR, BOOL);
extern int _pcre2_strcmp(PCRE2_SPTR, PCRE2_SPTR);
extern int _pcre2_strcmp_c8(PCRE2_SPTR, const char *);
extern PCRE2_SIZE _pcre2_strcpy_c8(PCRE2_UCHAR *, const char *);

View File

@ -5014,6 +5014,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
must record a backtracking point and also set up a chained frame. */
case OP_ONCE:
case OP_SCRIPT_RUN:
case OP_SBRA:
Lframe_type = GF_NOCAPTURE | Fop;
@ -5525,6 +5526,14 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
case OP_ASSERT_NOT:
case OP_ASSERTBACK_NOT:
RRETURN(MATCH_MATCH);
/* At the end of a script run, apply the script-checking rules. This code
will never by exercised if Unicode support it not compiled, because in
that environment script runs cause an error at compile time. */
case OP_SCRIPT_RUN:
if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
break;
/* Whole-pattern recursion is coded as a recurse into group 0, so it
won't be picked up here. Instead, we catch it when the OP_END is reached.

View File

@ -393,6 +393,7 @@ for(;;)
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
case OP_ONCE:
case OP_SCRIPT_RUN:
case OP_COND:
case OP_SCOND:
case OP_REVERSE:

228
src/pcre2_script_run.c Normal file
View File

@ -0,0 +1,228 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2018 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains the function for checking a script run. */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "pcre2_internal.h"
/*************************************************
* Check script run *
*************************************************/
/* A script run is conceptually a sequence of characters all in the same
Unicode script. However, it isn't quite that simple. There are special rules
for scripts that are commonly used together, and also special rules for digits.
This function implements the appropriate checks, which is possible only when
PCRE2 is compiled with Unicode support. The function returns TRUE if there is
no Unicode support; however, it should never be called in that circumstance
because an error is given by pcre2_compile() if a script run is called for in a
version of PCRE2 compiled without Unicode support.
Arguments:
pgr point to the first character
endptr point after the last character
utf TRUE if in UTF mode
Returns: TRUE if this is a valid script run
*/
#define SCRIPT_UNSET (-1)
#define SCRIPT_HANPENDING (-2)
#define SCRIPT_HANHIRAKATA (-3)
#define SCRIPT_HANBOPOMOFO (-4)
#define SCRIPT_HANHANGUL (-5)
BOOL
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
{
#ifdef SUPPORT_UNICODE
int require_script = SCRIPT_UNSET;
uint32_t require_digitset = 0;
uint32_t c;
#if PCRE2_CODE_UNIT_WIDTH == 32
(void)utf; /* Avoid compiler warning */
#endif
/* Any string containing fewer than 2 characters is a valid script run. */
if (ptr >= endptr) return TRUE;
GETCHARINCTEST(c, ptr);
if (ptr >= endptr) return TRUE;
/* Scan strings of two or more characters, checking the Unicode characteristics
of each code point. */
for (;;)
{
const ucd_record *ucd = GET_UCD(c);
uint32_t script = ucd->script;
/* If the script is Unknown, the string is not a valid script run. Such
characters can only form script runs of length one. */
if (script == ucp_Unknown) return FALSE;
/* A character whose script is Inherited is always accepted, and plays no
further part. A character whose script is Common is always accepted, but must
still be tested for a digit below. Otherwise, the character must match the
script of the first non-Inherited, non-Common character encountered. For most
scripts, the test is for the same script. However, the Han Chinese script may
be used in conjunction with four other scripts in these combinations:
. Han with Hiragana and Katakana is allowed (for Japanese).
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
. Han with Hangul is allowed (for Korean).
If the first significant character's script is one of the four, the required
script type is immediately known. However, if the first significant
character's script is Han, we have to keep checking for a non-Han character.
Hence the SCRIPT_HANPENDING state. */
if (script != ucp_Inherited)
{
if (script != ucp_Common) switch(require_script)
{
default:
if (script != (unsigned int)require_script) return FALSE;
break;
case SCRIPT_UNSET:
case SCRIPT_HANPENDING:
switch(script)
{
case ucp_Han:
require_script = SCRIPT_HANPENDING;
break;
case ucp_Hiragana:
case ucp_Katakana:
require_script = SCRIPT_HANHIRAKATA;
break;
case ucp_Bopomofo:
require_script = SCRIPT_HANBOPOMOFO;
break;
case ucp_Hangul:
require_script = SCRIPT_HANHANGUL;
break;
default:
if (require_script == SCRIPT_HANPENDING) return FALSE;
require_script = script;
break;
}
break;
case SCRIPT_HANHIRAKATA:
if (script != ucp_Han && script != ucp_Hiragana && script != ucp_Katakana)
return FALSE;
break;
case SCRIPT_HANBOPOMOFO:
if (script != ucp_Han && script != ucp_Bopomofo) return FALSE;
break;
case SCRIPT_HANHANGUL:
if (script != ucp_Han && script != ucp_Hangul) return FALSE;
break;
}
/* The character is in an acceptable script. We must now ensure that all
decimal digits in the string come from the same set. Some scripts (e.g.
Common, Arabic) have more than one set of decimal digits. This code does
not allow mixing sets, even within the same script. The vector called
PRIV(ucd_digit_sets)[] contains, in its first element, the number of
following elements, and then, in ascending order, the code points of the
'9' characters in every set of 10 digits. Each set is identified by the
offset in the vector of its '9' character. An initial check of the first
value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
if (ucd->chartype == ucp_Nd)
{
uint32_t digitset;
if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
{
int mid;
int bot = 1;
int top = PRIV(ucd_digit_sets)[0];
for (;;)
{
if (top <= bot + 1) /* <= rather than == is paranoia */
{
digitset = top;
break;
}
mid = (top + bot) / 2;
if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
}
}
/* A required value of 0 means "unset". */
if (require_digitset == 0) require_digitset = digitset;
else if (digitset != require_digitset) return FALSE;
} /* End digit handling */
} /* End checking non-Inherited character */
/* If we haven't yet got to the end, pick up the next character. */
if (ptr >= endptr) return TRUE;
GETCHARINCTEST(c, ptr);
} /* End checking loop */
#else /* NOT SUPPORT_UNICODE */
(void)ptr;
(void)endptr;
(void)utf;
return TRUE;
#endif /* SUPPORT_UNICODE */
}
/* End of pcre2_script_run.c */

View File

@ -171,6 +171,7 @@ for (;;)
/* Fall through */
case OP_ONCE:
case OP_SCRIPT_RUN:
case OP_SBRA:
case OP_BRAPOS:
case OP_SBRAPOS:
@ -1075,6 +1076,7 @@ do
case OP_CBRAPOS:
case OP_SCBRAPOS:
case OP_ONCE:
case OP_SCRIPT_RUN:
case OP_ASSERT:
rc = set_start_bits(re, tcode, utf);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;

View File

@ -417,6 +417,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
#define STRING_Tirhuta0 STR_T STR_i STR_r STR_h STR_u STR_t STR_a "\0"
#define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
#define STRING_Unknown0 STR_U STR_n STR_k STR_n STR_o STR_w STR_n "\0"
#define STRING_Vai0 STR_V STR_a STR_i "\0"
#define STRING_Warang_Citi0 STR_W STR_a STR_r STR_a STR_n STR_g STR_UNDERSCORE STR_C STR_i STR_t STR_i "\0"
#define STRING_Xan0 STR_X STR_a STR_n "\0"
@ -611,6 +612,7 @@ const char PRIV(utt_names)[] =
STRING_Tifinagh0
STRING_Tirhuta0
STRING_Ugaritic0
STRING_Unknown0
STRING_Vai0
STRING_Warang_Citi0
STRING_Xan0
@ -805,19 +807,20 @@ const ucp_type_table PRIV(utt)[] = {
{ 1424, PT_SC, ucp_Tifinagh },
{ 1433, PT_SC, ucp_Tirhuta },
{ 1441, PT_SC, ucp_Ugaritic },
{ 1450, PT_SC, ucp_Vai },
{ 1454, PT_SC, ucp_Warang_Citi },
{ 1466, PT_ALNUM, 0 },
{ 1470, PT_PXSPACE, 0 },
{ 1474, PT_SPACE, 0 },
{ 1478, PT_UCNC, 0 },
{ 1482, PT_WORD, 0 },
{ 1486, PT_SC, ucp_Yi },
{ 1489, PT_GC, ucp_Z },
{ 1491, PT_SC, ucp_Zanabazar_Square },
{ 1508, PT_PC, ucp_Zl },
{ 1511, PT_PC, ucp_Zp },
{ 1514, PT_PC, ucp_Zs }
{ 1450, PT_SC, ucp_Unknown },
{ 1458, PT_SC, ucp_Vai },
{ 1462, PT_SC, ucp_Warang_Citi },
{ 1474, PT_ALNUM, 0 },
{ 1478, PT_PXSPACE, 0 },
{ 1482, PT_SPACE, 0 },
{ 1486, PT_UCNC, 0 },
{ 1490, PT_WORD, 0 },
{ 1494, PT_SC, ucp_Yi },
{ 1497, PT_GC, ucp_Z },
{ 1499, PT_SC, ucp_Zanabazar_Square },
{ 1516, PT_PC, ucp_Zl },
{ 1519, PT_PC, ucp_Zp },
{ 1522, PT_PC, ucp_Zs }
};
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);

File diff suppressed because it is too large Load Diff

View File

@ -124,6 +124,7 @@ enum {
/* These are the script identifications. */
enum {
ucp_Unknown,
ucp_Arabic,
ucp_Armenian,
ucp_Bengali,

11
testdata/testinput12 vendored
View File

@ -386,5 +386,16 @@
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
123abcáyzabcdef789abcሴqr
# A few script run tests in non-UTF mode (but they need Unicode support)
/^(*script_run:.{4})/
\x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
\x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
\x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
/^(*sr:.*)/utf,allow_surrogate_escapes
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
\x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
# End of testinput12

91
testdata/testinput4 vendored
View File

@ -2317,5 +2317,96 @@
/[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf
\x{99}\x{99}\x{99}
# Script run tests
/^(*script_run:.{4})/utf
abcd Latin x4
\x{2e80}\x{2fa1d}\x{3041}\x{30a1} Han Han Hiragana Katakana
\x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
\x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
\x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
\x{2e80}\x{3105}\x{2e80}\x{3105} Han Bopomofo Han Bopomofo
\x{02ea}\x{2e80}\x{2e80}\x{3105} Bopomofo-Sk Han Han Bopomofo
\x{3105}\x{2e80}\x{2e80}\x{3105} Bopomofo Han Han Bopomofo
\x{0300}cd! Inherited Latin Latin Common
\x{0391}12\x{03a9} Greek Common-digits Greek
\x{0400}12\x{fe2f} Cyrillic Common-digits Cyrillic
\x{0531}12\x{fb17} Armenian Common-digits Armenian
\x{0591}12\x{fb4f} Hebrew Common-digits Hebrew
\x{0600}12\x{1eef1} Arabic Common-digits Arabic
\x{0600}\x{0660}\x{0669}\x{1eef1} Arabic Arabic-digits Arabic
\x{0700}12\x{086a} Syriac Common-digits Syriac
\x{1200}12\x{ab2e} Ethiopic Common-digits Ethiopic
\x{1680}12\x{169c} Ogham Common-digits Ogham
\x{3041}12\x{3041} Hiragana Common-digits Hiragana
\x{0980}\x{09e6}\x{09e7}\x{0993} Bengali Bengali-digits Bengali
!cde Common Latin Latin Latin
A..B Latin Common Common Latin
0abc Ascii-digit Latin Latin Latin
1\x{0700}\x{0700}\x{0700} Ascii-digit Syriac x 3
\x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters
\= Expect no match
a\x{370}bcd Latin Greek Latin Latin
\x{1100}\x{02ea}\x{02ea}\x{02ea} Hangul Bopomofo x3
\x{02ea}\x{02ea}\x{02ea}\x{1100} Bopomofo x3 Hangul
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
\x{0391}\x{09e6}\x{09e7}\x{03a9} Greek Bengali digits Greek
\x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic
\x{0600}\x{0669}7\x{1eef1} Arabic Arabic-digit ascii-digit Arabic
A5\x{ff19}B Latin Common-ascii/notascii-digits Latin
\x{0300}cd\x{0391} Inherited Latin Latin Greek
!cd\x{0391} Common Latin Latin Greek
\x{1A80}\x{1A90}\x{1a40}\x{1a41} Tai Tham Hora digit, Tham digit, letters
A\x{1d7ce}\x{1d7ff}B Common fancy-common-2-sets-digits Common
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
/^(*sr:.{4}|..)/utf
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
/^(*atomic_script_run:.{4}|..)/utf
\= Expect no match
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
/^(*asr:.*)/utf
\= Expect no match
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
/^(?>(*sr:.*))/utf
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
/^(*sr:.*)/utf
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
\x{10fffd}\x{10fffd}\x{10fffd} Private use (Unknown)
/^(*sr:\x{2e80}*)/utf
\x{2e80}\x{2e80}\x{3105} Han Han Bopomofo
/^(*sr:\x{2e80}*)\x{2e80}/utf
\x{2e80}\x{2e80}\x{3105} Han Han Bopomofo
/^(*sr:.*)Test/utf
Test script run on an empty string
/^(*sr:(.{2})){2}/utf
\x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic
\x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters
\x{1A80}\x{1a40}\x{1A90}\x{1a41} Tai Tham Hora digit, letter, Tham digit, letter
\= Expect no match
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
# Test loop breaking for empty string match
/^(*sr:A|)*BCD/utf
AABCD
ABCD
BCD
# The use of (*ACCEPT) breaks script run checking
/^(*sr:.*(*ACCEPT)ZZ)/utf
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
# -------
# End of testinput4

23
testdata/testinput5 vendored
View File

@ -2106,5 +2106,28 @@
/(*: ABC)abc/x,utf,mark,alt_verbnames
abc
# Script run tests: auto-possessification
/^(*sr:.*)/B,utf
paypаl.com A classic example of why script run checks are a good thing
/^(*sr:\x{2e80}*)/B,utf
/^(*sr:\x{2e80}*)\x{2e80}/B,utf
# Some script run patterns are broken in Perl 5.28.0. These can be moved into
# test 4 when a mended version of Perl is released.
/^(*sr:.{4})/utf
\x{0980}12\x{0993} Bengali Common-digits Bengali
\x{0780}12\x{07b1} Thaana Common-digits Thaana
\x{0e01}12\x{0e5b} Thai Common-digits Thai
\x{1780}12\x{19ff} Khmer Common-digits Khmer
\x{0904}12\x{0939} Devanagari Common-digits Devanagari
A\x{ff10}\x{ff19}B Latin Common-notascii-digits Latin
A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin
# -------
# End of testinput5

View File

@ -1480,5 +1480,20 @@ Old 12 12 New 14 16
Old 12 15 New 16 21
Old 21 21 New 27 29
4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
# A few script run tests in non-UTF mode (but they need Unicode support)
/^(*script_run:.{4})/
\x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
0: \x{3041}\x{30a1}\x{3007}\x{3007}
\x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
0: \x{30a1}\x{3041}\x{3007}\x{3007}
\x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
0: \x{1100}\x{2e80}\x{2e80}\x{1101}
/^(*sr:.*)/utf,allow_surrogate_escapes
Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
\x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
# End of testinput12

View File

@ -1477,5 +1477,21 @@ Old 12 12 New 14 16
Old 12 15 New 16 21
Old 21 21 New 27 29
4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
# A few script run tests in non-UTF mode (but they need Unicode support)
/^(*script_run:.{4})/
\x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
0: \x{3041}\x{30a1}\x{3007}\x{3007}
\x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
0: \x{30a1}\x{3041}\x{3007}\x{3007}
\x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
0: \x{1100}\x{2e80}\x{2e80}\x{1101}
/^(*sr:.*)/utf,allow_surrogate_escapes
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
0: \x{2e80}\x{3105}\x{2e80}
\x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
0: \x{d800}
# End of testinput12

149
testdata/testoutput4 vendored
View File

@ -3741,5 +3741,154 @@ No match
/[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf
\x{99}\x{99}\x{99}
0: \x{99}\x{99}\x{99}
# Script run tests
/^(*script_run:.{4})/utf
abcd Latin x4
0: abcd
\x{2e80}\x{2fa1d}\x{3041}\x{30a1} Han Han Hiragana Katakana
0: \x{2e80}\x{2fa1d}\x{3041}\x{30a1}
\x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
0: \x{3041}\x{30a1}\x{3007}\x{3007}
\x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
0: \x{30a1}\x{3041}\x{3007}\x{3007}
\x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
0: \x{1100}\x{2e80}\x{2e80}\x{1101}
\x{2e80}\x{3105}\x{2e80}\x{3105} Han Bopomofo Han Bopomofo
0: \x{2e80}\x{3105}\x{2e80}\x{3105}
\x{02ea}\x{2e80}\x{2e80}\x{3105} Bopomofo-Sk Han Han Bopomofo
0: \x{2ea}\x{2e80}\x{2e80}\x{3105}
\x{3105}\x{2e80}\x{2e80}\x{3105} Bopomofo Han Han Bopomofo
0: \x{3105}\x{2e80}\x{2e80}\x{3105}
\x{0300}cd! Inherited Latin Latin Common
0: \x{300}cd!
\x{0391}12\x{03a9} Greek Common-digits Greek
0: \x{391}12\x{3a9}
\x{0400}12\x{fe2f} Cyrillic Common-digits Cyrillic
0: \x{400}12\x{fe2f}
\x{0531}12\x{fb17} Armenian Common-digits Armenian
0: \x{531}12\x{fb17}
\x{0591}12\x{fb4f} Hebrew Common-digits Hebrew
0: \x{591}12\x{fb4f}
\x{0600}12\x{1eef1} Arabic Common-digits Arabic
0: \x{600}12\x{1eef1}
\x{0600}\x{0660}\x{0669}\x{1eef1} Arabic Arabic-digits Arabic
0: \x{600}\x{660}\x{669}\x{1eef1}
\x{0700}12\x{086a} Syriac Common-digits Syriac
0: \x{700}12\x{86a}
\x{1200}12\x{ab2e} Ethiopic Common-digits Ethiopic
0: \x{1200}12\x{ab2e}
\x{1680}12\x{169c} Ogham Common-digits Ogham
0: \x{1680}12\x{169c}
\x{3041}12\x{3041} Hiragana Common-digits Hiragana
0: \x{3041}12\x{3041}
\x{0980}\x{09e6}\x{09e7}\x{0993} Bengali Bengali-digits Bengali
0: \x{980}\x{9e6}\x{9e7}\x{993}
!cde Common Latin Latin Latin
0: !cde
A..B Latin Common Common Latin
0: A..B
0abc Ascii-digit Latin Latin Latin
0: 0abc
1\x{0700}\x{0700}\x{0700} Ascii-digit Syriac x 3
0: 1\x{700}\x{700}\x{700}
\x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters
0: \x{1a80}\x{1a80}\x{1a40}\x{1a41}
\= Expect no match
a\x{370}bcd Latin Greek Latin Latin
No match
\x{1100}\x{02ea}\x{02ea}\x{02ea} Hangul Bopomofo x3
No match
\x{02ea}\x{02ea}\x{02ea}\x{1100} Bopomofo x3 Hangul
No match
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
No match
\x{0391}\x{09e6}\x{09e7}\x{03a9} Greek Bengali digits Greek
No match
\x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic
No match
\x{0600}\x{0669}7\x{1eef1} Arabic Arabic-digit ascii-digit Arabic
No match
A5\x{ff19}B Latin Common-ascii/notascii-digits Latin
No match
\x{0300}cd\x{0391} Inherited Latin Latin Greek
No match
!cd\x{0391} Common Latin Latin Greek
No match
\x{1A80}\x{1A90}\x{1a40}\x{1a41} Tai Tham Hora digit, Tham digit, letters
No match
A\x{1d7ce}\x{1d7ff}B Common fancy-common-2-sets-digits Common
No match
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
No match
/^(*sr:.{4}|..)/utf
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
0: \x{2e80}\x{3105}
/^(*atomic_script_run:.{4}|..)/utf
\= Expect no match
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
No match
/^(*asr:.*)/utf
\= Expect no match
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
No match
/^(?>(*sr:.*))/utf
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
0: \x{2e80}\x{3105}\x{2e80}
/^(*sr:.*)/utf
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
0: \x{2e80}\x{3105}\x{2e80}
\x{10fffd}\x{10fffd}\x{10fffd} Private use (Unknown)
0: \x{10fffd}
/^(*sr:\x{2e80}*)/utf
\x{2e80}\x{2e80}\x{3105} Han Han Bopomofo
0: \x{2e80}\x{2e80}
/^(*sr:\x{2e80}*)\x{2e80}/utf
\x{2e80}\x{2e80}\x{3105} Han Han Bopomofo
0: \x{2e80}\x{2e80}
/^(*sr:.*)Test/utf
Test script run on an empty string
0: Test
/^(*sr:(.{2})){2}/utf
\x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic
0: \x{600}7\x{669}\x{1eef1}
1: \x{669}\x{1eef1}
\x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters
0: \x{1a80}\x{1a80}\x{1a40}\x{1a41}
1: \x{1a40}\x{1a41}
\x{1A80}\x{1a40}\x{1A90}\x{1a41} Tai Tham Hora digit, letter, Tham digit, letter
0: \x{1a80}\x{1a40}\x{1a90}\x{1a41}
1: \x{1a90}\x{1a41}
\= Expect no match
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
No match
# Test loop breaking for empty string match
/^(*sr:A|)*BCD/utf
AABCD
0: AABCD
ABCD
0: ABCD
BCD
0: BCD
# The use of (*ACCEPT) breaks script run checking
/^(*sr:.*(*ACCEPT)ZZ)/utf
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
0: \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
# -------
# End of testinput4

59
testdata/testoutput5 vendored
View File

@ -4775,5 +4775,64 @@ Failed: error 137 at offset 2: PCRE2 does not support \F, \L, \l, \N{name}, \U,
abc
0: abc
MK: ABC
# Script run tests: auto-possessification
/^(*sr:.*)/B,utf
------------------------------------------------------------------
Bra
^
Script run
Any*
Ket
Ket
End
------------------------------------------------------------------
paypаl.com A classic example of why script run checks are a good thing
0: payp
/^(*sr:\x{2e80}*)/B,utf
------------------------------------------------------------------
Bra
^
Script run
\x{2e80}*+
Ket
Ket
End
------------------------------------------------------------------
/^(*sr:\x{2e80}*)\x{2e80}/B,utf
------------------------------------------------------------------
Bra
^
Script run
\x{2e80}*
Ket
\x{2e80}
Ket
End
------------------------------------------------------------------
# Some script run patterns are broken in Perl 5.28.0. These can be moved into
# test 4 when a mended version of Perl is released.
/^(*sr:.{4})/utf
\x{0980}12\x{0993} Bengali Common-digits Bengali
0: \x{980}12\x{993}
\x{0780}12\x{07b1} Thaana Common-digits Thaana
0: \x{780}12\x{7b1}
\x{0e01}12\x{0e5b} Thai Common-digits Thai
0: \x{e01}12\x{e5b}
\x{1780}12\x{19ff} Khmer Common-digits Khmer
0: \x{1780}12\x{19ff}
\x{0904}12\x{0939} Devanagari Common-digits Devanagari
0: \x{904}12\x{939}
A\x{ff10}\x{ff19}B Latin Common-notascii-digits Latin
0: A\x{ff10}\x{ff19}B
A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin
0: A\x{1d7ce}\x{1d7cf}B
# -------
# End of testinput5