Unicode upper/lower casing is now used when UCP is set, even if UTF is not set.

This is not yet documented, and it not yet implemented in JIT.
This commit is contained in:
Philip.Hazel 2020-02-23 16:40:05 +00:00
parent d0666136c9
commit 4a7dfab0ec
18 changed files with 893 additions and 125 deletions

View File

@ -66,6 +66,11 @@ recurse function in JIT.
17. Fix a crash which occurs when the character type of an invalid UTF 17. Fix a crash which occurs when the character type of an invalid UTF
character is decoded in JIT. character is decoded in JIT.
18. Changes in many areas of the code so that when Unicode is supported and
PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
upper/lower case computations on characters whose code points are greater than
127. Documentation is not yet updated. JIT is not yet updated.
Version 10.34 21-November-2019 Version 10.34 21-November-2019
------------------------------ ------------------------------

View File

@ -28,8 +28,6 @@
# The -v option causes a call to 'pcre2test -C' to happen for each # The -v option causes a call to 'pcre2test -C' to happen for each
# configuration. # configuration.
# Currently -fsanitize=undefined is not working (locks machine).
useasan=1 useasan=1
useusan=1 useusan=1
usedebug=1 usedebug=1

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge New API code Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -292,6 +292,7 @@ possessification, and if so, fills a list with its properties.
Arguments: Arguments:
code points to start of expression code points to start of expression
utf TRUE if in UTF mode utf TRUE if in UTF mode
ucp TRUE if in UCP mode
fcc points to the case-flipping table fcc points to the case-flipping table
list points to output list list points to output list
list[0] will be filled with the opcode list[0] will be filled with the opcode
@ -304,7 +305,7 @@ Returns: points to the start of the next opcode if *code is accepted
*/ */
static PCRE2_SPTR static PCRE2_SPTR
get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc, get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
uint32_t *list) uint32_t *list)
{ {
PCRE2_UCHAR c = *code; PCRE2_UCHAR c = *code;
@ -316,7 +317,8 @@ uint32_t chr;
uint32_t *clist_dest; uint32_t *clist_dest;
const uint32_t *clist_src; const uint32_t *clist_src;
#else #else
(void)utf; /* Suppress "unused parameter" compiler warning */ (void)utf; /* Suppress "unused parameter" compiler warnings */
(void)ucp;
#endif #endif
list[0] = c; list[0] = c;
@ -396,7 +398,7 @@ switch(c)
list[2] = chr; list[2] = chr;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (chr < 128 || (chr < 256 && !utf)) if (chr < 128 || (chr < 256 && !utf && !ucp))
list[3] = fcc[chr]; list[3] = fcc[chr];
else else
list[3] = UCD_OTHERCASE(chr); list[3] = UCD_OTHERCASE(chr);
@ -503,6 +505,7 @@ which case the base cannot be possessified.
Arguments: Arguments:
code points to the byte code code points to the byte code
utf TRUE in UTF mode utf TRUE in UTF mode
ucp TRUE in UCP mode
cb compile data block cb compile data block
base_list the data list of the base opcode base_list the data list of the base opcode
base_end the end of the base opcode base_end the end of the base opcode
@ -512,7 +515,7 @@ Returns: TRUE if the auto-possessification is possible
*/ */
static BOOL static BOOL
compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb, compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit) const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
{ {
PCRE2_UCHAR c; PCRE2_UCHAR c;
@ -651,7 +654,7 @@ for(;;)
while (*next_code == OP_ALT) while (*next_code == OP_ALT)
{ {
if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit)) if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
return FALSE; return FALSE;
code = next_code + 1 + LINK_SIZE; code = next_code + 1 + LINK_SIZE;
next_code += GET(next_code, 1); next_code += GET(next_code, 1);
@ -672,7 +675,8 @@ for(;;)
/* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
next_code += 1 + LINK_SIZE; next_code += 1 + LINK_SIZE;
if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit)) if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
rec_limit))
return FALSE; return FALSE;
code += PRIV(OP_lengths)[c]; code += PRIV(OP_lengths)[c];
@ -688,7 +692,7 @@ for(;;)
/* We now have the next appropriate opcode to compare with the base. Check /* We now have the next appropriate opcode to compare with the base. Check
for a supported opcode, and load its properties. */ for a supported opcode, and load its properties. */
code = get_chr_property_list(code, utf, cb->fcc, list); code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
if (code == NULL) return FALSE; /* Unsupported */ if (code == NULL) return FALSE; /* Unsupported */
/* If either opcode is a small character list, set pointers for comparing /* If either opcode is a small character list, set pointers for comparing
@ -1100,7 +1104,6 @@ leaving the remainder of the pattern unpossessified.
Arguments: Arguments:
code points to start of the byte code code points to start of the byte code
utf TRUE in UTF mode
cb compile data block cb compile data block
Returns: 0 for success Returns: 0 for success
@ -1108,13 +1111,15 @@ Returns: 0 for success
*/ */
int int
PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb) PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
{ {
PCRE2_UCHAR c; PCRE2_UCHAR c;
PCRE2_SPTR end; PCRE2_SPTR end;
PCRE2_UCHAR *repeat_opcode; PCRE2_UCHAR *repeat_opcode;
uint32_t list[8]; uint32_t list[8];
int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
for (;;) for (;;)
{ {
@ -1126,10 +1131,11 @@ for (;;)
{ {
c -= get_repeat_base(c) - OP_STAR; c -= get_repeat_base(c) - OP_STAR;
end = (c <= OP_MINUPTO) ? end = (c <= OP_MINUPTO) ?
get_chr_property_list(code, utf, cb->fcc, list) : NULL; get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit)) if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
&rec_limit))
{ {
switch(c) switch(c)
{ {
@ -1181,11 +1187,11 @@ for (;;)
if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
{ {
/* end must not be NULL. */ /* end must not be NULL. */
end = get_chr_property_list(code, utf, cb->fcc, list); end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
list[1] = (c & 1) == 0; list[1] = (c & 1) == 0;
if (compare_opcodes(end, utf, cb, list, end, &rec_limit)) if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
{ {
switch (c) switch (c)
{ {

View File

@ -4904,7 +4904,7 @@ range. */
if ((options & PCRE2_CASELESS) != 0) if ((options & PCRE2_CASELESS) != 0)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if ((options & PCRE2_UTF) != 0) if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
{ {
int rc; int rc;
uint32_t oc, od; uint32_t oc, od;
@ -5319,7 +5319,8 @@ dynamically as we process the pattern. */
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
BOOL utf = (options & PCRE2_UTF) != 0; BOOL utf = (options & PCRE2_UTF) != 0;
#else /* No UTF support */ BOOL ucp = (options & PCRE2_UCP) != 0;
#else /* No Unicode support */
BOOL utf = FALSE; BOOL utf = FALSE;
#endif #endif
@ -5602,7 +5603,7 @@ for (;; pptr++)
uint32_t d; uint32_t d;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && c > 127) d = UCD_OTHERCASE(c); else if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
#endif #endif
{ {
#if PCRE2_CODE_UNIT_WIDTH != 8 #if PCRE2_CODE_UNIT_WIDTH != 8
@ -9632,6 +9633,7 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
{ {
BOOL utf; /* Set TRUE for UTF mode */ BOOL utf; /* Set TRUE for UTF mode */
BOOL ucp; /* Set TRUE for UCP mode */
BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */ BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */ BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
pcre2_real_code *re = NULL; /* What we will return */ pcre2_real_code *re = NULL; /* What we will return */
@ -9919,8 +9921,8 @@ if (utf)
/* Check UCP lockout. */ /* Check UCP lockout. */
if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) == ucp = (cb.external_options & PCRE2_UCP) != 0;
(PCRE2_UCP|PCRE2_NEVER_UCP)) if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
{ {
errorcode = ERR75; errorcode = ERR75;
goto HAD_EARLY_ERROR; goto HAD_EARLY_ERROR;
@ -10296,7 +10298,7 @@ function call. */
if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0) if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
{ {
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80; if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
} }
/* Failed to compile, or error while post-processing. */ /* Failed to compile, or error while post-processing. */
@ -10344,21 +10346,25 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
if ((firstcuflags & REQ_CASELESS) != 0) if ((firstcuflags & REQ_CASELESS) != 0)
{ {
if (firstcu < 128 || (!utf && firstcu < 255)) if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
{ {
if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS; if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
} }
/* The first code unit is > 128 in UTF mode, or > 255 otherwise. In /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
8-bit UTF mode, codepoints in the range 128-255 are introductory code In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
points and cannot have another case. In 16-bit and 32-bit modes, we can points and cannot have another case, but if UCP is set they may do. */
check wide characters when UTF (and therefore UCP) is supported. */
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 #ifdef SUPPORT_UNICODE
else if (firstcu <= MAX_UTF_CODE_POINT && #if PCRE2_CODE_UNIT_WIDTH == 8
else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
re->flags |= PCRE2_FIRSTCASELESS;
#else
else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
UCD_OTHERCASE(firstcu) != firstcu) UCD_OTHERCASE(firstcu) != firstcu)
re->flags |= PCRE2_FIRSTCASELESS; re->flags |= PCRE2_FIRSTCASELESS;
#endif #endif
#endif /* SUPPORT_UNICODE */
} }
} }
@ -10407,14 +10413,20 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
if ((reqcuflags & REQ_CASELESS) != 0) if ((reqcuflags & REQ_CASELESS) != 0)
{ {
if (reqcu < 128 || (!utf && reqcu < 255)) if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
{ {
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
} }
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 #ifdef SUPPORT_UNICODE
else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu) #if PCRE2_CODE_UNIT_WIDTH == 8
re->flags |= PCRE2_LASTCASELESS; else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
#else
else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
#endif #endif
#endif /* SUPPORT_UNICODE */
} }
} }
} }

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge New API code Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -548,6 +548,7 @@ PCRE2_SPTR start_code = mb->start_code;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0; BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
#else #else
BOOL utf = FALSE; BOOL utf = FALSE;
#endif #endif
@ -2190,7 +2191,7 @@ for (;;)
if (clen == 0) break; if (clen == 0) break;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) if (utf_or_ucp)
{ {
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
{ {
@ -2204,7 +2205,7 @@ for (;;)
} }
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
/* Not UTF mode */ /* Not UTF or UCP mode */
{ {
if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d)) if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
{ ADD_NEW(state_offset + 2, 0); } { ADD_NEW(state_offset + 2, 0); }
@ -2339,7 +2340,7 @@ for (;;)
{ {
uint32_t otherd; uint32_t otherd;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && d >= 128) if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d); otherd = UCD_OTHERCASE(d);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -2374,7 +2375,7 @@ for (;;)
if (caseless) if (caseless)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && d >= 128) if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d); otherd = UCD_OTHERCASE(d);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -2417,7 +2418,7 @@ for (;;)
if (caseless) if (caseless)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && d >= 128) if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d); otherd = UCD_OTHERCASE(d);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -2458,7 +2459,7 @@ for (;;)
if (caseless) if (caseless)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && d >= 128) if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d); otherd = UCD_OTHERCASE(d);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -2491,7 +2492,7 @@ for (;;)
if (caseless) if (caseless)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && d >= 128) if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d); otherd = UCD_OTHERCASE(d);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -2531,7 +2532,7 @@ for (;;)
if (caseless) if (caseless)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && d >= 128) if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d); otherd = UCD_OTHERCASE(d);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -3526,10 +3527,15 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0) if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{ {
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu); first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 #ifdef SUPPORT_UNICODE
if (utf && first_cu > 127) #if PCRE2_CODE_UNIT_WIDTH == 8
if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
#else
if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
#endif #endif
#endif /* SUPPORT_UNICODE */
} }
} }
else else
@ -3545,9 +3551,15 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0) if ((re->flags & PCRE2_LASTCASELESS) != 0)
{ {
req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu); req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 #ifdef SUPPORT_UNICODE
if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); #if PCRE2_CODE_UNIT_WIDTH == 8
if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#else
if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#endif #endif
#endif /* SUPPORT_UNICODE */
} }
} }

View File

@ -1952,7 +1952,7 @@ is available. */
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_) #define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_) #define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_)
extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, extern int _pcre2_auto_possessify(PCRE2_UCHAR *,
const compile_block *); const compile_block *);
extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *, extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *,
int *, uint32_t, uint32_t, BOOL, compile_block *); int *, uint32_t, uint32_t, BOOL, compile_block *);

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2015-2019 University of Cambridge New API code Copyright (c) 2015-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -598,12 +598,13 @@ BOOL condition; /* Used in conditional groups */
BOOL cur_is_word; /* Used in "word" tests */ BOOL cur_is_word; /* Used in "word" tests */
BOOL prev_is_word; /* Used in "word" tests */ BOOL prev_is_word; /* Used in "word" tests */
/* UTF flag */ /* UTF and UCP flags */
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0; BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
#else #else
BOOL utf = FALSE; BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
#endif #endif
/* This is the length of the last part of a backtracking frame that must be /* This is the length of the last part of a backtracking frame that must be
@ -928,6 +929,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
} }
else else
#endif #endif
/* Not UTF mode */ /* Not UTF mode */
{ {
if (mb->end_subject - Feptr < 1) if (mb->end_subject - Feptr < 1)
@ -987,10 +989,30 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
} }
} }
/* If UCP is set without UTF we must do the same as above, but with one
character per code unit. */
else if (ucp)
{
uint32_t cc = UCHAR21(Feptr);
fc = Fecode[1];
if (fc < 128)
{
if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
}
else
{
if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
}
Feptr++;
Fecode += 2;
}
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
/* Not UTF mode; use the table for characters < 256. */ /* Not UTF or UCP mode; use the table for characters < 256. */
{ {
if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
!= TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
@ -1010,6 +1032,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
SCHECK_PARTIAL(); SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH); RRETURN(MATCH_NOMATCH);
} }
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) if (utf)
{ {
@ -1026,15 +1049,42 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (ch > 127) if (ch > 127)
ch = UCD_OTHERCASE(ch); ch = UCD_OTHERCASE(ch);
else else
ch = TABLE_GET(ch, mb->fcc, ch); ch = (mb->fcc)[ch];
if (ch == fc) RRETURN(MATCH_NOMATCH); if (ch == fc) RRETURN(MATCH_NOMATCH);
} }
} }
/* UCP without UTF is as above, but with one character per code unit. */
else if (ucp)
{
uint32_t ch;
fc = UCHAR21INC(Feptr);
ch = Fecode[1];
Fecode += 2;
if (ch == fc)
{
RRETURN(MATCH_NOMATCH); /* Caseful match */
}
else if (Fop == OP_NOTI) /* If caseless */
{
if (ch > 127)
ch = UCD_OTHERCASE(ch);
else
ch = (mb->fcc)[ch];
if (ch == fc) RRETURN(MATCH_NOMATCH);
}
}
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
/* Neither UTF nor UCP is set */
{ {
uint32_t ch = Fecode[1]; uint32_t ch = Fecode[1];
fc = *Feptr++; fc = UCHAR21INC(Feptr);
if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
RRETURN(MATCH_NOMATCH); RRETURN(MATCH_NOMATCH);
Fecode += 2; Fecode += 2;
@ -1244,7 +1294,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
/* When not in UTF mode, load a single-code-unit character. Then proceed as /* When not in UTF mode, load a single-code-unit character. Then proceed as
above. */ above, using Unicode casing if either UTF or UCP is set. */
Lc = *Fecode++; Lc = *Fecode++;
@ -1253,11 +1303,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (Fop >= OP_STARI) if (Fop >= OP_STARI)
{ {
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
/* Lc must be < 128 in UTF-8 mode. */ #ifdef SUPPORT_UNICODE
if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
else
#endif /* SUPPORT_UNICODE */
/* Lc will be < 128 in UTF-8 mode. */
Loc = mb->fcc[Lc]; Loc = mb->fcc[Lc];
#else /* 16-bit & 32-bit */ #else /* 16-bit & 32-bit */
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
Loc = TABLE_GET(Lc, mb->fcc, Lc); Loc = TABLE_GET(Lc, mb->fcc, Lc);
@ -1490,7 +1544,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (Fop >= OP_NOTSTARI) /* Caseless */ if (Fop >= OP_NOTSTARI) /* Caseless */
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && Lc > 127) if ((utf || ucp) && Lc > 127)
Loc = UCD_OTHERCASE(Lc); Loc = UCD_OTHERCASE(Lc);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -6045,7 +6099,6 @@ BOOL firstline;
BOOL has_first_cu = FALSE; BOOL has_first_cu = FALSE;
BOOL has_req_cu = FALSE; BOOL has_req_cu = FALSE;
BOOL startline; BOOL startline;
BOOL utf;
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
BOOL memchr_not_found_first_cu = FALSE; BOOL memchr_not_found_first_cu = FALSE;
@ -6069,13 +6122,19 @@ PCRE2_SPTR match_partial;
BOOL use_jit; BOOL use_jit;
#endif #endif
/* This flag is needed even when Unicode is not supported for convenience
(it is used by the IS_NEWLINE macro). */
BOOL utf = FALSE;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
BOOL ucp = FALSE;
BOOL allow_invalid; BOOL allow_invalid;
uint32_t fragment_options = 0; uint32_t fragment_options = 0;
#ifdef SUPPORT_JIT #ifdef SUPPORT_JIT
BOOL jit_checked_utf = FALSE; BOOL jit_checked_utf = FALSE;
#endif #endif
#endif #endif /* SUPPORT_UNICODE */
PCRE2_SIZE frame_size; PCRE2_SIZE frame_size;
@ -6147,12 +6206,13 @@ use_jit = (re->executable_jit != NULL &&
(options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
#endif #endif
/* Initialize UTF parameters. */ /* Initialize UTF/UCP parameters. */
utf = (re->overall_options & PCRE2_UTF) != 0;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
utf = (re->overall_options & PCRE2_UTF) != 0;
allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
#endif ucp = (re->overall_options & PCRE2_UCP) != 0;
#endif /* SUPPORT_UNICODE */
/* Convert the partial matching flags into an integer. */ /* Convert the partial matching flags into an integer. */
@ -6589,9 +6649,13 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0) if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{ {
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 #ifdef SUPPORT_UNICODE
if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu); #if PCRE2_CODE_UNIT_WIDTH == 8
if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
#else
if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
#endif #endif
#endif /* SUPPORT_UNICODE */
} }
} }
else else
@ -6607,9 +6671,13 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0) if ((re->flags & PCRE2_LASTCASELESS) != 0)
{ {
req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 #ifdef SUPPORT_UNICODE
if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu); #if PCRE2_CODE_UNIT_WIDTH == 8
if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
#else
if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
#endif #endif
#endif /* SUPPORT_UNICODE */
} }
} }
@ -6756,15 +6824,16 @@ for(;;)
#endif #endif
} }
/* If we can't find the required code unit, having reached the true end /* If we can't find the required first code unit, having reached the
of the subject, break the bumpalong loop, to force a match failure, true end of the subject, break the bumpalong loop, to force a match
except when doing partial matching, when we let the next cycle run at failure, except when doing partial matching, when we let the next cycle
the end of the subject. To see why, consider the pattern /(?<=abc)def/, run at the end of the subject. To see why, consider the pattern
which partially matches "abc", even though the string does not contain /(?<=abc)def/, which partially matches "abc", even though the string
the starting character "d". If we have not reached the true end of the does not contain the starting character "d". If we have not reached the
subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified) true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
we also let the cycle run, because the matching string is legitimately temporarily modified) we also let the cycle run, because the matching
allowed to start with the first code unit of a newline. */ string is legitimately allowed to start with the first code unit of a
newline. */
if (mb->partial == 0 && start_match >= mb->end_subject) if (mb->partial == 0 && start_match >= mb->end_subject)
{ {

View File

@ -772,15 +772,19 @@ Arguments:
p points to the first code unit of the character p points to the first code unit of the character
caseless TRUE if caseless caseless TRUE if caseless
utf TRUE for UTF mode utf TRUE for UTF mode
ucp TRUE for UCP mode
Returns: pointer after the character Returns: pointer after the character
*/ */
static PCRE2_SPTR static PCRE2_SPTR
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf) set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
BOOL ucp)
{ {
uint32_t c = *p++; /* First code unit */ uint32_t c = *p++; /* First code unit */
(void)utf; /* Stop compiler warning when UTF not supported */
(void)utf; /* Stop compiler warnings when UTF not supported */
(void)ucp;
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
0xff. */ 0xff. */
@ -810,22 +814,26 @@ if (utf)
if (caseless) if (caseless)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) if (utf || ucp)
{ {
c = UCD_OTHERCASE(c);
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
PCRE2_UCHAR buff[6]; if (utf)
c = UCD_OTHERCASE(c); {
(void)PRIV(ord2utf)(c, buff); PCRE2_UCHAR buff[6];
SET_BIT(buff[0]); (void)PRIV(ord2utf)(c, buff);
SET_BIT(buff[0]);
}
else SET_BIT(c);
#else /* 16-bit or 32-bit mode */ #else /* 16-bit or 32-bit mode */
c = UCD_OTHERCASE(c);
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
#endif #endif
} }
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
/* Not UTF */ /* Not UTF or UCP */
if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]); if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
} }
@ -931,6 +939,7 @@ Arguments:
re points to the compiled regex block re points to the compiled regex block
code points to an expression code points to an expression
utf TRUE if in UTF mode utf TRUE if in UTF mode
ucp TRUE if in UCP mode
depthptr pointer to recurse depth depthptr pointer to recurse depth
Returns: SSB_FAIL => Failed to find any starting code units Returns: SSB_FAIL => Failed to find any starting code units
@ -941,7 +950,8 @@ Returns: SSB_FAIL => Failed to find any starting code units
*/ */
static int static int
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, int *depthptr) set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
int *depthptr)
{ {
uint32_t c; uint32_t c;
int yield = SSB_DONE; int yield = SSB_DONE;
@ -1111,7 +1121,7 @@ do
case OP_SCRIPT_RUN: case OP_SCRIPT_RUN:
case OP_ASSERT: case OP_ASSERT:
case OP_ASSERT_NA: case OP_ASSERT_NA:
rc = set_start_bits(re, tcode, utf, depthptr); rc = set_start_bits(re, tcode, utf, ucp, depthptr);
if (rc == SSB_DONE) if (rc == SSB_DONE)
{ {
try_next = FALSE; try_next = FALSE;
@ -1167,7 +1177,7 @@ do
case OP_BRAZERO: case OP_BRAZERO:
case OP_BRAMINZERO: case OP_BRAMINZERO:
case OP_BRAPOSZERO: case OP_BRAPOSZERO:
rc = set_start_bits(re, ++tcode, utf, depthptr); rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc; if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
do tcode += GET(tcode,1); while (*tcode == OP_ALT); do tcode += GET(tcode,1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE; tcode += 1 + LINK_SIZE;
@ -1189,7 +1199,7 @@ do
case OP_QUERY: case OP_QUERY:
case OP_MINQUERY: case OP_MINQUERY:
case OP_POSQUERY: case OP_POSQUERY:
tcode = set_table_bit(re, tcode + 1, FALSE, utf); tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
break; break;
case OP_STARI: case OP_STARI:
@ -1198,7 +1208,7 @@ do
case OP_QUERYI: case OP_QUERYI:
case OP_MINQUERYI: case OP_MINQUERYI:
case OP_POSQUERYI: case OP_POSQUERYI:
tcode = set_table_bit(re, tcode + 1, TRUE, utf); tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
break; break;
/* Single-char upto sets the bit and tries the next */ /* Single-char upto sets the bit and tries the next */
@ -1206,13 +1216,13 @@ do
case OP_UPTO: case OP_UPTO:
case OP_MINUPTO: case OP_MINUPTO:
case OP_POSUPTO: case OP_POSUPTO:
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf); tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
break; break;
case OP_UPTOI: case OP_UPTOI:
case OP_MINUPTOI: case OP_MINUPTOI:
case OP_POSUPTOI: case OP_POSUPTOI:
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf); tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
break; break;
/* At least one single char sets the bit and stops */ /* At least one single char sets the bit and stops */
@ -1224,7 +1234,7 @@ do
case OP_PLUS: case OP_PLUS:
case OP_MINPLUS: case OP_MINPLUS:
case OP_POSPLUS: case OP_POSPLUS:
(void)set_table_bit(re, tcode + 1, FALSE, utf); (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
try_next = FALSE; try_next = FALSE;
break; break;
@ -1235,7 +1245,7 @@ do
case OP_PLUSI: case OP_PLUSI:
case OP_MINPLUSI: case OP_MINPLUSI:
case OP_POSPLUSI: case OP_POSPLUSI:
(void)set_table_bit(re, tcode + 1, TRUE, utf); (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
try_next = FALSE; try_next = FALSE;
break; break;
@ -1664,6 +1674,7 @@ PRIV(study)(pcre2_real_code *re)
int count = 0; int count = 0;
PCRE2_UCHAR *code; PCRE2_UCHAR *code;
BOOL utf = (re->overall_options & PCRE2_UTF) != 0; BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
/* Find start of compiled code */ /* Find start of compiled code */
@ -1677,7 +1688,7 @@ code units. */
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0) if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
{ {
int depth = 0; int depth = 0;
int rc = set_start_bits(re, code, utf, &depth); int rc = set_start_bits(re, code, utf, ucp, &depth);
if (rc == SSB_UNKNOWN) return 1; if (rc == SSB_UNKNOWN) return 1;
/* If a list of starting code units was set up, scan the list to see if only /* If a list of starting code units was set up, scan the list to see if only
@ -1725,27 +1736,27 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
} }
/* c contains the code unit value, in the range 0-255. In 8-bit UTF /* c contains the code unit value, in the range 0-255. In 8-bit UTF
mode, only values < 128 can be used. */ mode, only values < 128 can be used. In all the other cases, c is a
character value. */
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
if (c > 127) goto DONE; if (utf && c > 127) goto DONE;
#endif #endif
if (a < 0) a = c; /* First one found */ if (a < 0) a = c; /* First one found, save in a */
else if (b < 0) /* Second one found */ else if (b < 0) /* Second one found */
{ {
int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c); int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8 if (utf || ucp)
if (utf && UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */ {
#else /* 16-bit or 32-bit */ if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */ if (c > 127) d = UCD_OTHERCASE(c);
if (utf && c > 127) d = UCD_OTHERCASE(c); }
#endif /* Code width */
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
if (d != a) goto DONE; /* Not other case of a */ if (d != a) goto DONE; /* Not the other case of a */
b = c; b = c; /* Save second in b */
} }
else goto DONE; /* More than two characters found */ else goto DONE; /* More than two characters found */
} }

View File

@ -236,6 +236,7 @@ BOOL use_existing_match;
BOOL replacement_only; BOOL replacement_only;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
BOOL utf = (code->overall_options & PCRE2_UTF) != 0; BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
#endif #endif
PCRE2_UCHAR temp[6]; PCRE2_UCHAR temp[6];
PCRE2_SPTR ptr; PCRE2_SPTR ptr;
@ -758,7 +759,7 @@ do
if (forcecase != 0) if (forcecase != 0)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) if (utf || ucp)
{ {
uint32_t type = UCD_CHARTYPE(ch); uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L && if (PRIV(ucp_gentype)[type] == ucp_L &&
@ -860,7 +861,7 @@ do
if (forcecase != 0) if (forcecase != 0)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) if (utf || ucp)
{ {
uint32_t type = UCD_CHARTYPE(ch); uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L && if (PRIV(ucp_gentype)[type] == ucp_L &&

29
testdata/testinput10 vendored
View File

@ -570,8 +570,10 @@
/[\xff\x{ffff}]/I,utf /[\xff\x{ffff}]/I,utf
/[\xff\x{ff}]/I,utf /[\xff\x{ff}]/I,utf
abc\x{ff}def
/[\xff\x{ff}]/I /[\xff\x{ff}]/I
abc\x{ff}def
/[Ss]/I /[Ss]/I
@ -585,4 +587,31 @@
abc\x80\=startchar abc\x80\=startchar
abc\x80\=startchar,offset=3 abc\x80\=startchar,offset=3
#subject no_jit
/\x{c1}+\x{e1}/iIB,ucp
\x{c1}\x{c1}\x{c1}
\x{e1}\x{e1}\x{e1}
/a|\x{c1}/iI,ucp
\x{e1}xxx
/a|\x{c1}/iI,utf
\x{e1}xxx
/\x{c1}|\x{e1}/iI,ucp
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
X\x{e1}Y
/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
X\x{c1}Y
# Without UTF or UCP characters > 127 have only one case in the default locale.
/X(\x{e1})Y/replace=>\U$1<,substitute_extended
X\x{e1}Y
#subject
# End of testinput10 # End of testinput10

67
testdata/testinput12 vendored
View File

@ -463,4 +463,71 @@
/(?:\x{ff}|\x{3000})/I,utf /(?:\x{ff}|\x{3000})/I,utf
# ----------------------------------------------------
# UCP and casing tests
/\x{120}/i,I
/\x{c1}/i,I,ucp
/[\x{120}\x{121}]/iB,ucp
/[ab\x{120}]+/iB,ucp
aABb\x{121}\x{120}
#subject no_jit
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
/[^\x{120}]/i,no_start_optimize
\x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
/[^\x{120}]/i
\x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
/\x{120}{2}/i,ucp
\x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
/\x{c1}+\x{e1}/iB,ucp
\x{c1}\x{c1}\x{c1}
/\x{c1}+\x{e1}/iIB,ucp
\x{c1}\x{c1}\x{c1}
\x{e1}\x{e1}\x{e1}
/a|\x{c1}/iI,ucp
\x{e1}xxx
/\x{c1}|\x{e1}/iI,ucp
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
X\x{e1}Y
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
X\x{121}Y
#subject
# ----------------------------------------------------
# End of testinput12 # End of testinput12

50
testdata/testinput14 vendored
View File

@ -1,9 +1,12 @@
# These test special (mostly error) UTF features of DFA matching. They are a # These test special UTF and UCP features of DFA matching. The output is
# selection of the more comprehensive tests that are run for non-DFA matching. # different for the different widths.
# The output is different for the different widths.
#subject dfa #subject dfa
# ----------------------------------------------------
# These are a selection of the more comprehensive tests that are run for
# non-DFA matching.
/X/utf /X/utf
XX\x{d800} XX\x{d800}
XX\x{d800}\=offset=3 XX\x{d800}\=offset=3
@ -34,4 +37,45 @@
\xf7\=ph \xf7\=ph
\xf7\x80\=ph \xf7\x80\=ph
# ----------------------------------------------------
# UCP and casing tests - except for the first two, these will all fail in 8-bit
# mode because they are testing UCP without UTF and use characters > 255.
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
/\x{c1}+\x{e1}/iB,ucp
\x{c1}\x{c1}\x{c1}
\x{e1}\x{e1}\x{e1}
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
/[^\x{120}]/i,no_start_optimize
\x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
/[^\x{120}]/i
\x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
/\x{120}{2}/i,ucp
\x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
# ----------------------------------------------------
# End of testinput14 # End of testinput14

64
testdata/testoutput10 vendored
View File

@ -1780,11 +1780,15 @@ Capture group count = 0
Options: utf Options: utf
Starting code units: \xc3 Starting code units: \xc3
Subject length lower bound = 1 Subject length lower bound = 1
abc\x{ff}def
0: \x{ff}
/[\xff\x{ff}]/I /[\xff\x{ff}]/I
Capture group count = 0 Capture group count = 0
Starting code units: \xff First code unit = \xff
Subject length lower bound = 1 Subject length lower bound = 1
abc\x{ff}def
0: \xff
/[Ss]/I /[Ss]/I
Capture group count = 0 Capture group count = 0
@ -1813,4 +1817,62 @@ Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 3
abc\x80\=startchar,offset=3 abc\x80\=startchar,offset=3
Error -36 (bad UTF-8 offset) Error -36 (bad UTF-8 offset)
#subject no_jit
/\x{c1}+\x{e1}/iIB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Last code unit = \xe1 (caseless)
Subject length lower bound = 2
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
\x{e1}\x{e1}\x{e1}
0: \xe1\xe1\xe1
/a|\x{c1}/iI,ucp
Capture group count = 0
Options: caseless ucp
Starting code units: A a \xc1 \xe1
Subject length lower bound = 1
\x{e1}xxx
0: \xe1
/a|\x{c1}/iI,utf
Capture group count = 0
Options: caseless utf
Starting code units: A a \xc3
Subject length lower bound = 1
\x{e1}xxx
0: \x{e1}
/\x{c1}|\x{e1}/iI,ucp
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Subject length lower bound = 1
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
X\x{e1}Y
1: >\xc1<
/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
X\x{c1}Y
1: >\xe1<
# Without UTF or UCP characters > 127 have only one case in the default locale.
/X(\x{e1})Y/replace=>\U$1<,substitute_extended
X\x{e1}Y
1: >\xe1<
#subject
# End of testinput10 # End of testinput10

View File

@ -1613,7 +1613,7 @@ Subject length lower bound = 1
/[Ss]/I /[Ss]/I
Capture group count = 0 Capture group count = 0
Starting code units: S s First code unit = 'S' (caseless)
Subject length lower bound = 1 Subject length lower bound = 1
/[Ss]/I,utf /[Ss]/I,utf
@ -1628,4 +1628,134 @@ Options: utf
Starting code units: \xff Starting code units: \xff
Subject length lower bound = 1 Subject length lower bound = 1
# ----------------------------------------------------
# UCP and casing tests
/\x{120}/i,I
Capture group count = 0
Options: caseless
First code unit = \x{120}
Subject length lower bound = 1
/\x{c1}/i,I,ucp
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Subject length lower bound = 1
/[\x{120}\x{121}]/iB,ucp
------------------------------------------------------------------
Bra
/i \x{120}
Ket
End
------------------------------------------------------------------
/[ab\x{120}]+/iB,ucp
------------------------------------------------------------------
Bra
[ABab\x{120}-\x{121}]++
Ket
End
------------------------------------------------------------------
aABb\x{121}\x{120}
0: aABb\x{121}\x{120}
#subject no_jit
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
No match
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
0: \x{121}\xe1
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
0: \x{121}\xe1
/[^\x{120}]/i,no_start_optimize
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
No match
/[^\x{120}]/i
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
No match
/\x{120}{2}/i,ucp
\x{121}\x{121}
0: \x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
No match
/\x{c1}+\x{e1}/iB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
/\x{c1}+\x{e1}/iIB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Last code unit = \xe1 (caseless)
Subject length lower bound = 2
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
\x{e1}\x{e1}\x{e1}
0: \xe1\xe1\xe1
/a|\x{c1}/iI,ucp
Capture group count = 0
Options: caseless ucp
Starting code units: A a \xc1 \xe1
Subject length lower bound = 1
\x{e1}xxx
0: \xe1
/\x{c1}|\x{e1}/iI,ucp
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Subject length lower bound = 1
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
X\x{e1}Y
1: >\xc1<
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
X\x{121}Y
1: >\x{120}<
#subject
# ----------------------------------------------------
# End of testinput12 # End of testinput12

View File

@ -1611,7 +1611,7 @@ Subject length lower bound = 1
/[Ss]/I /[Ss]/I
Capture group count = 0 Capture group count = 0
Starting code units: S s First code unit = 'S' (caseless)
Subject length lower bound = 1 Subject length lower bound = 1
/[Ss]/I,utf /[Ss]/I,utf
@ -1626,4 +1626,134 @@ Options: utf
Starting code units: \xff Starting code units: \xff
Subject length lower bound = 1 Subject length lower bound = 1
# ----------------------------------------------------
# UCP and casing tests
/\x{120}/i,I
Capture group count = 0
Options: caseless
First code unit = \x{120}
Subject length lower bound = 1
/\x{c1}/i,I,ucp
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Subject length lower bound = 1
/[\x{120}\x{121}]/iB,ucp
------------------------------------------------------------------
Bra
/i \x{120}
Ket
End
------------------------------------------------------------------
/[ab\x{120}]+/iB,ucp
------------------------------------------------------------------
Bra
[ABab\x{120}-\x{121}]++
Ket
End
------------------------------------------------------------------
aABb\x{121}\x{120}
0: aABb\x{121}\x{120}
#subject no_jit
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
No match
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
0: \x{121}\xe1
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
0: \x{121}\xe1
/[^\x{120}]/i,no_start_optimize
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
No match
/[^\x{120}]/i
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
No match
/\x{120}{2}/i,ucp
\x{121}\x{121}
0: \x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
No match
/\x{c1}+\x{e1}/iB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
/\x{c1}+\x{e1}/iIB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Last code unit = \xe1 (caseless)
Subject length lower bound = 2
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
\x{e1}\x{e1}\x{e1}
0: \xe1\xe1\xe1
/a|\x{c1}/iI,ucp
Capture group count = 0
Options: caseless ucp
Starting code units: A a \xc1 \xe1
Subject length lower bound = 1
\x{e1}xxx
0: \xe1
/\x{c1}|\x{e1}/iI,ucp
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Subject length lower bound = 1
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
X\x{e1}Y
1: >\xc1<
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
X\x{121}Y
1: >\x{120}<
#subject
# ----------------------------------------------------
# End of testinput12 # End of testinput12

View File

@ -1,9 +1,12 @@
# These test special (mostly error) UTF features of DFA matching. They are a # These test special UTF and UCP features of DFA matching. The output is
# selection of the more comprehensive tests that are run for non-DFA matching. # different for the different widths.
# The output is different for the different widths.
#subject dfa #subject dfa
# ----------------------------------------------------
# These are a selection of the more comprehensive tests that are run for
# non-DFA matching.
/X/utf /X/utf
XX\x{d800} XX\x{d800}
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2 Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
@ -58,4 +61,65 @@ No match
\xf7\x80\=ph \xf7\x80\=ph
No match No match
# ----------------------------------------------------
# UCP and casing tests - except for the first two, these will all fail in 8-bit
# mode because they are testing UCP without UTF and use characters > 255.
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
No match
/\x{c1}+\x{e1}/iB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
1: \xc1\xc1
\x{e1}\x{e1}\x{e1}
0: \xe1\xe1\xe1
1: \xe1\xe1
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
0: \x{121}\xe1
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
0: \x{121}\xe1
/[^\x{120}]/i,no_start_optimize
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
No match
/[^\x{120}]/i
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
No match
/\x{120}{2}/i,ucp
\x{121}\x{121}
0: \x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
No match
# ----------------------------------------------------
# End of testinput14 # End of testinput14

View File

@ -1,9 +1,12 @@
# These test special (mostly error) UTF features of DFA matching. They are a # These test special UTF and UCP features of DFA matching. The output is
# selection of the more comprehensive tests that are run for non-DFA matching. # different for the different widths.
# The output is different for the different widths.
#subject dfa #subject dfa
# ----------------------------------------------------
# These are a selection of the more comprehensive tests that are run for
# non-DFA matching.
/X/utf /X/utf
XX\x{d800} XX\x{d800}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2 Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
@ -58,4 +61,65 @@ No match
\xf7\x80\=ph \xf7\x80\=ph
No match No match
# ----------------------------------------------------
# UCP and casing tests - except for the first two, these will all fail in 8-bit
# mode because they are testing UCP without UTF and use characters > 255.
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
No match
/\x{c1}+\x{e1}/iB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
1: \xc1\xc1
\x{e1}\x{e1}\x{e1}
0: \xe1\xe1\xe1
1: \xe1\xe1
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
0: \x{121}\xe1
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
0: \x{121}\xe1
/[^\x{120}]/i,no_start_optimize
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
No match
/[^\x{120}]/i
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
No match
/\x{120}{2}/i,ucp
\x{121}\x{121}
0: \x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
No match
# ----------------------------------------------------
# End of testinput14 # End of testinput14

View File

@ -1,9 +1,12 @@
# These test special (mostly error) UTF features of DFA matching. They are a # These test special UTF and UCP features of DFA matching. The output is
# selection of the more comprehensive tests that are run for non-DFA matching. # different for the different widths.
# The output is different for the different widths.
#subject dfa #subject dfa
# ----------------------------------------------------
# These are a selection of the more comprehensive tests that are run for
# non-DFA matching.
/X/utf /X/utf
XX\x{d800} XX\x{d800}
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2 Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
@ -58,4 +61,65 @@ Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
\xf7\x80\=ph \xf7\x80\=ph
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0 Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
# ----------------------------------------------------
# UCP and casing tests - except for the first two, these will all fail in 8-bit
# mode because they are testing UCP without UTF and use characters > 255.
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
No match
/\x{c1}+\x{e1}/iB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
1: \xc1\xc1
\x{e1}\x{e1}\x{e1}
0: \xe1\xe1\xe1
1: \xe1\xe1
/\x{120}\x{c1}/i,ucp,no_start_optimize
Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
\x{121}\x{e1}
/\x{120}\x{c1}/i,ucp
Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
\x{121}\x{e1}
/[^\x{120}]/i,no_start_optimize
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
\x{121}
/[^\x{120}]/i,ucp,no_start_optimize
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
\= Expect no match
\x{121}
/[^\x{120}]/i
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
\x{121}
/[^\x{120}]/i,ucp
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
\= Expect no match
\x{121}
/\x{120}{2}/i,ucp
Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
\x{121}\x{121}
/[^\x{120}]{2}/i,ucp
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
\= Expect no match
\x{121}\x{121}
# ----------------------------------------------------
# End of testinput14 # End of testinput14