Unicode upper/lower casing is now used when UCP is set, even if UTF is not set.
This is not yet documented, and it not yet implemented in JIT.
This commit is contained in:
parent
d0666136c9
commit
4a7dfab0ec
|
@ -66,6 +66,11 @@ recurse function in JIT.
|
||||||
17. Fix a crash which occurs when the character type of an invalid UTF
|
17. Fix a crash which occurs when the character type of an invalid UTF
|
||||||
character is decoded in JIT.
|
character is decoded in JIT.
|
||||||
|
|
||||||
|
18. Changes in many areas of the code so that when Unicode is supported and
|
||||||
|
PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
|
||||||
|
upper/lower case computations on characters whose code points are greater than
|
||||||
|
127. Documentation is not yet updated. JIT is not yet updated.
|
||||||
|
|
||||||
|
|
||||||
Version 10.34 21-November-2019
|
Version 10.34 21-November-2019
|
||||||
------------------------------
|
------------------------------
|
||||||
|
|
|
@ -28,8 +28,6 @@
|
||||||
# The -v option causes a call to 'pcre2test -C' to happen for each
|
# The -v option causes a call to 'pcre2test -C' to happen for each
|
||||||
# configuration.
|
# configuration.
|
||||||
|
|
||||||
# Currently -fsanitize=undefined is not working (locks machine).
|
|
||||||
|
|
||||||
useasan=1
|
useasan=1
|
||||||
useusan=1
|
useusan=1
|
||||||
usedebug=1
|
usedebug=1
|
||||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||||
|
|
||||||
Written by Philip Hazel
|
Written by Philip Hazel
|
||||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||||
New API code Copyright (c) 2016-2019 University of Cambridge
|
New API code Copyright (c) 2016-2020 University of Cambridge
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -292,6 +292,7 @@ possessification, and if so, fills a list with its properties.
|
||||||
Arguments:
|
Arguments:
|
||||||
code points to start of expression
|
code points to start of expression
|
||||||
utf TRUE if in UTF mode
|
utf TRUE if in UTF mode
|
||||||
|
ucp TRUE if in UCP mode
|
||||||
fcc points to the case-flipping table
|
fcc points to the case-flipping table
|
||||||
list points to output list
|
list points to output list
|
||||||
list[0] will be filled with the opcode
|
list[0] will be filled with the opcode
|
||||||
|
@ -304,7 +305,7 @@ Returns: points to the start of the next opcode if *code is accepted
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static PCRE2_SPTR
|
static PCRE2_SPTR
|
||||||
get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
|
get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
|
||||||
uint32_t *list)
|
uint32_t *list)
|
||||||
{
|
{
|
||||||
PCRE2_UCHAR c = *code;
|
PCRE2_UCHAR c = *code;
|
||||||
|
@ -316,7 +317,8 @@ uint32_t chr;
|
||||||
uint32_t *clist_dest;
|
uint32_t *clist_dest;
|
||||||
const uint32_t *clist_src;
|
const uint32_t *clist_src;
|
||||||
#else
|
#else
|
||||||
(void)utf; /* Suppress "unused parameter" compiler warning */
|
(void)utf; /* Suppress "unused parameter" compiler warnings */
|
||||||
|
(void)ucp;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
list[0] = c;
|
list[0] = c;
|
||||||
|
@ -396,7 +398,7 @@ switch(c)
|
||||||
list[2] = chr;
|
list[2] = chr;
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (chr < 128 || (chr < 256 && !utf))
|
if (chr < 128 || (chr < 256 && !utf && !ucp))
|
||||||
list[3] = fcc[chr];
|
list[3] = fcc[chr];
|
||||||
else
|
else
|
||||||
list[3] = UCD_OTHERCASE(chr);
|
list[3] = UCD_OTHERCASE(chr);
|
||||||
|
@ -503,6 +505,7 @@ which case the base cannot be possessified.
|
||||||
Arguments:
|
Arguments:
|
||||||
code points to the byte code
|
code points to the byte code
|
||||||
utf TRUE in UTF mode
|
utf TRUE in UTF mode
|
||||||
|
ucp TRUE in UCP mode
|
||||||
cb compile data block
|
cb compile data block
|
||||||
base_list the data list of the base opcode
|
base_list the data list of the base opcode
|
||||||
base_end the end of the base opcode
|
base_end the end of the base opcode
|
||||||
|
@ -512,7 +515,7 @@ Returns: TRUE if the auto-possessification is possible
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static BOOL
|
static BOOL
|
||||||
compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
|
compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
|
||||||
const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
|
const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
|
||||||
{
|
{
|
||||||
PCRE2_UCHAR c;
|
PCRE2_UCHAR c;
|
||||||
|
@ -651,7 +654,7 @@ for(;;)
|
||||||
|
|
||||||
while (*next_code == OP_ALT)
|
while (*next_code == OP_ALT)
|
||||||
{
|
{
|
||||||
if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
|
if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
|
||||||
return FALSE;
|
return FALSE;
|
||||||
code = next_code + 1 + LINK_SIZE;
|
code = next_code + 1 + LINK_SIZE;
|
||||||
next_code += GET(next_code, 1);
|
next_code += GET(next_code, 1);
|
||||||
|
@ -672,7 +675,8 @@ for(;;)
|
||||||
/* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
|
/* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
|
||||||
|
|
||||||
next_code += 1 + LINK_SIZE;
|
next_code += 1 + LINK_SIZE;
|
||||||
if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
|
if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
|
||||||
|
rec_limit))
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
code += PRIV(OP_lengths)[c];
|
code += PRIV(OP_lengths)[c];
|
||||||
|
@ -688,7 +692,7 @@ for(;;)
|
||||||
/* We now have the next appropriate opcode to compare with the base. Check
|
/* We now have the next appropriate opcode to compare with the base. Check
|
||||||
for a supported opcode, and load its properties. */
|
for a supported opcode, and load its properties. */
|
||||||
|
|
||||||
code = get_chr_property_list(code, utf, cb->fcc, list);
|
code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
|
||||||
if (code == NULL) return FALSE; /* Unsupported */
|
if (code == NULL) return FALSE; /* Unsupported */
|
||||||
|
|
||||||
/* If either opcode is a small character list, set pointers for comparing
|
/* If either opcode is a small character list, set pointers for comparing
|
||||||
|
@ -1100,7 +1104,6 @@ leaving the remainder of the pattern unpossessified.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
code points to start of the byte code
|
code points to start of the byte code
|
||||||
utf TRUE in UTF mode
|
|
||||||
cb compile data block
|
cb compile data block
|
||||||
|
|
||||||
Returns: 0 for success
|
Returns: 0 for success
|
||||||
|
@ -1108,13 +1111,15 @@ Returns: 0 for success
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int
|
int
|
||||||
PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
|
PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
|
||||||
{
|
{
|
||||||
PCRE2_UCHAR c;
|
PCRE2_UCHAR c;
|
||||||
PCRE2_SPTR end;
|
PCRE2_SPTR end;
|
||||||
PCRE2_UCHAR *repeat_opcode;
|
PCRE2_UCHAR *repeat_opcode;
|
||||||
uint32_t list[8];
|
uint32_t list[8];
|
||||||
int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
|
int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
|
||||||
|
BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
|
||||||
|
BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
|
@ -1126,10 +1131,11 @@ for (;;)
|
||||||
{
|
{
|
||||||
c -= get_repeat_base(c) - OP_STAR;
|
c -= get_repeat_base(c) - OP_STAR;
|
||||||
end = (c <= OP_MINUPTO) ?
|
end = (c <= OP_MINUPTO) ?
|
||||||
get_chr_property_list(code, utf, cb->fcc, list) : NULL;
|
get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
|
||||||
list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
|
list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
|
||||||
|
|
||||||
if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
|
if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
|
||||||
|
&rec_limit))
|
||||||
{
|
{
|
||||||
switch(c)
|
switch(c)
|
||||||
{
|
{
|
||||||
|
@ -1181,11 +1187,11 @@ for (;;)
|
||||||
if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
|
if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
|
||||||
{
|
{
|
||||||
/* end must not be NULL. */
|
/* end must not be NULL. */
|
||||||
end = get_chr_property_list(code, utf, cb->fcc, list);
|
end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
|
||||||
|
|
||||||
list[1] = (c & 1) == 0;
|
list[1] = (c & 1) == 0;
|
||||||
|
|
||||||
if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
|
if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
|
||||||
{
|
{
|
||||||
switch (c)
|
switch (c)
|
||||||
{
|
{
|
||||||
|
|
|
@ -4904,7 +4904,7 @@ range. */
|
||||||
if ((options & PCRE2_CASELESS) != 0)
|
if ((options & PCRE2_CASELESS) != 0)
|
||||||
{
|
{
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if ((options & PCRE2_UTF) != 0)
|
if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
uint32_t oc, od;
|
uint32_t oc, od;
|
||||||
|
@ -5319,7 +5319,8 @@ dynamically as we process the pattern. */
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
BOOL utf = (options & PCRE2_UTF) != 0;
|
BOOL utf = (options & PCRE2_UTF) != 0;
|
||||||
#else /* No UTF support */
|
BOOL ucp = (options & PCRE2_UCP) != 0;
|
||||||
|
#else /* No Unicode support */
|
||||||
BOOL utf = FALSE;
|
BOOL utf = FALSE;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -5602,7 +5603,7 @@ for (;; pptr++)
|
||||||
uint32_t d;
|
uint32_t d;
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && c > 127) d = UCD_OTHERCASE(c); else
|
if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
#if PCRE2_CODE_UNIT_WIDTH != 8
|
#if PCRE2_CODE_UNIT_WIDTH != 8
|
||||||
|
@ -9632,6 +9633,7 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
|
||||||
int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
|
int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
|
||||||
{
|
{
|
||||||
BOOL utf; /* Set TRUE for UTF mode */
|
BOOL utf; /* Set TRUE for UTF mode */
|
||||||
|
BOOL ucp; /* Set TRUE for UCP mode */
|
||||||
BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
|
BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
|
||||||
BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
|
BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
|
||||||
pcre2_real_code *re = NULL; /* What we will return */
|
pcre2_real_code *re = NULL; /* What we will return */
|
||||||
|
@ -9919,8 +9921,8 @@ if (utf)
|
||||||
|
|
||||||
/* Check UCP lockout. */
|
/* Check UCP lockout. */
|
||||||
|
|
||||||
if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
|
ucp = (cb.external_options & PCRE2_UCP) != 0;
|
||||||
(PCRE2_UCP|PCRE2_NEVER_UCP))
|
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
|
||||||
{
|
{
|
||||||
errorcode = ERR75;
|
errorcode = ERR75;
|
||||||
goto HAD_EARLY_ERROR;
|
goto HAD_EARLY_ERROR;
|
||||||
|
@ -10296,7 +10298,7 @@ function call. */
|
||||||
if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
|
if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
|
||||||
{
|
{
|
||||||
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
|
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
|
||||||
if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
|
if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Failed to compile, or error while post-processing. */
|
/* Failed to compile, or error while post-processing. */
|
||||||
|
@ -10344,21 +10346,25 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
|
||||||
|
|
||||||
if ((firstcuflags & REQ_CASELESS) != 0)
|
if ((firstcuflags & REQ_CASELESS) != 0)
|
||||||
{
|
{
|
||||||
if (firstcu < 128 || (!utf && firstcu < 255))
|
if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
|
||||||
{
|
{
|
||||||
if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
|
if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
|
/* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
|
||||||
8-bit UTF mode, codepoints in the range 128-255 are introductory code
|
In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
|
||||||
points and cannot have another case. In 16-bit and 32-bit modes, we can
|
points and cannot have another case, but if UCP is set they may do. */
|
||||||
check wide characters when UTF (and therefore UCP) is supported. */
|
|
||||||
|
|
||||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
|
#ifdef SUPPORT_UNICODE
|
||||||
else if (firstcu <= MAX_UTF_CODE_POINT &&
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
|
||||||
|
re->flags |= PCRE2_FIRSTCASELESS;
|
||||||
|
#else
|
||||||
|
else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
|
||||||
UCD_OTHERCASE(firstcu) != firstcu)
|
UCD_OTHERCASE(firstcu) != firstcu)
|
||||||
re->flags |= PCRE2_FIRSTCASELESS;
|
re->flags |= PCRE2_FIRSTCASELESS;
|
||||||
#endif
|
#endif
|
||||||
|
#endif /* SUPPORT_UNICODE */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10407,14 +10413,20 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
|
||||||
|
|
||||||
if ((reqcuflags & REQ_CASELESS) != 0)
|
if ((reqcuflags & REQ_CASELESS) != 0)
|
||||||
{
|
{
|
||||||
if (reqcu < 128 || (!utf && reqcu < 255))
|
if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
|
||||||
{
|
{
|
||||||
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
|
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
|
||||||
}
|
}
|
||||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
|
#ifdef SUPPORT_UNICODE
|
||||||
else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
re->flags |= PCRE2_LASTCASELESS;
|
else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
|
||||||
|
re->flags |= PCRE2_LASTCASELESS;
|
||||||
|
#else
|
||||||
|
else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
|
||||||
|
UCD_OTHERCASE(reqcu) != reqcu)
|
||||||
|
re->flags |= PCRE2_LASTCASELESS;
|
||||||
#endif
|
#endif
|
||||||
|
#endif /* SUPPORT_UNICODE */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||||
|
|
||||||
Written by Philip Hazel
|
Written by Philip Hazel
|
||||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||||
New API code Copyright (c) 2016-2019 University of Cambridge
|
New API code Copyright (c) 2016-2020 University of Cambridge
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -548,6 +548,7 @@ PCRE2_SPTR start_code = mb->start_code;
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
|
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
|
||||||
|
BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
|
||||||
#else
|
#else
|
||||||
BOOL utf = FALSE;
|
BOOL utf = FALSE;
|
||||||
#endif
|
#endif
|
||||||
|
@ -2190,7 +2191,7 @@ for (;;)
|
||||||
if (clen == 0) break;
|
if (clen == 0) break;
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf)
|
if (utf_or_ucp)
|
||||||
{
|
{
|
||||||
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
|
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
|
||||||
{
|
{
|
||||||
|
@ -2204,7 +2205,7 @@ for (;;)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
/* Not UTF mode */
|
/* Not UTF or UCP mode */
|
||||||
{
|
{
|
||||||
if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
|
if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
|
||||||
{ ADD_NEW(state_offset + 2, 0); }
|
{ ADD_NEW(state_offset + 2, 0); }
|
||||||
|
@ -2339,7 +2340,7 @@ for (;;)
|
||||||
{
|
{
|
||||||
uint32_t otherd;
|
uint32_t otherd;
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && d >= 128)
|
if (utf_or_ucp && d >= 128)
|
||||||
otherd = UCD_OTHERCASE(d);
|
otherd = UCD_OTHERCASE(d);
|
||||||
else
|
else
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
@ -2374,7 +2375,7 @@ for (;;)
|
||||||
if (caseless)
|
if (caseless)
|
||||||
{
|
{
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && d >= 128)
|
if (utf_or_ucp && d >= 128)
|
||||||
otherd = UCD_OTHERCASE(d);
|
otherd = UCD_OTHERCASE(d);
|
||||||
else
|
else
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
@ -2417,7 +2418,7 @@ for (;;)
|
||||||
if (caseless)
|
if (caseless)
|
||||||
{
|
{
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && d >= 128)
|
if (utf_or_ucp && d >= 128)
|
||||||
otherd = UCD_OTHERCASE(d);
|
otherd = UCD_OTHERCASE(d);
|
||||||
else
|
else
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
@ -2458,7 +2459,7 @@ for (;;)
|
||||||
if (caseless)
|
if (caseless)
|
||||||
{
|
{
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && d >= 128)
|
if (utf_or_ucp && d >= 128)
|
||||||
otherd = UCD_OTHERCASE(d);
|
otherd = UCD_OTHERCASE(d);
|
||||||
else
|
else
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
@ -2491,7 +2492,7 @@ for (;;)
|
||||||
if (caseless)
|
if (caseless)
|
||||||
{
|
{
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && d >= 128)
|
if (utf_or_ucp && d >= 128)
|
||||||
otherd = UCD_OTHERCASE(d);
|
otherd = UCD_OTHERCASE(d);
|
||||||
else
|
else
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
@ -2531,7 +2532,7 @@ for (;;)
|
||||||
if (caseless)
|
if (caseless)
|
||||||
{
|
{
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && d >= 128)
|
if (utf_or_ucp && d >= 128)
|
||||||
otherd = UCD_OTHERCASE(d);
|
otherd = UCD_OTHERCASE(d);
|
||||||
else
|
else
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
@ -3526,10 +3527,15 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
|
||||||
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
|
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
|
||||||
{
|
{
|
||||||
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
|
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
|
||||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && first_cu > 127)
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
|
||||||
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
|
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
|
||||||
#endif
|
#else
|
||||||
|
if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
|
||||||
|
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
|
||||||
|
#endif
|
||||||
|
#endif /* SUPPORT_UNICODE */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -3545,9 +3551,15 @@ if ((re->flags & PCRE2_LASTSET) != 0)
|
||||||
if ((re->flags & PCRE2_LASTCASELESS) != 0)
|
if ((re->flags & PCRE2_LASTCASELESS) != 0)
|
||||||
{
|
{
|
||||||
req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
|
req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
|
||||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
|
||||||
|
req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
|
||||||
|
#else
|
||||||
|
if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
|
||||||
|
req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
|
||||||
#endif
|
#endif
|
||||||
|
#endif /* SUPPORT_UNICODE */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1952,7 +1952,7 @@ is available. */
|
||||||
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
|
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
|
||||||
#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_)
|
#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_)
|
||||||
|
|
||||||
extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
|
extern int _pcre2_auto_possessify(PCRE2_UCHAR *,
|
||||||
const compile_block *);
|
const compile_block *);
|
||||||
extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *,
|
extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *,
|
||||||
int *, uint32_t, uint32_t, BOOL, compile_block *);
|
int *, uint32_t, uint32_t, BOOL, compile_block *);
|
||||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||||
|
|
||||||
Written by Philip Hazel
|
Written by Philip Hazel
|
||||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||||
New API code Copyright (c) 2015-2019 University of Cambridge
|
New API code Copyright (c) 2015-2020 University of Cambridge
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -598,12 +598,13 @@ BOOL condition; /* Used in conditional groups */
|
||||||
BOOL cur_is_word; /* Used in "word" tests */
|
BOOL cur_is_word; /* Used in "word" tests */
|
||||||
BOOL prev_is_word; /* Used in "word" tests */
|
BOOL prev_is_word; /* Used in "word" tests */
|
||||||
|
|
||||||
/* UTF flag */
|
/* UTF and UCP flags */
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
|
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
|
||||||
|
BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
|
||||||
#else
|
#else
|
||||||
BOOL utf = FALSE;
|
BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* This is the length of the last part of a backtracking frame that must be
|
/* This is the length of the last part of a backtracking frame that must be
|
||||||
|
@ -928,6 +929,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Not UTF mode */
|
/* Not UTF mode */
|
||||||
{
|
{
|
||||||
if (mb->end_subject - Feptr < 1)
|
if (mb->end_subject - Feptr < 1)
|
||||||
|
@ -987,10 +989,30 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||||
if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
|
if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* If UCP is set without UTF we must do the same as above, but with one
|
||||||
|
character per code unit. */
|
||||||
|
|
||||||
|
else if (ucp)
|
||||||
|
{
|
||||||
|
uint32_t cc = UCHAR21(Feptr);
|
||||||
|
fc = Fecode[1];
|
||||||
|
if (fc < 128)
|
||||||
|
{
|
||||||
|
if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
|
||||||
|
}
|
||||||
|
Feptr++;
|
||||||
|
Fecode += 2;
|
||||||
|
}
|
||||||
|
|
||||||
else
|
else
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
|
||||||
/* Not UTF mode; use the table for characters < 256. */
|
/* Not UTF or UCP mode; use the table for characters < 256. */
|
||||||
{
|
{
|
||||||
if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
|
if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
|
||||||
!= TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
|
!= TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
|
||||||
|
@ -1010,6 +1032,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||||
SCHECK_PARTIAL();
|
SCHECK_PARTIAL();
|
||||||
RRETURN(MATCH_NOMATCH);
|
RRETURN(MATCH_NOMATCH);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf)
|
if (utf)
|
||||||
{
|
{
|
||||||
|
@ -1026,15 +1049,42 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||||
if (ch > 127)
|
if (ch > 127)
|
||||||
ch = UCD_OTHERCASE(ch);
|
ch = UCD_OTHERCASE(ch);
|
||||||
else
|
else
|
||||||
ch = TABLE_GET(ch, mb->fcc, ch);
|
ch = (mb->fcc)[ch];
|
||||||
if (ch == fc) RRETURN(MATCH_NOMATCH);
|
if (ch == fc) RRETURN(MATCH_NOMATCH);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* UCP without UTF is as above, but with one character per code unit. */
|
||||||
|
|
||||||
|
else if (ucp)
|
||||||
|
{
|
||||||
|
uint32_t ch;
|
||||||
|
fc = UCHAR21INC(Feptr);
|
||||||
|
ch = Fecode[1];
|
||||||
|
Fecode += 2;
|
||||||
|
|
||||||
|
if (ch == fc)
|
||||||
|
{
|
||||||
|
RRETURN(MATCH_NOMATCH); /* Caseful match */
|
||||||
|
}
|
||||||
|
else if (Fop == OP_NOTI) /* If caseless */
|
||||||
|
{
|
||||||
|
if (ch > 127)
|
||||||
|
ch = UCD_OTHERCASE(ch);
|
||||||
|
else
|
||||||
|
ch = (mb->fcc)[ch];
|
||||||
|
if (ch == fc) RRETURN(MATCH_NOMATCH);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
else
|
else
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
|
||||||
|
/* Neither UTF nor UCP is set */
|
||||||
|
|
||||||
{
|
{
|
||||||
uint32_t ch = Fecode[1];
|
uint32_t ch = Fecode[1];
|
||||||
fc = *Feptr++;
|
fc = UCHAR21INC(Feptr);
|
||||||
if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
|
if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
|
||||||
RRETURN(MATCH_NOMATCH);
|
RRETURN(MATCH_NOMATCH);
|
||||||
Fecode += 2;
|
Fecode += 2;
|
||||||
|
@ -1244,7 +1294,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
|
||||||
/* When not in UTF mode, load a single-code-unit character. Then proceed as
|
/* When not in UTF mode, load a single-code-unit character. Then proceed as
|
||||||
above. */
|
above, using Unicode casing if either UTF or UCP is set. */
|
||||||
|
|
||||||
Lc = *Fecode++;
|
Lc = *Fecode++;
|
||||||
|
|
||||||
|
@ -1253,11 +1303,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||||
if (Fop >= OP_STARI)
|
if (Fop >= OP_STARI)
|
||||||
{
|
{
|
||||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
/* Lc must be < 128 in UTF-8 mode. */
|
#ifdef SUPPORT_UNICODE
|
||||||
|
if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
|
||||||
|
else
|
||||||
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
/* Lc will be < 128 in UTF-8 mode. */
|
||||||
Loc = mb->fcc[Lc];
|
Loc = mb->fcc[Lc];
|
||||||
#else /* 16-bit & 32-bit */
|
#else /* 16-bit & 32-bit */
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
|
if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
|
||||||
else
|
else
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
Loc = TABLE_GET(Lc, mb->fcc, Lc);
|
Loc = TABLE_GET(Lc, mb->fcc, Lc);
|
||||||
|
@ -1490,7 +1544,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||||
if (Fop >= OP_NOTSTARI) /* Caseless */
|
if (Fop >= OP_NOTSTARI) /* Caseless */
|
||||||
{
|
{
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && Lc > 127)
|
if ((utf || ucp) && Lc > 127)
|
||||||
Loc = UCD_OTHERCASE(Lc);
|
Loc = UCD_OTHERCASE(Lc);
|
||||||
else
|
else
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
@ -6045,7 +6099,6 @@ BOOL firstline;
|
||||||
BOOL has_first_cu = FALSE;
|
BOOL has_first_cu = FALSE;
|
||||||
BOOL has_req_cu = FALSE;
|
BOOL has_req_cu = FALSE;
|
||||||
BOOL startline;
|
BOOL startline;
|
||||||
BOOL utf;
|
|
||||||
|
|
||||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
BOOL memchr_not_found_first_cu = FALSE;
|
BOOL memchr_not_found_first_cu = FALSE;
|
||||||
|
@ -6069,13 +6122,19 @@ PCRE2_SPTR match_partial;
|
||||||
BOOL use_jit;
|
BOOL use_jit;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* This flag is needed even when Unicode is not supported for convenience
|
||||||
|
(it is used by the IS_NEWLINE macro). */
|
||||||
|
|
||||||
|
BOOL utf = FALSE;
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
|
BOOL ucp = FALSE;
|
||||||
BOOL allow_invalid;
|
BOOL allow_invalid;
|
||||||
uint32_t fragment_options = 0;
|
uint32_t fragment_options = 0;
|
||||||
#ifdef SUPPORT_JIT
|
#ifdef SUPPORT_JIT
|
||||||
BOOL jit_checked_utf = FALSE;
|
BOOL jit_checked_utf = FALSE;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
|
||||||
PCRE2_SIZE frame_size;
|
PCRE2_SIZE frame_size;
|
||||||
|
|
||||||
|
@ -6147,12 +6206,13 @@ use_jit = (re->executable_jit != NULL &&
|
||||||
(options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
|
(options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Initialize UTF parameters. */
|
/* Initialize UTF/UCP parameters. */
|
||||||
|
|
||||||
utf = (re->overall_options & PCRE2_UTF) != 0;
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
|
utf = (re->overall_options & PCRE2_UTF) != 0;
|
||||||
allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
|
allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
|
||||||
#endif
|
ucp = (re->overall_options & PCRE2_UCP) != 0;
|
||||||
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
|
||||||
/* Convert the partial matching flags into an integer. */
|
/* Convert the partial matching flags into an integer. */
|
||||||
|
|
||||||
|
@ -6589,9 +6649,13 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
|
||||||
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
|
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
|
||||||
{
|
{
|
||||||
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
|
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
|
||||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
|
||||||
|
#else
|
||||||
|
if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
|
||||||
#endif
|
#endif
|
||||||
|
#endif /* SUPPORT_UNICODE */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -6607,9 +6671,13 @@ if ((re->flags & PCRE2_LASTSET) != 0)
|
||||||
if ((re->flags & PCRE2_LASTCASELESS) != 0)
|
if ((re->flags & PCRE2_LASTCASELESS) != 0)
|
||||||
{
|
{
|
||||||
req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
|
req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
|
||||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
|
||||||
|
#else
|
||||||
|
if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
|
||||||
#endif
|
#endif
|
||||||
|
#endif /* SUPPORT_UNICODE */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6756,15 +6824,16 @@ for(;;)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If we can't find the required code unit, having reached the true end
|
/* If we can't find the required first code unit, having reached the
|
||||||
of the subject, break the bumpalong loop, to force a match failure,
|
true end of the subject, break the bumpalong loop, to force a match
|
||||||
except when doing partial matching, when we let the next cycle run at
|
failure, except when doing partial matching, when we let the next cycle
|
||||||
the end of the subject. To see why, consider the pattern /(?<=abc)def/,
|
run at the end of the subject. To see why, consider the pattern
|
||||||
which partially matches "abc", even though the string does not contain
|
/(?<=abc)def/, which partially matches "abc", even though the string
|
||||||
the starting character "d". If we have not reached the true end of the
|
does not contain the starting character "d". If we have not reached the
|
||||||
subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
|
true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
|
||||||
we also let the cycle run, because the matching string is legitimately
|
temporarily modified) we also let the cycle run, because the matching
|
||||||
allowed to start with the first code unit of a newline. */
|
string is legitimately allowed to start with the first code unit of a
|
||||||
|
newline. */
|
||||||
|
|
||||||
if (mb->partial == 0 && start_match >= mb->end_subject)
|
if (mb->partial == 0 && start_match >= mb->end_subject)
|
||||||
{
|
{
|
||||||
|
|
|
@ -772,15 +772,19 @@ Arguments:
|
||||||
p points to the first code unit of the character
|
p points to the first code unit of the character
|
||||||
caseless TRUE if caseless
|
caseless TRUE if caseless
|
||||||
utf TRUE for UTF mode
|
utf TRUE for UTF mode
|
||||||
|
ucp TRUE for UCP mode
|
||||||
|
|
||||||
Returns: pointer after the character
|
Returns: pointer after the character
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static PCRE2_SPTR
|
static PCRE2_SPTR
|
||||||
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
|
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
|
||||||
|
BOOL ucp)
|
||||||
{
|
{
|
||||||
uint32_t c = *p++; /* First code unit */
|
uint32_t c = *p++; /* First code unit */
|
||||||
(void)utf; /* Stop compiler warning when UTF not supported */
|
|
||||||
|
(void)utf; /* Stop compiler warnings when UTF not supported */
|
||||||
|
(void)ucp;
|
||||||
|
|
||||||
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
|
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
|
||||||
0xff. */
|
0xff. */
|
||||||
|
@ -810,22 +814,26 @@ if (utf)
|
||||||
if (caseless)
|
if (caseless)
|
||||||
{
|
{
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf)
|
if (utf || ucp)
|
||||||
{
|
{
|
||||||
|
c = UCD_OTHERCASE(c);
|
||||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
PCRE2_UCHAR buff[6];
|
if (utf)
|
||||||
c = UCD_OTHERCASE(c);
|
{
|
||||||
(void)PRIV(ord2utf)(c, buff);
|
PCRE2_UCHAR buff[6];
|
||||||
SET_BIT(buff[0]);
|
(void)PRIV(ord2utf)(c, buff);
|
||||||
|
SET_BIT(buff[0]);
|
||||||
|
}
|
||||||
|
else SET_BIT(c);
|
||||||
#else /* 16-bit or 32-bit mode */
|
#else /* 16-bit or 32-bit mode */
|
||||||
c = UCD_OTHERCASE(c);
|
|
||||||
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
|
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
else
|
else
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
|
||||||
/* Not UTF */
|
/* Not UTF or UCP */
|
||||||
|
|
||||||
if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
|
if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
|
||||||
}
|
}
|
||||||
|
@ -931,6 +939,7 @@ Arguments:
|
||||||
re points to the compiled regex block
|
re points to the compiled regex block
|
||||||
code points to an expression
|
code points to an expression
|
||||||
utf TRUE if in UTF mode
|
utf TRUE if in UTF mode
|
||||||
|
ucp TRUE if in UCP mode
|
||||||
depthptr pointer to recurse depth
|
depthptr pointer to recurse depth
|
||||||
|
|
||||||
Returns: SSB_FAIL => Failed to find any starting code units
|
Returns: SSB_FAIL => Failed to find any starting code units
|
||||||
|
@ -941,7 +950,8 @@ Returns: SSB_FAIL => Failed to find any starting code units
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static int
|
static int
|
||||||
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, int *depthptr)
|
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
|
||||||
|
int *depthptr)
|
||||||
{
|
{
|
||||||
uint32_t c;
|
uint32_t c;
|
||||||
int yield = SSB_DONE;
|
int yield = SSB_DONE;
|
||||||
|
@ -1111,7 +1121,7 @@ do
|
||||||
case OP_SCRIPT_RUN:
|
case OP_SCRIPT_RUN:
|
||||||
case OP_ASSERT:
|
case OP_ASSERT:
|
||||||
case OP_ASSERT_NA:
|
case OP_ASSERT_NA:
|
||||||
rc = set_start_bits(re, tcode, utf, depthptr);
|
rc = set_start_bits(re, tcode, utf, ucp, depthptr);
|
||||||
if (rc == SSB_DONE)
|
if (rc == SSB_DONE)
|
||||||
{
|
{
|
||||||
try_next = FALSE;
|
try_next = FALSE;
|
||||||
|
@ -1167,7 +1177,7 @@ do
|
||||||
case OP_BRAZERO:
|
case OP_BRAZERO:
|
||||||
case OP_BRAMINZERO:
|
case OP_BRAMINZERO:
|
||||||
case OP_BRAPOSZERO:
|
case OP_BRAPOSZERO:
|
||||||
rc = set_start_bits(re, ++tcode, utf, depthptr);
|
rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
|
||||||
if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
|
if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
|
||||||
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
|
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
|
||||||
tcode += 1 + LINK_SIZE;
|
tcode += 1 + LINK_SIZE;
|
||||||
|
@ -1189,7 +1199,7 @@ do
|
||||||
case OP_QUERY:
|
case OP_QUERY:
|
||||||
case OP_MINQUERY:
|
case OP_MINQUERY:
|
||||||
case OP_POSQUERY:
|
case OP_POSQUERY:
|
||||||
tcode = set_table_bit(re, tcode + 1, FALSE, utf);
|
tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case OP_STARI:
|
case OP_STARI:
|
||||||
|
@ -1198,7 +1208,7 @@ do
|
||||||
case OP_QUERYI:
|
case OP_QUERYI:
|
||||||
case OP_MINQUERYI:
|
case OP_MINQUERYI:
|
||||||
case OP_POSQUERYI:
|
case OP_POSQUERYI:
|
||||||
tcode = set_table_bit(re, tcode + 1, TRUE, utf);
|
tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* Single-char upto sets the bit and tries the next */
|
/* Single-char upto sets the bit and tries the next */
|
||||||
|
@ -1206,13 +1216,13 @@ do
|
||||||
case OP_UPTO:
|
case OP_UPTO:
|
||||||
case OP_MINUPTO:
|
case OP_MINUPTO:
|
||||||
case OP_POSUPTO:
|
case OP_POSUPTO:
|
||||||
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
|
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case OP_UPTOI:
|
case OP_UPTOI:
|
||||||
case OP_MINUPTOI:
|
case OP_MINUPTOI:
|
||||||
case OP_POSUPTOI:
|
case OP_POSUPTOI:
|
||||||
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
|
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* At least one single char sets the bit and stops */
|
/* At least one single char sets the bit and stops */
|
||||||
|
@ -1224,7 +1234,7 @@ do
|
||||||
case OP_PLUS:
|
case OP_PLUS:
|
||||||
case OP_MINPLUS:
|
case OP_MINPLUS:
|
||||||
case OP_POSPLUS:
|
case OP_POSPLUS:
|
||||||
(void)set_table_bit(re, tcode + 1, FALSE, utf);
|
(void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
|
||||||
try_next = FALSE;
|
try_next = FALSE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -1235,7 +1245,7 @@ do
|
||||||
case OP_PLUSI:
|
case OP_PLUSI:
|
||||||
case OP_MINPLUSI:
|
case OP_MINPLUSI:
|
||||||
case OP_POSPLUSI:
|
case OP_POSPLUSI:
|
||||||
(void)set_table_bit(re, tcode + 1, TRUE, utf);
|
(void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
|
||||||
try_next = FALSE;
|
try_next = FALSE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -1664,6 +1674,7 @@ PRIV(study)(pcre2_real_code *re)
|
||||||
int count = 0;
|
int count = 0;
|
||||||
PCRE2_UCHAR *code;
|
PCRE2_UCHAR *code;
|
||||||
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
|
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
|
||||||
|
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
|
||||||
|
|
||||||
/* Find start of compiled code */
|
/* Find start of compiled code */
|
||||||
|
|
||||||
|
@ -1677,7 +1688,7 @@ code units. */
|
||||||
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
|
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
|
||||||
{
|
{
|
||||||
int depth = 0;
|
int depth = 0;
|
||||||
int rc = set_start_bits(re, code, utf, &depth);
|
int rc = set_start_bits(re, code, utf, ucp, &depth);
|
||||||
if (rc == SSB_UNKNOWN) return 1;
|
if (rc == SSB_UNKNOWN) return 1;
|
||||||
|
|
||||||
/* If a list of starting code units was set up, scan the list to see if only
|
/* If a list of starting code units was set up, scan the list to see if only
|
||||||
|
@ -1695,7 +1706,7 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
|
||||||
int b = -1;
|
int b = -1;
|
||||||
uint8_t *p = re->start_bitmap;
|
uint8_t *p = re->start_bitmap;
|
||||||
uint32_t flags = PCRE2_FIRSTMAPSET;
|
uint32_t flags = PCRE2_FIRSTMAPSET;
|
||||||
|
|
||||||
for (i = 0; i < 256; p++, i += 8)
|
for (i = 0; i < 256; p++, i += 8)
|
||||||
{
|
{
|
||||||
uint8_t x = *p;
|
uint8_t x = *p;
|
||||||
|
@ -1725,27 +1736,27 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* c contains the code unit value, in the range 0-255. In 8-bit UTF
|
/* c contains the code unit value, in the range 0-255. In 8-bit UTF
|
||||||
mode, only values < 128 can be used. */
|
mode, only values < 128 can be used. In all the other cases, c is a
|
||||||
|
character value. */
|
||||||
|
|
||||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
if (c > 127) goto DONE;
|
if (utf && c > 127) goto DONE;
|
||||||
#endif
|
#endif
|
||||||
if (a < 0) a = c; /* First one found */
|
if (a < 0) a = c; /* First one found, save in a */
|
||||||
else if (b < 0) /* Second one found */
|
else if (b < 0) /* Second one found */
|
||||||
{
|
{
|
||||||
int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
|
int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
if (utf || ucp)
|
||||||
if (utf && UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
|
{
|
||||||
#else /* 16-bit or 32-bit */
|
if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
|
||||||
if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
|
if (c > 127) d = UCD_OTHERCASE(c);
|
||||||
if (utf && c > 127) d = UCD_OTHERCASE(c);
|
}
|
||||||
#endif /* Code width */
|
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
|
||||||
if (d != a) goto DONE; /* Not other case of a */
|
if (d != a) goto DONE; /* Not the other case of a */
|
||||||
b = c;
|
b = c; /* Save second in b */
|
||||||
}
|
}
|
||||||
else goto DONE; /* More than two characters found */
|
else goto DONE; /* More than two characters found */
|
||||||
}
|
}
|
||||||
|
|
|
@ -236,6 +236,7 @@ BOOL use_existing_match;
|
||||||
BOOL replacement_only;
|
BOOL replacement_only;
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
|
BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
|
||||||
|
BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
|
||||||
#endif
|
#endif
|
||||||
PCRE2_UCHAR temp[6];
|
PCRE2_UCHAR temp[6];
|
||||||
PCRE2_SPTR ptr;
|
PCRE2_SPTR ptr;
|
||||||
|
@ -758,7 +759,7 @@ do
|
||||||
if (forcecase != 0)
|
if (forcecase != 0)
|
||||||
{
|
{
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf)
|
if (utf || ucp)
|
||||||
{
|
{
|
||||||
uint32_t type = UCD_CHARTYPE(ch);
|
uint32_t type = UCD_CHARTYPE(ch);
|
||||||
if (PRIV(ucp_gentype)[type] == ucp_L &&
|
if (PRIV(ucp_gentype)[type] == ucp_L &&
|
||||||
|
@ -860,7 +861,7 @@ do
|
||||||
if (forcecase != 0)
|
if (forcecase != 0)
|
||||||
{
|
{
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf)
|
if (utf || ucp)
|
||||||
{
|
{
|
||||||
uint32_t type = UCD_CHARTYPE(ch);
|
uint32_t type = UCD_CHARTYPE(ch);
|
||||||
if (PRIV(ucp_gentype)[type] == ucp_L &&
|
if (PRIV(ucp_gentype)[type] == ucp_L &&
|
||||||
|
|
|
@ -570,8 +570,10 @@
|
||||||
/[\xff\x{ffff}]/I,utf
|
/[\xff\x{ffff}]/I,utf
|
||||||
|
|
||||||
/[\xff\x{ff}]/I,utf
|
/[\xff\x{ff}]/I,utf
|
||||||
|
abc\x{ff}def
|
||||||
|
|
||||||
/[\xff\x{ff}]/I
|
/[\xff\x{ff}]/I
|
||||||
|
abc\x{ff}def
|
||||||
|
|
||||||
/[Ss]/I
|
/[Ss]/I
|
||||||
|
|
||||||
|
@ -585,4 +587,31 @@
|
||||||
abc\x80\=startchar
|
abc\x80\=startchar
|
||||||
abc\x80\=startchar,offset=3
|
abc\x80\=startchar,offset=3
|
||||||
|
|
||||||
|
#subject no_jit
|
||||||
|
|
||||||
|
/\x{c1}+\x{e1}/iIB,ucp
|
||||||
|
\x{c1}\x{c1}\x{c1}
|
||||||
|
\x{e1}\x{e1}\x{e1}
|
||||||
|
|
||||||
|
/a|\x{c1}/iI,ucp
|
||||||
|
\x{e1}xxx
|
||||||
|
|
||||||
|
/a|\x{c1}/iI,utf
|
||||||
|
\x{e1}xxx
|
||||||
|
|
||||||
|
/\x{c1}|\x{e1}/iI,ucp
|
||||||
|
|
||||||
|
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
|
||||||
|
X\x{e1}Y
|
||||||
|
|
||||||
|
/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
|
||||||
|
X\x{c1}Y
|
||||||
|
|
||||||
|
# Without UTF or UCP characters > 127 have only one case in the default locale.
|
||||||
|
|
||||||
|
/X(\x{e1})Y/replace=>\U$1<,substitute_extended
|
||||||
|
X\x{e1}Y
|
||||||
|
|
||||||
|
#subject
|
||||||
|
|
||||||
# End of testinput10
|
# End of testinput10
|
||||||
|
|
|
@ -463,4 +463,71 @@
|
||||||
|
|
||||||
/(?:\x{ff}|\x{3000})/I,utf
|
/(?:\x{ff}|\x{3000})/I,utf
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# UCP and casing tests
|
||||||
|
|
||||||
|
/\x{120}/i,I
|
||||||
|
|
||||||
|
/\x{c1}/i,I,ucp
|
||||||
|
|
||||||
|
/[\x{120}\x{121}]/iB,ucp
|
||||||
|
|
||||||
|
/[ab\x{120}]+/iB,ucp
|
||||||
|
aABb\x{121}\x{120}
|
||||||
|
|
||||||
|
#subject no_jit
|
||||||
|
|
||||||
|
/\x{c1}/i,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{e1}
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp,no_start_optimize
|
||||||
|
\x{121}\x{e1}
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp
|
||||||
|
\x{121}\x{e1}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,no_start_optimize
|
||||||
|
\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i
|
||||||
|
\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
|
||||||
|
/\x{120}{2}/i,ucp
|
||||||
|
\x{121}\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]{2}/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}\x{121}
|
||||||
|
|
||||||
|
/\x{c1}+\x{e1}/iB,ucp
|
||||||
|
\x{c1}\x{c1}\x{c1}
|
||||||
|
|
||||||
|
/\x{c1}+\x{e1}/iIB,ucp
|
||||||
|
\x{c1}\x{c1}\x{c1}
|
||||||
|
\x{e1}\x{e1}\x{e1}
|
||||||
|
|
||||||
|
/a|\x{c1}/iI,ucp
|
||||||
|
\x{e1}xxx
|
||||||
|
|
||||||
|
/\x{c1}|\x{e1}/iI,ucp
|
||||||
|
|
||||||
|
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
|
||||||
|
X\x{e1}Y
|
||||||
|
|
||||||
|
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
|
||||||
|
X\x{121}Y
|
||||||
|
|
||||||
|
#subject
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
# These test special (mostly error) UTF features of DFA matching. They are a
|
# These test special UTF and UCP features of DFA matching. The output is
|
||||||
# selection of the more comprehensive tests that are run for non-DFA matching.
|
# different for the different widths.
|
||||||
# The output is different for the different widths.
|
|
||||||
|
|
||||||
#subject dfa
|
#subject dfa
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# These are a selection of the more comprehensive tests that are run for
|
||||||
|
# non-DFA matching.
|
||||||
|
|
||||||
/X/utf
|
/X/utf
|
||||||
XX\x{d800}
|
XX\x{d800}
|
||||||
XX\x{d800}\=offset=3
|
XX\x{d800}\=offset=3
|
||||||
|
@ -33,5 +36,46 @@
|
||||||
XX\xef\x80\=ph
|
XX\xef\x80\=ph
|
||||||
\xf7\=ph
|
\xf7\=ph
|
||||||
\xf7\x80\=ph
|
\xf7\x80\=ph
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# UCP and casing tests - except for the first two, these will all fail in 8-bit
|
||||||
|
# mode because they are testing UCP without UTF and use characters > 255.
|
||||||
|
|
||||||
|
/\x{c1}/i,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{e1}
|
||||||
|
|
||||||
|
/\x{c1}+\x{e1}/iB,ucp
|
||||||
|
\x{c1}\x{c1}\x{c1}
|
||||||
|
\x{e1}\x{e1}\x{e1}
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp,no_start_optimize
|
||||||
|
\x{121}\x{e1}
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp
|
||||||
|
\x{121}\x{e1}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,no_start_optimize
|
||||||
|
\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i
|
||||||
|
\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
|
||||||
|
/\x{120}{2}/i,ucp
|
||||||
|
\x{121}\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]{2}/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}\x{121}
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
|
||||||
# End of testinput14
|
# End of testinput14
|
||||||
|
|
|
@ -1780,11 +1780,15 @@ Capture group count = 0
|
||||||
Options: utf
|
Options: utf
|
||||||
Starting code units: \xc3
|
Starting code units: \xc3
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
abc\x{ff}def
|
||||||
|
0: \x{ff}
|
||||||
|
|
||||||
/[\xff\x{ff}]/I
|
/[\xff\x{ff}]/I
|
||||||
Capture group count = 0
|
Capture group count = 0
|
||||||
Starting code units: \xff
|
First code unit = \xff
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
abc\x{ff}def
|
||||||
|
0: \xff
|
||||||
|
|
||||||
/[Ss]/I
|
/[Ss]/I
|
||||||
Capture group count = 0
|
Capture group count = 0
|
||||||
|
@ -1813,4 +1817,62 @@ Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 3
|
||||||
abc\x80\=startchar,offset=3
|
abc\x80\=startchar,offset=3
|
||||||
Error -36 (bad UTF-8 offset)
|
Error -36 (bad UTF-8 offset)
|
||||||
|
|
||||||
|
#subject no_jit
|
||||||
|
|
||||||
|
/\x{c1}+\x{e1}/iIB,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
/i \x{c1}+
|
||||||
|
/i \x{e1}
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless ucp
|
||||||
|
First code unit = \xc1 (caseless)
|
||||||
|
Last code unit = \xe1 (caseless)
|
||||||
|
Subject length lower bound = 2
|
||||||
|
\x{c1}\x{c1}\x{c1}
|
||||||
|
0: \xc1\xc1\xc1
|
||||||
|
\x{e1}\x{e1}\x{e1}
|
||||||
|
0: \xe1\xe1\xe1
|
||||||
|
|
||||||
|
/a|\x{c1}/iI,ucp
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless ucp
|
||||||
|
Starting code units: A a \xc1 \xe1
|
||||||
|
Subject length lower bound = 1
|
||||||
|
\x{e1}xxx
|
||||||
|
0: \xe1
|
||||||
|
|
||||||
|
/a|\x{c1}/iI,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless utf
|
||||||
|
Starting code units: A a \xc3
|
||||||
|
Subject length lower bound = 1
|
||||||
|
\x{e1}xxx
|
||||||
|
0: \x{e1}
|
||||||
|
|
||||||
|
/\x{c1}|\x{e1}/iI,ucp
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless ucp
|
||||||
|
First code unit = \xc1 (caseless)
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
|
||||||
|
X\x{e1}Y
|
||||||
|
1: >\xc1<
|
||||||
|
|
||||||
|
/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
|
||||||
|
X\x{c1}Y
|
||||||
|
1: >\xe1<
|
||||||
|
|
||||||
|
# Without UTF or UCP characters > 127 have only one case in the default locale.
|
||||||
|
|
||||||
|
/X(\x{e1})Y/replace=>\U$1<,substitute_extended
|
||||||
|
X\x{e1}Y
|
||||||
|
1: >\xe1<
|
||||||
|
|
||||||
|
#subject
|
||||||
|
|
||||||
# End of testinput10
|
# End of testinput10
|
||||||
|
|
|
@ -1613,7 +1613,7 @@ Subject length lower bound = 1
|
||||||
|
|
||||||
/[Ss]/I
|
/[Ss]/I
|
||||||
Capture group count = 0
|
Capture group count = 0
|
||||||
Starting code units: S s
|
First code unit = 'S' (caseless)
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
/[Ss]/I,utf
|
/[Ss]/I,utf
|
||||||
|
@ -1628,4 +1628,134 @@ Options: utf
|
||||||
Starting code units: \xff
|
Starting code units: \xff
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# UCP and casing tests
|
||||||
|
|
||||||
|
/\x{120}/i,I
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless
|
||||||
|
First code unit = \x{120}
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/\x{c1}/i,I,ucp
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless ucp
|
||||||
|
First code unit = \xc1 (caseless)
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\x{120}\x{121}]/iB,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
/i \x{120}
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/[ab\x{120}]+/iB,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
[ABab\x{120}-\x{121}]++
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
aABb\x{121}\x{120}
|
||||||
|
0: aABb\x{121}\x{120}
|
||||||
|
|
||||||
|
#subject no_jit
|
||||||
|
|
||||||
|
/\x{c1}/i,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{e1}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp,no_start_optimize
|
||||||
|
\x{121}\x{e1}
|
||||||
|
0: \x{121}\xe1
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp
|
||||||
|
\x{121}\x{e1}
|
||||||
|
0: \x{121}\xe1
|
||||||
|
|
||||||
|
/[^\x{120}]/i,no_start_optimize
|
||||||
|
\x{121}
|
||||||
|
0: \x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/[^\x{120}]/i
|
||||||
|
\x{121}
|
||||||
|
0: \x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\x{120}{2}/i,ucp
|
||||||
|
\x{121}\x{121}
|
||||||
|
0: \x{121}\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]{2}/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}\x{121}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\x{c1}+\x{e1}/iB,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
/i \x{c1}+
|
||||||
|
/i \x{e1}
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
\x{c1}\x{c1}\x{c1}
|
||||||
|
0: \xc1\xc1\xc1
|
||||||
|
|
||||||
|
/\x{c1}+\x{e1}/iIB,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
/i \x{c1}+
|
||||||
|
/i \x{e1}
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless ucp
|
||||||
|
First code unit = \xc1 (caseless)
|
||||||
|
Last code unit = \xe1 (caseless)
|
||||||
|
Subject length lower bound = 2
|
||||||
|
\x{c1}\x{c1}\x{c1}
|
||||||
|
0: \xc1\xc1\xc1
|
||||||
|
\x{e1}\x{e1}\x{e1}
|
||||||
|
0: \xe1\xe1\xe1
|
||||||
|
|
||||||
|
/a|\x{c1}/iI,ucp
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless ucp
|
||||||
|
Starting code units: A a \xc1 \xe1
|
||||||
|
Subject length lower bound = 1
|
||||||
|
\x{e1}xxx
|
||||||
|
0: \xe1
|
||||||
|
|
||||||
|
/\x{c1}|\x{e1}/iI,ucp
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless ucp
|
||||||
|
First code unit = \xc1 (caseless)
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
|
||||||
|
X\x{e1}Y
|
||||||
|
1: >\xc1<
|
||||||
|
|
||||||
|
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
|
||||||
|
X\x{121}Y
|
||||||
|
1: >\x{120}<
|
||||||
|
|
||||||
|
#subject
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -1611,7 +1611,7 @@ Subject length lower bound = 1
|
||||||
|
|
||||||
/[Ss]/I
|
/[Ss]/I
|
||||||
Capture group count = 0
|
Capture group count = 0
|
||||||
Starting code units: S s
|
First code unit = 'S' (caseless)
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
/[Ss]/I,utf
|
/[Ss]/I,utf
|
||||||
|
@ -1626,4 +1626,134 @@ Options: utf
|
||||||
Starting code units: \xff
|
Starting code units: \xff
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# UCP and casing tests
|
||||||
|
|
||||||
|
/\x{120}/i,I
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless
|
||||||
|
First code unit = \x{120}
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/\x{c1}/i,I,ucp
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless ucp
|
||||||
|
First code unit = \xc1 (caseless)
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\x{120}\x{121}]/iB,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
/i \x{120}
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/[ab\x{120}]+/iB,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
[ABab\x{120}-\x{121}]++
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
aABb\x{121}\x{120}
|
||||||
|
0: aABb\x{121}\x{120}
|
||||||
|
|
||||||
|
#subject no_jit
|
||||||
|
|
||||||
|
/\x{c1}/i,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{e1}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp,no_start_optimize
|
||||||
|
\x{121}\x{e1}
|
||||||
|
0: \x{121}\xe1
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp
|
||||||
|
\x{121}\x{e1}
|
||||||
|
0: \x{121}\xe1
|
||||||
|
|
||||||
|
/[^\x{120}]/i,no_start_optimize
|
||||||
|
\x{121}
|
||||||
|
0: \x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/[^\x{120}]/i
|
||||||
|
\x{121}
|
||||||
|
0: \x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\x{120}{2}/i,ucp
|
||||||
|
\x{121}\x{121}
|
||||||
|
0: \x{121}\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]{2}/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}\x{121}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\x{c1}+\x{e1}/iB,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
/i \x{c1}+
|
||||||
|
/i \x{e1}
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
\x{c1}\x{c1}\x{c1}
|
||||||
|
0: \xc1\xc1\xc1
|
||||||
|
|
||||||
|
/\x{c1}+\x{e1}/iIB,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
/i \x{c1}+
|
||||||
|
/i \x{e1}
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless ucp
|
||||||
|
First code unit = \xc1 (caseless)
|
||||||
|
Last code unit = \xe1 (caseless)
|
||||||
|
Subject length lower bound = 2
|
||||||
|
\x{c1}\x{c1}\x{c1}
|
||||||
|
0: \xc1\xc1\xc1
|
||||||
|
\x{e1}\x{e1}\x{e1}
|
||||||
|
0: \xe1\xe1\xe1
|
||||||
|
|
||||||
|
/a|\x{c1}/iI,ucp
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless ucp
|
||||||
|
Starting code units: A a \xc1 \xe1
|
||||||
|
Subject length lower bound = 1
|
||||||
|
\x{e1}xxx
|
||||||
|
0: \xe1
|
||||||
|
|
||||||
|
/\x{c1}|\x{e1}/iI,ucp
|
||||||
|
Capture group count = 0
|
||||||
|
Options: caseless ucp
|
||||||
|
First code unit = \xc1 (caseless)
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
|
||||||
|
X\x{e1}Y
|
||||||
|
1: >\xc1<
|
||||||
|
|
||||||
|
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
|
||||||
|
X\x{121}Y
|
||||||
|
1: >\x{120}<
|
||||||
|
|
||||||
|
#subject
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
# These test special (mostly error) UTF features of DFA matching. They are a
|
# These test special UTF and UCP features of DFA matching. The output is
|
||||||
# selection of the more comprehensive tests that are run for non-DFA matching.
|
# different for the different widths.
|
||||||
# The output is different for the different widths.
|
|
||||||
|
|
||||||
#subject dfa
|
#subject dfa
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# These are a selection of the more comprehensive tests that are run for
|
||||||
|
# non-DFA matching.
|
||||||
|
|
||||||
/X/utf
|
/X/utf
|
||||||
XX\x{d800}
|
XX\x{d800}
|
||||||
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
|
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
|
||||||
|
@ -57,5 +60,66 @@ No match
|
||||||
No match
|
No match
|
||||||
\xf7\x80\=ph
|
\xf7\x80\=ph
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# UCP and casing tests - except for the first two, these will all fail in 8-bit
|
||||||
|
# mode because they are testing UCP without UTF and use characters > 255.
|
||||||
|
|
||||||
|
/\x{c1}/i,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{e1}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\x{c1}+\x{e1}/iB,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
/i \x{c1}+
|
||||||
|
/i \x{e1}
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
\x{c1}\x{c1}\x{c1}
|
||||||
|
0: \xc1\xc1\xc1
|
||||||
|
1: \xc1\xc1
|
||||||
|
\x{e1}\x{e1}\x{e1}
|
||||||
|
0: \xe1\xe1\xe1
|
||||||
|
1: \xe1\xe1
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp,no_start_optimize
|
||||||
|
\x{121}\x{e1}
|
||||||
|
0: \x{121}\xe1
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp
|
||||||
|
\x{121}\x{e1}
|
||||||
|
0: \x{121}\xe1
|
||||||
|
|
||||||
|
/[^\x{120}]/i,no_start_optimize
|
||||||
|
\x{121}
|
||||||
|
0: \x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/[^\x{120}]/i
|
||||||
|
\x{121}
|
||||||
|
0: \x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\x{120}{2}/i,ucp
|
||||||
|
\x{121}\x{121}
|
||||||
|
0: \x{121}\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]{2}/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}\x{121}
|
||||||
|
No match
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
|
||||||
# End of testinput14
|
# End of testinput14
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
# These test special (mostly error) UTF features of DFA matching. They are a
|
# These test special UTF and UCP features of DFA matching. The output is
|
||||||
# selection of the more comprehensive tests that are run for non-DFA matching.
|
# different for the different widths.
|
||||||
# The output is different for the different widths.
|
|
||||||
|
|
||||||
#subject dfa
|
#subject dfa
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# These are a selection of the more comprehensive tests that are run for
|
||||||
|
# non-DFA matching.
|
||||||
|
|
||||||
/X/utf
|
/X/utf
|
||||||
XX\x{d800}
|
XX\x{d800}
|
||||||
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
|
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||||
|
@ -57,5 +60,66 @@ No match
|
||||||
No match
|
No match
|
||||||
\xf7\x80\=ph
|
\xf7\x80\=ph
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# UCP and casing tests - except for the first two, these will all fail in 8-bit
|
||||||
|
# mode because they are testing UCP without UTF and use characters > 255.
|
||||||
|
|
||||||
|
/\x{c1}/i,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{e1}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\x{c1}+\x{e1}/iB,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
/i \x{c1}+
|
||||||
|
/i \x{e1}
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
\x{c1}\x{c1}\x{c1}
|
||||||
|
0: \xc1\xc1\xc1
|
||||||
|
1: \xc1\xc1
|
||||||
|
\x{e1}\x{e1}\x{e1}
|
||||||
|
0: \xe1\xe1\xe1
|
||||||
|
1: \xe1\xe1
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp,no_start_optimize
|
||||||
|
\x{121}\x{e1}
|
||||||
|
0: \x{121}\xe1
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp
|
||||||
|
\x{121}\x{e1}
|
||||||
|
0: \x{121}\xe1
|
||||||
|
|
||||||
|
/[^\x{120}]/i,no_start_optimize
|
||||||
|
\x{121}
|
||||||
|
0: \x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/[^\x{120}]/i
|
||||||
|
\x{121}
|
||||||
|
0: \x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\x{120}{2}/i,ucp
|
||||||
|
\x{121}\x{121}
|
||||||
|
0: \x{121}\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]{2}/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}\x{121}
|
||||||
|
No match
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
|
||||||
# End of testinput14
|
# End of testinput14
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
# These test special (mostly error) UTF features of DFA matching. They are a
|
# These test special UTF and UCP features of DFA matching. The output is
|
||||||
# selection of the more comprehensive tests that are run for non-DFA matching.
|
# different for the different widths.
|
||||||
# The output is different for the different widths.
|
|
||||||
|
|
||||||
#subject dfa
|
#subject dfa
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# These are a selection of the more comprehensive tests that are run for
|
||||||
|
# non-DFA matching.
|
||||||
|
|
||||||
/X/utf
|
/X/utf
|
||||||
XX\x{d800}
|
XX\x{d800}
|
||||||
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
|
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||||
|
@ -57,5 +60,66 @@ Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2
|
||||||
Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
|
Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
|
||||||
\xf7\x80\=ph
|
\xf7\x80\=ph
|
||||||
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
|
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# UCP and casing tests - except for the first two, these will all fail in 8-bit
|
||||||
|
# mode because they are testing UCP without UTF and use characters > 255.
|
||||||
|
|
||||||
|
/\x{c1}/i,no_start_optimize
|
||||||
|
\= Expect no match
|
||||||
|
\x{e1}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\x{c1}+\x{e1}/iB,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
/i \x{c1}+
|
||||||
|
/i \x{e1}
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
\x{c1}\x{c1}\x{c1}
|
||||||
|
0: \xc1\xc1\xc1
|
||||||
|
1: \xc1\xc1
|
||||||
|
\x{e1}\x{e1}\x{e1}
|
||||||
|
0: \xe1\xe1\xe1
|
||||||
|
1: \xe1\xe1
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp,no_start_optimize
|
||||||
|
Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
|
||||||
|
\x{121}\x{e1}
|
||||||
|
|
||||||
|
/\x{120}\x{c1}/i,ucp
|
||||||
|
Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
|
||||||
|
\x{121}\x{e1}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,no_start_optimize
|
||||||
|
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
|
||||||
|
\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp,no_start_optimize
|
||||||
|
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i
|
||||||
|
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
|
||||||
|
\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]/i,ucp
|
||||||
|
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}
|
||||||
|
|
||||||
|
/\x{120}{2}/i,ucp
|
||||||
|
Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
|
||||||
|
\x{121}\x{121}
|
||||||
|
|
||||||
|
/[^\x{120}]{2}/i,ucp
|
||||||
|
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
|
||||||
|
\= Expect no match
|
||||||
|
\x{121}\x{121}
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
|
||||||
# End of testinput14
|
# End of testinput14
|
||||||
|
|
Loading…
Reference in New Issue