Unicode upper/lower casing is now used when UCP is set, even if UTF is not set.

This is not yet documented, and it not yet implemented in JIT.
This commit is contained in:
Philip.Hazel 2020-02-23 16:40:05 +00:00
parent d0666136c9
commit 4a7dfab0ec
18 changed files with 893 additions and 125 deletions

View File

@ -66,6 +66,11 @@ recurse function in JIT.
17. Fix a crash which occurs when the character type of an invalid UTF
character is decoded in JIT.
18. Changes in many areas of the code so that when Unicode is supported and
PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
upper/lower case computations on characters whose code points are greater than
127. Documentation is not yet updated. JIT is not yet updated.
Version 10.34 21-November-2019
------------------------------

View File

@ -28,8 +28,6 @@
# The -v option causes a call to 'pcre2test -C' to happen for each
# configuration.
# Currently -fsanitize=undefined is not working (locks machine).
useasan=1
useusan=1
usedebug=1

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -292,6 +292,7 @@ possessification, and if so, fills a list with its properties.
Arguments:
code points to start of expression
utf TRUE if in UTF mode
ucp TRUE if in UCP mode
fcc points to the case-flipping table
list points to output list
list[0] will be filled with the opcode
@ -304,7 +305,7 @@ Returns: points to the start of the next opcode if *code is accepted
*/
static PCRE2_SPTR
get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
uint32_t *list)
{
PCRE2_UCHAR c = *code;
@ -316,7 +317,8 @@ uint32_t chr;
uint32_t *clist_dest;
const uint32_t *clist_src;
#else
(void)utf; /* Suppress "unused parameter" compiler warning */
(void)utf; /* Suppress "unused parameter" compiler warnings */
(void)ucp;
#endif
list[0] = c;
@ -396,7 +398,7 @@ switch(c)
list[2] = chr;
#ifdef SUPPORT_UNICODE
if (chr < 128 || (chr < 256 && !utf))
if (chr < 128 || (chr < 256 && !utf && !ucp))
list[3] = fcc[chr];
else
list[3] = UCD_OTHERCASE(chr);
@ -503,6 +505,7 @@ which case the base cannot be possessified.
Arguments:
code points to the byte code
utf TRUE in UTF mode
ucp TRUE in UCP mode
cb compile data block
base_list the data list of the base opcode
base_end the end of the base opcode
@ -512,7 +515,7 @@ Returns: TRUE if the auto-possessification is possible
*/
static BOOL
compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
{
PCRE2_UCHAR c;
@ -651,7 +654,7 @@ for(;;)
while (*next_code == OP_ALT)
{
if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
return FALSE;
code = next_code + 1 + LINK_SIZE;
next_code += GET(next_code, 1);
@ -672,7 +675,8 @@ for(;;)
/* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
next_code += 1 + LINK_SIZE;
if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
rec_limit))
return FALSE;
code += PRIV(OP_lengths)[c];
@ -688,7 +692,7 @@ for(;;)
/* We now have the next appropriate opcode to compare with the base. Check
for a supported opcode, and load its properties. */
code = get_chr_property_list(code, utf, cb->fcc, list);
code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
if (code == NULL) return FALSE; /* Unsupported */
/* If either opcode is a small character list, set pointers for comparing
@ -1100,7 +1104,6 @@ leaving the remainder of the pattern unpossessified.
Arguments:
code points to start of the byte code
utf TRUE in UTF mode
cb compile data block
Returns: 0 for success
@ -1108,13 +1111,15 @@ Returns: 0 for success
*/
int
PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
{
PCRE2_UCHAR c;
PCRE2_SPTR end;
PCRE2_UCHAR *repeat_opcode;
uint32_t list[8];
int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
for (;;)
{
@ -1126,10 +1131,11 @@ for (;;)
{
c -= get_repeat_base(c) - OP_STAR;
end = (c <= OP_MINUPTO) ?
get_chr_property_list(code, utf, cb->fcc, list) : NULL;
get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
&rec_limit))
{
switch(c)
{
@ -1181,11 +1187,11 @@ for (;;)
if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
{
/* end must not be NULL. */
end = get_chr_property_list(code, utf, cb->fcc, list);
end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
list[1] = (c & 1) == 0;
if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
{
switch (c)
{

View File

@ -4904,7 +4904,7 @@ range. */
if ((options & PCRE2_CASELESS) != 0)
{
#ifdef SUPPORT_UNICODE
if ((options & PCRE2_UTF) != 0)
if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
{
int rc;
uint32_t oc, od;
@ -5319,7 +5319,8 @@ dynamically as we process the pattern. */
#ifdef SUPPORT_UNICODE
BOOL utf = (options & PCRE2_UTF) != 0;
#else /* No UTF support */
BOOL ucp = (options & PCRE2_UCP) != 0;
#else /* No Unicode support */
BOOL utf = FALSE;
#endif
@ -5602,7 +5603,7 @@ for (;; pptr++)
uint32_t d;
#ifdef SUPPORT_UNICODE
if (utf && c > 127) d = UCD_OTHERCASE(c); else
if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
#endif
{
#if PCRE2_CODE_UNIT_WIDTH != 8
@ -9632,6 +9633,7 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
{
BOOL utf; /* Set TRUE for UTF mode */
BOOL ucp; /* Set TRUE for UCP mode */
BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
pcre2_real_code *re = NULL; /* What we will return */
@ -9919,8 +9921,8 @@ if (utf)
/* Check UCP lockout. */
if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
(PCRE2_UCP|PCRE2_NEVER_UCP))
ucp = (cb.external_options & PCRE2_UCP) != 0;
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
{
errorcode = ERR75;
goto HAD_EARLY_ERROR;
@ -10296,7 +10298,7 @@ function call. */
if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
{
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
}
/* Failed to compile, or error while post-processing. */
@ -10344,21 +10346,25 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
if ((firstcuflags & REQ_CASELESS) != 0)
{
if (firstcu < 128 || (!utf && firstcu < 255))
if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
{
if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
}
/* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
8-bit UTF mode, codepoints in the range 128-255 are introductory code
points and cannot have another case. In 16-bit and 32-bit modes, we can
check wide characters when UTF (and therefore UCP) is supported. */
/* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
points and cannot have another case, but if UCP is set they may do. */
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
else if (firstcu <= MAX_UTF_CODE_POINT &&
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
re->flags |= PCRE2_FIRSTCASELESS;
#else
else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
UCD_OTHERCASE(firstcu) != firstcu)
re->flags |= PCRE2_FIRSTCASELESS;
#endif
#endif /* SUPPORT_UNICODE */
}
}
@ -10407,14 +10413,20 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
if ((reqcuflags & REQ_CASELESS) != 0)
{
if (reqcu < 128 || (!utf && reqcu < 255))
if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
{
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
}
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
#else
else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
#endif
#endif /* SUPPORT_UNICODE */
}
}
}

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -548,6 +548,7 @@ PCRE2_SPTR start_code = mb->start_code;
#ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
#else
BOOL utf = FALSE;
#endif
@ -2190,7 +2191,7 @@ for (;;)
if (clen == 0) break;
#ifdef SUPPORT_UNICODE
if (utf)
if (utf_or_ucp)
{
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
{
@ -2204,7 +2205,7 @@ for (;;)
}
else
#endif /* SUPPORT_UNICODE */
/* Not UTF mode */
/* Not UTF or UCP mode */
{
if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
{ ADD_NEW(state_offset + 2, 0); }
@ -2339,7 +2340,7 @@ for (;;)
{
uint32_t otherd;
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@ -2374,7 +2375,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@ -2417,7 +2418,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@ -2458,7 +2459,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@ -2491,7 +2492,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@ -2531,7 +2532,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@ -3526,10 +3527,15 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && first_cu > 127)
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
#endif
#else
if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
#endif
#endif /* SUPPORT_UNICODE */
}
}
else
@ -3545,9 +3551,15 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0)
{
req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#else
if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#endif
#endif /* SUPPORT_UNICODE */
}
}

View File

@ -1952,7 +1952,7 @@ is available. */
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_)
extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
extern int _pcre2_auto_possessify(PCRE2_UCHAR *,
const compile_block *);
extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *,
int *, uint32_t, uint32_t, BOOL, compile_block *);

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2015-2019 University of Cambridge
New API code Copyright (c) 2015-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -598,12 +598,13 @@ BOOL condition; /* Used in conditional groups */
BOOL cur_is_word; /* Used in "word" tests */
BOOL prev_is_word; /* Used in "word" tests */
/* UTF flag */
/* UTF and UCP flags */
#ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
#else
BOOL utf = FALSE;
BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
#endif
/* This is the length of the last part of a backtracking frame that must be
@ -928,6 +929,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
else
#endif
/* Not UTF mode */
{
if (mb->end_subject - Feptr < 1)
@ -987,10 +989,30 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
}
}
/* If UCP is set without UTF we must do the same as above, but with one
character per code unit. */
else if (ucp)
{
uint32_t cc = UCHAR21(Feptr);
fc = Fecode[1];
if (fc < 128)
{
if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
}
else
{
if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
}
Feptr++;
Fecode += 2;
}
else
#endif /* SUPPORT_UNICODE */
/* Not UTF mode; use the table for characters < 256. */
/* Not UTF or UCP mode; use the table for characters < 256. */
{
if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
!= TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
@ -1010,6 +1032,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
#ifdef SUPPORT_UNICODE
if (utf)
{
@ -1026,15 +1049,42 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (ch > 127)
ch = UCD_OTHERCASE(ch);
else
ch = TABLE_GET(ch, mb->fcc, ch);
ch = (mb->fcc)[ch];
if (ch == fc) RRETURN(MATCH_NOMATCH);
}
}
/* UCP without UTF is as above, but with one character per code unit. */
else if (ucp)
{
uint32_t ch;
fc = UCHAR21INC(Feptr);
ch = Fecode[1];
Fecode += 2;
if (ch == fc)
{
RRETURN(MATCH_NOMATCH); /* Caseful match */
}
else if (Fop == OP_NOTI) /* If caseless */
{
if (ch > 127)
ch = UCD_OTHERCASE(ch);
else
ch = (mb->fcc)[ch];
if (ch == fc) RRETURN(MATCH_NOMATCH);
}
}
else
#endif /* SUPPORT_UNICODE */
/* Neither UTF nor UCP is set */
{
uint32_t ch = Fecode[1];
fc = *Feptr++;
fc = UCHAR21INC(Feptr);
if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
RRETURN(MATCH_NOMATCH);
Fecode += 2;
@ -1244,7 +1294,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
#endif /* SUPPORT_UNICODE */
/* When not in UTF mode, load a single-code-unit character. Then proceed as
above. */
above, using Unicode casing if either UTF or UCP is set. */
Lc = *Fecode++;
@ -1253,11 +1303,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (Fop >= OP_STARI)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
/* Lc must be < 128 in UTF-8 mode. */
#ifdef SUPPORT_UNICODE
if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
else
#endif /* SUPPORT_UNICODE */
/* Lc will be < 128 in UTF-8 mode. */
Loc = mb->fcc[Lc];
#else /* 16-bit & 32-bit */
#ifdef SUPPORT_UNICODE
if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
else
#endif /* SUPPORT_UNICODE */
Loc = TABLE_GET(Lc, mb->fcc, Lc);
@ -1490,7 +1544,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (Fop >= OP_NOTSTARI) /* Caseless */
{
#ifdef SUPPORT_UNICODE
if (utf && Lc > 127)
if ((utf || ucp) && Lc > 127)
Loc = UCD_OTHERCASE(Lc);
else
#endif /* SUPPORT_UNICODE */
@ -6045,7 +6099,6 @@ BOOL firstline;
BOOL has_first_cu = FALSE;
BOOL has_req_cu = FALSE;
BOOL startline;
BOOL utf;
#if PCRE2_CODE_UNIT_WIDTH == 8
BOOL memchr_not_found_first_cu = FALSE;
@ -6069,13 +6122,19 @@ PCRE2_SPTR match_partial;
BOOL use_jit;
#endif
/* This flag is needed even when Unicode is not supported for convenience
(it is used by the IS_NEWLINE macro). */
BOOL utf = FALSE;
#ifdef SUPPORT_UNICODE
BOOL ucp = FALSE;
BOOL allow_invalid;
uint32_t fragment_options = 0;
#ifdef SUPPORT_JIT
BOOL jit_checked_utf = FALSE;
#endif
#endif
#endif /* SUPPORT_UNICODE */
PCRE2_SIZE frame_size;
@ -6147,12 +6206,13 @@ use_jit = (re->executable_jit != NULL &&
(options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
#endif
/* Initialize UTF parameters. */
/* Initialize UTF/UCP parameters. */
utf = (re->overall_options & PCRE2_UTF) != 0;
#ifdef SUPPORT_UNICODE
utf = (re->overall_options & PCRE2_UTF) != 0;
allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
#endif
ucp = (re->overall_options & PCRE2_UCP) != 0;
#endif /* SUPPORT_UNICODE */
/* Convert the partial matching flags into an integer. */
@ -6589,9 +6649,13 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
#else
if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
#endif
#endif /* SUPPORT_UNICODE */
}
}
else
@ -6607,9 +6671,13 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0)
{
req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
#else
if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
#endif
#endif /* SUPPORT_UNICODE */
}
}
@ -6756,15 +6824,16 @@ for(;;)
#endif
}
/* If we can't find the required code unit, having reached the true end
of the subject, break the bumpalong loop, to force a match failure,
except when doing partial matching, when we let the next cycle run at
the end of the subject. To see why, consider the pattern /(?<=abc)def/,
which partially matches "abc", even though the string does not contain
the starting character "d". If we have not reached the true end of the
subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
we also let the cycle run, because the matching string is legitimately
allowed to start with the first code unit of a newline. */
/* If we can't find the required first code unit, having reached the
true end of the subject, break the bumpalong loop, to force a match
failure, except when doing partial matching, when we let the next cycle
run at the end of the subject. To see why, consider the pattern
/(?<=abc)def/, which partially matches "abc", even though the string
does not contain the starting character "d". If we have not reached the
true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
temporarily modified) we also let the cycle run, because the matching
string is legitimately allowed to start with the first code unit of a
newline. */
if (mb->partial == 0 && start_match >= mb->end_subject)
{

View File

@ -772,15 +772,19 @@ Arguments:
p points to the first code unit of the character
caseless TRUE if caseless
utf TRUE for UTF mode
ucp TRUE for UCP mode
Returns: pointer after the character
*/
static PCRE2_SPTR
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
BOOL ucp)
{
uint32_t c = *p++; /* First code unit */
(void)utf; /* Stop compiler warning when UTF not supported */
(void)utf; /* Stop compiler warnings when UTF not supported */
(void)ucp;
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
0xff. */
@ -810,22 +814,26 @@ if (utf)
if (caseless)
{
#ifdef SUPPORT_UNICODE
if (utf)
if (utf || ucp)
{
c = UCD_OTHERCASE(c);
#if PCRE2_CODE_UNIT_WIDTH == 8
PCRE2_UCHAR buff[6];
c = UCD_OTHERCASE(c);
(void)PRIV(ord2utf)(c, buff);
SET_BIT(buff[0]);
if (utf)
{
PCRE2_UCHAR buff[6];
(void)PRIV(ord2utf)(c, buff);
SET_BIT(buff[0]);
}
else SET_BIT(c);
#else /* 16-bit or 32-bit mode */
c = UCD_OTHERCASE(c);
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
#endif
}
else
#endif /* SUPPORT_UNICODE */
/* Not UTF */
/* Not UTF or UCP */
if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
}
@ -931,6 +939,7 @@ Arguments:
re points to the compiled regex block
code points to an expression
utf TRUE if in UTF mode
ucp TRUE if in UCP mode
depthptr pointer to recurse depth
Returns: SSB_FAIL => Failed to find any starting code units
@ -941,7 +950,8 @@ Returns: SSB_FAIL => Failed to find any starting code units
*/
static int
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, int *depthptr)
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
int *depthptr)
{
uint32_t c;
int yield = SSB_DONE;
@ -1111,7 +1121,7 @@ do
case OP_SCRIPT_RUN:
case OP_ASSERT:
case OP_ASSERT_NA:
rc = set_start_bits(re, tcode, utf, depthptr);
rc = set_start_bits(re, tcode, utf, ucp, depthptr);
if (rc == SSB_DONE)
{
try_next = FALSE;
@ -1167,7 +1177,7 @@ do
case OP_BRAZERO:
case OP_BRAMINZERO:
case OP_BRAPOSZERO:
rc = set_start_bits(re, ++tcode, utf, depthptr);
rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
@ -1189,7 +1199,7 @@ do
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
tcode = set_table_bit(re, tcode + 1, FALSE, utf);
tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
break;
case OP_STARI:
@ -1198,7 +1208,7 @@ do
case OP_QUERYI:
case OP_MINQUERYI:
case OP_POSQUERYI:
tcode = set_table_bit(re, tcode + 1, TRUE, utf);
tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
break;
/* Single-char upto sets the bit and tries the next */
@ -1206,13 +1216,13 @@ do
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
break;
case OP_UPTOI:
case OP_MINUPTOI:
case OP_POSUPTOI:
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
break;
/* At least one single char sets the bit and stops */
@ -1224,7 +1234,7 @@ do
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
(void)set_table_bit(re, tcode + 1, FALSE, utf);
(void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
try_next = FALSE;
break;
@ -1235,7 +1245,7 @@ do
case OP_PLUSI:
case OP_MINPLUSI:
case OP_POSPLUSI:
(void)set_table_bit(re, tcode + 1, TRUE, utf);
(void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
try_next = FALSE;
break;
@ -1664,6 +1674,7 @@ PRIV(study)(pcre2_real_code *re)
int count = 0;
PCRE2_UCHAR *code;
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
/* Find start of compiled code */
@ -1677,7 +1688,7 @@ code units. */
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
{
int depth = 0;
int rc = set_start_bits(re, code, utf, &depth);
int rc = set_start_bits(re, code, utf, ucp, &depth);
if (rc == SSB_UNKNOWN) return 1;
/* If a list of starting code units was set up, scan the list to see if only
@ -1695,7 +1706,7 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
int b = -1;
uint8_t *p = re->start_bitmap;
uint32_t flags = PCRE2_FIRSTMAPSET;
for (i = 0; i < 256; p++, i += 8)
{
uint8_t x = *p;
@ -1725,27 +1736,27 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
}
/* c contains the code unit value, in the range 0-255. In 8-bit UTF
mode, only values < 128 can be used. */
mode, only values < 128 can be used. In all the other cases, c is a
character value. */
#if PCRE2_CODE_UNIT_WIDTH == 8
if (c > 127) goto DONE;
if (utf && c > 127) goto DONE;
#endif
if (a < 0) a = c; /* First one found */
if (a < 0) a = c; /* First one found, save in a */
else if (b < 0) /* Second one found */
{
int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
if (utf && UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
#else /* 16-bit or 32-bit */
if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
if (utf && c > 127) d = UCD_OTHERCASE(c);
#endif /* Code width */
if (utf || ucp)
{
if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
if (c > 127) d = UCD_OTHERCASE(c);
}
#endif /* SUPPORT_UNICODE */
if (d != a) goto DONE; /* Not other case of a */
b = c;
if (d != a) goto DONE; /* Not the other case of a */
b = c; /* Save second in b */
}
else goto DONE; /* More than two characters found */
}

View File

@ -236,6 +236,7 @@ BOOL use_existing_match;
BOOL replacement_only;
#ifdef SUPPORT_UNICODE
BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
#endif
PCRE2_UCHAR temp[6];
PCRE2_SPTR ptr;
@ -758,7 +759,7 @@ do
if (forcecase != 0)
{
#ifdef SUPPORT_UNICODE
if (utf)
if (utf || ucp)
{
uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L &&
@ -860,7 +861,7 @@ do
if (forcecase != 0)
{
#ifdef SUPPORT_UNICODE
if (utf)
if (utf || ucp)
{
uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L &&

29
testdata/testinput10 vendored
View File

@ -570,8 +570,10 @@
/[\xff\x{ffff}]/I,utf
/[\xff\x{ff}]/I,utf
abc\x{ff}def
/[\xff\x{ff}]/I
abc\x{ff}def
/[Ss]/I
@ -585,4 +587,31 @@
abc\x80\=startchar
abc\x80\=startchar,offset=3
#subject no_jit
/\x{c1}+\x{e1}/iIB,ucp
\x{c1}\x{c1}\x{c1}
\x{e1}\x{e1}\x{e1}
/a|\x{c1}/iI,ucp
\x{e1}xxx
/a|\x{c1}/iI,utf
\x{e1}xxx
/\x{c1}|\x{e1}/iI,ucp
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
X\x{e1}Y
/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
X\x{c1}Y
# Without UTF or UCP characters > 127 have only one case in the default locale.
/X(\x{e1})Y/replace=>\U$1<,substitute_extended
X\x{e1}Y
#subject
# End of testinput10

67
testdata/testinput12 vendored
View File

@ -463,4 +463,71 @@
/(?:\x{ff}|\x{3000})/I,utf
# ----------------------------------------------------
# UCP and casing tests
/\x{120}/i,I
/\x{c1}/i,I,ucp
/[\x{120}\x{121}]/iB,ucp
/[ab\x{120}]+/iB,ucp
aABb\x{121}\x{120}
#subject no_jit
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
/[^\x{120}]/i,no_start_optimize
\x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
/[^\x{120}]/i
\x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
/\x{120}{2}/i,ucp
\x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
/\x{c1}+\x{e1}/iB,ucp
\x{c1}\x{c1}\x{c1}
/\x{c1}+\x{e1}/iIB,ucp
\x{c1}\x{c1}\x{c1}
\x{e1}\x{e1}\x{e1}
/a|\x{c1}/iI,ucp
\x{e1}xxx
/\x{c1}|\x{e1}/iI,ucp
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
X\x{e1}Y
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
X\x{121}Y
#subject
# ----------------------------------------------------
# End of testinput12

50
testdata/testinput14 vendored
View File

@ -1,9 +1,12 @@
# These test special (mostly error) UTF features of DFA matching. They are a
# selection of the more comprehensive tests that are run for non-DFA matching.
# The output is different for the different widths.
# These test special UTF and UCP features of DFA matching. The output is
# different for the different widths.
#subject dfa
# ----------------------------------------------------
# These are a selection of the more comprehensive tests that are run for
# non-DFA matching.
/X/utf
XX\x{d800}
XX\x{d800}\=offset=3
@ -33,5 +36,46 @@
XX\xef\x80\=ph
\xf7\=ph
\xf7\x80\=ph
# ----------------------------------------------------
# UCP and casing tests - except for the first two, these will all fail in 8-bit
# mode because they are testing UCP without UTF and use characters > 255.
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
/\x{c1}+\x{e1}/iB,ucp
\x{c1}\x{c1}\x{c1}
\x{e1}\x{e1}\x{e1}
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
/[^\x{120}]/i,no_start_optimize
\x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
/[^\x{120}]/i
\x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
/\x{120}{2}/i,ucp
\x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
# ----------------------------------------------------
# End of testinput14

64
testdata/testoutput10 vendored
View File

@ -1780,11 +1780,15 @@ Capture group count = 0
Options: utf
Starting code units: \xc3
Subject length lower bound = 1
abc\x{ff}def
0: \x{ff}
/[\xff\x{ff}]/I
Capture group count = 0
Starting code units: \xff
First code unit = \xff
Subject length lower bound = 1
abc\x{ff}def
0: \xff
/[Ss]/I
Capture group count = 0
@ -1813,4 +1817,62 @@ Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 3
abc\x80\=startchar,offset=3
Error -36 (bad UTF-8 offset)
#subject no_jit
/\x{c1}+\x{e1}/iIB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Last code unit = \xe1 (caseless)
Subject length lower bound = 2
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
\x{e1}\x{e1}\x{e1}
0: \xe1\xe1\xe1
/a|\x{c1}/iI,ucp
Capture group count = 0
Options: caseless ucp
Starting code units: A a \xc1 \xe1
Subject length lower bound = 1
\x{e1}xxx
0: \xe1
/a|\x{c1}/iI,utf
Capture group count = 0
Options: caseless utf
Starting code units: A a \xc3
Subject length lower bound = 1
\x{e1}xxx
0: \x{e1}
/\x{c1}|\x{e1}/iI,ucp
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Subject length lower bound = 1
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
X\x{e1}Y
1: >\xc1<
/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
X\x{c1}Y
1: >\xe1<
# Without UTF or UCP characters > 127 have only one case in the default locale.
/X(\x{e1})Y/replace=>\U$1<,substitute_extended
X\x{e1}Y
1: >\xe1<
#subject
# End of testinput10

View File

@ -1613,7 +1613,7 @@ Subject length lower bound = 1
/[Ss]/I
Capture group count = 0
Starting code units: S s
First code unit = 'S' (caseless)
Subject length lower bound = 1
/[Ss]/I,utf
@ -1628,4 +1628,134 @@ Options: utf
Starting code units: \xff
Subject length lower bound = 1
# ----------------------------------------------------
# UCP and casing tests
/\x{120}/i,I
Capture group count = 0
Options: caseless
First code unit = \x{120}
Subject length lower bound = 1
/\x{c1}/i,I,ucp
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Subject length lower bound = 1
/[\x{120}\x{121}]/iB,ucp
------------------------------------------------------------------
Bra
/i \x{120}
Ket
End
------------------------------------------------------------------
/[ab\x{120}]+/iB,ucp
------------------------------------------------------------------
Bra
[ABab\x{120}-\x{121}]++
Ket
End
------------------------------------------------------------------
aABb\x{121}\x{120}
0: aABb\x{121}\x{120}
#subject no_jit
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
No match
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
0: \x{121}\xe1
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
0: \x{121}\xe1
/[^\x{120}]/i,no_start_optimize
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
No match
/[^\x{120}]/i
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
No match
/\x{120}{2}/i,ucp
\x{121}\x{121}
0: \x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
No match
/\x{c1}+\x{e1}/iB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
/\x{c1}+\x{e1}/iIB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Last code unit = \xe1 (caseless)
Subject length lower bound = 2
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
\x{e1}\x{e1}\x{e1}
0: \xe1\xe1\xe1
/a|\x{c1}/iI,ucp
Capture group count = 0
Options: caseless ucp
Starting code units: A a \xc1 \xe1
Subject length lower bound = 1
\x{e1}xxx
0: \xe1
/\x{c1}|\x{e1}/iI,ucp
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Subject length lower bound = 1
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
X\x{e1}Y
1: >\xc1<
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
X\x{121}Y
1: >\x{120}<
#subject
# ----------------------------------------------------
# End of testinput12

View File

@ -1611,7 +1611,7 @@ Subject length lower bound = 1
/[Ss]/I
Capture group count = 0
Starting code units: S s
First code unit = 'S' (caseless)
Subject length lower bound = 1
/[Ss]/I,utf
@ -1626,4 +1626,134 @@ Options: utf
Starting code units: \xff
Subject length lower bound = 1
# ----------------------------------------------------
# UCP and casing tests
/\x{120}/i,I
Capture group count = 0
Options: caseless
First code unit = \x{120}
Subject length lower bound = 1
/\x{c1}/i,I,ucp
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Subject length lower bound = 1
/[\x{120}\x{121}]/iB,ucp
------------------------------------------------------------------
Bra
/i \x{120}
Ket
End
------------------------------------------------------------------
/[ab\x{120}]+/iB,ucp
------------------------------------------------------------------
Bra
[ABab\x{120}-\x{121}]++
Ket
End
------------------------------------------------------------------
aABb\x{121}\x{120}
0: aABb\x{121}\x{120}
#subject no_jit
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
No match
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
0: \x{121}\xe1
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
0: \x{121}\xe1
/[^\x{120}]/i,no_start_optimize
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
No match
/[^\x{120}]/i
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
No match
/\x{120}{2}/i,ucp
\x{121}\x{121}
0: \x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
No match
/\x{c1}+\x{e1}/iB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
/\x{c1}+\x{e1}/iIB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Last code unit = \xe1 (caseless)
Subject length lower bound = 2
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
\x{e1}\x{e1}\x{e1}
0: \xe1\xe1\xe1
/a|\x{c1}/iI,ucp
Capture group count = 0
Options: caseless ucp
Starting code units: A a \xc1 \xe1
Subject length lower bound = 1
\x{e1}xxx
0: \xe1
/\x{c1}|\x{e1}/iI,ucp
Capture group count = 0
Options: caseless ucp
First code unit = \xc1 (caseless)
Subject length lower bound = 1
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
X\x{e1}Y
1: >\xc1<
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
X\x{121}Y
1: >\x{120}<
#subject
# ----------------------------------------------------
# End of testinput12

View File

@ -1,9 +1,12 @@
# These test special (mostly error) UTF features of DFA matching. They are a
# selection of the more comprehensive tests that are run for non-DFA matching.
# The output is different for the different widths.
# These test special UTF and UCP features of DFA matching. The output is
# different for the different widths.
#subject dfa
# ----------------------------------------------------
# These are a selection of the more comprehensive tests that are run for
# non-DFA matching.
/X/utf
XX\x{d800}
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
@ -57,5 +60,66 @@ No match
No match
\xf7\x80\=ph
No match
# ----------------------------------------------------
# UCP and casing tests - except for the first two, these will all fail in 8-bit
# mode because they are testing UCP without UTF and use characters > 255.
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
No match
/\x{c1}+\x{e1}/iB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
1: \xc1\xc1
\x{e1}\x{e1}\x{e1}
0: \xe1\xe1\xe1
1: \xe1\xe1
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
0: \x{121}\xe1
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
0: \x{121}\xe1
/[^\x{120}]/i,no_start_optimize
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
No match
/[^\x{120}]/i
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
No match
/\x{120}{2}/i,ucp
\x{121}\x{121}
0: \x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
No match
# ----------------------------------------------------
# End of testinput14

View File

@ -1,9 +1,12 @@
# These test special (mostly error) UTF features of DFA matching. They are a
# selection of the more comprehensive tests that are run for non-DFA matching.
# The output is different for the different widths.
# These test special UTF and UCP features of DFA matching. The output is
# different for the different widths.
#subject dfa
# ----------------------------------------------------
# These are a selection of the more comprehensive tests that are run for
# non-DFA matching.
/X/utf
XX\x{d800}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
@ -57,5 +60,66 @@ No match
No match
\xf7\x80\=ph
No match
# ----------------------------------------------------
# UCP and casing tests - except for the first two, these will all fail in 8-bit
# mode because they are testing UCP without UTF and use characters > 255.
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
No match
/\x{c1}+\x{e1}/iB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
1: \xc1\xc1
\x{e1}\x{e1}\x{e1}
0: \xe1\xe1\xe1
1: \xe1\xe1
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
0: \x{121}\xe1
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
0: \x{121}\xe1
/[^\x{120}]/i,no_start_optimize
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
No match
/[^\x{120}]/i
\x{121}
0: \x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
No match
/\x{120}{2}/i,ucp
\x{121}\x{121}
0: \x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
No match
# ----------------------------------------------------
# End of testinput14

View File

@ -1,9 +1,12 @@
# These test special (mostly error) UTF features of DFA matching. They are a
# selection of the more comprehensive tests that are run for non-DFA matching.
# The output is different for the different widths.
# These test special UTF and UCP features of DFA matching. The output is
# different for the different widths.
#subject dfa
# ----------------------------------------------------
# These are a selection of the more comprehensive tests that are run for
# non-DFA matching.
/X/utf
XX\x{d800}
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
@ -57,5 +60,66 @@ Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2
Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
\xf7\x80\=ph
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
# ----------------------------------------------------
# UCP and casing tests - except for the first two, these will all fail in 8-bit
# mode because they are testing UCP without UTF and use characters > 255.
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
No match
/\x{c1}+\x{e1}/iB,ucp
------------------------------------------------------------------
Bra
/i \x{c1}+
/i \x{e1}
Ket
End
------------------------------------------------------------------
\x{c1}\x{c1}\x{c1}
0: \xc1\xc1\xc1
1: \xc1\xc1
\x{e1}\x{e1}\x{e1}
0: \xe1\xe1\xe1
1: \xe1\xe1
/\x{120}\x{c1}/i,ucp,no_start_optimize
Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
\x{121}\x{e1}
/\x{120}\x{c1}/i,ucp
Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
\x{121}\x{e1}
/[^\x{120}]/i,no_start_optimize
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
\x{121}
/[^\x{120}]/i,ucp,no_start_optimize
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
\= Expect no match
\x{121}
/[^\x{120}]/i
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
\x{121}
/[^\x{120}]/i,ucp
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
\= Expect no match
\x{121}
/\x{120}{2}/i,ucp
Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
\x{121}\x{121}
/[^\x{120}]{2}/i,ucp
Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
\= Expect no match
\x{121}\x{121}
# ----------------------------------------------------
# End of testinput14