diff --git a/ChangeLog b/ChangeLog index 6e75456..ab4a5e6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -66,6 +66,11 @@ recurse function in JIT. 17. Fix a crash which occurs when the character type of an invalid UTF character is decoded in JIT. +18. Changes in many areas of the code so that when Unicode is supported and +PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for +upper/lower case computations on characters whose code points are greater than +127. Documentation is not yet updated. JIT is not yet updated. + Version 10.34 21-November-2019 ------------------------------ diff --git a/maint/ManyConfigTests b/maint/ManyConfigTests index 2d92dc5..d82613f 100755 --- a/maint/ManyConfigTests +++ b/maint/ManyConfigTests @@ -28,8 +28,6 @@ # The -v option causes a call to 'pcre2test -C' to happen for each # configuration. -# Currently -fsanitize=undefined is not working (locks machine). - useasan=1 useusan=1 usedebug=1 diff --git a/src/pcre2_auto_possess.c b/src/pcre2_auto_possess.c index 5b95b9b..6c4925f 100644 --- a/src/pcre2_auto_possess.c +++ b/src/pcre2_auto_possess.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge + New API code Copyright (c) 2016-2020 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -292,6 +292,7 @@ possessification, and if so, fills a list with its properties. Arguments: code points to start of expression utf TRUE if in UTF mode + ucp TRUE if in UCP mode fcc points to the case-flipping table list points to output list list[0] will be filled with the opcode @@ -304,7 +305,7 @@ Returns: points to the start of the next opcode if *code is accepted */ static PCRE2_SPTR -get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc, +get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc, uint32_t *list) { PCRE2_UCHAR c = *code; @@ -316,7 +317,8 @@ uint32_t chr; uint32_t *clist_dest; const uint32_t *clist_src; #else -(void)utf; /* Suppress "unused parameter" compiler warning */ +(void)utf; /* Suppress "unused parameter" compiler warnings */ +(void)ucp; #endif list[0] = c; @@ -396,7 +398,7 @@ switch(c) list[2] = chr; #ifdef SUPPORT_UNICODE - if (chr < 128 || (chr < 256 && !utf)) + if (chr < 128 || (chr < 256 && !utf && !ucp)) list[3] = fcc[chr]; else list[3] = UCD_OTHERCASE(chr); @@ -503,6 +505,7 @@ which case the base cannot be possessified. Arguments: code points to the byte code utf TRUE in UTF mode + ucp TRUE in UCP mode cb compile data block base_list the data list of the base opcode base_end the end of the base opcode @@ -512,7 +515,7 @@ Returns: TRUE if the auto-possessification is possible */ static BOOL -compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb, +compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb, const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit) { PCRE2_UCHAR c; @@ -651,7 +654,7 @@ for(;;) while (*next_code == OP_ALT) { - if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit)) + if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit)) return FALSE; code = next_code + 1 + LINK_SIZE; next_code += GET(next_code, 1); @@ -672,7 +675,8 @@ for(;;) /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ next_code += 1 + LINK_SIZE; - if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit)) + if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, + rec_limit)) return FALSE; code += PRIV(OP_lengths)[c]; @@ -688,7 +692,7 @@ for(;;) /* We now have the next appropriate opcode to compare with the base. Check for a supported opcode, and load its properties. */ - code = get_chr_property_list(code, utf, cb->fcc, list); + code = get_chr_property_list(code, utf, ucp, cb->fcc, list); if (code == NULL) return FALSE; /* Unsupported */ /* If either opcode is a small character list, set pointers for comparing @@ -1100,7 +1104,6 @@ leaving the remainder of the pattern unpossessified. Arguments: code points to start of the byte code - utf TRUE in UTF mode cb compile data block Returns: 0 for success @@ -1108,13 +1111,15 @@ Returns: 0 for success */ int -PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb) +PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb) { PCRE2_UCHAR c; PCRE2_SPTR end; PCRE2_UCHAR *repeat_opcode; uint32_t list[8]; int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ +BOOL utf = (cb->external_options & PCRE2_UTF) != 0; +BOOL ucp = (cb->external_options & PCRE2_UCP) != 0; for (;;) { @@ -1126,10 +1131,11 @@ for (;;) { c -= get_repeat_base(c) - OP_STAR; end = (c <= OP_MINUPTO) ? - get_chr_property_list(code, utf, cb->fcc, list) : NULL; + get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL; list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; - if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit)) + if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, + &rec_limit)) { switch(c) { @@ -1181,11 +1187,11 @@ for (;;) if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) { /* end must not be NULL. */ - end = get_chr_property_list(code, utf, cb->fcc, list); + end = get_chr_property_list(code, utf, ucp, cb->fcc, list); list[1] = (c & 1) == 0; - if (compare_opcodes(end, utf, cb, list, end, &rec_limit)) + if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit)) { switch (c) { diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index aa4869f..515f2aa 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -4904,7 +4904,7 @@ range. */ if ((options & PCRE2_CASELESS) != 0) { #ifdef SUPPORT_UNICODE - if ((options & PCRE2_UTF) != 0) + if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0) { int rc; uint32_t oc, od; @@ -5319,7 +5319,8 @@ dynamically as we process the pattern. */ #ifdef SUPPORT_UNICODE BOOL utf = (options & PCRE2_UTF) != 0; -#else /* No UTF support */ +BOOL ucp = (options & PCRE2_UCP) != 0; +#else /* No Unicode support */ BOOL utf = FALSE; #endif @@ -5602,7 +5603,7 @@ for (;; pptr++) uint32_t d; #ifdef SUPPORT_UNICODE - if (utf && c > 127) d = UCD_OTHERCASE(c); else + if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else #endif { #if PCRE2_CODE_UNIT_WIDTH != 8 @@ -9632,6 +9633,7 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options, int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) { BOOL utf; /* Set TRUE for UTF mode */ +BOOL ucp; /* Set TRUE for UCP mode */ BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */ BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */ pcre2_real_code *re = NULL; /* What we will return */ @@ -9919,8 +9921,8 @@ if (utf) /* Check UCP lockout. */ -if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) == - (PCRE2_UCP|PCRE2_NEVER_UCP)) +ucp = (cb.external_options & PCRE2_UCP) != 0; +if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0) { errorcode = ERR75; goto HAD_EARLY_ERROR; @@ -10296,7 +10298,7 @@ function call. */ if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0) { PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; - if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80; + if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80; } /* Failed to compile, or error while post-processing. */ @@ -10344,21 +10346,25 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) if ((firstcuflags & REQ_CASELESS) != 0) { - if (firstcu < 128 || (!utf && firstcu < 255)) + if (firstcu < 128 || (!utf && !ucp && firstcu < 255)) { if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS; } - /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In - 8-bit UTF mode, codepoints in the range 128-255 are introductory code - points and cannot have another case. In 16-bit and 32-bit modes, we can - check wide characters when UTF (and therefore UCP) is supported. */ + /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise. + In 8-bit UTF mode, codepoints in the range 128-255 are introductory code + points and cannot have another case, but if UCP is set they may do. */ -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - else if (firstcu <= MAX_UTF_CODE_POINT && +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu) + re->flags |= PCRE2_FIRSTCASELESS; +#else + else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(firstcu) != firstcu) re->flags |= PCRE2_FIRSTCASELESS; #endif +#endif /* SUPPORT_UNICODE */ } } @@ -10407,14 +10413,20 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) if ((reqcuflags & REQ_CASELESS) != 0) { - if (reqcu < 128 || (!utf && reqcu < 255)) + if (reqcu < 128 || (!utf && !ucp && reqcu < 255)) { if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; } -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu) - re->flags |= PCRE2_LASTCASELESS; +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu) + re->flags |= PCRE2_LASTCASELESS; +#else + else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT && + UCD_OTHERCASE(reqcu) != reqcu) + re->flags |= PCRE2_LASTCASELESS; #endif +#endif /* SUPPORT_UNICODE */ } } } diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index 7d8ffe8..b8bdd02 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge + New API code Copyright (c) 2016-2020 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -548,6 +548,7 @@ PCRE2_SPTR start_code = mb->start_code; #ifdef SUPPORT_UNICODE BOOL utf = (mb->poptions & PCRE2_UTF) != 0; +BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0; #else BOOL utf = FALSE; #endif @@ -2190,7 +2191,7 @@ for (;;) if (clen == 0) break; #ifdef SUPPORT_UNICODE - if (utf) + if (utf_or_ucp) { if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else { @@ -2204,7 +2205,7 @@ for (;;) } else #endif /* SUPPORT_UNICODE */ - /* Not UTF mode */ + /* Not UTF or UCP mode */ { if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d)) { ADD_NEW(state_offset + 2, 0); } @@ -2339,7 +2340,7 @@ for (;;) { uint32_t otherd; #ifdef SUPPORT_UNICODE - if (utf && d >= 128) + if (utf_or_ucp && d >= 128) otherd = UCD_OTHERCASE(d); else #endif /* SUPPORT_UNICODE */ @@ -2374,7 +2375,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UNICODE - if (utf && d >= 128) + if (utf_or_ucp && d >= 128) otherd = UCD_OTHERCASE(d); else #endif /* SUPPORT_UNICODE */ @@ -2417,7 +2418,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UNICODE - if (utf && d >= 128) + if (utf_or_ucp && d >= 128) otherd = UCD_OTHERCASE(d); else #endif /* SUPPORT_UNICODE */ @@ -2458,7 +2459,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UNICODE - if (utf && d >= 128) + if (utf_or_ucp && d >= 128) otherd = UCD_OTHERCASE(d); else #endif /* SUPPORT_UNICODE */ @@ -2491,7 +2492,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UNICODE - if (utf && d >= 128) + if (utf_or_ucp && d >= 128) otherd = UCD_OTHERCASE(d); else #endif /* SUPPORT_UNICODE */ @@ -2531,7 +2532,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UNICODE - if (utf && d >= 128) + if (utf_or_ucp && d >= 128) otherd = UCD_OTHERCASE(d); else #endif /* SUPPORT_UNICODE */ @@ -3526,10 +3527,15 @@ if ((re->flags & PCRE2_FIRSTSET) != 0) if ((re->flags & PCRE2_FIRSTCASELESS) != 0) { first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && first_cu > 127) +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0) first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); -#endif +#else + if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0)) + first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); +#endif +#endif /* SUPPORT_UNICODE */ } } else @@ -3545,9 +3551,15 @@ if ((re->flags & PCRE2_LASTSET) != 0) if ((re->flags & PCRE2_LASTCASELESS) != 0) { req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0) + req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); +#else + if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0)) + req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); #endif +#endif /* SUPPORT_UNICODE */ } } diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index ac96d2d..9963d6f 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -1952,7 +1952,7 @@ is available. */ #define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_) #define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_) -extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, +extern int _pcre2_auto_possessify(PCRE2_UCHAR *, const compile_block *); extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *, int *, uint32_t, uint32_t, BOOL, compile_block *); diff --git a/src/pcre2_match.c b/src/pcre2_match.c index a3fccc1..77c98f5 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2015-2019 University of Cambridge + New API code Copyright (c) 2015-2020 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -598,12 +598,13 @@ BOOL condition; /* Used in conditional groups */ BOOL cur_is_word; /* Used in "word" tests */ BOOL prev_is_word; /* Used in "word" tests */ -/* UTF flag */ +/* UTF and UCP flags */ #ifdef SUPPORT_UNICODE BOOL utf = (mb->poptions & PCRE2_UTF) != 0; +BOOL ucp = (mb->poptions & PCRE2_UCP) != 0; #else -BOOL utf = FALSE; +BOOL utf = FALSE; /* Required for convenience even when no Unicode support */ #endif /* This is the length of the last part of a backtracking frame that must be @@ -928,6 +929,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } else #endif + /* Not UTF mode */ { if (mb->end_subject - Feptr < 1) @@ -987,10 +989,30 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); } } + + /* If UCP is set without UTF we must do the same as above, but with one + character per code unit. */ + + else if (ucp) + { + uint32_t cc = UCHAR21(Feptr); + fc = Fecode[1]; + if (fc < 128) + { + if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); + } + else + { + if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); + } + Feptr++; + Fecode += 2; + } + else #endif /* SUPPORT_UNICODE */ - /* Not UTF mode; use the table for characters < 256. */ + /* Not UTF or UCP mode; use the table for characters < 256. */ { if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); @@ -1010,6 +1032,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } + #ifdef SUPPORT_UNICODE if (utf) { @@ -1026,15 +1049,42 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (ch > 127) ch = UCD_OTHERCASE(ch); else - ch = TABLE_GET(ch, mb->fcc, ch); + ch = (mb->fcc)[ch]; if (ch == fc) RRETURN(MATCH_NOMATCH); } } + + /* UCP without UTF is as above, but with one character per code unit. */ + + else if (ucp) + { + uint32_t ch; + fc = UCHAR21INC(Feptr); + ch = Fecode[1]; + Fecode += 2; + + if (ch == fc) + { + RRETURN(MATCH_NOMATCH); /* Caseful match */ + } + else if (Fop == OP_NOTI) /* If caseless */ + { + if (ch > 127) + ch = UCD_OTHERCASE(ch); + else + ch = (mb->fcc)[ch]; + if (ch == fc) RRETURN(MATCH_NOMATCH); + } + } + else #endif /* SUPPORT_UNICODE */ + + /* Neither UTF nor UCP is set */ + { uint32_t ch = Fecode[1]; - fc = *Feptr++; + fc = UCHAR21INC(Feptr); if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) RRETURN(MATCH_NOMATCH); Fecode += 2; @@ -1244,7 +1294,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); #endif /* SUPPORT_UNICODE */ /* When not in UTF mode, load a single-code-unit character. Then proceed as - above. */ + above, using Unicode casing if either UTF or UCP is set. */ Lc = *Fecode++; @@ -1253,11 +1303,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (Fop >= OP_STARI) { #if PCRE2_CODE_UNIT_WIDTH == 8 - /* Lc must be < 128 in UTF-8 mode. */ +#ifdef SUPPORT_UNICODE + if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); + else +#endif /* SUPPORT_UNICODE */ + /* Lc will be < 128 in UTF-8 mode. */ Loc = mb->fcc[Lc]; #else /* 16-bit & 32-bit */ #ifdef SUPPORT_UNICODE - if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); + if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); else #endif /* SUPPORT_UNICODE */ Loc = TABLE_GET(Lc, mb->fcc, Lc); @@ -1490,7 +1544,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (Fop >= OP_NOTSTARI) /* Caseless */ { #ifdef SUPPORT_UNICODE - if (utf && Lc > 127) + if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); else #endif /* SUPPORT_UNICODE */ @@ -6045,7 +6099,6 @@ BOOL firstline; BOOL has_first_cu = FALSE; BOOL has_req_cu = FALSE; BOOL startline; -BOOL utf; #if PCRE2_CODE_UNIT_WIDTH == 8 BOOL memchr_not_found_first_cu = FALSE; @@ -6069,13 +6122,19 @@ PCRE2_SPTR match_partial; BOOL use_jit; #endif +/* This flag is needed even when Unicode is not supported for convenience +(it is used by the IS_NEWLINE macro). */ + +BOOL utf = FALSE; + #ifdef SUPPORT_UNICODE +BOOL ucp = FALSE; BOOL allow_invalid; uint32_t fragment_options = 0; #ifdef SUPPORT_JIT BOOL jit_checked_utf = FALSE; #endif -#endif +#endif /* SUPPORT_UNICODE */ PCRE2_SIZE frame_size; @@ -6147,12 +6206,13 @@ use_jit = (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); #endif -/* Initialize UTF parameters. */ +/* Initialize UTF/UCP parameters. */ -utf = (re->overall_options & PCRE2_UTF) != 0; #ifdef SUPPORT_UNICODE +utf = (re->overall_options & PCRE2_UTF) != 0; allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; -#endif +ucp = (re->overall_options & PCRE2_UCP) != 0; +#endif /* SUPPORT_UNICODE */ /* Convert the partial matching flags into an integer. */ @@ -6589,9 +6649,13 @@ if ((re->flags & PCRE2_FIRSTSET) != 0) if ((re->flags & PCRE2_FIRSTCASELESS) != 0) { first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu); +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu); +#else + if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu); #endif +#endif /* SUPPORT_UNICODE */ } } else @@ -6607,9 +6671,13 @@ if ((re->flags & PCRE2_LASTSET) != 0) if ((re->flags & PCRE2_LASTCASELESS) != 0) { req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu); +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu); +#else + if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu); #endif +#endif /* SUPPORT_UNICODE */ } } @@ -6756,15 +6824,16 @@ for(;;) #endif } - /* If we can't find the required code unit, having reached the true end - of the subject, break the bumpalong loop, to force a match failure, - except when doing partial matching, when we let the next cycle run at - the end of the subject. To see why, consider the pattern /(?<=abc)def/, - which partially matches "abc", even though the string does not contain - the starting character "d". If we have not reached the true end of the - subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified) - we also let the cycle run, because the matching string is legitimately - allowed to start with the first code unit of a newline. */ + /* If we can't find the required first code unit, having reached the + true end of the subject, break the bumpalong loop, to force a match + failure, except when doing partial matching, when we let the next cycle + run at the end of the subject. To see why, consider the pattern + /(?<=abc)def/, which partially matches "abc", even though the string + does not contain the starting character "d". If we have not reached the + true end of the subject (PCRE2_FIRSTLINE caused end_subject to be + temporarily modified) we also let the cycle run, because the matching + string is legitimately allowed to start with the first code unit of a + newline. */ if (mb->partial == 0 && start_match >= mb->end_subject) { diff --git a/src/pcre2_study.c b/src/pcre2_study.c index 5af01b5..02d1c08 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -772,15 +772,19 @@ Arguments: p points to the first code unit of the character caseless TRUE if caseless utf TRUE for UTF mode + ucp TRUE for UCP mode Returns: pointer after the character */ static PCRE2_SPTR -set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf) +set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf, + BOOL ucp) { uint32_t c = *p++; /* First code unit */ -(void)utf; /* Stop compiler warning when UTF not supported */ + +(void)utf; /* Stop compiler warnings when UTF not supported */ +(void)ucp; /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for 0xff. */ @@ -810,22 +814,26 @@ if (utf) if (caseless) { #ifdef SUPPORT_UNICODE - if (utf) + if (utf || ucp) { + c = UCD_OTHERCASE(c); #if PCRE2_CODE_UNIT_WIDTH == 8 - PCRE2_UCHAR buff[6]; - c = UCD_OTHERCASE(c); - (void)PRIV(ord2utf)(c, buff); - SET_BIT(buff[0]); + if (utf) + { + PCRE2_UCHAR buff[6]; + (void)PRIV(ord2utf)(c, buff); + SET_BIT(buff[0]); + } + else SET_BIT(c); #else /* 16-bit or 32-bit mode */ - c = UCD_OTHERCASE(c); if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); #endif } + else #endif /* SUPPORT_UNICODE */ - /* Not UTF */ + /* Not UTF or UCP */ if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]); } @@ -931,6 +939,7 @@ Arguments: re points to the compiled regex block code points to an expression utf TRUE if in UTF mode + ucp TRUE if in UCP mode depthptr pointer to recurse depth Returns: SSB_FAIL => Failed to find any starting code units @@ -941,7 +950,8 @@ Returns: SSB_FAIL => Failed to find any starting code units */ static int -set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, int *depthptr) +set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp, + int *depthptr) { uint32_t c; int yield = SSB_DONE; @@ -1111,7 +1121,7 @@ do case OP_SCRIPT_RUN: case OP_ASSERT: case OP_ASSERT_NA: - rc = set_start_bits(re, tcode, utf, depthptr); + rc = set_start_bits(re, tcode, utf, ucp, depthptr); if (rc == SSB_DONE) { try_next = FALSE; @@ -1167,7 +1177,7 @@ do case OP_BRAZERO: case OP_BRAMINZERO: case OP_BRAPOSZERO: - rc = set_start_bits(re, ++tcode, utf, depthptr); + rc = set_start_bits(re, ++tcode, utf, ucp, depthptr); if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc; do tcode += GET(tcode,1); while (*tcode == OP_ALT); tcode += 1 + LINK_SIZE; @@ -1189,7 +1199,7 @@ do case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: - tcode = set_table_bit(re, tcode + 1, FALSE, utf); + tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp); break; case OP_STARI: @@ -1198,7 +1208,7 @@ do case OP_QUERYI: case OP_MINQUERYI: case OP_POSQUERYI: - tcode = set_table_bit(re, tcode + 1, TRUE, utf); + tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp); break; /* Single-char upto sets the bit and tries the next */ @@ -1206,13 +1216,13 @@ do case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: - tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf); + tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp); break; case OP_UPTOI: case OP_MINUPTOI: case OP_POSUPTOI: - tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf); + tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp); break; /* At least one single char sets the bit and stops */ @@ -1224,7 +1234,7 @@ do case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: - (void)set_table_bit(re, tcode + 1, FALSE, utf); + (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp); try_next = FALSE; break; @@ -1235,7 +1245,7 @@ do case OP_PLUSI: case OP_MINPLUSI: case OP_POSPLUSI: - (void)set_table_bit(re, tcode + 1, TRUE, utf); + (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp); try_next = FALSE; break; @@ -1664,6 +1674,7 @@ PRIV(study)(pcre2_real_code *re) int count = 0; PCRE2_UCHAR *code; BOOL utf = (re->overall_options & PCRE2_UTF) != 0; +BOOL ucp = (re->overall_options & PCRE2_UCP) != 0; /* Find start of compiled code */ @@ -1677,7 +1688,7 @@ code units. */ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0) { int depth = 0; - int rc = set_start_bits(re, code, utf, &depth); + int rc = set_start_bits(re, code, utf, ucp, &depth); if (rc == SSB_UNKNOWN) return 1; /* If a list of starting code units was set up, scan the list to see if only @@ -1695,7 +1706,7 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0) int b = -1; uint8_t *p = re->start_bitmap; uint32_t flags = PCRE2_FIRSTMAPSET; - + for (i = 0; i < 256; p++, i += 8) { uint8_t x = *p; @@ -1725,27 +1736,27 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0) } /* c contains the code unit value, in the range 0-255. In 8-bit UTF - mode, only values < 128 can be used. */ + mode, only values < 128 can be used. In all the other cases, c is a + character value. */ #if PCRE2_CODE_UNIT_WIDTH == 8 - if (c > 127) goto DONE; + if (utf && c > 127) goto DONE; #endif - if (a < 0) a = c; /* First one found */ + if (a < 0) a = c; /* First one found, save in a */ else if (b < 0) /* Second one found */ { int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c); - + #ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (utf && UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */ -#else /* 16-bit or 32-bit */ - if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */ - if (utf && c > 127) d = UCD_OTHERCASE(c); -#endif /* Code width */ + if (utf || ucp) + { + if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */ + if (c > 127) d = UCD_OTHERCASE(c); + } #endif /* SUPPORT_UNICODE */ - if (d != a) goto DONE; /* Not other case of a */ - b = c; + if (d != a) goto DONE; /* Not the other case of a */ + b = c; /* Save second in b */ } else goto DONE; /* More than two characters found */ } diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c index 222cb32..981a106 100644 --- a/src/pcre2_substitute.c +++ b/src/pcre2_substitute.c @@ -236,6 +236,7 @@ BOOL use_existing_match; BOOL replacement_only; #ifdef SUPPORT_UNICODE BOOL utf = (code->overall_options & PCRE2_UTF) != 0; +BOOL ucp = (code->overall_options & PCRE2_UCP) != 0; #endif PCRE2_UCHAR temp[6]; PCRE2_SPTR ptr; @@ -758,7 +759,7 @@ do if (forcecase != 0) { #ifdef SUPPORT_UNICODE - if (utf) + if (utf || ucp) { uint32_t type = UCD_CHARTYPE(ch); if (PRIV(ucp_gentype)[type] == ucp_L && @@ -860,7 +861,7 @@ do if (forcecase != 0) { #ifdef SUPPORT_UNICODE - if (utf) + if (utf || ucp) { uint32_t type = UCD_CHARTYPE(ch); if (PRIV(ucp_gentype)[type] == ucp_L && diff --git a/testdata/testinput10 b/testdata/testinput10 index 3813709..be6d426 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -570,8 +570,10 @@ /[\xff\x{ffff}]/I,utf /[\xff\x{ff}]/I,utf + abc\x{ff}def /[\xff\x{ff}]/I + abc\x{ff}def /[Ss]/I @@ -585,4 +587,31 @@ abc\x80\=startchar abc\x80\=startchar,offset=3 +#subject no_jit + +/\x{c1}+\x{e1}/iIB,ucp + \x{c1}\x{c1}\x{c1} + \x{e1}\x{e1}\x{e1} + +/a|\x{c1}/iI,ucp + \x{e1}xxx + +/a|\x{c1}/iI,utf + \x{e1}xxx + +/\x{c1}|\x{e1}/iI,ucp + +/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended + X\x{e1}Y + +/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended + X\x{c1}Y + +# Without UTF or UCP characters > 127 have only one case in the default locale. + +/X(\x{e1})Y/replace=>\U$1<,substitute_extended + X\x{e1}Y + +#subject + # End of testinput10 diff --git a/testdata/testinput12 b/testdata/testinput12 index bed00a5..32e97b5 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -463,4 +463,71 @@ /(?:\x{ff}|\x{3000})/I,utf +# ---------------------------------------------------- +# UCP and casing tests + +/\x{120}/i,I + +/\x{c1}/i,I,ucp + +/[\x{120}\x{121}]/iB,ucp + +/[ab\x{120}]+/iB,ucp + aABb\x{121}\x{120} + +#subject no_jit + +/\x{c1}/i,no_start_optimize +\= Expect no match + \x{e1} + +/\x{120}\x{c1}/i,ucp,no_start_optimize + \x{121}\x{e1} + +/\x{120}\x{c1}/i,ucp + \x{121}\x{e1} + +/[^\x{120}]/i,no_start_optimize + \x{121} + +/[^\x{120}]/i,ucp,no_start_optimize +\= Expect no match + \x{121} + +/[^\x{120}]/i + \x{121} + +/[^\x{120}]/i,ucp +\= Expect no match + \x{121} + +/\x{120}{2}/i,ucp + \x{121}\x{121} + +/[^\x{120}]{2}/i,ucp +\= Expect no match + \x{121}\x{121} + +/\x{c1}+\x{e1}/iB,ucp + \x{c1}\x{c1}\x{c1} + +/\x{c1}+\x{e1}/iIB,ucp + \x{c1}\x{c1}\x{c1} + \x{e1}\x{e1}\x{e1} + +/a|\x{c1}/iI,ucp + \x{e1}xxx + +/\x{c1}|\x{e1}/iI,ucp + +/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended + X\x{e1}Y + +/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended + X\x{121}Y + +#subject + +# ---------------------------------------------------- + # End of testinput12 diff --git a/testdata/testinput14 b/testdata/testinput14 index f97f3ec..8a17ae7 100644 --- a/testdata/testinput14 +++ b/testdata/testinput14 @@ -1,9 +1,12 @@ -# These test special (mostly error) UTF features of DFA matching. They are a -# selection of the more comprehensive tests that are run for non-DFA matching. -# The output is different for the different widths. +# These test special UTF and UCP features of DFA matching. The output is +# different for the different widths. #subject dfa +# ---------------------------------------------------- +# These are a selection of the more comprehensive tests that are run for +# non-DFA matching. + /X/utf XX\x{d800} XX\x{d800}\=offset=3 @@ -33,5 +36,46 @@ XX\xef\x80\=ph \xf7\=ph \xf7\x80\=ph + +# ---------------------------------------------------- +# UCP and casing tests - except for the first two, these will all fail in 8-bit +# mode because they are testing UCP without UTF and use characters > 255. + +/\x{c1}/i,no_start_optimize +\= Expect no match + \x{e1} + +/\x{c1}+\x{e1}/iB,ucp + \x{c1}\x{c1}\x{c1} + \x{e1}\x{e1}\x{e1} + +/\x{120}\x{c1}/i,ucp,no_start_optimize + \x{121}\x{e1} + +/\x{120}\x{c1}/i,ucp + \x{121}\x{e1} + +/[^\x{120}]/i,no_start_optimize + \x{121} + +/[^\x{120}]/i,ucp,no_start_optimize +\= Expect no match + \x{121} + +/[^\x{120}]/i + \x{121} + +/[^\x{120}]/i,ucp +\= Expect no match + \x{121} + +/\x{120}{2}/i,ucp + \x{121}\x{121} + +/[^\x{120}]{2}/i,ucp +\= Expect no match + \x{121}\x{121} + +# ---------------------------------------------------- # End of testinput14 diff --git a/testdata/testoutput10 b/testdata/testoutput10 index 775c2ab..9fe5ef6 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -1780,11 +1780,15 @@ Capture group count = 0 Options: utf Starting code units: \xc3 Subject length lower bound = 1 + abc\x{ff}def + 0: \x{ff} /[\xff\x{ff}]/I Capture group count = 0 -Starting code units: \xff +First code unit = \xff Subject length lower bound = 1 + abc\x{ff}def + 0: \xff /[Ss]/I Capture group count = 0 @@ -1813,4 +1817,62 @@ Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 3 abc\x80\=startchar,offset=3 Error -36 (bad UTF-8 offset) +#subject no_jit + +/\x{c1}+\x{e1}/iIB,ucp +------------------------------------------------------------------ + Bra + /i \x{c1}+ + /i \x{e1} + Ket + End +------------------------------------------------------------------ +Capture group count = 0 +Options: caseless ucp +First code unit = \xc1 (caseless) +Last code unit = \xe1 (caseless) +Subject length lower bound = 2 + \x{c1}\x{c1}\x{c1} + 0: \xc1\xc1\xc1 + \x{e1}\x{e1}\x{e1} + 0: \xe1\xe1\xe1 + +/a|\x{c1}/iI,ucp +Capture group count = 0 +Options: caseless ucp +Starting code units: A a \xc1 \xe1 +Subject length lower bound = 1 + \x{e1}xxx + 0: \xe1 + +/a|\x{c1}/iI,utf +Capture group count = 0 +Options: caseless utf +Starting code units: A a \xc3 +Subject length lower bound = 1 + \x{e1}xxx + 0: \x{e1} + +/\x{c1}|\x{e1}/iI,ucp +Capture group count = 0 +Options: caseless ucp +First code unit = \xc1 (caseless) +Subject length lower bound = 1 + +/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended + X\x{e1}Y + 1: >\xc1< + +/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended + X\x{c1}Y + 1: >\xe1< + +# Without UTF or UCP characters > 127 have only one case in the default locale. + +/X(\x{e1})Y/replace=>\U$1<,substitute_extended + X\x{e1}Y + 1: >\xe1< + +#subject + # End of testinput10 diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index 3006bc1..b944311 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -1613,7 +1613,7 @@ Subject length lower bound = 1 /[Ss]/I Capture group count = 0 -Starting code units: S s +First code unit = 'S' (caseless) Subject length lower bound = 1 /[Ss]/I,utf @@ -1628,4 +1628,134 @@ Options: utf Starting code units: \xff Subject length lower bound = 1 +# ---------------------------------------------------- +# UCP and casing tests + +/\x{120}/i,I +Capture group count = 0 +Options: caseless +First code unit = \x{120} +Subject length lower bound = 1 + +/\x{c1}/i,I,ucp +Capture group count = 0 +Options: caseless ucp +First code unit = \xc1 (caseless) +Subject length lower bound = 1 + +/[\x{120}\x{121}]/iB,ucp +------------------------------------------------------------------ + Bra + /i \x{120} + Ket + End +------------------------------------------------------------------ + +/[ab\x{120}]+/iB,ucp +------------------------------------------------------------------ + Bra + [ABab\x{120}-\x{121}]++ + Ket + End +------------------------------------------------------------------ + aABb\x{121}\x{120} + 0: aABb\x{121}\x{120} + +#subject no_jit + +/\x{c1}/i,no_start_optimize +\= Expect no match + \x{e1} +No match + +/\x{120}\x{c1}/i,ucp,no_start_optimize + \x{121}\x{e1} + 0: \x{121}\xe1 + +/\x{120}\x{c1}/i,ucp + \x{121}\x{e1} + 0: \x{121}\xe1 + +/[^\x{120}]/i,no_start_optimize + \x{121} + 0: \x{121} + +/[^\x{120}]/i,ucp,no_start_optimize +\= Expect no match + \x{121} +No match + +/[^\x{120}]/i + \x{121} + 0: \x{121} + +/[^\x{120}]/i,ucp +\= Expect no match + \x{121} +No match + +/\x{120}{2}/i,ucp + \x{121}\x{121} + 0: \x{121}\x{121} + +/[^\x{120}]{2}/i,ucp +\= Expect no match + \x{121}\x{121} +No match + +/\x{c1}+\x{e1}/iB,ucp +------------------------------------------------------------------ + Bra + /i \x{c1}+ + /i \x{e1} + Ket + End +------------------------------------------------------------------ + \x{c1}\x{c1}\x{c1} + 0: \xc1\xc1\xc1 + +/\x{c1}+\x{e1}/iIB,ucp +------------------------------------------------------------------ + Bra + /i \x{c1}+ + /i \x{e1} + Ket + End +------------------------------------------------------------------ +Capture group count = 0 +Options: caseless ucp +First code unit = \xc1 (caseless) +Last code unit = \xe1 (caseless) +Subject length lower bound = 2 + \x{c1}\x{c1}\x{c1} + 0: \xc1\xc1\xc1 + \x{e1}\x{e1}\x{e1} + 0: \xe1\xe1\xe1 + +/a|\x{c1}/iI,ucp +Capture group count = 0 +Options: caseless ucp +Starting code units: A a \xc1 \xe1 +Subject length lower bound = 1 + \x{e1}xxx + 0: \xe1 + +/\x{c1}|\x{e1}/iI,ucp +Capture group count = 0 +Options: caseless ucp +First code unit = \xc1 (caseless) +Subject length lower bound = 1 + +/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended + X\x{e1}Y + 1: >\xc1< + +/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended + X\x{121}Y + 1: >\x{120}< + +#subject + +# ---------------------------------------------------- + # End of testinput12 diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index ad240e2..74ccac8 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1611,7 +1611,7 @@ Subject length lower bound = 1 /[Ss]/I Capture group count = 0 -Starting code units: S s +First code unit = 'S' (caseless) Subject length lower bound = 1 /[Ss]/I,utf @@ -1626,4 +1626,134 @@ Options: utf Starting code units: \xff Subject length lower bound = 1 +# ---------------------------------------------------- +# UCP and casing tests + +/\x{120}/i,I +Capture group count = 0 +Options: caseless +First code unit = \x{120} +Subject length lower bound = 1 + +/\x{c1}/i,I,ucp +Capture group count = 0 +Options: caseless ucp +First code unit = \xc1 (caseless) +Subject length lower bound = 1 + +/[\x{120}\x{121}]/iB,ucp +------------------------------------------------------------------ + Bra + /i \x{120} + Ket + End +------------------------------------------------------------------ + +/[ab\x{120}]+/iB,ucp +------------------------------------------------------------------ + Bra + [ABab\x{120}-\x{121}]++ + Ket + End +------------------------------------------------------------------ + aABb\x{121}\x{120} + 0: aABb\x{121}\x{120} + +#subject no_jit + +/\x{c1}/i,no_start_optimize +\= Expect no match + \x{e1} +No match + +/\x{120}\x{c1}/i,ucp,no_start_optimize + \x{121}\x{e1} + 0: \x{121}\xe1 + +/\x{120}\x{c1}/i,ucp + \x{121}\x{e1} + 0: \x{121}\xe1 + +/[^\x{120}]/i,no_start_optimize + \x{121} + 0: \x{121} + +/[^\x{120}]/i,ucp,no_start_optimize +\= Expect no match + \x{121} +No match + +/[^\x{120}]/i + \x{121} + 0: \x{121} + +/[^\x{120}]/i,ucp +\= Expect no match + \x{121} +No match + +/\x{120}{2}/i,ucp + \x{121}\x{121} + 0: \x{121}\x{121} + +/[^\x{120}]{2}/i,ucp +\= Expect no match + \x{121}\x{121} +No match + +/\x{c1}+\x{e1}/iB,ucp +------------------------------------------------------------------ + Bra + /i \x{c1}+ + /i \x{e1} + Ket + End +------------------------------------------------------------------ + \x{c1}\x{c1}\x{c1} + 0: \xc1\xc1\xc1 + +/\x{c1}+\x{e1}/iIB,ucp +------------------------------------------------------------------ + Bra + /i \x{c1}+ + /i \x{e1} + Ket + End +------------------------------------------------------------------ +Capture group count = 0 +Options: caseless ucp +First code unit = \xc1 (caseless) +Last code unit = \xe1 (caseless) +Subject length lower bound = 2 + \x{c1}\x{c1}\x{c1} + 0: \xc1\xc1\xc1 + \x{e1}\x{e1}\x{e1} + 0: \xe1\xe1\xe1 + +/a|\x{c1}/iI,ucp +Capture group count = 0 +Options: caseless ucp +Starting code units: A a \xc1 \xe1 +Subject length lower bound = 1 + \x{e1}xxx + 0: \xe1 + +/\x{c1}|\x{e1}/iI,ucp +Capture group count = 0 +Options: caseless ucp +First code unit = \xc1 (caseless) +Subject length lower bound = 1 + +/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended + X\x{e1}Y + 1: >\xc1< + +/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended + X\x{121}Y + 1: >\x{120}< + +#subject + +# ---------------------------------------------------- + # End of testinput12 diff --git a/testdata/testoutput14-16 b/testdata/testoutput14-16 index 05b7d48..2d58f1c 100644 --- a/testdata/testoutput14-16 +++ b/testdata/testoutput14-16 @@ -1,9 +1,12 @@ -# These test special (mostly error) UTF features of DFA matching. They are a -# selection of the more comprehensive tests that are run for non-DFA matching. -# The output is different for the different widths. +# These test special UTF and UCP features of DFA matching. The output is +# different for the different widths. #subject dfa +# ---------------------------------------------------- +# These are a selection of the more comprehensive tests that are run for +# non-DFA matching. + /X/utf XX\x{d800} Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2 @@ -57,5 +60,66 @@ No match No match \xf7\x80\=ph No match + +# ---------------------------------------------------- +# UCP and casing tests - except for the first two, these will all fail in 8-bit +# mode because they are testing UCP without UTF and use characters > 255. + +/\x{c1}/i,no_start_optimize +\= Expect no match + \x{e1} +No match + +/\x{c1}+\x{e1}/iB,ucp +------------------------------------------------------------------ + Bra + /i \x{c1}+ + /i \x{e1} + Ket + End +------------------------------------------------------------------ + \x{c1}\x{c1}\x{c1} + 0: \xc1\xc1\xc1 + 1: \xc1\xc1 + \x{e1}\x{e1}\x{e1} + 0: \xe1\xe1\xe1 + 1: \xe1\xe1 + +/\x{120}\x{c1}/i,ucp,no_start_optimize + \x{121}\x{e1} + 0: \x{121}\xe1 + +/\x{120}\x{c1}/i,ucp + \x{121}\x{e1} + 0: \x{121}\xe1 + +/[^\x{120}]/i,no_start_optimize + \x{121} + 0: \x{121} + +/[^\x{120}]/i,ucp,no_start_optimize +\= Expect no match + \x{121} +No match + +/[^\x{120}]/i + \x{121} + 0: \x{121} + +/[^\x{120}]/i,ucp +\= Expect no match + \x{121} +No match + +/\x{120}{2}/i,ucp + \x{121}\x{121} + 0: \x{121}\x{121} + +/[^\x{120}]{2}/i,ucp +\= Expect no match + \x{121}\x{121} +No match + +# ---------------------------------------------------- # End of testinput14 diff --git a/testdata/testoutput14-32 b/testdata/testoutput14-32 index 30d7fa6..f1f65b7 100644 --- a/testdata/testoutput14-32 +++ b/testdata/testoutput14-32 @@ -1,9 +1,12 @@ -# These test special (mostly error) UTF features of DFA matching. They are a -# selection of the more comprehensive tests that are run for non-DFA matching. -# The output is different for the different widths. +# These test special UTF and UCP features of DFA matching. The output is +# different for the different widths. #subject dfa +# ---------------------------------------------------- +# These are a selection of the more comprehensive tests that are run for +# non-DFA matching. + /X/utf XX\x{d800} Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2 @@ -57,5 +60,66 @@ No match No match \xf7\x80\=ph No match + +# ---------------------------------------------------- +# UCP and casing tests - except for the first two, these will all fail in 8-bit +# mode because they are testing UCP without UTF and use characters > 255. + +/\x{c1}/i,no_start_optimize +\= Expect no match + \x{e1} +No match + +/\x{c1}+\x{e1}/iB,ucp +------------------------------------------------------------------ + Bra + /i \x{c1}+ + /i \x{e1} + Ket + End +------------------------------------------------------------------ + \x{c1}\x{c1}\x{c1} + 0: \xc1\xc1\xc1 + 1: \xc1\xc1 + \x{e1}\x{e1}\x{e1} + 0: \xe1\xe1\xe1 + 1: \xe1\xe1 + +/\x{120}\x{c1}/i,ucp,no_start_optimize + \x{121}\x{e1} + 0: \x{121}\xe1 + +/\x{120}\x{c1}/i,ucp + \x{121}\x{e1} + 0: \x{121}\xe1 + +/[^\x{120}]/i,no_start_optimize + \x{121} + 0: \x{121} + +/[^\x{120}]/i,ucp,no_start_optimize +\= Expect no match + \x{121} +No match + +/[^\x{120}]/i + \x{121} + 0: \x{121} + +/[^\x{120}]/i,ucp +\= Expect no match + \x{121} +No match + +/\x{120}{2}/i,ucp + \x{121}\x{121} + 0: \x{121}\x{121} + +/[^\x{120}]{2}/i,ucp +\= Expect no match + \x{121}\x{121} +No match + +# ---------------------------------------------------- # End of testinput14 diff --git a/testdata/testoutput14-8 b/testdata/testoutput14-8 index 1fb0dc1..aa62414 100644 --- a/testdata/testoutput14-8 +++ b/testdata/testoutput14-8 @@ -1,9 +1,12 @@ -# These test special (mostly error) UTF features of DFA matching. They are a -# selection of the more comprehensive tests that are run for non-DFA matching. -# The output is different for the different widths. +# These test special UTF and UCP features of DFA matching. The output is +# different for the different widths. #subject dfa +# ---------------------------------------------------- +# These are a selection of the more comprehensive tests that are run for +# non-DFA matching. + /X/utf XX\x{d800} Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2 @@ -57,5 +60,66 @@ Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2 Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0 \xf7\x80\=ph Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0 + +# ---------------------------------------------------- +# UCP and casing tests - except for the first two, these will all fail in 8-bit +# mode because they are testing UCP without UTF and use characters > 255. + +/\x{c1}/i,no_start_optimize +\= Expect no match + \x{e1} +No match + +/\x{c1}+\x{e1}/iB,ucp +------------------------------------------------------------------ + Bra + /i \x{c1}+ + /i \x{e1} + Ket + End +------------------------------------------------------------------ + \x{c1}\x{c1}\x{c1} + 0: \xc1\xc1\xc1 + 1: \xc1\xc1 + \x{e1}\x{e1}\x{e1} + 0: \xe1\xe1\xe1 + 1: \xe1\xe1 + +/\x{120}\x{c1}/i,ucp,no_start_optimize +Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large + \x{121}\x{e1} + +/\x{120}\x{c1}/i,ucp +Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large + \x{121}\x{e1} + +/[^\x{120}]/i,no_start_optimize +Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large + \x{121} + +/[^\x{120}]/i,ucp,no_start_optimize +Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large +\= Expect no match + \x{121} + +/[^\x{120}]/i +Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large + \x{121} + +/[^\x{120}]/i,ucp +Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large +\= Expect no match + \x{121} + +/\x{120}{2}/i,ucp +Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large + \x{121}\x{121} + +/[^\x{120}]{2}/i,ucp +Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large +\= Expect no match + \x{121}\x{121} + +# ---------------------------------------------------- # End of testinput14