From 42f547bf4d25f755b9076405a363145f6a1b2659 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Tue, 12 Sep 2017 16:28:42 +0000 Subject: [PATCH] Replace multiple copies of extended grapheme sequence code with a single subroutine. --- CMakeLists.txt | 1 + ChangeLog | 6 +- Makefile.am | 1 + NON-AUTOTOOLS-BUILD | 4 +- PrepareRelease | 2 + README | 3 +- src/pcre2_dfa_match.c | 213 ++---------------------------------------- src/pcre2_extuni.c | 129 +++++++++++++++++++++++++ src/pcre2_internal.h | 3 + src/pcre2_match.c | 204 ++-------------------------------------- 10 files changed, 163 insertions(+), 403 deletions(-) create mode 100644 src/pcre2_extuni.c diff --git a/CMakeLists.txt b/CMakeLists.txt index fbc37fe..7303dcc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -432,6 +432,7 @@ SET(PCRE2_SOURCES src/pcre2_convert.c src/pcre2_dfa_match.c src/pcre2_error.c + src/pcre2_extuni.c src/pcre2_find_bracket.c src/pcre2_jit_compile.c src/pcre2_maketables.c diff --git a/ChangeLog b/ChangeLog index 2a49eef..1384320 100644 --- a/ChangeLog +++ b/ChangeLog @@ -5,7 +5,11 @@ Change Log for PCRE2 Version 10.31 xx-xxx-201x ------------------------- -1. Fix typo (missing ]) in VMS code in pcre2test.c. +1. Fix typo (missing ]) in VMS code in pcre2test.c. + +2. Replace the replicated code for matching extended Unicode grapheme sequences +(which got a lot more complicated by change 10.30/49) by a single subroutine +that is called by both pcre2_match() and pcre2_dfa_match(). Version 10.30 14-August-2017 diff --git a/Makefile.am b/Makefile.am index 7fa98c5..7dbe569 100644 --- a/Makefile.am +++ b/Makefile.am @@ -351,6 +351,7 @@ COMMON_SOURCES = \ src/pcre2_convert.c \ src/pcre2_dfa_match.c \ src/pcre2_error.c \ + src/pcre2_extuni.c \ src/pcre2_find_bracket.c \ src/pcre2_internal.h \ src/pcre2_intmodedep.h \ diff --git a/NON-AUTOTOOLS-BUILD b/NON-AUTOTOOLS-BUILD index f8c6359..2898948 100644 --- a/NON-AUTOTOOLS-BUILD +++ b/NON-AUTOTOOLS-BUILD @@ -91,8 +91,10 @@ can skip ahead to the CMake section. pcre2_compile.c pcre2_config.c pcre2_context.c + pcre2_convert.c pcre2_dfa_match.c pcre2_error.c + pcre2_extuni.c pcre2_find_bracket.c pcre2_jit_compile.c pcre2_maketables.c @@ -377,4 +379,4 @@ and executable, is in EBCDIC and native z/OS file formats and this is the recommended download site. ============================= -Last Updated: 17 March 2017 +Last Updated: 12 September 2017 diff --git a/PrepareRelease b/PrepareRelease index 0cd4c96..9aa6b7d 100755 --- a/PrepareRelease +++ b/PrepareRelease @@ -196,8 +196,10 @@ files="\ src/pcre2_compile.c \ src/pcre2_config.c \ src/pcre2_context.c \ + src/pcre2_convert.c \ src/pcre2_dfa_match.c \ src/pcre2_error.c \ + src/pcre2_extuni.c \ src/pcre2_find_bracket.c \ src/pcre2_internal.h \ src/pcre2_intmodedep.h \ diff --git a/README b/README index bed0513..2e376b0 100644 --- a/README +++ b/README @@ -773,6 +773,7 @@ The distribution should contain the files listed below. src/pcre2_convert.c ) src/pcre2_dfa_match.c ) src/pcre2_error.c ) + src/pcre2_extuni.c ) src/pcre2_find_bracket.c ) src/pcre2_jit_compile.c ) src/pcre2_jit_match.c ) sources for the functions in the library, @@ -882,4 +883,4 @@ The distribution should contain the files listed below. Philip Hazel Email local part: ph10 Email domain: cam.ac.uk -Last updated: 18 July 2017 +Last updated: 12 September 2017 diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index 5ae1394..b78ad07 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -1364,63 +1364,14 @@ for (;;) if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0) { - uint32_t lgb, rgb; - PCRE2_SPTR nptr = ptr + clen; int ncount = 0; if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility */ next_active_state--; } - lgb = UCD_GRAPHBREAK(c); - while (nptr < end_subject) - { - dlen = 1; - if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } - rgb = UCD_GRAPHBREAK(d); - if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; - - /* Not breaking between Regional Indicators is allowed only if - there are an even number of preceding RIs. */ - - if (lgb == ucp_gbRegionalIndicator && - rgb == ucp_gbRegionalIndicator) - { - int ricount = 0; - PCRE2_SPTR bptr = nptr - 1; -#ifdef SUPPORT_UNICODE - if (utf) BACKCHAR(bptr); -#endif - /* bptr is pointing to the left-hand character */ - - while (bptr > mb->start_subject) - { - bptr--; -#ifdef SUPPORT_UNICODE - if (utf) - { - BACKCHAR(bptr); - GETCHAR(d, bptr); - } - else -#endif - d = *bptr; - if (UCD_GRAPHBREAK(d) != ucp_gbRegionalIndicator) break; - ricount++; - } - if ((ricount & 1) != 0) break; /* Grapheme break required */ - } - - /* If Extend follows E_Base[_GAZ] do not update lgb; this allows - any number of Extend before a following E_Modifier. */ - - if (rgb != ucp_gbExtend || - (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ)) - lgb = rgb; - - ncount++; - nptr += dlen; - } + (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf, + &ncount); count++; ADD_NEW_DATA(-state_offset, count, ncount); } @@ -1663,8 +1614,6 @@ for (;;) ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { - uint32_t lgb, rgb; - PCRE2_SPTR nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) @@ -1672,55 +1621,8 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - lgb = UCD_GRAPHBREAK(c); - while (nptr < end_subject) - { - dlen = 1; - if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } - rgb = UCD_GRAPHBREAK(d); - if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; - - /* Not breaking between Regional Indicators is allowed only if - there are an even number of preceding RIs. */ - - if (lgb == ucp_gbRegionalIndicator && - rgb == ucp_gbRegionalIndicator) - { - int ricount = 0; - PCRE2_SPTR bptr = nptr - 1; -#ifdef SUPPORT_UNICODE - if (utf) BACKCHAR(bptr); -#endif - /* bptr is pointing to the left-hand character */ - - while (bptr > mb->start_subject) - { - bptr--; -#ifdef SUPPORT_UNICODE - if (utf) - { - BACKCHAR(bptr); - GETCHAR(d, bptr); - } - else -#endif - d = *bptr; - if (UCD_GRAPHBREAK(d) != ucp_gbRegionalIndicator) break; - ricount++; - } - if ((ricount & 1) != 0) break; /* Grapheme break required */ - } - - /* If Extend follows E_Base[_GAZ] do not update lgb; this allows - any number of Extend before a following E_Modifier. */ - - if (rgb != ucp_gbExtend || - (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ)) - lgb = rgb; - - ncount++; - nptr += dlen; - } + (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf, + &ncount); ADD_NEW_DATA(-(state_offset + count), 0, ncount); } break; @@ -1973,63 +1875,15 @@ for (;;) count = current_state->count; /* Number already matched */ if (clen > 0) { - uint32_t lgb, rgb; - PCRE2_SPTR nptr = ptr + clen; + PCRE2_SPTR nptr; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility */ next_active_state--; } - lgb = UCD_GRAPHBREAK(c); - while (nptr < end_subject) - { - dlen = 1; - if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } - rgb = UCD_GRAPHBREAK(d); - if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; - - /* Not breaking between Regional Indicators is allowed only if - there are an even number of preceding RIs. */ - - if (lgb == ucp_gbRegionalIndicator && - rgb == ucp_gbRegionalIndicator) - { - int ricount = 0; - PCRE2_SPTR bptr = nptr - 1; -#ifdef SUPPORT_UNICODE - if (utf) BACKCHAR(bptr); -#endif - /* bptr is pointing to the left-hand character */ - - while (bptr > mb->start_subject) - { - bptr--; -#ifdef SUPPORT_UNICODE - if (utf) - { - BACKCHAR(bptr); - GETCHAR(d, bptr); - } - else -#endif - d = *bptr; - if (UCD_GRAPHBREAK(d) != ucp_gbRegionalIndicator) break; - ricount++; - } - if ((ricount & 1) != 0) break; /* Grapheme break required */ - } - - /* If Extend follows E_Base[_GAZ] do not update lgb; this allows - any number of Extend before a following E_Modifier. */ - - if (rgb != ucp_gbExtend || - (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ)) - lgb = rgb; - - ncount++; - nptr += dlen; - } + nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf, + &ncount); if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) reset_could_continue = TRUE; if (++count >= (int)GET2(code, 1)) @@ -2206,58 +2060,9 @@ for (;;) case OP_EXTUNI: if (clen > 0) { - uint32_t lgb, rgb; - PCRE2_SPTR nptr = ptr + clen; int ncount = 0; - lgb = UCD_GRAPHBREAK(c); - while (nptr < end_subject) - { - dlen = 1; - if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } - rgb = UCD_GRAPHBREAK(d); - if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; - - /* Not breaking between Regional Indicators is allowed only if - there are an even number of preceding RIs. */ - - if (lgb == ucp_gbRegionalIndicator && - rgb == ucp_gbRegionalIndicator) - { - int ricount = 0; - PCRE2_SPTR bptr = nptr - 1; -#ifdef SUPPORT_UNICODE - if (utf) BACKCHAR(bptr); -#endif - /* bptr is pointing to the left-hand character */ - - while (bptr > mb->start_subject) - { - bptr--; -#ifdef SUPPORT_UNICODE - if (utf) - { - BACKCHAR(bptr); - GETCHAR(d, bptr); - } - else -#endif - d = *bptr; - if (UCD_GRAPHBREAK(d) != ucp_gbRegionalIndicator) break; - ricount++; - } - if ((ricount & 1) != 0) break; /* Grapheme break required */ - } - - /* If Extend follows E_Base[_GAZ] do not update lgb; this allows - any number of Extend before a following E_Modifier. */ - - if (rgb != ucp_gbExtend || - (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ)) - lgb = rgb; - - ncount++; - nptr += dlen; - } + PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, + end_subject, utf, &ncount); if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) reset_could_continue = TRUE; ADD_NEW_DATA(-(state_offset + 1), 0, ncount); diff --git a/src/pcre2_extuni.c b/src/pcre2_extuni.c new file mode 100644 index 0000000..ed56812 --- /dev/null +++ b/src/pcre2_extuni.c @@ -0,0 +1,129 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2017 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This module contains an internal function that is used to match a Unicode +extended grapheme sequence. It is used by both pcre2_match() and +pcre2_def_match(). */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + + +#include "pcre2_internal.h" + +/************************************************* +* Match an extended grapheme sequence * +*************************************************/ + +/* +Arguments: + c the first character + eptr pointer to next character + start_subject pointer to start of subject + end_subject pointer to end of subject + utf TRUE if in UTF mode + xcount pointer to count of additional characters, + or NULL if count not needed + +Returns: pointer after the end of the sequence +*/ + +PCRE2_SPTR +PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, + PCRE2_SPTR end_subject, BOOL utf, int *xcount) +{ +int lgb = UCD_GRAPHBREAK(c); + +while (eptr < end_subject) + { + int rgb; + int len = 1; + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } + rgb = UCD_GRAPHBREAK(c); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + + /* Not breaking between Regional Indicators is allowed only if there + are an even number of preceding RIs. */ + + if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator) + { + int ricount = 0; + PCRE2_SPTR bptr = eptr - 1; +#ifdef SUPPORT_UNICODE + if (utf) BACKCHAR(bptr); +#endif + + /* bptr is pointing to the left-hand character */ + + while (bptr > start_subject) + { + bptr--; +#ifdef SUPPORT_UNICODE + if (utf) + { + BACKCHAR(bptr); + GETCHAR(c, bptr); + } + else +#endif + c = *bptr; + if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break; + ricount++; + } + if ((ricount & 1) != 0) break; /* Grapheme break required */ + } + + /* If Extend follows E_Base[_GAZ] do not update lgb; this allows + any number of Extend before a following E_Modifier. */ + + if (rgb != ucp_gbExtend || + (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ)) + lgb = rgb; + + eptr += len; + if (xcount != NULL) *xcount += 1; + } + +return eptr; +} + +/* End of pcre2_extuni.c */ diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 9ccce25..4886bf1 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -1926,6 +1926,7 @@ is available. */ #define _pcre2_auto_possessify PCRE2_SUFFIX(_pcre2_auto_possessify_) #define _pcre2_check_escape PCRE2_SUFFIX(_pcre2_check_escape_) +#define _pcre2_extuni PCRE2_SUFFIX(_pcre2_extuni_) #define _pcre2_find_bracket PCRE2_SUFFIX(_pcre2_find_bracket_) #define _pcre2_is_newline PCRE2_SUFFIX(_pcre2_is_newline_) #define _pcre2_jit_free_rodata PCRE2_SUFFIX(_pcre2_jit_free_rodata_) @@ -1949,6 +1950,8 @@ extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, const compile_block *); extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *, int *, uint32_t, BOOL, compile_block *); +extern PCRE2_SPTR _pcre2_extuni(uint32_t, PCRE2_SPTR, PCRE2_SPTR, PCRE2_SPTR, + BOOL, int *); extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int); extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, uint32_t *, BOOL); diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 050b7e9..70bf936 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -2440,55 +2440,9 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } else { - int lgb, rgb; GETCHARINCTEST(fc, Feptr); - lgb = UCD_GRAPHBREAK(fc); - while (Feptr < mb->end_subject) - { - int len = 1; - if (!utf) fc = *Feptr; else { GETCHARLEN(fc, Feptr, len); } - rgb = UCD_GRAPHBREAK(fc); - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; - - /* Not breaking between Regional Indicators is allowed only if there - are an even number of preceding RIs. */ - - if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator) - { - int ricount = 0; - PCRE2_SPTR bptr = Feptr - 1; -#ifdef SUPPORT_UNICODE - if (utf) BACKCHAR(bptr); -#endif - /* bptr is pointing to the left-hand character */ - - while (bptr > mb->start_subject) - { - bptr--; -#ifdef SUPPORT_UNICODE - if (utf) - { - BACKCHAR(bptr); - GETCHAR(fc, bptr); - } - else -#endif - fc = *bptr; - if (UCD_GRAPHBREAK(fc) != ucp_gbRegionalIndicator) break; - ricount++; - } - if ((ricount & 1) != 0) break; /* Grapheme break required */ - } - - /* If Extend follows E_Base[_GAZ] do not update lgb; this allows - any number of Extend before a following E_Modifier. */ - - if (rgb != ucp_gbExtend || - (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ)) - lgb = rgb; - - Feptr += len; - } + Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf, + NULL); } CHECK_PARTIAL(); Fecode++; @@ -2785,61 +2739,13 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } else { - int lgb, rgb; GETCHARINCTEST(fc, Feptr); - lgb = UCD_GRAPHBREAK(fc); - while (Feptr < mb->end_subject) - { - int len = 1; - if (!utf) fc = *Feptr; else { GETCHARLEN(fc, Feptr, len); } - rgb = UCD_GRAPHBREAK(fc); - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; - - /* Not breaking between Regional Indicators is allowed only if - there are an even number of preceding RIs. */ - - if (lgb == ucp_gbRegionalIndicator && - rgb == ucp_gbRegionalIndicator) - { - int ricount = 0; - PCRE2_SPTR bptr = Feptr - 1; -#ifdef SUPPORT_UNICODE - if (utf) BACKCHAR(bptr); -#endif - /* bptr is pointing to the left-hand character */ - - while (bptr > mb->start_subject) - { - bptr--; -#ifdef SUPPORT_UNICODE - if (utf) - { - BACKCHAR(bptr); - GETCHAR(fc, bptr); - } - else -#endif - fc = *bptr; - if (UCD_GRAPHBREAK(fc) != ucp_gbRegionalIndicator) break; - ricount++; - } - if ((ricount & 1) != 0) break; /* Grapheme break required */ - } - - /* If Extend follows E_Base[_GAZ] do not update lgb; this allows - any number of Extend before a following E_Modifier. */ - - if (rgb != ucp_gbExtend || - (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ)) - lgb = rgb; - - Feptr += len; - } + Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, + mb->end_subject, utf, NULL); } CHECK_PARTIAL(); } } - else #endif /* SUPPORT_UNICODE */ @@ -3593,56 +3499,9 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } else { - int lgb, rgb; GETCHARINCTEST(fc, Feptr); - lgb = UCD_GRAPHBREAK(fc); - while (Feptr < mb->end_subject) - { - int len = 1; - if (!utf) fc = *Feptr; else { GETCHARLEN(fc, Feptr, len); } - rgb = UCD_GRAPHBREAK(fc); - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; - - /* Not breaking between Regional Indicators is allowed only if - there are an even number of preceding RIs. */ - - if (lgb == ucp_gbRegionalIndicator && - rgb == ucp_gbRegionalIndicator) - { - int ricount = 0; - PCRE2_SPTR bptr = Feptr - 1; -#ifdef SUPPORT_UNICODE - if (utf) BACKCHAR(bptr); -#endif - /* bptr is pointing to the left-hand character */ - - while (bptr > mb->start_subject) - { - bptr--; -#ifdef SUPPORT_UNICODE - if (utf) - { - BACKCHAR(bptr); - GETCHAR(fc, bptr); - } - else -#endif - fc = *bptr; - if (UCD_GRAPHBREAK(fc) != ucp_gbRegionalIndicator) break; - ricount++; - } - if ((ricount & 1) != 0) break; /* Grapheme break required */ - } - - /* If Extend follows E_Base[_GAZ] do not update lgb; this allows - any number of Extend before a following E_Modifier. */ - - if (rgb != ucp_gbExtend || - (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ)) - lgb = rgb; - - Feptr += len; - } + Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, + utf, NULL); } CHECK_PARTIAL(); } @@ -4167,56 +4026,9 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } else { - int lgb, rgb; GETCHARINCTEST(fc, Feptr); - lgb = UCD_GRAPHBREAK(fc); - while (Feptr < mb->end_subject) - { - int len = 1; - if (!utf) fc = *Feptr; else { GETCHARLEN(fc, Feptr, len); } - rgb = UCD_GRAPHBREAK(fc); - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; - - /* Not breaking between Regional Indicators is allowed only if - there are an even number of preceding RIs. */ - - if (lgb == ucp_gbRegionalIndicator && - rgb == ucp_gbRegionalIndicator) - { - int ricount = 0; - PCRE2_SPTR bptr = Feptr - 1; -#ifdef SUPPORT_UNICODE - if (utf) BACKCHAR(bptr); -#endif - /* bptr is pointing to the left-hand character */ - - while (bptr > mb->start_subject) - { - bptr--; -#ifdef SUPPORT_UNICODE - if (utf) - { - BACKCHAR(bptr); - GETCHAR(fc, bptr); - } - else -#endif - fc = *bptr; - if (UCD_GRAPHBREAK(fc) != ucp_gbRegionalIndicator) break; - ricount++; - } - if ((ricount & 1) != 0) break; /* Grapheme break required */ - } - - /* If Extend follows E_Base[_GAZ] do not update lgb; this allows - any number of Extend before a following E_Modifier. */ - - if (rgb != ucp_gbExtend || - (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ)) - lgb = rgb; - - Feptr += len; - } + Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, + utf, NULL); } CHECK_PARTIAL(); }