Update Script Run code to use the Script Extension property instead of the
Script property.
This commit is contained in:
parent
83726c359d
commit
4e7a204d18
|
@ -32,7 +32,7 @@ src/pcre2_chartables.c.dist are updated.
|
||||||
|
|
||||||
8. Implement the new Perl "script run" features (*script_run:...) and
|
8. Implement the new Perl "script run" features (*script_run:...) and
|
||||||
(*atomic_script_run:...) aka (*sr:...) and (*asr:...). At present, this is
|
(*atomic_script_run:...) aka (*sr:...) and (*asr:...). At present, this is
|
||||||
incomplete and not yet documented.
|
not yet documented.
|
||||||
|
|
||||||
|
|
||||||
Version 10.32 10-September-2018
|
Version 10.32 10-September-2018
|
||||||
|
|
|
@ -68,17 +68,26 @@ Arguments:
|
||||||
Returns: TRUE if this is a valid script run
|
Returns: TRUE if this is a valid script run
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define SCRIPT_UNSET (-1)
|
/* These dummy values must be less than the negation of the largest offset in
|
||||||
#define SCRIPT_HANPENDING (-2)
|
the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
|
||||||
#define SCRIPT_HANHIRAKATA (-3)
|
records (and is only likely to be a few hundred). */
|
||||||
#define SCRIPT_HANBOPOMOFO (-4)
|
|
||||||
#define SCRIPT_HANHANGUL (-5)
|
#define SCRIPT_UNSET (-99999)
|
||||||
|
#define SCRIPT_HANPENDING (-99998)
|
||||||
|
#define SCRIPT_HANHIRAKATA (-99997)
|
||||||
|
#define SCRIPT_HANBOPOMOFO (-99996)
|
||||||
|
#define SCRIPT_HANHANGUL (-99995)
|
||||||
|
#define SCRIPT_LIST (-99994)
|
||||||
|
|
||||||
|
#define INTERSECTION_LIST_SIZE 50
|
||||||
|
|
||||||
BOOL
|
BOOL
|
||||||
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
|
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
|
||||||
{
|
{
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
int require_script = SCRIPT_UNSET;
|
int require_script = SCRIPT_UNSET;
|
||||||
|
uint8_t intersection_list[INTERSECTION_LIST_SIZE];
|
||||||
|
const uint8_t *require_list = NULL;
|
||||||
uint32_t require_digitset = 0;
|
uint32_t require_digitset = 0;
|
||||||
uint32_t c;
|
uint32_t c;
|
||||||
|
|
||||||
|
@ -93,86 +102,290 @@ GETCHARINCTEST(c, ptr);
|
||||||
if (ptr >= endptr) return TRUE;
|
if (ptr >= endptr) return TRUE;
|
||||||
|
|
||||||
/* Scan strings of two or more characters, checking the Unicode characteristics
|
/* Scan strings of two or more characters, checking the Unicode characteristics
|
||||||
of each code point. */
|
of each code point. We make use of the Script Extensions property. There is
|
||||||
|
special code for scripts that can be combined with characters from the Han
|
||||||
|
Chinese script. This may be used in conjunction with four other scripts in
|
||||||
|
these combinations:
|
||||||
|
|
||||||
|
. Han with Hiragana and Katakana is allowed (for Japanese).
|
||||||
|
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
|
||||||
|
. Han with Hangul is allowed (for Korean).
|
||||||
|
|
||||||
|
If the first significant character's script is one of the four, the required
|
||||||
|
script type is immediately known. However, if the first significant
|
||||||
|
character's script is Han, we have to keep checking for a non-Han character.
|
||||||
|
Hence the SCRIPT_HANPENDING state. */
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
const ucd_record *ucd = GET_UCD(c);
|
const ucd_record *ucd = GET_UCD(c);
|
||||||
uint32_t script = ucd->script;
|
int32_t scriptx = ucd->scriptx;
|
||||||
|
|
||||||
/* If the script is Unknown, the string is not a valid script run. Such
|
/* If the script extension is Unknown, the string is not a valid script run.
|
||||||
characters can only form script runs of length one. */
|
Such characters can only form script runs of length one. */
|
||||||
|
|
||||||
if (script == ucp_Unknown) return FALSE;
|
|
||||||
|
|
||||||
/* A character whose script is Inherited is always accepted, and plays no
|
if (scriptx == ucp_Unknown) return FALSE;
|
||||||
further part. A character whose script is Common is always accepted, but must
|
|
||||||
still be tested for a digit below. Otherwise, the character must match the
|
|
||||||
script of the first non-Inherited, non-Common character encountered. For most
|
|
||||||
scripts, the test is for the same script. However, the Han Chinese script may
|
|
||||||
be used in conjunction with four other scripts in these combinations:
|
|
||||||
|
|
||||||
. Han with Hiragana and Katakana is allowed (for Japanese).
|
/* A character whose script extension is Inherited is always accepted with
|
||||||
|
any script, and plays no further part in this testing. A character whose
|
||||||
|
script is Common is always accepted, but must still be tested for a digit
|
||||||
|
below. The scriptx value at this point is non-zero, because zero is
|
||||||
|
ucp_Unknown, tested for above. */
|
||||||
|
|
||||||
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
|
if (scriptx != ucp_Inherited)
|
||||||
|
{
|
||||||
. Han with Hangul is allowed (for Korean).
|
if (scriptx != ucp_Common)
|
||||||
|
|
||||||
If the first significant character's script is one of the four, the required
|
|
||||||
script type is immediately known. However, if the first significant
|
|
||||||
character's script is Han, we have to keep checking for a non-Han character.
|
|
||||||
Hence the SCRIPT_HANPENDING state. */
|
|
||||||
|
|
||||||
if (script != ucp_Inherited)
|
|
||||||
{
|
|
||||||
if (script != ucp_Common) switch(require_script)
|
|
||||||
{
|
{
|
||||||
default:
|
/* If the script extension value is positive, the character is not a mark
|
||||||
if (script != (unsigned int)require_script) return FALSE;
|
that can be used with many scripts. In the simple case we either set or
|
||||||
break;
|
compare with the required script. However, handling the scripts that can
|
||||||
|
combine with Han are more complicated, as is the case when the previous
|
||||||
case SCRIPT_UNSET:
|
characters have been man-script marks. */
|
||||||
case SCRIPT_HANPENDING:
|
|
||||||
switch(script)
|
if (scriptx > 0)
|
||||||
{
|
{
|
||||||
case ucp_Han:
|
switch(require_script)
|
||||||
require_script = SCRIPT_HANPENDING;
|
{
|
||||||
break;
|
/* Either the first significant character (require_script unset) or
|
||||||
|
after only Han characters. */
|
||||||
case ucp_Hiragana:
|
|
||||||
case ucp_Katakana:
|
case SCRIPT_UNSET:
|
||||||
require_script = SCRIPT_HANHIRAKATA;
|
case SCRIPT_HANPENDING:
|
||||||
break;
|
switch(scriptx)
|
||||||
|
{
|
||||||
case ucp_Bopomofo:
|
case ucp_Han:
|
||||||
require_script = SCRIPT_HANBOPOMOFO;
|
require_script = SCRIPT_HANPENDING;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ucp_Hangul:
|
case ucp_Hiragana:
|
||||||
require_script = SCRIPT_HANHANGUL;
|
case ucp_Katakana:
|
||||||
break;
|
require_script = SCRIPT_HANHIRAKATA;
|
||||||
|
break;
|
||||||
default:
|
|
||||||
if (require_script == SCRIPT_HANPENDING) return FALSE;
|
case ucp_Bopomofo:
|
||||||
require_script = script;
|
require_script = SCRIPT_HANBOPOMOFO;
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
break;
|
case ucp_Hangul:
|
||||||
|
require_script = SCRIPT_HANHANGUL;
|
||||||
case SCRIPT_HANHIRAKATA:
|
break;
|
||||||
if (script != ucp_Han && script != ucp_Hiragana && script != ucp_Katakana)
|
|
||||||
return FALSE;
|
/* Not a Han-related script. If expecting one, fail. Otherise set
|
||||||
break;
|
the requirement to this script. */
|
||||||
|
|
||||||
case SCRIPT_HANBOPOMOFO:
|
default:
|
||||||
if (script != ucp_Han && script != ucp_Bopomofo) return FALSE;
|
if (require_script == SCRIPT_HANPENDING) return FALSE;
|
||||||
break;
|
require_script = scriptx;
|
||||||
|
break;
|
||||||
case SCRIPT_HANHANGUL:
|
}
|
||||||
if (script != ucp_Han && script != ucp_Hangul) return FALSE;
|
break;
|
||||||
break;
|
|
||||||
}
|
/* Previously encountered one of the "with Han" scripts. Check that
|
||||||
|
this character is appropriate. */
|
||||||
|
|
||||||
|
case SCRIPT_HANHIRAKATA:
|
||||||
|
if (scriptx != ucp_Han && scriptx != ucp_Hiragana &&
|
||||||
|
scriptx != ucp_Katakana)
|
||||||
|
return FALSE;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SCRIPT_HANBOPOMOFO:
|
||||||
|
if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SCRIPT_HANHANGUL:
|
||||||
|
if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* We have a list of scripts to check that is derived from one or
|
||||||
|
more previous characters. This is either one of the lists in
|
||||||
|
ucd_script_sets[] (for one previous character) or the intersection of
|
||||||
|
several lists for multiple characters. */
|
||||||
|
|
||||||
|
case SCRIPT_LIST:
|
||||||
|
{
|
||||||
|
const uint8_t *list;
|
||||||
|
for (list = require_list; *list != 0; list++)
|
||||||
|
{
|
||||||
|
if (*list == scriptx) break;
|
||||||
|
}
|
||||||
|
if (*list == 0) return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The rest of the string must be in this script, but we have to
|
||||||
|
allow for the Han complications. */
|
||||||
|
|
||||||
|
switch(scriptx)
|
||||||
|
{
|
||||||
|
case ucp_Han:
|
||||||
|
require_script = SCRIPT_HANPENDING;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ucp_Hiragana:
|
||||||
|
case ucp_Katakana:
|
||||||
|
require_script = SCRIPT_HANHIRAKATA;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ucp_Bopomofo:
|
||||||
|
require_script = SCRIPT_HANBOPOMOFO;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ucp_Hangul:
|
||||||
|
require_script = SCRIPT_HANHANGUL;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
require_script = scriptx;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* This is the easy case when a single script is required. */
|
||||||
|
|
||||||
|
default:
|
||||||
|
if (scriptx != require_script) return FALSE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} /* End of handing positive scriptx */
|
||||||
|
|
||||||
|
/* If scriptx is negative, this character is a mark-type character that
|
||||||
|
has a list of permitted scripts. */
|
||||||
|
|
||||||
|
else
|
||||||
|
{
|
||||||
|
uint32_t chspecial;
|
||||||
|
const uint8_t *clist, *rlist;
|
||||||
|
const uint8_t *list = PRIV(ucd_script_sets) - scriptx;
|
||||||
|
|
||||||
|
switch(require_script)
|
||||||
|
{
|
||||||
|
case SCRIPT_UNSET:
|
||||||
|
require_list = PRIV(ucd_script_sets) - scriptx;
|
||||||
|
require_script = SCRIPT_LIST;
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* An inspection of the Unicode 11.0.0 files shows that there are the
|
||||||
|
following types of Script Extension list that involve the Han,
|
||||||
|
Bopomofo, Hiragana, Katakana, and Hangul scripts:
|
||||||
|
|
||||||
|
. Bopomofo + Han
|
||||||
|
. Han + Hiragana + Katakana
|
||||||
|
. Hiragana + Katakana
|
||||||
|
. Bopopmofo + Hangul + Han + Hiragana + Katakana
|
||||||
|
|
||||||
|
The following code tries to make sense of this. */
|
||||||
|
|
||||||
|
#define FOUND_BOPOMOFO 1
|
||||||
|
#define FOUND_HIRAGANA 2
|
||||||
|
#define FOUND_KATAKANA 4
|
||||||
|
#define FOUND_HANGUL 8
|
||||||
|
|
||||||
|
case SCRIPT_HANPENDING:
|
||||||
|
chspecial = 0;
|
||||||
|
for (; *list != 0; list++)
|
||||||
|
{
|
||||||
|
switch (*list)
|
||||||
|
{
|
||||||
|
case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break;
|
||||||
|
case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break;
|
||||||
|
case ucp_Katakana: chspecial |= FOUND_KATAKANA; break;
|
||||||
|
case ucp_Hangul: chspecial |= FOUND_HANGUL; break;
|
||||||
|
default: break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (chspecial == 0) return FALSE;
|
||||||
|
|
||||||
|
if (chspecial == FOUND_BOPOMOFO)
|
||||||
|
{
|
||||||
|
require_script = SCRIPT_HANBOPOMOFO;
|
||||||
|
}
|
||||||
|
else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
|
||||||
|
{
|
||||||
|
require_script = SCRIPT_HANHIRAKATA;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Otherwise it must be allowed with all of them, so remain in
|
||||||
|
the pending state. */
|
||||||
|
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SCRIPT_HANHIRAKATA:
|
||||||
|
for (; *list != 0; list++)
|
||||||
|
{
|
||||||
|
if (*list == ucp_Hiragana || *list == ucp_Katakana) break;
|
||||||
|
}
|
||||||
|
if (*list == 0) return FALSE;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SCRIPT_HANBOPOMOFO:
|
||||||
|
for (; *list != 0; list++)
|
||||||
|
{
|
||||||
|
if (*list == ucp_Bopomofo) break;
|
||||||
|
}
|
||||||
|
if (*list == 0) return FALSE;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SCRIPT_HANHANGUL:
|
||||||
|
for (; *list != 0; list++)
|
||||||
|
{
|
||||||
|
if (*list == ucp_Hangul) break;
|
||||||
|
}
|
||||||
|
if (*list == 0) return FALSE;
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* Previously encountered one or more characters that are allowed
|
||||||
|
with a list of scripts. Build the intersection of the required list
|
||||||
|
with this character's list in intersection_list[]. This code is
|
||||||
|
written so that it still works OK if the required list is already in
|
||||||
|
that vector. */
|
||||||
|
|
||||||
|
case SCRIPT_LIST:
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
for (rlist = require_list; *rlist != 0; rlist++)
|
||||||
|
{
|
||||||
|
for (clist = list; *clist != 0; clist++)
|
||||||
|
{
|
||||||
|
if (*rlist == *clist)
|
||||||
|
{
|
||||||
|
intersection_list[i++] = *rlist;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (i == 0) return FALSE; /* No scripts in common */
|
||||||
|
|
||||||
|
/* If there's just one script in common, we can set it as the
|
||||||
|
unique required script. Otherwise, terminate the intersection list
|
||||||
|
and make it the required list. */
|
||||||
|
|
||||||
|
if (i == 1)
|
||||||
|
{
|
||||||
|
require_script = intersection_list[0];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intersection_list[i] = 0;
|
||||||
|
require_list = intersection_list;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* The previously set required script is a single script, not
|
||||||
|
Han-related. Check that it is in this character's list. */
|
||||||
|
|
||||||
|
default:
|
||||||
|
for (; *list != 0; list++)
|
||||||
|
{
|
||||||
|
if (*list == require_script) break;
|
||||||
|
}
|
||||||
|
if (*list == 0) return FALSE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} /* End of handling negative scriptx */
|
||||||
|
} /* End of checking non-Common character */
|
||||||
|
|
||||||
/* The character is in an acceptable script. We must now ensure that all
|
/* The character is in an acceptable script. We must now ensure that all
|
||||||
decimal digits in the string come from the same set. Some scripts (e.g.
|
decimal digits in the string come from the same set. Some scripts (e.g.
|
||||||
Common, Arabic) have more than one set of decimal digits. This code does
|
Common, Arabic) have more than one set of decimal digits. This code does
|
||||||
|
@ -182,11 +395,11 @@ for (;;)
|
||||||
'9' characters in every set of 10 digits. Each set is identified by the
|
'9' characters in every set of 10 digits. Each set is identified by the
|
||||||
offset in the vector of its '9' character. An initial check of the first
|
offset in the vector of its '9' character. An initial check of the first
|
||||||
value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
|
value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
|
||||||
|
|
||||||
if (ucd->chartype == ucp_Nd)
|
if (ucd->chartype == ucp_Nd)
|
||||||
{
|
{
|
||||||
uint32_t digitset;
|
uint32_t digitset;
|
||||||
|
|
||||||
if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
|
if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
|
||||||
{
|
{
|
||||||
int mid;
|
int mid;
|
||||||
|
@ -203,9 +416,9 @@ for (;;)
|
||||||
if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
|
if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* A required value of 0 means "unset". */
|
/* A required value of 0 means "unset". */
|
||||||
|
|
||||||
if (require_digitset == 0) require_digitset = digitset;
|
if (require_digitset == 0) require_digitset = digitset;
|
||||||
else if (digitset != require_digitset) return FALSE;
|
else if (digitset != require_digitset) return FALSE;
|
||||||
} /* End digit handling */
|
} /* End digit handling */
|
||||||
|
|
|
@ -2394,6 +2394,22 @@
|
||||||
\x{1A80}\x{1a40}\x{1A90}\x{1a41} Tai Tham Hora digit, letter, Tham digit, letter
|
\x{1A80}\x{1a40}\x{1A90}\x{1a41} Tai Tham Hora digit, letter, Tham digit, letter
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
|
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
|
||||||
|
|
||||||
|
/^(*sr:\S*)/utf
|
||||||
|
\x{1cf4}\x{20f0}\x{900}\x{11305} [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Gran
|
||||||
|
\x{1cf4}\x{20f0}\x{11305}\x{900} [Dev,Gran,Kan] [Dev,Gran,Lat] Gran Dev
|
||||||
|
\x{1cf4}\x{20f0}\x{900}ABC [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Lat
|
||||||
|
\x{1cf4}\x{20f0}ABC [Dev,Gran,Kan] [Dev,Gran,Lat] Lat
|
||||||
|
\x{20f0}ABC [Dev,Gran,Lat] Lat
|
||||||
|
XYZ\x{20f0}ABC Lat [Dev,Gran,Lat] Lat
|
||||||
|
\x{a36}\x{a33}\x{900} [Dev,...] [Dev,...] Dev
|
||||||
|
\x{3001}\x{2e80}\x{3041}\x{30a1} [Bopo, Han, etc] Han Hira Kata
|
||||||
|
\x{3001}\x{30a1}\x{2e80}\x{3041} [Bopo, Han, etc] Kata Han Hira
|
||||||
|
\x{3001}\x{3105}\x{2e80}\x{1101} [Bopo, Han, etc] Bopomofo Han Hangul
|
||||||
|
\x{3105}\x{3001}\x{2e80}\x{1101} Bopomofo [Bopo, Han, etc] Han Hangul
|
||||||
|
\x{3031}\x{3041}\x{30a1}\x{2e80} [Hira Kata] Hira Kata Han
|
||||||
|
\x{060c}\x{06d4}\x{0600}\x{10d00}\x{0700} [Arab Rohg Syrc Thaa] [Arab Rohg] Arab Rohg Syrc
|
||||||
|
\x{060c}\x{06d4}\x{0700}\x{0600}\x{10d00} [Arab Rohg Syrc Thaa] [Arab Rohg] Syrc Arab Rohg
|
||||||
|
|
||||||
/(?<!)(*sr:)/
|
/(?<!)(*sr:)/
|
||||||
|
|
||||||
|
@ -2405,6 +2421,17 @@
|
||||||
/(?<=abc(?=X(*sr:BXY)CCC)XBXYCCC)./
|
/(?<=abc(?=X(*sr:BXY)CCC)XBXYCCC)./
|
||||||
abcXBXYCCC!
|
abcXBXYCCC!
|
||||||
|
|
||||||
|
/^(*sr:\S*)/utf
|
||||||
|
\x{10d00}\x{10d00}\x{06d4} Rohingya Rohingya Arabic-full-stop
|
||||||
|
\x{06d4}\x{10d00}\x{10d00} Arabic-full-stop Rohingya Rohingya
|
||||||
|
\x{10d00}\x{10d00}\x{0363} Rohingya Rohingya Inherited-extend-Latin
|
||||||
|
\x{0363}\x{10d00}\x{10d00} Inherited-extend-Latin Rohingya Rohingya
|
||||||
|
AB\x{0363} Latin Latin Inherited-extend-Latin
|
||||||
|
\x{0363}AB Inherited-extend-Latin Latin Latin
|
||||||
|
AB\x{1cf7} Latin Latin Common-extended-Beng
|
||||||
|
\x{1cf7}AB Common-extend-Beng Latin Latin
|
||||||
|
\x{1cf7}\x{0993} Common-extend-Beng Bengali
|
||||||
|
|
||||||
# Test loop breaking for empty string match
|
# Test loop breaking for empty string match
|
||||||
|
|
||||||
/^(*sr:A|)*BCD/utf
|
/^(*sr:A|)*BCD/utf
|
||||||
|
|
|
@ -2132,6 +2132,17 @@
|
||||||
\x{0904}12\x{0939} Devanagari Common-digits Devanagari
|
\x{0904}12\x{0939} Devanagari Common-digits Devanagari
|
||||||
A\x{ff10}\x{ff19}B Latin Common-notascii-digits Latin
|
A\x{ff10}\x{ff19}B Latin Common-notascii-digits Latin
|
||||||
A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin
|
A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin
|
||||||
|
|
||||||
|
# These ones involve non-ASCII but nevertheless Common digits. As of October
|
||||||
|
# 2018 even blead Perl wasn't handling all of these - but is going to.
|
||||||
|
|
||||||
|
/^(*sr:.{4})/utf
|
||||||
|
A\x{ff10}\x{ff19}B Latin Common-notascii-digits Latin
|
||||||
|
\x{ff10}\x{ff19}.. Common-notascii-digits Common Common
|
||||||
|
A\x{ff10}BC Latin Common-notascii-digit Latin Latin
|
||||||
|
A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin
|
||||||
|
\x{1d7ce}\x{1d7cf},, fancy-common-digits Common Common
|
||||||
|
A\x{1d7ce}BC Latin fancy-common-digit Latin Latin
|
||||||
|
|
||||||
# -------
|
# -------
|
||||||
|
|
||||||
|
|
|
@ -3872,6 +3872,36 @@ No match
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
|
\x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
/^(*sr:\S*)/utf
|
||||||
|
\x{1cf4}\x{20f0}\x{900}\x{11305} [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Gran
|
||||||
|
0: \x{1cf4}\x{20f0}\x{900}
|
||||||
|
\x{1cf4}\x{20f0}\x{11305}\x{900} [Dev,Gran,Kan] [Dev,Gran,Lat] Gran Dev
|
||||||
|
0: \x{1cf4}\x{20f0}\x{11305}
|
||||||
|
\x{1cf4}\x{20f0}\x{900}ABC [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Lat
|
||||||
|
0: \x{1cf4}\x{20f0}\x{900}
|
||||||
|
\x{1cf4}\x{20f0}ABC [Dev,Gran,Kan] [Dev,Gran,Lat] Lat
|
||||||
|
0: \x{1cf4}\x{20f0}
|
||||||
|
\x{20f0}ABC [Dev,Gran,Lat] Lat
|
||||||
|
0: \x{20f0}ABC
|
||||||
|
XYZ\x{20f0}ABC Lat [Dev,Gran,Lat] Lat
|
||||||
|
0: XYZ\x{20f0}ABC
|
||||||
|
\x{a36}\x{a33}\x{900} [Dev,...] [Dev,...] Dev
|
||||||
|
0: \x{a36}\x{a33}
|
||||||
|
\x{3001}\x{2e80}\x{3041}\x{30a1} [Bopo, Han, etc] Han Hira Kata
|
||||||
|
0: \x{3001}\x{2e80}\x{3041}\x{30a1}
|
||||||
|
\x{3001}\x{30a1}\x{2e80}\x{3041} [Bopo, Han, etc] Kata Han Hira
|
||||||
|
0: \x{3001}\x{30a1}\x{2e80}\x{3041}
|
||||||
|
\x{3001}\x{3105}\x{2e80}\x{1101} [Bopo, Han, etc] Bopomofo Han Hangul
|
||||||
|
0: \x{3001}\x{3105}\x{2e80}
|
||||||
|
\x{3105}\x{3001}\x{2e80}\x{1101} Bopomofo [Bopo, Han, etc] Han Hangul
|
||||||
|
0: \x{3105}\x{3001}\x{2e80}
|
||||||
|
\x{3031}\x{3041}\x{30a1}\x{2e80} [Hira Kata] Hira Kata Han
|
||||||
|
0: \x{3031}\x{3041}\x{30a1}\x{2e80}
|
||||||
|
\x{060c}\x{06d4}\x{0600}\x{10d00}\x{0700} [Arab Rohg Syrc Thaa] [Arab Rohg] Arab Rohg Syrc
|
||||||
|
0: \x{60c}\x{6d4}\x{600}
|
||||||
|
\x{060c}\x{06d4}\x{0700}\x{0600}\x{10d00} [Arab Rohg Syrc Thaa] [Arab Rohg] Syrc Arab Rohg
|
||||||
|
0: \x{60c}\x{6d4}
|
||||||
|
|
||||||
/(?<!)(*sr:)/
|
/(?<!)(*sr:)/
|
||||||
|
|
||||||
|
@ -3885,6 +3915,26 @@ No match
|
||||||
abcXBXYCCC!
|
abcXBXYCCC!
|
||||||
0: !
|
0: !
|
||||||
|
|
||||||
|
/^(*sr:\S*)/utf
|
||||||
|
\x{10d00}\x{10d00}\x{06d4} Rohingya Rohingya Arabic-full-stop
|
||||||
|
0: \x{10d00}\x{10d00}\x{6d4}
|
||||||
|
\x{06d4}\x{10d00}\x{10d00} Arabic-full-stop Rohingya Rohingya
|
||||||
|
0: \x{6d4}\x{10d00}\x{10d00}
|
||||||
|
\x{10d00}\x{10d00}\x{0363} Rohingya Rohingya Inherited-extend-Latin
|
||||||
|
0: \x{10d00}\x{10d00}
|
||||||
|
\x{0363}\x{10d00}\x{10d00} Inherited-extend-Latin Rohingya Rohingya
|
||||||
|
0: \x{363}
|
||||||
|
AB\x{0363} Latin Latin Inherited-extend-Latin
|
||||||
|
0: AB\x{363}
|
||||||
|
\x{0363}AB Inherited-extend-Latin Latin Latin
|
||||||
|
0: \x{363}AB
|
||||||
|
AB\x{1cf7} Latin Latin Common-extended-Beng
|
||||||
|
0: AB
|
||||||
|
\x{1cf7}AB Common-extend-Beng Latin Latin
|
||||||
|
0: \x{1cf7}
|
||||||
|
\x{1cf7}\x{0993} Common-extend-Beng Bengali
|
||||||
|
0: \x{1cf7}\x{993}
|
||||||
|
|
||||||
# Test loop breaking for empty string match
|
# Test loop breaking for empty string match
|
||||||
|
|
||||||
/^(*sr:A|)*BCD/utf
|
/^(*sr:A|)*BCD/utf
|
||||||
|
|
|
@ -4865,6 +4865,23 @@ MK: ABC
|
||||||
0: A\x{ff10}\x{ff19}B
|
0: A\x{ff10}\x{ff19}B
|
||||||
A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin
|
A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin
|
||||||
0: A\x{1d7ce}\x{1d7cf}B
|
0: A\x{1d7ce}\x{1d7cf}B
|
||||||
|
|
||||||
|
# These ones involve non-ASCII but nevertheless Common digits. As of October
|
||||||
|
# 2018 even blead Perl wasn't handling all of these - but is going to.
|
||||||
|
|
||||||
|
/^(*sr:.{4})/utf
|
||||||
|
A\x{ff10}\x{ff19}B Latin Common-notascii-digits Latin
|
||||||
|
0: A\x{ff10}\x{ff19}B
|
||||||
|
\x{ff10}\x{ff19}.. Common-notascii-digits Common Common
|
||||||
|
0: \x{ff10}\x{ff19}..
|
||||||
|
A\x{ff10}BC Latin Common-notascii-digit Latin Latin
|
||||||
|
0: A\x{ff10}BC
|
||||||
|
A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin
|
||||||
|
0: A\x{1d7ce}\x{1d7cf}B
|
||||||
|
\x{1d7ce}\x{1d7cf},, fancy-common-digits Common Common
|
||||||
|
0: \x{1d7ce}\x{1d7cf},,
|
||||||
|
A\x{1d7ce}BC Latin fancy-common-digit Latin Latin
|
||||||
|
0: A\x{1d7ce}BC
|
||||||
|
|
||||||
# -------
|
# -------
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue