Update script run code to work with new script extensions coding

This commit is contained in:
Philip Hazel 2021-12-31 16:06:05 +00:00
parent 6614b281bc
commit d888d36013
9 changed files with 290 additions and 347 deletions

View File

@ -117,8 +117,9 @@
# Conceptually, there is a table of records (of type ucd_record), one for each
# Unicode character. Each record contains the script number, script extension
# value, character type, grapheme break type, offset to caseless matching set,
# offset to the character's other case, and the bidi class/control. However, a
# real table covering all Unicode characters would be far too big. It can be
# offset to the character's other case, and the bidi class/control.
#
# A real table covering all Unicode characters would be far too big. It can be
# efficiently compressed by observing that many characters have the same
# record, and many blocks of characters (taking 128 characters in a block) have
# the same set of records as other blocks. This leads to a 2-stage lookup
@ -135,13 +136,20 @@
# in script runs all come from the same set. The first element in the vector
# contains the number of subsequent elements, which are in ascending order.
#
# The lists of scripts in script_names and script_abbrevs are partitioned into
# two groups. Scripts that appear in at least one character's script extension
# list come first, follwed by "Unknown" and then all the rest. This sorting is
# done certain automatically in the GenerateCommon.py script. A script's number
# is its index in these lists.
#
# The ucd_script_sets vector contains bitmaps that represent lists of scripts
# for the Script Extensions properties of certain characters. Each bitmap
# consists of a fixed number of unsigned 32-bit numbers, enough to allocate
# a bit for every known script. A character with more than one script listed
# for its Script Extension property has a negative value in its record. This is
# the negated offset to the start of the relevant bitmap in the ucd_script_sets
# vector.
# for Script Extensions properties. Each bitmap consists of a fixed number of
# unsigned 32-bit numbers, enough to allocate a bit for every script that is
# used in any character's extension list, that is, enough for every script
# whose number is less than ucp_Unknown. A character's script extension value
# in its ucd record is an offset into the ucd_script_sets vector. The first
# bitmap has no bits set; characters that have no script extensions have zero
# as their script extensions value so that they use this map.
#
# The ucd_records table contains one instance of every unique record that is
# required. The ucd_stage1 table is indexed by a character's block number,
@ -157,15 +165,15 @@
#
# Example: lowercase "a" (U+0061) is in block 0
# lookup 0 in stage1 table yields 0
# lookup 97 (0x61) in the first table in stage2 yields 22
# record 22 is { 34, 5, 12, 0, -32, 34, 2, 0 }
# 34 = ucp_Latin => Latin script
# lookup 97 (0x61) in the first table in stage2 yields 23
# record 23 is { 20, 5, 12, 0, -32, 0, 9, 0 }
# 20 = ucp_Latin => Latin script
# 5 = ucp_Ll => Lower case letter
# 12 = ucp_gbOther => Grapheme break property "Other"
# 0 => Not part of a caseless set
# -32 (-0x20) => Other case is U+0041
# 34 = ucp_Latin => No special Script Extension property
# 2 = ucp_bidiL => Bidi class left-to-right
# 0 => No special Script Extension property
# 9 = ucp_bidiL => Bidi class left-to-right
# 0 => Dummy value, unused at present
#
# Almost all lowercase latin characters resolve to the same record. One or two
@ -174,35 +182,35 @@
#
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
# lookup 96 in stage1 table yields 91
# lookup 66 (0x42) in table 91 in stage2 yields 613
# record 613 is { 27, 7, 12, 0, 0, 27, 2, 0 }
# 27 = ucp_Hiragana => Hiragana script
# lookup 66 (0x42) in table 91 in stage2 yields 614
# record 614 is { 17, 7, 12, 0, 0, 0, 9, 0 }
# 17 = ucp_Hiragana => Hiragana script
# 7 = ucp_Lo => Other letter
# 12 = ucp_gbOther => Grapheme break property "Other"
# 0 => Not part of a caseless set
# 0 => No other case
# 27 = ucp_Hiragana => No special Script Extension property
# 2 = ucp_bidiL => Bidi class left-to-right
# 0 => No special Script Extension property
# 9 = ucp_bidiL => Bidi class left-to-right
# 0 => Dummy value, unused at present
#
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
# lookup 57 in stage1 table yields 55
# lookup 80 (0x50) in table 55 in stage2 yields 485
# record 485 is { 28, 12, 3, 0, 0, -122, 19, 0 }
# 28 = ucp_Inherited => Script inherited from predecessor
# lookup 80 (0x50) in table 55 in stage2 yields 486
# record 485 is { 78, 12, 3, 0, 0, 138, 13, 0 }
# 78 = ucp_Inherited => Script inherited from predecessor
# 12 = ucp_Mn => Non-spacing mark
# 3 = ucp_gbExtend => Grapheme break property "Extend"
# 0 => Not part of a caseless set
# 0 => No other case
# -228 => Script Extension list offset = 228
# 138 => Script Extension list offset = 138
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
# 0 => Dummy value, unused at present
#
# At offset 228 in the ucd_script_sets vector we find a bitmap with bits 3, 15,
# 29, and 107 set. This means that this character is expected to be used with
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
# 18, and 47 set. This means that this character is expected to be used with
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
#
# Philip Hazel, last updated 19 December 2021.
# Philip Hazel, last updated 31 December 2021.
##############################################################################
@ -775,7 +783,6 @@ f.write("""\
const uint32_t PRIV(ucd_script_sets)[] = {
""")
for d in script_lists:
bitwords = [0] * script_list_item_size
@ -797,8 +804,8 @@ f.write("""\
/* These are the main two-stage UCD tables. The fields in each record are:
script (8 bits), character type (8 bits), grapheme break property (8 bits),
offset to multichar other cases or zero (8 bits), offset to other case or zero
(32 bits, signed), script extension (16 bits, signed), bidi class (8 bits), and
a dummy 8-bit field to make the whole thing a multiple of 4 bytes. */
(32 bits, signed), script extension (8 bits), bidi class (8 bits), and a dummy
16-bit field to make the whole thing a multiple of 4 bytes. */
\n""")
write_records(records, record_size)

View File

@ -316,7 +316,7 @@ j = 0;
for (i = 0; i < PRIV(utt_size); i++)
{
const ucp_type_table *u = PRIV(utt) + i;
if (u->type == PT_SCX && u->value == script)
if ((u->type == PT_SCX || u->type == PT_SC) && u->value == script)
{
foundlist[j++] = i;
if (j >= 2) break;
@ -479,39 +479,17 @@ if (is_just_one && othercase != c)
}
}
if (scriptx != script)
{
printf(", [");
if (scriptx >= 0)
printf("%s", get_scriptname(scriptx));
else
if (scriptx != 0)
{
const char *sep = "";
/*
const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
while (*p != 0)
{
printf("%s%s", sep, get_scriptname(*p++));
sep = ", ";
}
*/
const uint32_t *p = PRIV(ucd_script_sets) - scriptx;
for (int i = 0; i < ucp_Script_Count; i++)
{
int x = i/32;
int y = i%32;
if ((p[x] & (1u<<y)) != 0)
const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
printf(", [");
for (int i = 0; i < ucp_Unknown; i++)
if (MAPBIT(p, i) != 0)
{
printf("%s%s", sep, get_scriptname(i));
sep = ", ";
}
}
}
printf("]");
}

View File

@ -1850,10 +1850,11 @@ typedef struct {
#define UCD_SCRIPTX(ch) GET_UCD(ch)->scriptx
/* The "scriptx" field gives an offset into a vector of 32-bit words that
form a bitmap representing a list of scripts. This macro tests for a
script in the map by number. */
form a bitmap representing a list of scripts. These macros test or set the bit
for a script in the map by number. */
#define MAPBIT(map,script) ((map)[(script)/32]&(1u<<((script)%32)))
#define MAPSET(map,script) ((map)[(script)/32]|=(1u<<((script)%32)))
/* The "bidi" field has the 0x80 bit set if the character has the Bidi_Control
property. The remaining bits hold the bidi class, but as there are only 23

View File

@ -68,26 +68,26 @@ Arguments:
Returns: TRUE if this is a valid script run
*/
/* These dummy values must be less than the negation of the largest offset in
the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
records (and is only likely to be a few hundred). */
/* These are states in the checking process. */
#define SCRIPT_UNSET (-99999)
#define SCRIPT_HANPENDING (-99998)
#define SCRIPT_HANHIRAKATA (-99997)
#define SCRIPT_HANBOPOMOFO (-99996)
#define SCRIPT_HANHANGUL (-99995)
#define SCRIPT_MAP (-99994)
enum { SCRIPT_UNSET, /* Requirement as yet unknown */
SCRIPT_MAP, /* Bitmap contains acceptable scripts */
SCRIPT_HANPENDING, /* Have had only Han characters */
SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */
SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */
SCRIPT_HANHANGUL /* Expect Han or Hangul */
};
#define MAPSIZE (ucp_Script_Count/32 + 1)
#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
BOOL
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
{
#ifdef SUPPORT_UNICODE
int require_script = SCRIPT_UNSET;
uint32_t intersection_map[MAPSIZE];
const uint32_t *require_map = NULL;
uint32_t require_state = SCRIPT_UNSET;
uint32_t require_map[FULL_MAPSIZE];
uint32_t map[FULL_MAPSIZE];
uint32_t require_digitset = 0;
uint32_t c;
@ -101,11 +101,17 @@ if (ptr >= endptr) return TRUE;
GETCHARINCTEST(c, ptr);
if (ptr >= endptr) return TRUE;
/* Initialize the require map. This is a full-size bitmap that has a bit for
every script, as opposed to the maps in ucd_script_sets, which only have bits
for scripts less than ucp_Unknown - those that appear in script extension
lists. */
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
/* Scan strings of two or more characters, checking the Unicode characteristics
of each code point. We make use of the Script Extensions property. There is
special code for scripts that can be combined with characters from the Han
Chinese script. This may be used in conjunction with four other scripts in
these combinations:
of each code point. There is special code for scripts that can be combined with
characters from the Han Chinese script. This may be used in conjunction with
four other scripts in these combinations:
. Han with Hiragana and Katakana is allowed (for Japanese).
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
@ -119,146 +125,69 @@ Hence the SCRIPT_HANPENDING state. */
for (;;)
{
const ucd_record *ucd = GET_UCD(c);
int32_t scriptx = ucd->scriptx;
uint32_t script = ucd->script;
/* If the script extension is Unknown, the string is not a valid script run.
Such characters can only form script runs of length one. */
/* If the script is Unknown, the string is not a valid script run. Such
characters can only form script runs of length one (see test above). */
if (scriptx == ucp_Unknown) return FALSE;
if (script == ucp_Unknown) return FALSE;
/* A character whose script extension is Inherited is always accepted with
any script, and plays no further part in this testing. A character whose
script is Common is always accepted, but must still be tested for a digit
below. The scriptx value at this point is non-zero, because zero is
ucp_Unknown, tested for above. */
/* A character without any script extensions whose script is Inherited or
Common is always accepted with any script. If there are extensions, the
following processing happens for all scripts. */
if (scriptx != ucp_Inherited)
if (ucd->scriptx != 0 || (script != ucp_Inherited && script != ucp_Common))
{
if (scriptx != ucp_Common)
{
/* If the script extension value is positive, the character is not a mark
that can be used with many scripts. In the simple case we either set or
compare with the required script. However, handling the scripts that can
combine with Han are more complicated, as is the case when the previous
characters have been man-script marks. */
BOOL OK;
if (scriptx > 0)
/* Set up a full-sized map for this character that can include bits for all
scripts. Copy the scriptx map for this character (which covers those
scripts that appear in script extension lists), set the remaining values to
zero, and then, except for Common or Inherited, add this script's bit to
the map. */
memcpy(map, PRIV(ucd_script_sets) + ucd->scriptx, UCD_MAPSIZE * sizeof(uint32_t));
memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
/* Handle the different checking states */
switch(require_state)
{
switch(require_script)
{
/* Either the first significant character (require_script unset) or
after only Han characters. */
/* First significant character - it might follow Common or Inherited
characters that do not have any script extensions. */
case SCRIPT_UNSET:
case SCRIPT_HANPENDING:
switch(scriptx)
switch(script)
{
case ucp_Han:
require_script = SCRIPT_HANPENDING;
require_state = SCRIPT_HANPENDING;
break;
case ucp_Hiragana:
case ucp_Katakana:
require_script = SCRIPT_HANHIRAKATA;
require_state = SCRIPT_HANHIRAKATA;
break;
case ucp_Bopomofo:
require_script = SCRIPT_HANBOPOMOFO;
require_state = SCRIPT_HANBOPOMOFO;
break;
case ucp_Hangul:
require_script = SCRIPT_HANHANGUL;
require_state = SCRIPT_HANHANGUL;
break;
/* Not a Han-related script. If expecting one, fail. Otherise set
the requirement to this script. */
default:
if (require_script == SCRIPT_HANPENDING) return FALSE;
require_script = scriptx;
memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
require_state = SCRIPT_MAP;
break;
}
break;
/* Previously encountered one of the "with Han" scripts. Check that
this character is appropriate. */
case SCRIPT_HANHIRAKATA:
if (scriptx != ucp_Han && scriptx != ucp_Hiragana &&
scriptx != ucp_Katakana)
return FALSE;
break;
case SCRIPT_HANBOPOMOFO:
if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
break;
case SCRIPT_HANHANGUL:
if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
break;
/* We have a bitmap of scripts to check that is derived from one or
more previous characters. This is either one of the maps in
ucd_script_sets[] (for one previous character) or the intersection of
several maps for multiple characters. */
case SCRIPT_MAP:
if (MAPBIT(require_map, scriptx) == 0) return FALSE;
/* The rest of the string must be in this script, but we have to
allow for the Han complications. */
switch(scriptx)
{
case ucp_Han:
require_script = SCRIPT_HANPENDING;
break;
case ucp_Hiragana:
case ucp_Katakana:
require_script = SCRIPT_HANHIRAKATA;
break;
case ucp_Bopomofo:
require_script = SCRIPT_HANBOPOMOFO;
break;
case ucp_Hangul:
require_script = SCRIPT_HANHANGUL;
break;
default:
require_script = scriptx;
break;
}
break;
/* This is the easy case when a single script is required. */
default:
if (scriptx != require_script) return FALSE;
break;
}
} /* End of handing positive scriptx */
/* If scriptx is negative, this character is a mark-type character that
has a list of permitted scripts, which are encoded in a bitmap. */
else
{
uint32_t chspecial;
const uint32_t *map = PRIV(ucd_script_sets) - scriptx;
switch(require_script)
{
case SCRIPT_UNSET:
require_map = PRIV(ucd_script_sets) - scriptx;
require_script = SCRIPT_MAP;
break;
/* An inspection of the Unicode 11.0.0 files shows that there are the
following types of Script Extension list that involve the Han,
Bopomofo, Hiragana, Katakana, and Hangul scripts:
/* The first significant character was Han. An inspection of the Unicode
11.0.0 files shows that there are the following types of Script Extension
list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
scripts:
. Bopomofo + Han
. Han + Hiragana + Katakana
@ -273,72 +202,93 @@ for (;;)
#define FOUND_HANGUL 8
case SCRIPT_HANPENDING:
chspecial = 0;
if (script != ucp_Han) /* Another Han does nothing */
{
uint32_t chspecial = 0;
if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL;
if (chspecial == 0) return FALSE;
if (chspecial == 0) return FALSE; /* Not allowed with Han */
if (chspecial == FOUND_BOPOMOFO)
{
require_script = SCRIPT_HANBOPOMOFO;
}
require_state = SCRIPT_HANBOPOMOFO;
else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
{
require_script = SCRIPT_HANHIRAKATA;
require_state = SCRIPT_HANHIRAKATA;
/* Otherwise this character must be allowed with all of them, so remain
in the pending state. */
}
/* Otherwise it must be allowed with all of them, so remain in
the pending state. */
break;
/* Previously encountered one of the "with Han" scripts. Check that
this character is appropriate. */
case SCRIPT_HANHIRAKATA:
if (MAPBIT(map, ucp_Hiragana) != 0) break;
if (MAPBIT(map, ucp_Katakana) != 0) break;
return FALSE;
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
MAPBIT(map, ucp_Katakana) == 0) return FALSE;
break;
case SCRIPT_HANBOPOMOFO:
if (MAPBIT(map, ucp_Bopomofo) != 0) break;
return FALSE;
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
break;
case SCRIPT_HANHANGUL:
if (MAPBIT(map, ucp_Hangul) != 0) break;
return FALSE;
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
break;
/* Previously encountered one or more characters that are allowed
with a list of scripts. Build the intersection of the required list
with this character's list in intersection_map[]. */
/* Previously encountered one or more characters that are allowed with a
list of scripts. */
case SCRIPT_MAP:
for (int i = 0; i < MAPSIZE; i++)
intersection_map[i] = require_map[i] & map[i];
OK = FALSE;
/* If there's just one script in common, we could set it as the
unique required script. However, in the new bitmap arrangements,
finding the one script is expensive, so leave this out for now.
Otherwise, make the intersection map the required map. */
/*
if (onescript >= 0) require_script = onescript;
else require_map = intersection_map;
*/
require_map = intersection_map;
break;
/* The previously set required script is a single script, not
Han-related. Check that it is in this character's list. */
default:
if (MAPBIT(map, require_script) == 0) return FALSE;
for (int i = 0; i < FULL_MAPSIZE; i++)
{
if ((require_map[i] & map[i]) != 0)
{
OK = TRUE;
break;
}
} /* End of handling negative scriptx */
} /* End of checking non-Common character */
}
if (!OK) return FALSE;
/* The rest of the string must be in this script, but we have to
allow for the Han complications. */
switch(script)
{
case ucp_Han:
require_state = SCRIPT_HANPENDING;
break;
case ucp_Hiragana:
case ucp_Katakana:
require_state = SCRIPT_HANHIRAKATA;
break;
case ucp_Bopomofo:
require_state = SCRIPT_HANBOPOMOFO;
break;
case ucp_Hangul:
require_state = SCRIPT_HANHANGUL;
break;
/* Compute the intersection of the required list of scripts and the
allowed scripts for this character. */
default:
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
break;
}
break;
}
} /* End checking character's script and extensions. */
/* The character is in an acceptable script. We must now ensure that all
decimal digits in the string come from the same set. Some scripts (e.g.
@ -376,7 +326,6 @@ for (;;)
if (require_digitset == 0) require_digitset = digitset;
else if (digitset != require_digitset) return FALSE;
} /* End digit handling */
} /* End checking non-Inherited character */
/* If we haven't yet got to the end, pick up the next character. */

View File

@ -237,8 +237,8 @@ const uint32_t PRIV(ucd_script_sets)[] = {
/* These are the main two-stage UCD tables. The fields in each record are:
script (8 bits), character type (8 bits), grapheme break property (8 bits),
offset to multichar other cases or zero (8 bits), offset to other case or zero
(32 bits, signed), script extension (16 bits, signed), bidi class (8 bits), and
a dummy 8-bit field to make the whole thing a multiple of 4 bytes. */
(32 bits, signed), script extension (8 bits), bidi class (8 bits), and a dummy
16-bit field to make the whole thing a multiple of 4 bytes. */
const ucd_record PRIV(ucd_records)[] = { /* 12588 bytes, record size 12 */
{ 73, 0, 2, 0, 0, 0, 3, 256, }, /* 0 */

23
testdata/testinput4 vendored
View File

@ -1138,23 +1138,27 @@
\= Expect no match
\x{2e7f}
/^\P{Katakana}+/utf
\x{3105}
\= Expect no match
\x{30ff}
/^[\p{Arabic}]/utf
\x{06e9}
\x{060b}
\= Expect no match
X\x{06e9}
#subject no_jit
/^\P{Katakana}+/utf
\x{3105}
\= Expect no match
\x{30ff}
/^[\P{Yi}]/utf
\x{2f800}
\= Expect no match
\x{a014}
\x{a4c6}
#subject -no_jit
/^\p{Any}X/utf
AXYZ
\x{1234}XYZ
@ -2640,4 +2644,13 @@
/[\p{taml}\p{sc:ugar}]+/utf
\x{0b82}\x{10380}
/^[\p{sc:Arabic}]/utf
\= Expect no match
\x{650}
\x{651}
\x{652}
\x{653}
\x{654}
\x{655}
# End of testinput4

9
testdata/testinput5 vendored
View File

@ -2073,15 +2073,6 @@
# More differences from Perl
/^[\p{Arabic}]/utf
\= Expect no match
\x{650}
\x{651}
\x{652}
\x{653}
\x{654}
\x{655}
/^\p{Common}/utf
\x{60c}
\x{61f}

33
testdata/testoutput4 vendored
View File

@ -1883,13 +1883,6 @@ No match
\x{2e7f}
No match
/^\P{Katakana}+/utf
\x{3105}
0: \x{3105}
\= Expect no match
\x{30ff}
No match
/^[\p{Arabic}]/utf
\x{06e9}
0: \x{6e9}
@ -1899,6 +1892,15 @@ No match
X\x{06e9}
No match
#subject no_jit
/^\P{Katakana}+/utf
\x{3105}
0: \x{3105}
\= Expect no match
\x{30ff}
No match
/^[\P{Yi}]/utf
\x{2f800}
0: \x{2f800}
@ -1908,6 +1910,8 @@ No match
\x{a4c6}
No match
#subject -no_jit
/^\p{Any}X/utf
AXYZ
0: AX
@ -4235,4 +4239,19 @@ No match
\x{0b82}\x{10380}
0: \x{b82}\x{10380}
/^[\p{sc:Arabic}]/utf
\= Expect no match
\x{650}
No match
\x{651}
No match
\x{652}
No match
\x{653}
No match
\x{654}
No match
\x{655}
No match
# End of testinput4

15
testdata/testoutput5 vendored
View File

@ -4722,21 +4722,6 @@ Callout 0: last capture = 1
# More differences from Perl
/^[\p{Arabic}]/utf
\= Expect no match
\x{650}
No match
\x{651}
No match
\x{652}
No match
\x{653}
No match
\x{654}
No match
\x{655}
No match
/^\p{Common}/utf
\x{60c}
0: \x{60c}