Update script run code to work with new script extensions coding
This commit is contained in:
parent
6614b281bc
commit
d888d36013
|
@ -117,8 +117,9 @@
|
|||
# Conceptually, there is a table of records (of type ucd_record), one for each
|
||||
# Unicode character. Each record contains the script number, script extension
|
||||
# value, character type, grapheme break type, offset to caseless matching set,
|
||||
# offset to the character's other case, and the bidi class/control. However, a
|
||||
# real table covering all Unicode characters would be far too big. It can be
|
||||
# offset to the character's other case, and the bidi class/control.
|
||||
#
|
||||
# A real table covering all Unicode characters would be far too big. It can be
|
||||
# efficiently compressed by observing that many characters have the same
|
||||
# record, and many blocks of characters (taking 128 characters in a block) have
|
||||
# the same set of records as other blocks. This leads to a 2-stage lookup
|
||||
|
@ -135,13 +136,20 @@
|
|||
# in script runs all come from the same set. The first element in the vector
|
||||
# contains the number of subsequent elements, which are in ascending order.
|
||||
#
|
||||
# The lists of scripts in script_names and script_abbrevs are partitioned into
|
||||
# two groups. Scripts that appear in at least one character's script extension
|
||||
# list come first, follwed by "Unknown" and then all the rest. This sorting is
|
||||
# done certain automatically in the GenerateCommon.py script. A script's number
|
||||
# is its index in these lists.
|
||||
#
|
||||
# The ucd_script_sets vector contains bitmaps that represent lists of scripts
|
||||
# for the Script Extensions properties of certain characters. Each bitmap
|
||||
# consists of a fixed number of unsigned 32-bit numbers, enough to allocate
|
||||
# a bit for every known script. A character with more than one script listed
|
||||
# for its Script Extension property has a negative value in its record. This is
|
||||
# the negated offset to the start of the relevant bitmap in the ucd_script_sets
|
||||
# vector.
|
||||
# for Script Extensions properties. Each bitmap consists of a fixed number of
|
||||
# unsigned 32-bit numbers, enough to allocate a bit for every script that is
|
||||
# used in any character's extension list, that is, enough for every script
|
||||
# whose number is less than ucp_Unknown. A character's script extension value
|
||||
# in its ucd record is an offset into the ucd_script_sets vector. The first
|
||||
# bitmap has no bits set; characters that have no script extensions have zero
|
||||
# as their script extensions value so that they use this map.
|
||||
#
|
||||
# The ucd_records table contains one instance of every unique record that is
|
||||
# required. The ucd_stage1 table is indexed by a character's block number,
|
||||
|
@ -157,15 +165,15 @@
|
|||
#
|
||||
# Example: lowercase "a" (U+0061) is in block 0
|
||||
# lookup 0 in stage1 table yields 0
|
||||
# lookup 97 (0x61) in the first table in stage2 yields 22
|
||||
# record 22 is { 34, 5, 12, 0, -32, 34, 2, 0 }
|
||||
# 34 = ucp_Latin => Latin script
|
||||
# lookup 97 (0x61) in the first table in stage2 yields 23
|
||||
# record 23 is { 20, 5, 12, 0, -32, 0, 9, 0 }
|
||||
# 20 = ucp_Latin => Latin script
|
||||
# 5 = ucp_Ll => Lower case letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# -32 (-0x20) => Other case is U+0041
|
||||
# 34 = ucp_Latin => No special Script Extension property
|
||||
# 2 = ucp_bidiL => Bidi class left-to-right
|
||||
# 0 => No special Script Extension property
|
||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||
# 0 => Dummy value, unused at present
|
||||
#
|
||||
# Almost all lowercase latin characters resolve to the same record. One or two
|
||||
|
@ -174,35 +182,35 @@
|
|||
#
|
||||
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
||||
# lookup 96 in stage1 table yields 91
|
||||
# lookup 66 (0x42) in table 91 in stage2 yields 613
|
||||
# record 613 is { 27, 7, 12, 0, 0, 27, 2, 0 }
|
||||
# 27 = ucp_Hiragana => Hiragana script
|
||||
# lookup 66 (0x42) in table 91 in stage2 yields 614
|
||||
# record 614 is { 17, 7, 12, 0, 0, 0, 9, 0 }
|
||||
# 17 = ucp_Hiragana => Hiragana script
|
||||
# 7 = ucp_Lo => Other letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# 27 = ucp_Hiragana => No special Script Extension property
|
||||
# 2 = ucp_bidiL => Bidi class left-to-right
|
||||
# 0 => No special Script Extension property
|
||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||
# 0 => Dummy value, unused at present
|
||||
#
|
||||
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
|
||||
# lookup 57 in stage1 table yields 55
|
||||
# lookup 80 (0x50) in table 55 in stage2 yields 485
|
||||
# record 485 is { 28, 12, 3, 0, 0, -122, 19, 0 }
|
||||
# 28 = ucp_Inherited => Script inherited from predecessor
|
||||
# lookup 80 (0x50) in table 55 in stage2 yields 486
|
||||
# record 485 is { 78, 12, 3, 0, 0, 138, 13, 0 }
|
||||
# 78 = ucp_Inherited => Script inherited from predecessor
|
||||
# 12 = ucp_Mn => Non-spacing mark
|
||||
# 3 = ucp_gbExtend => Grapheme break property "Extend"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# -228 => Script Extension list offset = 228
|
||||
# 138 => Script Extension list offset = 138
|
||||
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
|
||||
# 0 => Dummy value, unused at present
|
||||
#
|
||||
# At offset 228 in the ucd_script_sets vector we find a bitmap with bits 3, 15,
|
||||
# 29, and 107 set. This means that this character is expected to be used with
|
||||
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
|
||||
# 18, and 47 set. This means that this character is expected to be used with
|
||||
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
|
||||
#
|
||||
# Philip Hazel, last updated 19 December 2021.
|
||||
# Philip Hazel, last updated 31 December 2021.
|
||||
##############################################################################
|
||||
|
||||
|
||||
|
@ -775,7 +783,6 @@ f.write("""\
|
|||
const uint32_t PRIV(ucd_script_sets)[] = {
|
||||
""")
|
||||
|
||||
|
||||
for d in script_lists:
|
||||
bitwords = [0] * script_list_item_size
|
||||
|
||||
|
@ -797,8 +804,8 @@ f.write("""\
|
|||
/* These are the main two-stage UCD tables. The fields in each record are:
|
||||
script (8 bits), character type (8 bits), grapheme break property (8 bits),
|
||||
offset to multichar other cases or zero (8 bits), offset to other case or zero
|
||||
(32 bits, signed), script extension (16 bits, signed), bidi class (8 bits), and
|
||||
a dummy 8-bit field to make the whole thing a multiple of 4 bytes. */
|
||||
(32 bits, signed), script extension (8 bits), bidi class (8 bits), and a dummy
|
||||
16-bit field to make the whole thing a multiple of 4 bytes. */
|
||||
\n""")
|
||||
|
||||
write_records(records, record_size)
|
||||
|
|
|
@ -316,7 +316,7 @@ j = 0;
|
|||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
{
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
if (u->type == PT_SCX && u->value == script)
|
||||
if ((u->type == PT_SCX || u->type == PT_SC) && u->value == script)
|
||||
{
|
||||
foundlist[j++] = i;
|
||||
if (j >= 2) break;
|
||||
|
@ -479,38 +479,16 @@ if (is_just_one && othercase != c)
|
|||
}
|
||||
}
|
||||
|
||||
if (scriptx != script)
|
||||
if (scriptx != 0)
|
||||
{
|
||||
const char *sep = "";
|
||||
const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
|
||||
printf(", [");
|
||||
if (scriptx >= 0)
|
||||
printf("%s", get_scriptname(scriptx));
|
||||
else
|
||||
{
|
||||
const char *sep = "";
|
||||
|
||||
|
||||
/*
|
||||
const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
|
||||
while (*p != 0)
|
||||
{
|
||||
printf("%s%s", sep, get_scriptname(*p++));
|
||||
sep = ", ";
|
||||
}
|
||||
*/
|
||||
|
||||
const uint32_t *p = PRIV(ucd_script_sets) - scriptx;
|
||||
for (int i = 0; i < ucp_Script_Count; i++)
|
||||
{
|
||||
int x = i/32;
|
||||
int y = i%32;
|
||||
|
||||
if ((p[x] & (1u<<y)) != 0)
|
||||
{
|
||||
printf("%s%s", sep, get_scriptname(i));
|
||||
sep = ", ";
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ucp_Unknown; i++)
|
||||
if (MAPBIT(p, i) != 0)
|
||||
{
|
||||
printf("%s%s", sep, get_scriptname(i));
|
||||
sep = ", ";
|
||||
}
|
||||
printf("]");
|
||||
}
|
||||
|
|
|
@ -1850,10 +1850,11 @@ typedef struct {
|
|||
#define UCD_SCRIPTX(ch) GET_UCD(ch)->scriptx
|
||||
|
||||
/* The "scriptx" field gives an offset into a vector of 32-bit words that
|
||||
form a bitmap representing a list of scripts. This macro tests for a
|
||||
script in the map by number. */
|
||||
form a bitmap representing a list of scripts. These macros test or set the bit
|
||||
for a script in the map by number. */
|
||||
|
||||
#define MAPBIT(map,script) ((map)[(script)/32]&(1u<<((script)%32)))
|
||||
#define MAPSET(map,script) ((map)[(script)/32]|=(1u<<((script)%32)))
|
||||
|
||||
/* The "bidi" field has the 0x80 bit set if the character has the Bidi_Control
|
||||
property. The remaining bits hold the bidi class, but as there are only 23
|
||||
|
|
|
@ -68,26 +68,26 @@ Arguments:
|
|||
Returns: TRUE if this is a valid script run
|
||||
*/
|
||||
|
||||
/* These dummy values must be less than the negation of the largest offset in
|
||||
the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
|
||||
records (and is only likely to be a few hundred). */
|
||||
/* These are states in the checking process. */
|
||||
|
||||
#define SCRIPT_UNSET (-99999)
|
||||
#define SCRIPT_HANPENDING (-99998)
|
||||
#define SCRIPT_HANHIRAKATA (-99997)
|
||||
#define SCRIPT_HANBOPOMOFO (-99996)
|
||||
#define SCRIPT_HANHANGUL (-99995)
|
||||
#define SCRIPT_MAP (-99994)
|
||||
enum { SCRIPT_UNSET, /* Requirement as yet unknown */
|
||||
SCRIPT_MAP, /* Bitmap contains acceptable scripts */
|
||||
SCRIPT_HANPENDING, /* Have had only Han characters */
|
||||
SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */
|
||||
SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */
|
||||
SCRIPT_HANHANGUL /* Expect Han or Hangul */
|
||||
};
|
||||
|
||||
#define MAPSIZE (ucp_Script_Count/32 + 1)
|
||||
#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
|
||||
#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
|
||||
|
||||
BOOL
|
||||
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
|
||||
{
|
||||
#ifdef SUPPORT_UNICODE
|
||||
int require_script = SCRIPT_UNSET;
|
||||
uint32_t intersection_map[MAPSIZE];
|
||||
const uint32_t *require_map = NULL;
|
||||
uint32_t require_state = SCRIPT_UNSET;
|
||||
uint32_t require_map[FULL_MAPSIZE];
|
||||
uint32_t map[FULL_MAPSIZE];
|
||||
uint32_t require_digitset = 0;
|
||||
uint32_t c;
|
||||
|
||||
|
@ -101,11 +101,17 @@ if (ptr >= endptr) return TRUE;
|
|||
GETCHARINCTEST(c, ptr);
|
||||
if (ptr >= endptr) return TRUE;
|
||||
|
||||
/* Initialize the require map. This is a full-size bitmap that has a bit for
|
||||
every script, as opposed to the maps in ucd_script_sets, which only have bits
|
||||
for scripts less than ucp_Unknown - those that appear in script extension
|
||||
lists. */
|
||||
|
||||
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
|
||||
|
||||
/* Scan strings of two or more characters, checking the Unicode characteristics
|
||||
of each code point. We make use of the Script Extensions property. There is
|
||||
special code for scripts that can be combined with characters from the Han
|
||||
Chinese script. This may be used in conjunction with four other scripts in
|
||||
these combinations:
|
||||
of each code point. There is special code for scripts that can be combined with
|
||||
characters from the Han Chinese script. This may be used in conjunction with
|
||||
four other scripts in these combinations:
|
||||
|
||||
. Han with Hiragana and Katakana is allowed (for Japanese).
|
||||
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
|
||||
|
@ -119,264 +125,207 @@ Hence the SCRIPT_HANPENDING state. */
|
|||
for (;;)
|
||||
{
|
||||
const ucd_record *ucd = GET_UCD(c);
|
||||
int32_t scriptx = ucd->scriptx;
|
||||
uint32_t script = ucd->script;
|
||||
|
||||
/* If the script extension is Unknown, the string is not a valid script run.
|
||||
Such characters can only form script runs of length one. */
|
||||
/* If the script is Unknown, the string is not a valid script run. Such
|
||||
characters can only form script runs of length one (see test above). */
|
||||
|
||||
if (scriptx == ucp_Unknown) return FALSE;
|
||||
if (script == ucp_Unknown) return FALSE;
|
||||
|
||||
/* A character whose script extension is Inherited is always accepted with
|
||||
any script, and plays no further part in this testing. A character whose
|
||||
script is Common is always accepted, but must still be tested for a digit
|
||||
below. The scriptx value at this point is non-zero, because zero is
|
||||
ucp_Unknown, tested for above. */
|
||||
/* A character without any script extensions whose script is Inherited or
|
||||
Common is always accepted with any script. If there are extensions, the
|
||||
following processing happens for all scripts. */
|
||||
|
||||
if (scriptx != ucp_Inherited)
|
||||
if (ucd->scriptx != 0 || (script != ucp_Inherited && script != ucp_Common))
|
||||
{
|
||||
if (scriptx != ucp_Common)
|
||||
BOOL OK;
|
||||
|
||||
/* Set up a full-sized map for this character that can include bits for all
|
||||
scripts. Copy the scriptx map for this character (which covers those
|
||||
scripts that appear in script extension lists), set the remaining values to
|
||||
zero, and then, except for Common or Inherited, add this script's bit to
|
||||
the map. */
|
||||
|
||||
memcpy(map, PRIV(ucd_script_sets) + ucd->scriptx, UCD_MAPSIZE * sizeof(uint32_t));
|
||||
memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
|
||||
if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
|
||||
|
||||
/* Handle the different checking states */
|
||||
|
||||
switch(require_state)
|
||||
{
|
||||
/* If the script extension value is positive, the character is not a mark
|
||||
that can be used with many scripts. In the simple case we either set or
|
||||
compare with the required script. However, handling the scripts that can
|
||||
combine with Han are more complicated, as is the case when the previous
|
||||
characters have been man-script marks. */
|
||||
/* First significant character - it might follow Common or Inherited
|
||||
characters that do not have any script extensions. */
|
||||
|
||||
if (scriptx > 0)
|
||||
case SCRIPT_UNSET:
|
||||
switch(script)
|
||||
{
|
||||
switch(require_script)
|
||||
{
|
||||
/* Either the first significant character (require_script unset) or
|
||||
after only Han characters. */
|
||||
case ucp_Han:
|
||||
require_state = SCRIPT_HANPENDING;
|
||||
break;
|
||||
|
||||
case SCRIPT_UNSET:
|
||||
case SCRIPT_HANPENDING:
|
||||
switch(scriptx)
|
||||
{
|
||||
case ucp_Han:
|
||||
require_script = SCRIPT_HANPENDING;
|
||||
break;
|
||||
case ucp_Hiragana:
|
||||
case ucp_Katakana:
|
||||
require_state = SCRIPT_HANHIRAKATA;
|
||||
break;
|
||||
|
||||
case ucp_Hiragana:
|
||||
case ucp_Katakana:
|
||||
require_script = SCRIPT_HANHIRAKATA;
|
||||
break;
|
||||
case ucp_Bopomofo:
|
||||
require_state = SCRIPT_HANBOPOMOFO;
|
||||
break;
|
||||
|
||||
case ucp_Bopomofo:
|
||||
require_script = SCRIPT_HANBOPOMOFO;
|
||||
break;
|
||||
case ucp_Hangul:
|
||||
require_state = SCRIPT_HANHANGUL;
|
||||
break;
|
||||
|
||||
case ucp_Hangul:
|
||||
require_script = SCRIPT_HANHANGUL;
|
||||
break;
|
||||
default:
|
||||
memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
|
||||
require_state = SCRIPT_MAP;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
/* Not a Han-related script. If expecting one, fail. Otherise set
|
||||
the requirement to this script. */
|
||||
/* The first significant character was Han. An inspection of the Unicode
|
||||
11.0.0 files shows that there are the following types of Script Extension
|
||||
list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
|
||||
scripts:
|
||||
|
||||
default:
|
||||
if (require_script == SCRIPT_HANPENDING) return FALSE;
|
||||
require_script = scriptx;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
. Bopomofo + Han
|
||||
. Han + Hiragana + Katakana
|
||||
. Hiragana + Katakana
|
||||
. Bopopmofo + Hangul + Han + Hiragana + Katakana
|
||||
|
||||
/* Previously encountered one of the "with Han" scripts. Check that
|
||||
this character is appropriate. */
|
||||
|
||||
case SCRIPT_HANHIRAKATA:
|
||||
if (scriptx != ucp_Han && scriptx != ucp_Hiragana &&
|
||||
scriptx != ucp_Katakana)
|
||||
return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_HANBOPOMOFO:
|
||||
if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_HANHANGUL:
|
||||
if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
|
||||
break;
|
||||
|
||||
/* We have a bitmap of scripts to check that is derived from one or
|
||||
more previous characters. This is either one of the maps in
|
||||
ucd_script_sets[] (for one previous character) or the intersection of
|
||||
several maps for multiple characters. */
|
||||
|
||||
case SCRIPT_MAP:
|
||||
if (MAPBIT(require_map, scriptx) == 0) return FALSE;
|
||||
|
||||
/* The rest of the string must be in this script, but we have to
|
||||
allow for the Han complications. */
|
||||
|
||||
switch(scriptx)
|
||||
{
|
||||
case ucp_Han:
|
||||
require_script = SCRIPT_HANPENDING;
|
||||
break;
|
||||
|
||||
case ucp_Hiragana:
|
||||
case ucp_Katakana:
|
||||
require_script = SCRIPT_HANHIRAKATA;
|
||||
break;
|
||||
|
||||
case ucp_Bopomofo:
|
||||
require_script = SCRIPT_HANBOPOMOFO;
|
||||
break;
|
||||
|
||||
case ucp_Hangul:
|
||||
require_script = SCRIPT_HANHANGUL;
|
||||
break;
|
||||
|
||||
default:
|
||||
require_script = scriptx;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
/* This is the easy case when a single script is required. */
|
||||
|
||||
default:
|
||||
if (scriptx != require_script) return FALSE;
|
||||
break;
|
||||
}
|
||||
} /* End of handing positive scriptx */
|
||||
|
||||
/* If scriptx is negative, this character is a mark-type character that
|
||||
has a list of permitted scripts, which are encoded in a bitmap. */
|
||||
|
||||
else
|
||||
{
|
||||
uint32_t chspecial;
|
||||
const uint32_t *map = PRIV(ucd_script_sets) - scriptx;
|
||||
|
||||
switch(require_script)
|
||||
{
|
||||
case SCRIPT_UNSET:
|
||||
require_map = PRIV(ucd_script_sets) - scriptx;
|
||||
require_script = SCRIPT_MAP;
|
||||
break;
|
||||
|
||||
/* An inspection of the Unicode 11.0.0 files shows that there are the
|
||||
following types of Script Extension list that involve the Han,
|
||||
Bopomofo, Hiragana, Katakana, and Hangul scripts:
|
||||
|
||||
. Bopomofo + Han
|
||||
. Han + Hiragana + Katakana
|
||||
. Hiragana + Katakana
|
||||
. Bopopmofo + Hangul + Han + Hiragana + Katakana
|
||||
|
||||
The following code tries to make sense of this. */
|
||||
The following code tries to make sense of this. */
|
||||
|
||||
#define FOUND_BOPOMOFO 1
|
||||
#define FOUND_HIRAGANA 2
|
||||
#define FOUND_KATAKANA 4
|
||||
#define FOUND_HANGUL 8
|
||||
|
||||
case SCRIPT_HANPENDING:
|
||||
chspecial = 0;
|
||||
|
||||
if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
|
||||
if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
|
||||
if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
|
||||
if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL;
|
||||
|
||||
if (chspecial == 0) return FALSE;
|
||||
|
||||
if (chspecial == FOUND_BOPOMOFO)
|
||||
{
|
||||
require_script = SCRIPT_HANBOPOMOFO;
|
||||
}
|
||||
else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
|
||||
{
|
||||
require_script = SCRIPT_HANHIRAKATA;
|
||||
}
|
||||
|
||||
/* Otherwise it must be allowed with all of them, so remain in
|
||||
the pending state. */
|
||||
|
||||
break;
|
||||
|
||||
case SCRIPT_HANHIRAKATA:
|
||||
if (MAPBIT(map, ucp_Hiragana) != 0) break;
|
||||
if (MAPBIT(map, ucp_Katakana) != 0) break;
|
||||
return FALSE;
|
||||
|
||||
case SCRIPT_HANBOPOMOFO:
|
||||
if (MAPBIT(map, ucp_Bopomofo) != 0) break;
|
||||
return FALSE;
|
||||
|
||||
case SCRIPT_HANHANGUL:
|
||||
if (MAPBIT(map, ucp_Hangul) != 0) break;
|
||||
return FALSE;
|
||||
|
||||
/* Previously encountered one or more characters that are allowed
|
||||
with a list of scripts. Build the intersection of the required list
|
||||
with this character's list in intersection_map[]. */
|
||||
|
||||
case SCRIPT_MAP:
|
||||
for (int i = 0; i < MAPSIZE; i++)
|
||||
intersection_map[i] = require_map[i] & map[i];
|
||||
|
||||
/* If there's just one script in common, we could set it as the
|
||||
unique required script. However, in the new bitmap arrangements,
|
||||
finding the one script is expensive, so leave this out for now.
|
||||
Otherwise, make the intersection map the required map. */
|
||||
|
||||
/*
|
||||
if (onescript >= 0) require_script = onescript;
|
||||
else require_map = intersection_map;
|
||||
*/
|
||||
|
||||
require_map = intersection_map;
|
||||
break;
|
||||
|
||||
/* The previously set required script is a single script, not
|
||||
Han-related. Check that it is in this character's list. */
|
||||
|
||||
default:
|
||||
if (MAPBIT(map, require_script) == 0) return FALSE;
|
||||
break;
|
||||
}
|
||||
} /* End of handling negative scriptx */
|
||||
} /* End of checking non-Common character */
|
||||
|
||||
/* The character is in an acceptable script. We must now ensure that all
|
||||
decimal digits in the string come from the same set. Some scripts (e.g.
|
||||
Common, Arabic) have more than one set of decimal digits. This code does
|
||||
not allow mixing sets, even within the same script. The vector called
|
||||
PRIV(ucd_digit_sets)[] contains, in its first element, the number of
|
||||
following elements, and then, in ascending order, the code points of the
|
||||
'9' characters in every set of 10 digits. Each set is identified by the
|
||||
offset in the vector of its '9' character. An initial check of the first
|
||||
value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
|
||||
|
||||
if (ucd->chartype == ucp_Nd)
|
||||
{
|
||||
uint32_t digitset;
|
||||
|
||||
if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
|
||||
case SCRIPT_HANPENDING:
|
||||
if (script != ucp_Han) /* Another Han does nothing */
|
||||
{
|
||||
int mid;
|
||||
int bot = 1;
|
||||
int top = PRIV(ucd_digit_sets)[0];
|
||||
for (;;)
|
||||
uint32_t chspecial = 0;
|
||||
|
||||
if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
|
||||
if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
|
||||
if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
|
||||
if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL;
|
||||
|
||||
if (chspecial == 0) return FALSE; /* Not allowed with Han */
|
||||
|
||||
if (chspecial == FOUND_BOPOMOFO)
|
||||
require_state = SCRIPT_HANBOPOMOFO;
|
||||
else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
|
||||
require_state = SCRIPT_HANHIRAKATA;
|
||||
|
||||
/* Otherwise this character must be allowed with all of them, so remain
|
||||
in the pending state. */
|
||||
}
|
||||
break;
|
||||
|
||||
/* Previously encountered one of the "with Han" scripts. Check that
|
||||
this character is appropriate. */
|
||||
|
||||
case SCRIPT_HANHIRAKATA:
|
||||
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
|
||||
MAPBIT(map, ucp_Katakana) == 0) return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_HANBOPOMOFO:
|
||||
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_HANHANGUL:
|
||||
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
|
||||
break;
|
||||
|
||||
/* Previously encountered one or more characters that are allowed with a
|
||||
list of scripts. */
|
||||
|
||||
case SCRIPT_MAP:
|
||||
OK = FALSE;
|
||||
|
||||
for (int i = 0; i < FULL_MAPSIZE; i++)
|
||||
{
|
||||
if ((require_map[i] & map[i]) != 0)
|
||||
{
|
||||
if (top <= bot + 1) /* <= rather than == is paranoia */
|
||||
{
|
||||
digitset = top;
|
||||
break;
|
||||
}
|
||||
mid = (top + bot) / 2;
|
||||
if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
|
||||
OK = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* A required value of 0 means "unset". */
|
||||
if (!OK) return FALSE;
|
||||
|
||||
if (require_digitset == 0) require_digitset = digitset;
|
||||
else if (digitset != require_digitset) return FALSE;
|
||||
} /* End digit handling */
|
||||
} /* End checking non-Inherited character */
|
||||
/* The rest of the string must be in this script, but we have to
|
||||
allow for the Han complications. */
|
||||
|
||||
switch(script)
|
||||
{
|
||||
case ucp_Han:
|
||||
require_state = SCRIPT_HANPENDING;
|
||||
break;
|
||||
|
||||
case ucp_Hiragana:
|
||||
case ucp_Katakana:
|
||||
require_state = SCRIPT_HANHIRAKATA;
|
||||
break;
|
||||
|
||||
case ucp_Bopomofo:
|
||||
require_state = SCRIPT_HANBOPOMOFO;
|
||||
break;
|
||||
|
||||
case ucp_Hangul:
|
||||
require_state = SCRIPT_HANHANGUL;
|
||||
break;
|
||||
|
||||
/* Compute the intersection of the required list of scripts and the
|
||||
allowed scripts for this character. */
|
||||
|
||||
default:
|
||||
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
} /* End checking character's script and extensions. */
|
||||
|
||||
/* The character is in an acceptable script. We must now ensure that all
|
||||
decimal digits in the string come from the same set. Some scripts (e.g.
|
||||
Common, Arabic) have more than one set of decimal digits. This code does
|
||||
not allow mixing sets, even within the same script. The vector called
|
||||
PRIV(ucd_digit_sets)[] contains, in its first element, the number of
|
||||
following elements, and then, in ascending order, the code points of the
|
||||
'9' characters in every set of 10 digits. Each set is identified by the
|
||||
offset in the vector of its '9' character. An initial check of the first
|
||||
value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
|
||||
|
||||
if (ucd->chartype == ucp_Nd)
|
||||
{
|
||||
uint32_t digitset;
|
||||
|
||||
if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
|
||||
{
|
||||
int mid;
|
||||
int bot = 1;
|
||||
int top = PRIV(ucd_digit_sets)[0];
|
||||
for (;;)
|
||||
{
|
||||
if (top <= bot + 1) /* <= rather than == is paranoia */
|
||||
{
|
||||
digitset = top;
|
||||
break;
|
||||
}
|
||||
mid = (top + bot) / 2;
|
||||
if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
|
||||
}
|
||||
}
|
||||
|
||||
/* A required value of 0 means "unset". */
|
||||
|
||||
if (require_digitset == 0) require_digitset = digitset;
|
||||
else if (digitset != require_digitset) return FALSE;
|
||||
} /* End digit handling */
|
||||
|
||||
/* If we haven't yet got to the end, pick up the next character. */
|
||||
|
||||
|
|
|
@ -237,8 +237,8 @@ const uint32_t PRIV(ucd_script_sets)[] = {
|
|||
/* These are the main two-stage UCD tables. The fields in each record are:
|
||||
script (8 bits), character type (8 bits), grapheme break property (8 bits),
|
||||
offset to multichar other cases or zero (8 bits), offset to other case or zero
|
||||
(32 bits, signed), script extension (16 bits, signed), bidi class (8 bits), and
|
||||
a dummy 8-bit field to make the whole thing a multiple of 4 bytes. */
|
||||
(32 bits, signed), script extension (8 bits), bidi class (8 bits), and a dummy
|
||||
16-bit field to make the whole thing a multiple of 4 bytes. */
|
||||
|
||||
const ucd_record PRIV(ucd_records)[] = { /* 12588 bytes, record size 12 */
|
||||
{ 73, 0, 2, 0, 0, 0, 3, 256, }, /* 0 */
|
||||
|
|
|
@ -1138,23 +1138,27 @@
|
|||
\= Expect no match
|
||||
\x{2e7f}
|
||||
|
||||
/^\P{Katakana}+/utf
|
||||
\x{3105}
|
||||
\= Expect no match
|
||||
\x{30ff}
|
||||
|
||||
/^[\p{Arabic}]/utf
|
||||
\x{06e9}
|
||||
\x{060b}
|
||||
\= Expect no match
|
||||
X\x{06e9}
|
||||
|
||||
#subject no_jit
|
||||
|
||||
/^\P{Katakana}+/utf
|
||||
\x{3105}
|
||||
\= Expect no match
|
||||
\x{30ff}
|
||||
|
||||
/^[\P{Yi}]/utf
|
||||
\x{2f800}
|
||||
\= Expect no match
|
||||
\x{a014}
|
||||
\x{a4c6}
|
||||
|
||||
#subject -no_jit
|
||||
|
||||
/^\p{Any}X/utf
|
||||
AXYZ
|
||||
\x{1234}XYZ
|
||||
|
@ -2640,4 +2644,13 @@
|
|||
/[\p{taml}\p{sc:ugar}]+/utf
|
||||
\x{0b82}\x{10380}
|
||||
|
||||
/^[\p{sc:Arabic}]/utf
|
||||
\= Expect no match
|
||||
\x{650}
|
||||
\x{651}
|
||||
\x{652}
|
||||
\x{653}
|
||||
\x{654}
|
||||
\x{655}
|
||||
|
||||
# End of testinput4
|
||||
|
|
|
@ -2073,15 +2073,6 @@
|
|||
|
||||
# More differences from Perl
|
||||
|
||||
/^[\p{Arabic}]/utf
|
||||
\= Expect no match
|
||||
\x{650}
|
||||
\x{651}
|
||||
\x{652}
|
||||
\x{653}
|
||||
\x{654}
|
||||
\x{655}
|
||||
|
||||
/^\p{Common}/utf
|
||||
\x{60c}
|
||||
\x{61f}
|
||||
|
|
|
@ -1883,13 +1883,6 @@ No match
|
|||
\x{2e7f}
|
||||
No match
|
||||
|
||||
/^\P{Katakana}+/utf
|
||||
\x{3105}
|
||||
0: \x{3105}
|
||||
\= Expect no match
|
||||
\x{30ff}
|
||||
No match
|
||||
|
||||
/^[\p{Arabic}]/utf
|
||||
\x{06e9}
|
||||
0: \x{6e9}
|
||||
|
@ -1899,6 +1892,15 @@ No match
|
|||
X\x{06e9}
|
||||
No match
|
||||
|
||||
#subject no_jit
|
||||
|
||||
/^\P{Katakana}+/utf
|
||||
\x{3105}
|
||||
0: \x{3105}
|
||||
\= Expect no match
|
||||
\x{30ff}
|
||||
No match
|
||||
|
||||
/^[\P{Yi}]/utf
|
||||
\x{2f800}
|
||||
0: \x{2f800}
|
||||
|
@ -1908,6 +1910,8 @@ No match
|
|||
\x{a4c6}
|
||||
No match
|
||||
|
||||
#subject -no_jit
|
||||
|
||||
/^\p{Any}X/utf
|
||||
AXYZ
|
||||
0: AX
|
||||
|
@ -4235,4 +4239,19 @@ No match
|
|||
\x{0b82}\x{10380}
|
||||
0: \x{b82}\x{10380}
|
||||
|
||||
/^[\p{sc:Arabic}]/utf
|
||||
\= Expect no match
|
||||
\x{650}
|
||||
No match
|
||||
\x{651}
|
||||
No match
|
||||
\x{652}
|
||||
No match
|
||||
\x{653}
|
||||
No match
|
||||
\x{654}
|
||||
No match
|
||||
\x{655}
|
||||
No match
|
||||
|
||||
# End of testinput4
|
||||
|
|
|
@ -4722,21 +4722,6 @@ Callout 0: last capture = 1
|
|||
|
||||
# More differences from Perl
|
||||
|
||||
/^[\p{Arabic}]/utf
|
||||
\= Expect no match
|
||||
\x{650}
|
||||
No match
|
||||
\x{651}
|
||||
No match
|
||||
\x{652}
|
||||
No match
|
||||
\x{653}
|
||||
No match
|
||||
\x{654}
|
||||
No match
|
||||
\x{655}
|
||||
No match
|
||||
|
||||
/^\p{Common}/utf
|
||||
\x{60c}
|
||||
0: \x{60c}
|
||||
|
|
Loading…
Reference in New Issue