Add bidi class and control information to Unicode property data

This commit is contained in:
Philip Hazel 2021-12-05 17:44:59 +00:00
parent ba3d0edcbd
commit 823d4ac956
12 changed files with 9136 additions and 4482 deletions

View File

@ -15,15 +15,16 @@
#
# [python3] ./MultiStage2.py >../src/pcre2_ucd.c
#
# It requires six Unicode data tables: DerivedGeneralCategory.txt,
# GraphemeBreakProperty.txt, Scripts.txt, ScriptExtensions.txt,
# CaseFolding.txt, and emoji-data.txt. These must be in the
# maint/Unicode.tables subdirectory.
# It requires eight Unicode data tables: DerivedBidiClass.txt,
# DerivedGeneralCategory.txt, GraphemeBreakProperty.txt, PropList.txt,
# Scripts.txt, ScriptExtensions.txt, CaseFolding.txt, and emoji-data.txt. These
# must be in the maint/Unicode.tables subdirectory.
#
# DerivedGeneralCategory.txt is found in the "extracted" subdirectory of the
# Unicode database (UCD) on the Unicode web site; GraphemeBreakProperty.txt is
# in the "auxiliary" subdirectory. Scripts.txt, ScriptExtensions.txt, and
# CaseFolding.txt are directly in the UCD directory.
# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted"
# subdirectory of the Unicode database (UCD) on the Unicode web site;
# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. PropList.txt,
# Scripts.txt, ScriptExtensions.txt, and CaseFolding.txt are directly in the
# UCD directory.
#
# The emoji-data.txt file is found in the "emoji" subdirectory even though it
# is technically part of a different (but coordinated) standard as shown
@ -69,6 +70,10 @@
# Added code to add a Script Extensions field to records. This has increased
# their size from 8 to 12 bytes, only 10 of which are currently used.
#
# Added code to add a bidi class field to records by scanning the
# DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare
# bytes, so now 11 out of 12 are in use.
#
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
# July-2012: Updated list of scripts for Unicode 6.1.0
@ -93,6 +98,8 @@
# 27-July-2019: Updated for Unicode 12.1.0
# 10-March-2020: Updated for Unicode 13.0.0
# PCRE2-10.39: Updated for Unicode 14.0.0
# 05-December-2021: Added code to scan DerivedBidiClass.txt for bidi class,
# and also PropList.txt for the Bidi_Control property
# ----------------------------------------------------------------------------
#
#
@ -100,14 +107,15 @@
# pcre2_internal.h. They look up Unicode character properties using short
# sequences of code that contains no branches, which makes for greater speed.
#
# Conceptually, there is a table of records (of type ucd_record), containing a
# script number, script extension value, character type, grapheme break type,
# offset to caseless matching set, offset to the character's other case, for
# every Unicode character. However, a real table covering all Unicode
# characters would be far too big. It can be efficiently compressed by
# observing that many characters have the same record, and many blocks of
# characters (taking 128 characters in a block) have the same set of records as
# other blocks. This leads to a 2-stage lookup process.
# Conceptually, there is a table of records (of type ucd_record), one for each
# Unicode character. Each record contains the script number, script extension
# value, character type, grapheme break type, offset to caseless matching set,
# offset to the character's other case, and the bidi class/control. However, a
# real table covering all Unicode characters would be far too big. It can be
# efficiently compressed by observing that many characters have the same
# record, and many blocks of characters (taking 128 characters in a block) have
# the same set of records as other blocks. This leads to a 2-stage lookup
# process.
#
# This script constructs six tables. The ucd_caseless_sets table contains
# lists of characters that all match each other caselessly. Each list is
@ -136,19 +144,20 @@
# the offset of a character within its own block, and the result is the index
# number of the required record in the ucd_records vector.
#
# The following examples are correct for the Unicode 11.0.0 database. Future
# The following examples are correct for the Unicode 14.0.0 database. Future
# updates may make change the actual lookup values.
#
# Example: lowercase "a" (U+0061) is in block 0
# lookup 0 in stage1 table yields 0
# lookup 97 (0x61) in the first table in stage2 yields 17
# record 17 is { 34, 5, 12, 0, -32, 34, 0 }
# lookup 97 (0x61) in the first table in stage2 yields 22
# record 22 is { 34, 5, 12, 0, -32, 34, 2, 0 }
# 34 = ucp_Latin => Latin script
# 5 = ucp_Ll => Lower case letter
# 12 = ucp_gbOther => Grapheme break property "Other"
# 0 => Not part of a caseless set
# -32 (-0x20) => Other case is U+0041
# 34 = ucp_Latin => No special Script Extension property
# 2 = ucp_bidiL => Bidi class left-to-right
# 0 => Dummy value, unused at present
#
# Almost all lowercase latin characters resolve to the same record. One or two
@ -156,34 +165,36 @@
# example, k, K and the Kelvin symbol are such a set).
#
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
# lookup 96 in stage1 table yields 90
# lookup 66 (0x42) in table 90 in stage2 yields 564
# record 564 is { 27, 7, 12, 0, 0, 27, 0 }
# lookup 96 in stage1 table yields 91
# lookup 66 (0x42) in table 91 in stage2 yields 613
# record 613 is { 27, 7, 12, 0, 0, 27, 2, 0 }
# 27 = ucp_Hiragana => Hiragana script
# 7 = ucp_Lo => Other letter
# 12 = ucp_gbOther => Grapheme break property "Other"
# 0 => Not part of a caseless set
# 0 => No other case
# 27 = ucp_Hiragana => No special Script Extension property
# 2 = ucp_bidiL => Bidi class left-to-right
# 0 => Dummy value, unused at present
#
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
# lookup 57 in stage1 table yields 55
# lookup 80 (0x50) in table 55 in stage2 yields 458
# record 458 is { 28, 12, 3, 0, 0, -101, 0 }
# lookup 80 (0x50) in table 55 in stage2 yields 485
# record 485 is { 28, 12, 3, 0, 0, -122, 19, 0 }
# 28 = ucp_Inherited => Script inherited from predecessor
# 12 = ucp_Mn => Non-spacing mark
# 3 = ucp_gbExtend => Grapheme break property "Extend"
# 0 => Not part of a caseless set
# 0 => No other case
# -101 => Script Extension list offset = 101
# -122 => Script Extension list offset = 122
# 19 = ucp_bidiNSM => Bidi class non-spacing mark
# 0 => Dummy value, unused at present
#
# At offset 101 in the ucd_script_sets vector we find the list 3, 15, 107, 29,
# and terminator 0. This means that this character is expected to be used with
# any of those scripts, which are Bengali, Devanagari, Grantha, and Kannada.
#
# Philip Hazel, 03 July 2008
# Philip Hazel, last updated 05 December 2021.
##############################################################################
@ -195,17 +206,21 @@ MAX_UNICODE = 0x110000
NOTACHAR = 0xffffffff
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt,
# DerivedBidiClass.txt or DerivedGeneralCategory.txt
def make_get_names(enum):
return lambda chardata: enum.index(chardata[1])
# Parse a line of CaseFolding.txt
def get_other_case(chardata):
if chardata[1] == 'C' or chardata[1] == 'S':
return int(chardata[2], 16) - int(chardata[0], 16)
return 0
# Parse a line of ScriptExtensions.txt
def get_script_extension(chardata):
this_script_list = list(chardata[1].split(' '))
if len(this_script_list) == 1:
@ -233,6 +248,7 @@ def get_script_extension(chardata):
return -return_value
# Read the whole table in memory, setting/checking the Unicode version
def read_table(file_name, get_value, default_value):
global unicode_version
@ -489,6 +505,14 @@ break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other',
'ZWJ', 'Extended_Pictographic' ]
# BIDI class property names in the DerivedBidiClass.txt file
bidiclass_names = ['AL', 'AN', 'B', 'BN', 'CS', 'EN', 'ES', 'ET', 'FSI', 'L',
'LRE', 'LRI', 'LRO', 'NSM', 'ON', 'PDF', 'PDI', 'R', 'RLE', 'RLI', 'RLO',
'S', 'WS' ]
# Create the various tables
test_record_size()
unicode_version = ""
@ -496,6 +520,28 @@ script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names),
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidiclass_names), bidiclass_names.index('L'))
# The Bidi_Control property is a Y/N value, so needs only one bit. We scan the
# PropList.txt file and set 0x80 bit in the bidi_class table.
file = open('Unicode.tables/PropList.txt', 'r', encoding='utf-8')
for line in file:
line = re.sub(r'#.*', '', line)
chardata = list(map(str.strip, line.split(';')))
if len(chardata) <= 1:
continue
if chardata[1] != "Bidi_Control":
continue
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
char = int(m.group(1), 16)
if m.group(3) is None:
last = char
else:
last = int(m.group(3), 16)
for i in range(char, last + 1):
bidi_class[i] |= 0x80;
file.close()
# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
# we need to find the Extended_Pictographic property for emoji characters. This
@ -509,10 +555,8 @@ for line in file:
chardata = list(map(str.strip, line.split(';')))
if len(chardata) <= 1:
continue
if chardata[1] != "Extended_Pictographic":
continue
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
char = int(m.group(1), 16)
if m.group(3) is None:
@ -542,12 +586,13 @@ for i in range(0, MAX_UNICODE):
if scriptx[i] == script_abbrevs_default:
scriptx[i] = script[i]
# With the addition of the new Script Extensions field, we need some padding
# to get the Unicode records up to 12 bytes (multiple of 4). Set a value
# greater than 255 to make the field 16 bits.
# With the addition of the Script Extensions field, we needed some padding to
# get the Unicode records up to 12 bytes (multiple of 4). Originally this was a
# 16-bit field and padding_dummy[0] was set to 256 to ensure this, but 8 bits
# are now used for the bidi class, so zero will do.
padding_dummy = [0] * MAX_UNICODE
padding_dummy[0] = 256
padding_dummy[0] = 0
# This block of code was added by PH in September 2012. I am not a Python
# programmer, so the style is probably dreadful, but it does the job. It scans
@ -622,7 +667,7 @@ for s in sets:
# Combine the tables
table, records = combine_tables(script, category, break_props,
caseless_offsets, other_case, scriptx, padding_dummy)
caseless_offsets, other_case, scriptx, bidi_class, padding_dummy)
record_size, record_struct = get_record_size_struct(list(records.keys()))
@ -673,7 +718,7 @@ print("a totally empty module because some compilers barf at that.")
print("Instead, just supply some small dummy tables. */")
print()
print("#ifndef SUPPORT_UNICODE")
print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0 }};")
print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0,0 }};")
print("const uint16_t PRIV(ucd_stage1)[] = {0};")
print("const uint16_t PRIV(ucd_stage2)[] = {0};")
print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
@ -693,6 +738,7 @@ print(" ucp_gbOther, /* grapheme break property */")
print(" 0, /* case set */")
print(" 0, /* other case */")
print(" ucp_Unknown, /* script extension */")
print(" ucp_bidiL, /* bidi class */")
print(" 0, /* dummy filler */")
print(" }};")
print("#endif")
@ -775,8 +821,9 @@ print("\n};\n")
print("/* These are the main two-stage UCD tables. The fields in each record are:")
print("script (8 bits), character type (8 bits), grapheme break property (8 bits),")
print("offset to multichar other cases or zero (8 bits), offset to other case")
print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy")
print("16-bit field to make the whole thing a multiple of 4 bytes. */\n")
print("or zero (32 bits, signed), script extension (16 bits, signed), bidi class")
print("(8 bits), and a dummy 8-bit field to make the whole thing a multiple")
print("of 4 bytes. */\n")
print_records(records, record_size)
print_table(min_stage1, 'PRIV(ucd_stage1)')

View File

@ -23,7 +23,7 @@ GenerateUtt.py A Python script to generate part of the pcre2_tables.c file
ManyConfigTests A shell script that runs "configure, make, test" a number of
times with different configuration settings.
MultiStage2.py A Python script that generates the file pcre2_ucd.c from six
MultiStage2.py A Python script that generates the file pcre2_ucd.c from eight
Unicode data files, which are themselves downloaded from the
Unicode web site. Run this script in the "maint" directory.
The generated file is written to stdout. It contains the
@ -41,7 +41,8 @@ README This file.
Unicode.tables The files in this directory were downloaded from the Unicode
web site. They contain information about Unicode characters
and scripts. The ones used by the MultiStage2.py script are
CaseFolding.txt, DerivedGeneralCategory.txt, Scripts.txt,
CaseFolding.txt, DerivedBidiClass.txt,
DerivedGeneralCategory.txt, PropList.txt, Scripts.txt,
ScriptExtensions.txt, GraphemeBreakProperty.txt, and
emoji-data.txt. I've kept UnicodeData.txt (which is no longer
used by the script) because it is useful occasionally for
@ -439,4 +440,4 @@ years.
Philip Hazel
Email local part: Philip.Hazel
Email domain: gmail.com
Last updated: 26 August 2021
Last updated: 05 December 2021

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@
* A program for testing the Unicode property table *
***************************************************/
/* Copyright (c) University of Cambridge 2008-2020 */
/* Copyright (c) University of Cambridge 2008-2021 */
/* Compile thus:
@ -19,33 +19,35 @@ I wrote it to help with debugging PCRE, and have added things that I found
useful, in a rather haphazard way. The code has never been seriously tidied or
checked for robustness, but it shouldn't now give compiler warnings.
There is only one option: "-s". If given, it applies only to the "findprop"
command. It causes the UTF-8 sequence of bytes that encode the character to be
output between angle brackets at the end of the line. On a UTF-8 terminal, this
There is only one option: "-s". If given, it applies only to the "findprop"
command. It causes the UTF-8 sequence of bytes that encode the character to be
output between angle brackets at the end of the line. On a UTF-8 terminal, this
will show the appropriate graphic for the code point.
If the command has arguments, they are concatenated into a buffer, separated by
spaces. If the first argument starts "U+" or consists entirely of hexadecimal
digits, "findprop" is inserted at the start. The buffer is then processed as a
single line file, after which the program exits. If there are no arguments, the
program reads commands line by line on stdin and writes output to stdout. The
program reads commands line by line on stdin and writes output to stdout. The
return code is always zero.
There are three commands:
"findprop" must be followed by a space-separated list of Unicode code points as
hex numbers, either without any prefix or starting with "U+". The output is one
line per character, giving its Unicode properties followed by its other case or
line per character, giving its Unicode properties followed by its other case or
cases if one or more exist, followed by its Script Extension list if it is not
just the same as the base script. This list is in square brackets. The
properties are:
Bidi control shown as '*' if true
Bidi class e.g. NSM (most common is L)
General type e.g. Letter
Specific type e.g. Upper case letter
Script e.g. Medefaidrin
Grapheme break type e.g. Extend (most common is Other)
"find" must be followed by a list of property names and their values. The
"find" must be followed by a list of property names and their values. The
values are case-sensitive. This finds characters that have those properties. If
multiple properties are listed, they must all be matched. Currently supported:
@ -56,6 +58,8 @@ multiple properties are listed, they must all be matched. Currently supported:
scripts must be present.
type <abbrev> The character's specific type (e.g. Lu or Nd) must match.
gbreak <name> The grapheme break property must match.
bidi <class> The character's bidi class must match.
bidi_control The character must be a bidi control character
If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
Script Extensions, there may be a mixture of positive and negative
@ -63,10 +67,10 @@ requirements. All must be satisfied.
Sequences of two or more characters are shown as ranges, for example
U+0041..U+004A. No more than 100 lines are are output. If there are more
characters, the list ends with ...
characters, the list ends with ...
"list" must be followed by a property name (script, type, or gbreak). The
defined values for that property are listed. */
"list" must be followed by one of property names script, type, gbreak or bidi.
The defined values for that property are listed. */
#ifdef HAVE_CONFIG_H
@ -145,7 +149,7 @@ static const unsigned char *type_names[] = {
US"So", US"Other symbol",
US"Zl", US"Line separator",
US"Zp", US"Paragraph separator",
US"Zs", US"Space separator"
US"Zs", US"Space separator"
};
static const unsigned char *gb_names[] = {
@ -166,6 +170,31 @@ static const unsigned char *gb_names[] = {
US"Extended_Pictographic", US""
};
static const unsigned char *bd_names[] = {
US"AL", US"Arabic letter",
US"AN", US"Arabid number",
US"B", US"Paragraph separator",
US"BN", US"Boundary neutral",
US"CS", US"Common separator",
US"EN", US"European number",
US"ES", US"European separator",
US"ET", US"European terminator",
US"FSI", US"First string isolate",
US"L", US"Left-to-right",
US"LRE", US"Left-to-right embedding",
US"LRI", US"Left-to-right isolate",
US"LRO", US"Left-to-right override",
US"NSM", US"Non-spacing mark",
US"ON", US"Other neutral",
US"PDF", US"Pop directional format",
US"PDI", US"Pop directional isolate",
US"R", US"Right-to-left",
US"RLE", US"Right-to-left embedding",
US"RLI", US"Right-to-left isolate",
US"RLO", US"Right-to-left override",
US"S", US"Segment separator",
US"WS", US"White space"
};
static const unsigned int utf8_table1[] = {
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
@ -235,14 +264,14 @@ const ucp_type_table *u;
for (i = 0; i < PRIV(utt_size); i++)
{
u = PRIV(utt) + i;
u = PRIV(utt) + i;
if (u->type == PT_SC && u->value == script) break;
}
if (i < PRIV(utt_size))
return PRIV(utt_names) + u->name_offset;
return "??";
}
}
/*************************************************
@ -257,12 +286,15 @@ int fulltype = UCD_CHARTYPE(c);
int script = UCD_SCRIPT(c);
int scriptx = UCD_SCRIPTX(c);
int gbprop = UCD_GRAPHBREAK(c);
int bidi = UCD_BIDICLASS(c);
int bidicontrol = UCD_BIDICONTROL(c);
unsigned int othercase = UCD_OTHERCASE(c);
int caseset = UCD_CASESET(c);
const unsigned char *fulltypename = US"??";
const unsigned char *typename = US"??";
const unsigned char *graphbreak = US"??";
const unsigned char *bidiclass = US"??";
const unsigned char *scriptname = CUS get_scriptname(script);
switch (type)
@ -332,7 +364,37 @@ switch(gbprop)
default: graphbreak = US"Unknown"; break;
}
printf("U+%04X %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
switch(bidi)
{
case ucp_bidiAL: bidiclass = US"AL "; break;
case ucp_bidiFSI: bidiclass = US"FSI"; break;
case ucp_bidiL: bidiclass = US"L "; break;
case ucp_bidiLRE: bidiclass = US"LRE"; break;
case ucp_bidiLRI: bidiclass = US"LRI"; break;
case ucp_bidiLRO: bidiclass = US"LRO"; break;
case ucp_bidiPDF: bidiclass = US"PDF"; break;
case ucp_bidiPDI: bidiclass = US"PDI"; break;
case ucp_bidiR: bidiclass = US"R "; break;
case ucp_bidiRLE: bidiclass = US"RLE"; break;
case ucp_bidiRLI: bidiclass = US"RLI"; break;
case ucp_bidiRLO: bidiclass = US"RLO"; break;
case ucp_bidiAN: bidiclass = US"AN "; break;
case ucp_bidiB: bidiclass = US"B "; break;
case ucp_bidiBN: bidiclass = US"BN "; break;
case ucp_bidiCS: bidiclass = US"CS "; break;
case ucp_bidiEN: bidiclass = US"EN "; break;
case ucp_bidiES: bidiclass = US"ES "; break;
case ucp_bidiET: bidiclass = US"ET "; break;
case ucp_bidiNSM: bidiclass = US"NSM"; break;
case ucp_bidiON: bidiclass = US"ON "; break;
case ucp_bidiS: bidiclass = US"S "; break;
case ucp_bidiWS: bidiclass = US"WS "; break;
default: bidiclass = US"???"; break;
}
printf("U+%04X %c%s %s: %s, %s, %s", c, bidicontrol? '*':' ', bidiclass,
typename, fulltypename, scriptname, graphbreak);
if (is_just_one && othercase != c)
{
printf(", U+%04X", othercase);
@ -341,9 +403,9 @@ if (is_just_one && othercase != c)
const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
while (*(++p) < NOTACHAR)
{
unsigned int d = *p;
unsigned int d = *p;
if (d != othercase && d != c) printf(", U+%04X", d);
}
}
}
}
@ -364,13 +426,13 @@ if (scriptx != script)
}
printf("]");
}
if (show_character && is_just_one)
{
unsigned char buffer[8];
size_t len = ord2utf8(c, buffer);
printf(", >%.*s<", (int)len, buffer);
}
printf(", >%.*s<", (int)len, buffer);
}
printf("\n");
}
@ -394,9 +456,12 @@ uint32_t i, c;
int script = -1;
int type = -1;
int gbreak = -1;
int bidiclass = -1;
BOOL bidicontrol = FALSE;
BOOL script_not = FALSE;
BOOL type_not = FALSE;
BOOL gbreak_not = FALSE;
BOOL bidiclass_not = FALSE;
BOOL hadrange = FALSE;
const ucd_record *ucd, *next_ucd;
const char *pad = " ";
@ -405,10 +470,12 @@ while (*s != 0)
{
unsigned int offset = 0;
BOOL scriptx_not = FALSE;
char *value_start;
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
*t = 0;
while (isspace(*s)) s++;
value_start = s;
for (t = value; *s != 0 && !isspace(*s); s++) *t++ = *s;
*t = 0;
@ -426,11 +493,11 @@ while (*s != 0)
for (i = 0; i < PRIV(utt_size); i++)
{
const ucp_type_table *u = PRIV(utt) + i;
if (u->type == PT_SC && strcmp(CS(value + offset),
const ucp_type_table *u = PRIV(utt) + i;
if (u->type == PT_SC && strcmp(CS(value + offset),
PRIV(utt_names) + u->name_offset) == 0)
{
c = u->value;
c = u->value;
if (name[6] == 'x')
{
scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
@ -516,6 +583,45 @@ while (*s != 0)
}
}
else if (strcmp(CS name, "bidi") == 0 ||
strcmp(CS name, "bidiclass") == 0 ||
strcmp(CS name, "bidi_class") == 0 )
{
if (bidiclass >= 0)
{
printf("** Only 1 bidi class value allowed\n");
return;
}
else
{
if (value[0] == '!')
{
bidiclass_not = TRUE;
offset = 1;
}
for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
{
if (strcmp(CS (value + offset), CS bd_names[i]) == 0)
{
bidiclass = i/2;
break;
}
}
if (i >= sizeof(bd_names)/sizeof(char *))
{
printf("** Unrecognized bidi class name \"%s\"\n", value);
return;
}
}
}
else if (strcmp(CS name, "bidi_control") == 0 ||
strcmp(CS name, "bidicontrol") == 0)
{
bidicontrol = TRUE;
s = value_start; /* No data */
}
else
{
printf("** Unrecognized property name \"%s\"\n", name);
@ -523,7 +629,8 @@ while (*s != 0)
}
}
if (script < 0 && scriptx_count == 0 && type < 0 && gbreak < 0)
if (script < 0 && scriptx_count == 0 && type < 0 && gbreak < 0 &&
bidiclass < 0 && !bidicontrol)
{
printf("** No properties specified\n");
return;
@ -608,6 +715,20 @@ for (c = 0; c <= 0x10ffff; c++)
}
}
if (bidiclass >= 0)
{
if (bidiclass_not)
{
if (bidiclass == UCD_BIDICLASS(c)) continue;
}
else
{
if (bidiclass != UCD_BIDICLASS(c)) continue;
}
}
if (bidicontrol && UCD_BIDICONTROL(c) == 0) continue;
/* All conditions are met. Look for runs. */
ucd = GET_UCD(c);
@ -663,9 +784,9 @@ if (strcmp(CS name, "findprop") == 0)
{
while (*s != 0)
{
unsigned int c;
unsigned int c;
unsigned char *endptr;
t = s;
t = s;
if (strncmp(CS t, "U+", 2) == 0) t += 2;
c = strtoul(CS t, CSS(&endptr), 16);
if (*endptr != 0 && !isspace(*endptr))
@ -673,13 +794,13 @@ if (strcmp(CS name, "findprop") == 0)
while (*endptr != 0 && !isspace(*endptr)) endptr++;
printf("** Invalid hex number: ignored \"%.*s\"\n", (int)(endptr-s), s);
}
else
else
{
if (c > 0x10ffff)
if (c > 0x10ffff)
printf("** U+%x is too big for a Unicode code point\n", c);
else
else
print_prop(c, TRUE);
}
}
s = endptr;
while (isspace(*s)) s++;
}
@ -689,7 +810,7 @@ else if (strcmp(CS name, "find") == 0)
{
find_chars(s);
}
else if (strcmp(CS name, "list") == 0)
{
while (*s != 0)
@ -698,38 +819,45 @@ else if (strcmp(CS name, "list") == 0)
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
*t = 0;
while (isspace(*s)) s++;
if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
{
for (i = 0; i < PRIV(utt_size); i++)
for (i = 0; i < PRIV(utt_size); i++)
if (PRIV(utt)[i].type == PT_SC)
printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
}
else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
{
for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
printf("%s %s\n", type_names[i], type_names[i+1]);
}
printf("%s %s\n", type_names[i], type_names[i+1]);
}
else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
{
for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
{
if (gb_names[i+1][0] != 0)
if (gb_names[i+1][0] != 0)
printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
else
else
printf("%s\n", gb_names[i]);
}
}
}
}
else
else if (strcmp(CS name, "bidi") == 0 ||
strcmp(CS name, "bidiclasses") == 0)
{
printf("** Unknown property \"%s\"\n", name);
for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
printf("%3s %s\n", bd_names[i], bd_names[i+1]);
}
else
{
printf("** Unknown property \"%s\"\n", name);
break;
}
}
}
}
}
}
else printf("** Unknown test command \"%s\"\n", name);
}
@ -751,32 +879,32 @@ if (argc > 1 && strcmp(argv[1], "-s") == 0)
{
show_character = TRUE;
first_arg++;
}
}
if (argc > first_arg)
{
int i;
BOOL hexfirst = TRUE;
char *arg = argv[first_arg];
BOOL hexfirst = TRUE;
char *arg = argv[first_arg];
unsigned char *s = buffer;
if (strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
if (strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
{
while (*arg != 0)
while (*arg != 0)
{
if (!isxdigit(*arg++)) { hexfirst = FALSE; break; }
}
}
if (!isxdigit(*arg++)) { hexfirst = FALSE; break; }
}
}
if (hexfirst)
{
strcpy(CS s, "findprop ");
s += 9;
}
for (i = first_arg; i < argc; i++)
{
s += sprintf(CS s, "%s ", argv[i]);
s += sprintf(CS s, "%s ", argv[i]);
}
process_command_line(buffer);
@ -812,7 +940,7 @@ for(;;)
if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
if (!interactive) printf("%s", buffer);
}
process_command_line(buffer);
}

View File

@ -46,3 +46,5 @@ findprop 32ff
findprop 1f16d
findprop U+10e93 U+10eaa
findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067

View File

@ -4,3 +4,14 @@ find type Sk
find type Pd
find gbreak LVT
find script Old_Uyghur
find bidi PDF
find bidi CS
find bidi CS type Sm
find bidi B
find bidi FSI
find bidi PDI
find bidi RLI
find bidi RLO
find bidi S
find bidi WS
find bidi_control

View File

@ -1,398 +1,409 @@
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
U+0000 Control: Control, Common, Control
U+0001 Control: Control, Common, Control
U+0002 Control: Control, Common, Control
U+0003 Control: Control, Common, Control
U+0004 Control: Control, Common, Control
U+0005 Control: Control, Common, Control
U+0006 Control: Control, Common, Control
U+0007 Control: Control, Common, Control
U+0008 Control: Control, Common, Control
U+0009 Control: Control, Common, Control
U+000A Control: Control, Common, LF
U+000B Control: Control, Common, Control
U+000C Control: Control, Common, Control
U+000D Control: Control, Common, CR
U+000E Control: Control, Common, Control
U+000F Control: Control, Common, Control
U+0000 BN Control: Control, Common, Control
U+0001 BN Control: Control, Common, Control
U+0002 BN Control: Control, Common, Control
U+0003 BN Control: Control, Common, Control
U+0004 BN Control: Control, Common, Control
U+0005 BN Control: Control, Common, Control
U+0006 BN Control: Control, Common, Control
U+0007 BN Control: Control, Common, Control
U+0008 BN Control: Control, Common, Control
U+0009 S Control: Control, Common, Control
U+000A B Control: Control, Common, LF
U+000B S Control: Control, Common, Control
U+000C WS Control: Control, Common, Control
U+000D B Control: Control, Common, CR
U+000E BN Control: Control, Common, Control
U+000F BN Control: Control, Common, Control
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
U+0010 Control: Control, Common, Control
U+0011 Control: Control, Common, Control
U+0012 Control: Control, Common, Control
U+0013 Control: Control, Common, Control
U+0014 Control: Control, Common, Control
U+0015 Control: Control, Common, Control
U+0016 Control: Control, Common, Control
U+0017 Control: Control, Common, Control
U+0018 Control: Control, Common, Control
U+0019 Control: Control, Common, Control
U+001A Control: Control, Common, Control
U+001B Control: Control, Common, Control
U+001C Control: Control, Common, Control
U+001D Control: Control, Common, Control
U+001E Control: Control, Common, Control
U+001F Control: Control, Common, Control
U+0010 BN Control: Control, Common, Control
U+0011 BN Control: Control, Common, Control
U+0012 BN Control: Control, Common, Control
U+0013 BN Control: Control, Common, Control
U+0014 BN Control: Control, Common, Control
U+0015 BN Control: Control, Common, Control
U+0016 BN Control: Control, Common, Control
U+0017 BN Control: Control, Common, Control
U+0018 BN Control: Control, Common, Control
U+0019 BN Control: Control, Common, Control
U+001A BN Control: Control, Common, Control
U+001B BN Control: Control, Common, Control
U+001C B Control: Control, Common, Control
U+001D B Control: Control, Common, Control
U+001E B Control: Control, Common, Control
U+001F S Control: Control, Common, Control
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
U+0020 Separator: Space separator, Common, Other
U+0021 Punctuation: Other punctuation, Common, Other
U+0022 Punctuation: Other punctuation, Common, Other
U+0023 Punctuation: Other punctuation, Common, Other
U+0024 Symbol: Currency symbol, Common, Other
U+0025 Punctuation: Other punctuation, Common, Other
U+0026 Punctuation: Other punctuation, Common, Other
U+0027 Punctuation: Other punctuation, Common, Other
U+0028 Punctuation: Open punctuation, Common, Other
U+0029 Punctuation: Close punctuation, Common, Other
U+002A Punctuation: Other punctuation, Common, Other
U+002B Symbol: Mathematical symbol, Common, Other
U+002C Punctuation: Other punctuation, Common, Other
U+002D Punctuation: Dash punctuation, Common, Other
U+002E Punctuation: Other punctuation, Common, Other
U+002F Punctuation: Other punctuation, Common, Other
U+0020 WS Separator: Space separator, Common, Other
U+0021 ON Punctuation: Other punctuation, Common, Other
U+0022 ON Punctuation: Other punctuation, Common, Other
U+0023 ET Punctuation: Other punctuation, Common, Other
U+0024 ET Symbol: Currency symbol, Common, Other
U+0025 ET Punctuation: Other punctuation, Common, Other
U+0026 ON Punctuation: Other punctuation, Common, Other
U+0027 ON Punctuation: Other punctuation, Common, Other
U+0028 ON Punctuation: Open punctuation, Common, Other
U+0029 ON Punctuation: Close punctuation, Common, Other
U+002A ON Punctuation: Other punctuation, Common, Other
U+002B ES Symbol: Mathematical symbol, Common, Other
U+002C CS Punctuation: Other punctuation, Common, Other
U+002D ES Punctuation: Dash punctuation, Common, Other
U+002E CS Punctuation: Other punctuation, Common, Other
U+002F CS Punctuation: Other punctuation, Common, Other
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
U+0030 Number: Decimal number, Common, Other
U+0031 Number: Decimal number, Common, Other
U+0032 Number: Decimal number, Common, Other
U+0033 Number: Decimal number, Common, Other
U+0034 Number: Decimal number, Common, Other
U+0035 Number: Decimal number, Common, Other
U+0036 Number: Decimal number, Common, Other
U+0037 Number: Decimal number, Common, Other
U+0038 Number: Decimal number, Common, Other
U+0039 Number: Decimal number, Common, Other
U+003A Punctuation: Other punctuation, Common, Other
U+003B Punctuation: Other punctuation, Common, Other
U+003C Symbol: Mathematical symbol, Common, Other
U+003D Symbol: Mathematical symbol, Common, Other
U+003E Symbol: Mathematical symbol, Common, Other
U+003F Punctuation: Other punctuation, Common, Other
U+0030 EN Number: Decimal number, Common, Other
U+0031 EN Number: Decimal number, Common, Other
U+0032 EN Number: Decimal number, Common, Other
U+0033 EN Number: Decimal number, Common, Other
U+0034 EN Number: Decimal number, Common, Other
U+0035 EN Number: Decimal number, Common, Other
U+0036 EN Number: Decimal number, Common, Other
U+0037 EN Number: Decimal number, Common, Other
U+0038 EN Number: Decimal number, Common, Other
U+0039 EN Number: Decimal number, Common, Other
U+003A CS Punctuation: Other punctuation, Common, Other
U+003B ON Punctuation: Other punctuation, Common, Other
U+003C ON Symbol: Mathematical symbol, Common, Other
U+003D ON Symbol: Mathematical symbol, Common, Other
U+003E ON Symbol: Mathematical symbol, Common, Other
U+003F ON Punctuation: Other punctuation, Common, Other
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
U+0040 Punctuation: Other punctuation, Common, Other
U+0041 Letter: Upper case letter, Latin, Other, U+0061
U+0042 Letter: Upper case letter, Latin, Other, U+0062
U+0043 Letter: Upper case letter, Latin, Other, U+0063
U+0044 Letter: Upper case letter, Latin, Other, U+0064
U+0045 Letter: Upper case letter, Latin, Other, U+0065
U+0046 Letter: Upper case letter, Latin, Other, U+0066
U+0047 Letter: Upper case letter, Latin, Other, U+0067
U+0048 Letter: Upper case letter, Latin, Other, U+0068
U+0049 Letter: Upper case letter, Latin, Other, U+0069
U+004A Letter: Upper case letter, Latin, Other, U+006A
U+004B Letter: Upper case letter, Latin, Other, U+006B, U+212A
U+004C Letter: Upper case letter, Latin, Other, U+006C
U+004D Letter: Upper case letter, Latin, Other, U+006D
U+004E Letter: Upper case letter, Latin, Other, U+006E
U+004F Letter: Upper case letter, Latin, Other, U+006F
U+0040 ON Punctuation: Other punctuation, Common, Other
U+0041 L Letter: Upper case letter, Latin, Other, U+0061
U+0042 L Letter: Upper case letter, Latin, Other, U+0062
U+0043 L Letter: Upper case letter, Latin, Other, U+0063
U+0044 L Letter: Upper case letter, Latin, Other, U+0064
U+0045 L Letter: Upper case letter, Latin, Other, U+0065
U+0046 L Letter: Upper case letter, Latin, Other, U+0066
U+0047 L Letter: Upper case letter, Latin, Other, U+0067
U+0048 L Letter: Upper case letter, Latin, Other, U+0068
U+0049 L Letter: Upper case letter, Latin, Other, U+0069
U+004A L Letter: Upper case letter, Latin, Other, U+006A
U+004B L Letter: Upper case letter, Latin, Other, U+006B, U+212A
U+004C L Letter: Upper case letter, Latin, Other, U+006C
U+004D L Letter: Upper case letter, Latin, Other, U+006D
U+004E L Letter: Upper case letter, Latin, Other, U+006E
U+004F L Letter: Upper case letter, Latin, Other, U+006F
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
U+0050 Letter: Upper case letter, Latin, Other, U+0070
U+0051 Letter: Upper case letter, Latin, Other, U+0071
U+0052 Letter: Upper case letter, Latin, Other, U+0072
U+0053 Letter: Upper case letter, Latin, Other, U+0073, U+017F
U+0054 Letter: Upper case letter, Latin, Other, U+0074
U+0055 Letter: Upper case letter, Latin, Other, U+0075
U+0056 Letter: Upper case letter, Latin, Other, U+0076
U+0057 Letter: Upper case letter, Latin, Other, U+0077
U+0058 Letter: Upper case letter, Latin, Other, U+0078
U+0059 Letter: Upper case letter, Latin, Other, U+0079
U+005A Letter: Upper case letter, Latin, Other, U+007A
U+005B Punctuation: Open punctuation, Common, Other
U+005C Punctuation: Other punctuation, Common, Other
U+005D Punctuation: Close punctuation, Common, Other
U+005E Symbol: Modifier symbol, Common, Other
U+005F Punctuation: Connector punctuation, Common, Other
U+0050 L Letter: Upper case letter, Latin, Other, U+0070
U+0051 L Letter: Upper case letter, Latin, Other, U+0071
U+0052 L Letter: Upper case letter, Latin, Other, U+0072
U+0053 L Letter: Upper case letter, Latin, Other, U+0073, U+017F
U+0054 L Letter: Upper case letter, Latin, Other, U+0074
U+0055 L Letter: Upper case letter, Latin, Other, U+0075
U+0056 L Letter: Upper case letter, Latin, Other, U+0076
U+0057 L Letter: Upper case letter, Latin, Other, U+0077
U+0058 L Letter: Upper case letter, Latin, Other, U+0078
U+0059 L Letter: Upper case letter, Latin, Other, U+0079
U+005A L Letter: Upper case letter, Latin, Other, U+007A
U+005B ON Punctuation: Open punctuation, Common, Other
U+005C ON Punctuation: Other punctuation, Common, Other
U+005D ON Punctuation: Close punctuation, Common, Other
U+005E ON Symbol: Modifier symbol, Common, Other
U+005F ON Punctuation: Connector punctuation, Common, Other
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
U+0060 Symbol: Modifier symbol, Common, Other
U+0061 Letter: Lower case letter, Latin, Other, U+0041
U+0062 Letter: Lower case letter, Latin, Other, U+0042
U+0063 Letter: Lower case letter, Latin, Other, U+0043
U+0064 Letter: Lower case letter, Latin, Other, U+0044
U+0065 Letter: Lower case letter, Latin, Other, U+0045
U+0066 Letter: Lower case letter, Latin, Other, U+0046
U+0067 Letter: Lower case letter, Latin, Other, U+0047
U+0068 Letter: Lower case letter, Latin, Other, U+0048
U+0069 Letter: Lower case letter, Latin, Other, U+0049
U+006A Letter: Lower case letter, Latin, Other, U+004A
U+006B Letter: Lower case letter, Latin, Other, U+004B, U+212A
U+006C Letter: Lower case letter, Latin, Other, U+004C
U+006D Letter: Lower case letter, Latin, Other, U+004D
U+006E Letter: Lower case letter, Latin, Other, U+004E
U+006F Letter: Lower case letter, Latin, Other, U+004F
U+0060 ON Symbol: Modifier symbol, Common, Other
U+0061 L Letter: Lower case letter, Latin, Other, U+0041
U+0062 L Letter: Lower case letter, Latin, Other, U+0042
U+0063 L Letter: Lower case letter, Latin, Other, U+0043
U+0064 L Letter: Lower case letter, Latin, Other, U+0044
U+0065 L Letter: Lower case letter, Latin, Other, U+0045
U+0066 L Letter: Lower case letter, Latin, Other, U+0046
U+0067 L Letter: Lower case letter, Latin, Other, U+0047
U+0068 L Letter: Lower case letter, Latin, Other, U+0048
U+0069 L Letter: Lower case letter, Latin, Other, U+0049
U+006A L Letter: Lower case letter, Latin, Other, U+004A
U+006B L Letter: Lower case letter, Latin, Other, U+004B, U+212A
U+006C L Letter: Lower case letter, Latin, Other, U+004C
U+006D L Letter: Lower case letter, Latin, Other, U+004D
U+006E L Letter: Lower case letter, Latin, Other, U+004E
U+006F L Letter: Lower case letter, Latin, Other, U+004F
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
U+0070 Letter: Lower case letter, Latin, Other, U+0050
U+0071 Letter: Lower case letter, Latin, Other, U+0051
U+0072 Letter: Lower case letter, Latin, Other, U+0052
U+0073 Letter: Lower case letter, Latin, Other, U+0053, U+017F
U+0074 Letter: Lower case letter, Latin, Other, U+0054
U+0075 Letter: Lower case letter, Latin, Other, U+0055
U+0076 Letter: Lower case letter, Latin, Other, U+0056
U+0077 Letter: Lower case letter, Latin, Other, U+0057
U+0078 Letter: Lower case letter, Latin, Other, U+0058
U+0079 Letter: Lower case letter, Latin, Other, U+0059
U+007A Letter: Lower case letter, Latin, Other, U+005A
U+007B Punctuation: Open punctuation, Common, Other
U+007C Symbol: Mathematical symbol, Common, Other
U+007D Punctuation: Close punctuation, Common, Other
U+007E Symbol: Mathematical symbol, Common, Other
U+007F Control: Control, Common, Control
U+0070 L Letter: Lower case letter, Latin, Other, U+0050
U+0071 L Letter: Lower case letter, Latin, Other, U+0051
U+0072 L Letter: Lower case letter, Latin, Other, U+0052
U+0073 L Letter: Lower case letter, Latin, Other, U+0053, U+017F
U+0074 L Letter: Lower case letter, Latin, Other, U+0054
U+0075 L Letter: Lower case letter, Latin, Other, U+0055
U+0076 L Letter: Lower case letter, Latin, Other, U+0056
U+0077 L Letter: Lower case letter, Latin, Other, U+0057
U+0078 L Letter: Lower case letter, Latin, Other, U+0058
U+0079 L Letter: Lower case letter, Latin, Other, U+0059
U+007A L Letter: Lower case letter, Latin, Other, U+005A
U+007B ON Punctuation: Open punctuation, Common, Other
U+007C ON Symbol: Mathematical symbol, Common, Other
U+007D ON Punctuation: Close punctuation, Common, Other
U+007E ON Symbol: Mathematical symbol, Common, Other
U+007F BN Control: Control, Common, Control
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
U+0080 Control: Control, Common, Control
U+0081 Control: Control, Common, Control
U+0082 Control: Control, Common, Control
U+0083 Control: Control, Common, Control
U+0084 Control: Control, Common, Control
U+0085 Control: Control, Common, Control
U+0086 Control: Control, Common, Control
U+0087 Control: Control, Common, Control
U+0088 Control: Control, Common, Control
U+0089 Control: Control, Common, Control
U+008A Control: Control, Common, Control
U+008B Control: Control, Common, Control
U+008C Control: Control, Common, Control
U+008D Control: Control, Common, Control
U+008E Control: Control, Common, Control
U+008F Control: Control, Common, Control
U+0080 BN Control: Control, Common, Control
U+0081 BN Control: Control, Common, Control
U+0082 BN Control: Control, Common, Control
U+0083 BN Control: Control, Common, Control
U+0084 BN Control: Control, Common, Control
U+0085 B Control: Control, Common, Control
U+0086 BN Control: Control, Common, Control
U+0087 BN Control: Control, Common, Control
U+0088 BN Control: Control, Common, Control
U+0089 BN Control: Control, Common, Control
U+008A BN Control: Control, Common, Control
U+008B BN Control: Control, Common, Control
U+008C BN Control: Control, Common, Control
U+008D BN Control: Control, Common, Control
U+008E BN Control: Control, Common, Control
U+008F BN Control: Control, Common, Control
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
U+0090 Control: Control, Common, Control
U+0091 Control: Control, Common, Control
U+0092 Control: Control, Common, Control
U+0093 Control: Control, Common, Control
U+0094 Control: Control, Common, Control
U+0095 Control: Control, Common, Control
U+0096 Control: Control, Common, Control
U+0097 Control: Control, Common, Control
U+0098 Control: Control, Common, Control
U+0099 Control: Control, Common, Control
U+009A Control: Control, Common, Control
U+009B Control: Control, Common, Control
U+009C Control: Control, Common, Control
U+009D Control: Control, Common, Control
U+009E Control: Control, Common, Control
U+009F Control: Control, Common, Control
U+0090 BN Control: Control, Common, Control
U+0091 BN Control: Control, Common, Control
U+0092 BN Control: Control, Common, Control
U+0093 BN Control: Control, Common, Control
U+0094 BN Control: Control, Common, Control
U+0095 BN Control: Control, Common, Control
U+0096 BN Control: Control, Common, Control
U+0097 BN Control: Control, Common, Control
U+0098 BN Control: Control, Common, Control
U+0099 BN Control: Control, Common, Control
U+009A BN Control: Control, Common, Control
U+009B BN Control: Control, Common, Control
U+009C BN Control: Control, Common, Control
U+009D BN Control: Control, Common, Control
U+009E BN Control: Control, Common, Control
U+009F BN Control: Control, Common, Control
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
U+00A0 Separator: Space separator, Common, Other
U+00A1 Punctuation: Other punctuation, Common, Other
U+00A2 Symbol: Currency symbol, Common, Other
U+00A3 Symbol: Currency symbol, Common, Other
U+00A4 Symbol: Currency symbol, Common, Other
U+00A5 Symbol: Currency symbol, Common, Other
U+00A6 Symbol: Other symbol, Common, Other
U+00A7 Punctuation: Other punctuation, Common, Other
U+00A8 Symbol: Modifier symbol, Common, Other
U+00A9 Symbol: Other symbol, Common, Extended Pictographic
U+00AA Letter: Other letter, Latin, Other
U+00AB Punctuation: Initial punctuation, Common, Other
U+00AC Symbol: Mathematical symbol, Common, Other
U+00AD Control: Format, Common, Control
U+00AE Symbol: Other symbol, Common, Extended Pictographic
U+00AF Symbol: Modifier symbol, Common, Other
U+00A0 CS Separator: Space separator, Common, Other
U+00A1 ON Punctuation: Other punctuation, Common, Other
U+00A2 ET Symbol: Currency symbol, Common, Other
U+00A3 ET Symbol: Currency symbol, Common, Other
U+00A4 ET Symbol: Currency symbol, Common, Other
U+00A5 ET Symbol: Currency symbol, Common, Other
U+00A6 ON Symbol: Other symbol, Common, Other
U+00A7 ON Punctuation: Other punctuation, Common, Other
U+00A8 ON Symbol: Modifier symbol, Common, Other
U+00A9 ON Symbol: Other symbol, Common, Extended Pictographic
U+00AA L Letter: Other letter, Latin, Other
U+00AB ON Punctuation: Initial punctuation, Common, Other
U+00AC ON Symbol: Mathematical symbol, Common, Other
U+00AD BN Control: Format, Common, Control
U+00AE ON Symbol: Other symbol, Common, Extended Pictographic
U+00AF ON Symbol: Modifier symbol, Common, Other
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
U+00B0 Symbol: Other symbol, Common, Other
U+00B1 Symbol: Mathematical symbol, Common, Other
U+00B2 Number: Other number, Common, Other
U+00B3 Number: Other number, Common, Other
U+00B4 Symbol: Modifier symbol, Common, Other
U+00B5 Letter: Lower case letter, Common, Other, U+03BC, U+039C
U+00B6 Punctuation: Other punctuation, Common, Other
U+00B7 Punctuation: Other punctuation, Common, Other
U+00B8 Symbol: Modifier symbol, Common, Other
U+00B9 Number: Other number, Common, Other
U+00BA Letter: Other letter, Latin, Other
U+00BB Punctuation: Final punctuation, Common, Other
U+00BC Number: Other number, Common, Other
U+00BD Number: Other number, Common, Other
U+00BE Number: Other number, Common, Other
U+00BF Punctuation: Other punctuation, Common, Other
U+00B0 ET Symbol: Other symbol, Common, Other
U+00B1 ET Symbol: Mathematical symbol, Common, Other
U+00B2 EN Number: Other number, Common, Other
U+00B3 EN Number: Other number, Common, Other
U+00B4 ON Symbol: Modifier symbol, Common, Other
U+00B5 L Letter: Lower case letter, Common, Other, U+03BC, U+039C
U+00B6 ON Punctuation: Other punctuation, Common, Other
U+00B7 ON Punctuation: Other punctuation, Common, Other
U+00B8 ON Symbol: Modifier symbol, Common, Other
U+00B9 EN Number: Other number, Common, Other
U+00BA L Letter: Other letter, Latin, Other
U+00BB ON Punctuation: Final punctuation, Common, Other
U+00BC ON Number: Other number, Common, Other
U+00BD ON Number: Other number, Common, Other
U+00BE ON Number: Other number, Common, Other
U+00BF ON Punctuation: Other punctuation, Common, Other
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
U+00C0 Letter: Upper case letter, Latin, Other, U+00E0
U+00C1 Letter: Upper case letter, Latin, Other, U+00E1
U+00C2 Letter: Upper case letter, Latin, Other, U+00E2
U+00C3 Letter: Upper case letter, Latin, Other, U+00E3
U+00C4 Letter: Upper case letter, Latin, Other, U+00E4
U+00C5 Letter: Upper case letter, Latin, Other, U+00E5, U+212B
U+00C6 Letter: Upper case letter, Latin, Other, U+00E6
U+00C7 Letter: Upper case letter, Latin, Other, U+00E7
U+00C8 Letter: Upper case letter, Latin, Other, U+00E8
U+00C9 Letter: Upper case letter, Latin, Other, U+00E9
U+00CA Letter: Upper case letter, Latin, Other, U+00EA
U+00CB Letter: Upper case letter, Latin, Other, U+00EB
U+00CC Letter: Upper case letter, Latin, Other, U+00EC
U+00CD Letter: Upper case letter, Latin, Other, U+00ED
U+00CE Letter: Upper case letter, Latin, Other, U+00EE
U+00CF Letter: Upper case letter, Latin, Other, U+00EF
U+00C0 L Letter: Upper case letter, Latin, Other, U+00E0
U+00C1 L Letter: Upper case letter, Latin, Other, U+00E1
U+00C2 L Letter: Upper case letter, Latin, Other, U+00E2
U+00C3 L Letter: Upper case letter, Latin, Other, U+00E3
U+00C4 L Letter: Upper case letter, Latin, Other, U+00E4
U+00C5 L Letter: Upper case letter, Latin, Other, U+00E5, U+212B
U+00C6 L Letter: Upper case letter, Latin, Other, U+00E6
U+00C7 L Letter: Upper case letter, Latin, Other, U+00E7
U+00C8 L Letter: Upper case letter, Latin, Other, U+00E8
U+00C9 L Letter: Upper case letter, Latin, Other, U+00E9
U+00CA L Letter: Upper case letter, Latin, Other, U+00EA
U+00CB L Letter: Upper case letter, Latin, Other, U+00EB
U+00CC L Letter: Upper case letter, Latin, Other, U+00EC
U+00CD L Letter: Upper case letter, Latin, Other, U+00ED
U+00CE L Letter: Upper case letter, Latin, Other, U+00EE
U+00CF L Letter: Upper case letter, Latin, Other, U+00EF
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
U+00D0 Letter: Upper case letter, Latin, Other, U+00F0
U+00D1 Letter: Upper case letter, Latin, Other, U+00F1
U+00D2 Letter: Upper case letter, Latin, Other, U+00F2
U+00D3 Letter: Upper case letter, Latin, Other, U+00F3
U+00D4 Letter: Upper case letter, Latin, Other, U+00F4
U+00D5 Letter: Upper case letter, Latin, Other, U+00F5
U+00D6 Letter: Upper case letter, Latin, Other, U+00F6
U+00D7 Symbol: Mathematical symbol, Common, Other
U+00D8 Letter: Upper case letter, Latin, Other, U+00F8
U+00D9 Letter: Upper case letter, Latin, Other, U+00F9
U+00DA Letter: Upper case letter, Latin, Other, U+00FA
U+00DB Letter: Upper case letter, Latin, Other, U+00FB
U+00DC Letter: Upper case letter, Latin, Other, U+00FC
U+00DD Letter: Upper case letter, Latin, Other, U+00FD
U+00DE Letter: Upper case letter, Latin, Other, U+00FE
U+00DF Letter: Lower case letter, Latin, Other, U+1E9E
U+00D0 L Letter: Upper case letter, Latin, Other, U+00F0
U+00D1 L Letter: Upper case letter, Latin, Other, U+00F1
U+00D2 L Letter: Upper case letter, Latin, Other, U+00F2
U+00D3 L Letter: Upper case letter, Latin, Other, U+00F3
U+00D4 L Letter: Upper case letter, Latin, Other, U+00F4
U+00D5 L Letter: Upper case letter, Latin, Other, U+00F5
U+00D6 L Letter: Upper case letter, Latin, Other, U+00F6
U+00D7 ON Symbol: Mathematical symbol, Common, Other
U+00D8 L Letter: Upper case letter, Latin, Other, U+00F8
U+00D9 L Letter: Upper case letter, Latin, Other, U+00F9
U+00DA L Letter: Upper case letter, Latin, Other, U+00FA
U+00DB L Letter: Upper case letter, Latin, Other, U+00FB
U+00DC L Letter: Upper case letter, Latin, Other, U+00FC
U+00DD L Letter: Upper case letter, Latin, Other, U+00FD
U+00DE L Letter: Upper case letter, Latin, Other, U+00FE
U+00DF L Letter: Lower case letter, Latin, Other, U+1E9E
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
U+00E0 Letter: Lower case letter, Latin, Other, U+00C0
U+00E1 Letter: Lower case letter, Latin, Other, U+00C1
U+00E2 Letter: Lower case letter, Latin, Other, U+00C2
U+00E3 Letter: Lower case letter, Latin, Other, U+00C3
U+00E4 Letter: Lower case letter, Latin, Other, U+00C4
U+00E5 Letter: Lower case letter, Latin, Other, U+00C5, U+212B
U+00E6 Letter: Lower case letter, Latin, Other, U+00C6
U+00E7 Letter: Lower case letter, Latin, Other, U+00C7
U+00E8 Letter: Lower case letter, Latin, Other, U+00C8
U+00E9 Letter: Lower case letter, Latin, Other, U+00C9
U+00EA Letter: Lower case letter, Latin, Other, U+00CA
U+00EB Letter: Lower case letter, Latin, Other, U+00CB
U+00EC Letter: Lower case letter, Latin, Other, U+00CC
U+00ED Letter: Lower case letter, Latin, Other, U+00CD
U+00EE Letter: Lower case letter, Latin, Other, U+00CE
U+00EF Letter: Lower case letter, Latin, Other, U+00CF
U+00E0 L Letter: Lower case letter, Latin, Other, U+00C0
U+00E1 L Letter: Lower case letter, Latin, Other, U+00C1
U+00E2 L Letter: Lower case letter, Latin, Other, U+00C2
U+00E3 L Letter: Lower case letter, Latin, Other, U+00C3
U+00E4 L Letter: Lower case letter, Latin, Other, U+00C4
U+00E5 L Letter: Lower case letter, Latin, Other, U+00C5, U+212B
U+00E6 L Letter: Lower case letter, Latin, Other, U+00C6
U+00E7 L Letter: Lower case letter, Latin, Other, U+00C7
U+00E8 L Letter: Lower case letter, Latin, Other, U+00C8
U+00E9 L Letter: Lower case letter, Latin, Other, U+00C9
U+00EA L Letter: Lower case letter, Latin, Other, U+00CA
U+00EB L Letter: Lower case letter, Latin, Other, U+00CB
U+00EC L Letter: Lower case letter, Latin, Other, U+00CC
U+00ED L Letter: Lower case letter, Latin, Other, U+00CD
U+00EE L Letter: Lower case letter, Latin, Other, U+00CE
U+00EF L Letter: Lower case letter, Latin, Other, U+00CF
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
U+00F0 Letter: Lower case letter, Latin, Other, U+00D0
U+00F1 Letter: Lower case letter, Latin, Other, U+00D1
U+00F2 Letter: Lower case letter, Latin, Other, U+00D2
U+00F3 Letter: Lower case letter, Latin, Other, U+00D3
U+00F4 Letter: Lower case letter, Latin, Other, U+00D4
U+00F5 Letter: Lower case letter, Latin, Other, U+00D5
U+00F6 Letter: Lower case letter, Latin, Other, U+00D6
U+00F7 Symbol: Mathematical symbol, Common, Other
U+00F8 Letter: Lower case letter, Latin, Other, U+00D8
U+00F9 Letter: Lower case letter, Latin, Other, U+00D9
U+00FA Letter: Lower case letter, Latin, Other, U+00DA
U+00FB Letter: Lower case letter, Latin, Other, U+00DB
U+00FC Letter: Lower case letter, Latin, Other, U+00DC
U+00FD Letter: Lower case letter, Latin, Other, U+00DD
U+00FE Letter: Lower case letter, Latin, Other, U+00DE
U+00FF Letter: Lower case letter, Latin, Other, U+0178
U+00F0 L Letter: Lower case letter, Latin, Other, U+00D0
U+00F1 L Letter: Lower case letter, Latin, Other, U+00D1
U+00F2 L Letter: Lower case letter, Latin, Other, U+00D2
U+00F3 L Letter: Lower case letter, Latin, Other, U+00D3
U+00F4 L Letter: Lower case letter, Latin, Other, U+00D4
U+00F5 L Letter: Lower case letter, Latin, Other, U+00D5
U+00F6 L Letter: Lower case letter, Latin, Other, U+00D6
U+00F7 ON Symbol: Mathematical symbol, Common, Other
U+00F8 L Letter: Lower case letter, Latin, Other, U+00D8
U+00F9 L Letter: Lower case letter, Latin, Other, U+00D9
U+00FA L Letter: Lower case letter, Latin, Other, U+00DA
U+00FB L Letter: Lower case letter, Latin, Other, U+00DB
U+00FC L Letter: Lower case letter, Latin, Other, U+00DC
U+00FD L Letter: Lower case letter, Latin, Other, U+00DD
U+00FE L Letter: Lower case letter, Latin, Other, U+00DE
U+00FF L Letter: Lower case letter, Latin, Other, U+0178
findprop 0100 0101 0102 0103 0104 0105 0106
U+0100 Letter: Upper case letter, Latin, Other, U+0101
U+0101 Letter: Lower case letter, Latin, Other, U+0100
U+0102 Letter: Upper case letter, Latin, Other, U+0103
U+0103 Letter: Lower case letter, Latin, Other, U+0102
U+0104 Letter: Upper case letter, Latin, Other, U+0105
U+0105 Letter: Lower case letter, Latin, Other, U+0104
U+0106 Letter: Upper case letter, Latin, Other, U+0107
U+0100 L Letter: Upper case letter, Latin, Other, U+0101
U+0101 L Letter: Lower case letter, Latin, Other, U+0100
U+0102 L Letter: Upper case letter, Latin, Other, U+0103
U+0103 L Letter: Lower case letter, Latin, Other, U+0102
U+0104 L Letter: Upper case letter, Latin, Other, U+0105
U+0105 L Letter: Lower case letter, Latin, Other, U+0104
U+0106 L Letter: Upper case letter, Latin, Other, U+0107
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
U+FFE0 Symbol: Currency symbol, Common, Other
U+FFE1 Symbol: Currency symbol, Common, Other
U+FFE2 Symbol: Mathematical symbol, Common, Other
U+FFE3 Symbol: Modifier symbol, Common, Other
U+FFE4 Symbol: Other symbol, Common, Other
U+FFE5 Symbol: Currency symbol, Common, Other
U+FFE6 Symbol: Currency symbol, Common, Other
U+FFE7 Control: Unassigned, Unknown, Other
U+FFE0 ET Symbol: Currency symbol, Common, Other
U+FFE1 ET Symbol: Currency symbol, Common, Other
U+FFE2 ON Symbol: Mathematical symbol, Common, Other
U+FFE3 ON Symbol: Modifier symbol, Common, Other
U+FFE4 ON Symbol: Other symbol, Common, Other
U+FFE5 ET Symbol: Currency symbol, Common, Other
U+FFE6 ET Symbol: Currency symbol, Common, Other
U+FFE7 L Control: Unassigned, Unknown, Other
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
U+FFE8 Symbol: Other symbol, Common, Other
U+FFE9 Symbol: Mathematical symbol, Common, Other
U+FFEA Symbol: Mathematical symbol, Common, Other
U+FFEB Symbol: Mathematical symbol, Common, Other
U+FFEC Symbol: Mathematical symbol, Common, Other
U+FFED Symbol: Other symbol, Common, Other
U+FFEE Symbol: Other symbol, Common, Other
U+FFEF Control: Unassigned, Unknown, Other
U+FFE8 ON Symbol: Other symbol, Common, Other
U+FFE9 ON Symbol: Mathematical symbol, Common, Other
U+FFEA ON Symbol: Mathematical symbol, Common, Other
U+FFEB ON Symbol: Mathematical symbol, Common, Other
U+FFEC ON Symbol: Mathematical symbol, Common, Other
U+FFED ON Symbol: Other symbol, Common, Other
U+FFEE ON Symbol: Other symbol, Common, Other
U+FFEF L Control: Unassigned, Unknown, Other
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
U+FFF8 Control: Unassigned, Unknown, Control
U+FFF9 Control: Format, Common, Control
U+FFFA Control: Format, Common, Control
U+FFFB Control: Format, Common, Control
U+FFFC Symbol: Other symbol, Common, Other
U+FFFD Symbol: Other symbol, Common, Other
U+FFFE Control: Unassigned, Unknown, Other
U+FFFF Control: Unassigned, Unknown, Other
U+FFF8 BN Control: Unassigned, Unknown, Control
U+FFF9 ON Control: Format, Common, Control
U+FFFA ON Control: Format, Common, Control
U+FFFB ON Control: Format, Common, Control
U+FFFC ON Symbol: Other symbol, Common, Other
U+FFFD ON Symbol: Other symbol, Common, Other
U+FFFE BN Control: Unassigned, Unknown, Other
U+FFFF BN Control: Unassigned, Unknown, Other
findprop 10000 10001 e01ef f0000 100000
U+10000 Letter: Other letter, Linear_B, Other
U+10001 Letter: Other letter, Linear_B, Other
U+E01EF Mark: Non-spacing mark, Inherited, Extend
U+F0000 Control: Private use, Unknown, Other
U+100000 Control: Private use, Unknown, Other
U+10000 L Letter: Other letter, Linear_B, Other
U+10001 L Letter: Other letter, Linear_B, Other
U+E01EF NSM Mark: Non-spacing mark, Inherited, Extend
U+F0000 L Control: Private use, Unknown, Other
U+100000 L Control: Private use, Unknown, Other
findprop 1b00 12000 7c0 a840 10900
U+1B00 Mark: Non-spacing mark, Balinese, Extend
U+12000 Letter: Other letter, Cuneiform, Other
U+07C0 Number: Decimal number, Nko, Other
U+A840 Letter: Other letter, Phags_Pa, Other
U+10900 Letter: Other letter, Phoenician, Other
U+1B00 NSM Mark: Non-spacing mark, Balinese, Extend
U+12000 L Letter: Other letter, Cuneiform, Other
U+07C0 R Number: Decimal number, Nko, Other
U+A840 L Letter: Other letter, Phags_Pa, Other
U+10900 R Letter: Other letter, Phoenician, Other
findprop 1d79 a77d
U+1D79 Letter: Lower case letter, Latin, Other, U+A77D
U+A77D Letter: Upper case letter, Latin, Other, U+1D79
U+1D79 L Letter: Lower case letter, Latin, Other, U+A77D
U+A77D L Letter: Upper case letter, Latin, Other, U+1D79
findprop 0800 083e a4d0 a4f7 aa80 aadf
U+0800 Letter: Other letter, Samaritan, Other
U+083E Punctuation: Other punctuation, Samaritan, Other
U+A4D0 Letter: Other letter, Lisu, Other
U+A4F7 Letter: Other letter, Lisu, Other
U+AA80 Letter: Other letter, Tai_Viet, Other
U+AADF Punctuation: Other punctuation, Tai_Viet, Other
U+0800 R Letter: Other letter, Samaritan, Other
U+083E R Punctuation: Other punctuation, Samaritan, Other
U+A4D0 L Letter: Other letter, Lisu, Other
U+A4F7 L Letter: Other letter, Lisu, Other
U+AA80 L Letter: Other letter, Tai_Viet, Other
U+AADF L Punctuation: Other punctuation, Tai_Viet, Other
findprop 10b00 10b35 13000 1342e 10840 10855
U+10B00 Letter: Other letter, Avestan, Other
U+10B35 Letter: Other letter, Avestan, Other
U+13000 Letter: Other letter, Egyptian_Hieroglyphs, Other
U+1342E Letter: Other letter, Egyptian_Hieroglyphs, Other
U+10840 Letter: Other letter, Imperial_Aramaic, Other
U+10855 Letter: Other letter, Imperial_Aramaic, Other
U+10B00 R Letter: Other letter, Avestan, Other
U+10B35 R Letter: Other letter, Avestan, Other
U+13000 L Letter: Other letter, Egyptian_Hieroglyphs, Other
U+1342E L Letter: Other letter, Egyptian_Hieroglyphs, Other
U+10840 R Letter: Other letter, Imperial_Aramaic, Other
U+10855 R Letter: Other letter, Imperial_Aramaic, Other
findprop 11100 1113c 11680 116c0
U+11100 Mark: Non-spacing mark, Chakma, Extend
U+1113C Number: Decimal number, Chakma, Other
U+11680 Letter: Other letter, Takri, Other
U+116C0 Number: Decimal number, Takri, Other
U+11100 NSM Mark: Non-spacing mark, Chakma, Extend
U+1113C L Number: Decimal number, Chakma, Other
U+11680 L Letter: Other letter, Takri, Other
U+116C0 L Number: Decimal number, Takri, Other
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
U+000D Control: Control, Common, CR
U+000A Control: Control, Common, LF
U+000E Control: Control, Common, Control
U+0711 Mark: Non-spacing mark, Syriac, Extend
U+1B04 Mark: Spacing mark, Balinese, SpacingMark
U+1111 Letter: Other letter, Hangul, Hangul syllable type L
U+1169 Letter: Other letter, Hangul, Hangul syllable type V
U+11FE Letter: Other letter, Hangul, Hangul syllable type T
U+AE4C Letter: Other letter, Hangul, Hangul syllable type LV
U+AD89 Letter: Other letter, Hangul, Hangul syllable type LVT
U+000D B Control: Control, Common, CR
U+000A B Control: Control, Common, LF
U+000E BN Control: Control, Common, Control
U+0711 NSM Mark: Non-spacing mark, Syriac, Extend
U+1B04 L Mark: Spacing mark, Balinese, SpacingMark
U+1111 L Letter: Other letter, Hangul, Hangul syllable type L
U+1169 L Letter: Other letter, Hangul, Hangul syllable type V
U+11FE L Letter: Other letter, Hangul, Hangul syllable type T
U+AE4C L Letter: Other letter, Hangul, Hangul syllable type LV
U+AD89 L Letter: Other letter, Hangul, Hangul syllable type LVT
findprop 118a0 11ac7 16ad0
U+118A0 Letter: Upper case letter, Warang_Citi, Other, U+118C0
U+11AC7 Letter: Other letter, Pau_Cin_Hau, Other
U+16AD0 Letter: Other letter, Bassa_Vah, Other
U+118A0 L Letter: Upper case letter, Warang_Citi, Other, U+118C0
U+11AC7 L Letter: Other letter, Pau_Cin_Hau, Other
U+16AD0 L Letter: Other letter, Bassa_Vah, Other
findprop 11700 14400 108e0 11280 1d800
U+11700 Letter: Other letter, Ahom, Other
U+14400 Letter: Other letter, Anatolian_Hieroglyphs, Other
U+108E0 Letter: Other letter, Hatran, Other
U+11280 Letter: Other letter, Multani, Other
U+1D800 Symbol: Other symbol, SignWriting, Other
U+11700 L Letter: Other letter, Ahom, Other
U+14400 L Letter: Other letter, Anatolian_Hieroglyphs, Other
U+108E0 R Letter: Other letter, Hatran, Other
U+11280 L Letter: Other letter, Multani, Other
U+1D800 L Symbol: Other symbol, SignWriting, Other
findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
U+11800 Letter: Other letter, Dogra, Other
U+1E903 Letter: Upper case letter, Adlam, Other, U+1E925
U+11DA9 Number: Decimal number, Gunjala_Gondi, Other
U+10D27 Mark: Non-spacing mark, Hanifi_Rohingya, Extend
U+11EE0 Letter: Other letter, Makasar, Other
U+16E48 Letter: Upper case letter, Medefaidrin, Other, U+16E68
U+10F27 Letter: Other letter, Old_Sogdian, Other
U+10F30 Letter: Other letter, Sogdian, Other
U+11800 L Letter: Other letter, Dogra, Other
U+1E903 R Letter: Upper case letter, Adlam, Other, U+1E925
U+11DA9 L Number: Decimal number, Gunjala_Gondi, Other
U+10D27 NSM Mark: Non-spacing mark, Hanifi_Rohingya, Extend
U+11EE0 L Letter: Other letter, Makasar, Other
U+16E48 L Letter: Upper case letter, Medefaidrin, Other, U+16E68
U+10F27 R Letter: Other letter, Old_Sogdian, Other
U+10F30 AL Letter: Other letter, Sogdian, Other
findprop a836 a833 1cf4 20f0 1cd0
U+A836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
U+A833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Nandinagari, Khudawadi, Takri, Tirhuta]
U+1CF4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
U+20F0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
U+1CD0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]
U+A836 L Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
U+A833 L Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Nandinagari, Khudawadi, Takri, Tirhuta]
U+1CF4 NSM Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
U+20F0 NSM Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
U+1CD0 NSM Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]
findprop 32ff
U+32FF Symbol: Other symbol, Common, Other, [Han]
U+32FF L Symbol: Other symbol, Common, Other, [Han]
findprop 1f16d
U+1F16D Symbol: Other symbol, Common, Extended Pictographic
U+1F16D ON Symbol: Other symbol, Common, Extended Pictographic
findprop U+10e93 U+10eaa
U+10E93 Letter: Other letter, Yezidi, Other
U+10EAA Control: Unassigned, Unknown, Other
U+10E93 R Letter: Other letter, Yezidi, Other
U+10EAA R Control: Unassigned, Unknown, Other
findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
U+0602 AN Control: Format, Arabic, Prepend
U+202A *LRE Control: Format, Common, Control
U+202B *RLE Control: Format, Common, Control
U+202C *PDF Control: Format, Common, Control
U+2068 *FSI Control: Format, Common, Control
U+2069 *PDI Control: Format, Common, Control
U+202D *LRO Control: Format, Common, Control
U+202E *RLO Control: Format, Common, Control
U+2067 *RLI Control: Format, Common, Control

View File

@ -1,196 +1,253 @@
find script Han
U+2E80..U+2E99 Symbol: Other symbol, Han, Other
U+2E9B..U+2EF3 Symbol: Other symbol, Han, Other
U+2F00..U+2FD5 Symbol: Other symbol, Han, Other
U+3005 Letter: Modifier letter, Han, Other
U+3007 Number: Letter number, Han, Other
U+3021..U+3029 Number: Letter number, Han, Other
U+3038..U+303A Number: Letter number, Han, Other
U+303B Letter: Modifier letter, Han, Other
U+3400..U+4DBF Letter: Other letter, Han, Other
U+4E00..U+9FFF Letter: Other letter, Han, Other
U+F900..U+FA6D Letter: Other letter, Han, Other
U+FA70..U+FAD9 Letter: Other letter, Han, Other
U+16FE2 Punctuation: Other punctuation, Han, Other
U+16FE3 Letter: Modifier letter, Han, Other
U+16FF0..U+16FF1 Mark: Spacing mark, Han, SpacingMark
U+20000..U+2A6DF Letter: Other letter, Han, Other
U+2A700..U+2B738 Letter: Other letter, Han, Other
U+2B740..U+2B81D Letter: Other letter, Han, Other
U+2B820..U+2CEA1 Letter: Other letter, Han, Other
U+2CEB0..U+2EBE0 Letter: Other letter, Han, Other
U+2F800..U+2FA1D Letter: Other letter, Han, Other
U+30000..U+3134A Letter: Other letter, Han, Other
U+2E80..U+2E99 ON Symbol: Other symbol, Han, Other
U+2E9B..U+2EF3 ON Symbol: Other symbol, Han, Other
U+2F00..U+2FD5 ON Symbol: Other symbol, Han, Other
U+3005 L Letter: Modifier letter, Han, Other
U+3007 L Number: Letter number, Han, Other
U+3021..U+3029 L Number: Letter number, Han, Other
U+3038..U+303A L Number: Letter number, Han, Other
U+303B L Letter: Modifier letter, Han, Other
U+3400..U+4DBF L Letter: Other letter, Han, Other
U+4E00..U+9FFF L Letter: Other letter, Han, Other
U+F900..U+FA6D L Letter: Other letter, Han, Other
U+FA70..U+FAD9 L Letter: Other letter, Han, Other
U+16FE2 ON Punctuation: Other punctuation, Han, Other
U+16FE3 L Letter: Modifier letter, Han, Other
U+16FF0..U+16FF1 L Mark: Spacing mark, Han, SpacingMark
U+20000..U+2A6DF L Letter: Other letter, Han, Other
U+2A700..U+2B738 L Letter: Other letter, Han, Other
U+2B740..U+2B81D L Letter: Other letter, Han, Other
U+2B820..U+2CEA1 L Letter: Other letter, Han, Other
U+2CEB0..U+2EBE0 L Letter: Other letter, Han, Other
U+2F800..U+2FA1D L Letter: Other letter, Han, Other
U+30000..U+3134A L Letter: Other letter, Han, Other
find type Pe script Common scriptx Hangul
U+3009 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+300B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+300D Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+300F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+3011 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+3015 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+3017 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+3019 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+301B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+301E..U+301F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
U+FF63 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+3009 ON Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+300B ON Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+300D ON Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+300F ON Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+3011 ON Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+3015 ON Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+3017 ON Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+3019 ON Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+301B ON Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
U+301E..U+301F ON Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
U+FF63 ON Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
find type Sk
U+005E Symbol: Modifier symbol, Common, Other
U+0060 Symbol: Modifier symbol, Common, Other
U+00A8 Symbol: Modifier symbol, Common, Other
U+00AF Symbol: Modifier symbol, Common, Other
U+00B4 Symbol: Modifier symbol, Common, Other
U+00B8 Symbol: Modifier symbol, Common, Other
U+02C2..U+02C5 Symbol: Modifier symbol, Common, Other
U+02D2..U+02DF Symbol: Modifier symbol, Common, Other
U+02E5..U+02E9 Symbol: Modifier symbol, Common, Other
U+02EA..U+02EB Symbol: Modifier symbol, Bopomofo, Other
U+02ED Symbol: Modifier symbol, Common, Other
U+02EF..U+02FF Symbol: Modifier symbol, Common, Other
U+0375 Symbol: Modifier symbol, Greek, Other
U+0384 Symbol: Modifier symbol, Greek, Other
U+0385 Symbol: Modifier symbol, Common, Other
U+0888 Symbol: Modifier symbol, Arabic, Other
U+1FBD Symbol: Modifier symbol, Greek, Other
U+1FBF..U+1FC1 Symbol: Modifier symbol, Greek, Other
U+1FCD..U+1FCF Symbol: Modifier symbol, Greek, Other
U+1FDD..U+1FDF Symbol: Modifier symbol, Greek, Other
U+1FED..U+1FEF Symbol: Modifier symbol, Greek, Other
U+1FFD..U+1FFE Symbol: Modifier symbol, Greek, Other
U+309B..U+309C Symbol: Modifier symbol, Common, Other, [Hiragana, Katakana]
U+A700..U+A707 Symbol: Modifier symbol, Common, Other, [Han, Latin]
U+A708..U+A716 Symbol: Modifier symbol, Common, Other
U+A720..U+A721 Symbol: Modifier symbol, Common, Other
U+A789..U+A78A Symbol: Modifier symbol, Common, Other
U+AB5B Symbol: Modifier symbol, Common, Other
U+AB6A..U+AB6B Symbol: Modifier symbol, Common, Other
U+FBB2..U+FBC2 Symbol: Modifier symbol, Arabic, Other
U+FF3E Symbol: Modifier symbol, Common, Other
U+FF40 Symbol: Modifier symbol, Common, Other
U+FFE3 Symbol: Modifier symbol, Common, Other
U+1F3FB..U+1F3FF Symbol: Modifier symbol, Common, Extend
U+005E ON Symbol: Modifier symbol, Common, Other
U+0060 ON Symbol: Modifier symbol, Common, Other
U+00A8 ON Symbol: Modifier symbol, Common, Other
U+00AF ON Symbol: Modifier symbol, Common, Other
U+00B4 ON Symbol: Modifier symbol, Common, Other
U+00B8 ON Symbol: Modifier symbol, Common, Other
U+02C2..U+02C5 ON Symbol: Modifier symbol, Common, Other
U+02D2..U+02DF ON Symbol: Modifier symbol, Common, Other
U+02E5..U+02E9 ON Symbol: Modifier symbol, Common, Other
U+02EA..U+02EB ON Symbol: Modifier symbol, Bopomofo, Other
U+02ED ON Symbol: Modifier symbol, Common, Other
U+02EF..U+02FF ON Symbol: Modifier symbol, Common, Other
U+0375 ON Symbol: Modifier symbol, Greek, Other
U+0384 ON Symbol: Modifier symbol, Greek, Other
U+0385 ON Symbol: Modifier symbol, Common, Other
U+0888 AL Symbol: Modifier symbol, Arabic, Other
U+1FBD ON Symbol: Modifier symbol, Greek, Other
U+1FBF..U+1FC1 ON Symbol: Modifier symbol, Greek, Other
U+1FCD..U+1FCF ON Symbol: Modifier symbol, Greek, Other
U+1FDD..U+1FDF ON Symbol: Modifier symbol, Greek, Other
U+1FED..U+1FEF ON Symbol: Modifier symbol, Greek, Other
U+1FFD..U+1FFE ON Symbol: Modifier symbol, Greek, Other
U+309B..U+309C ON Symbol: Modifier symbol, Common, Other, [Hiragana, Katakana]
U+A700..U+A707 ON Symbol: Modifier symbol, Common, Other, [Han, Latin]
U+A708..U+A716 ON Symbol: Modifier symbol, Common, Other
U+A720..U+A721 ON Symbol: Modifier symbol, Common, Other
U+A789..U+A78A L Symbol: Modifier symbol, Common, Other
U+AB5B L Symbol: Modifier symbol, Common, Other
U+AB6A..U+AB6B ON Symbol: Modifier symbol, Common, Other
U+FBB2..U+FBC2 AL Symbol: Modifier symbol, Arabic, Other
U+FF3E ON Symbol: Modifier symbol, Common, Other
U+FF40 ON Symbol: Modifier symbol, Common, Other
U+FFE3 ON Symbol: Modifier symbol, Common, Other
U+1F3FB..U+1F3FF ON Symbol: Modifier symbol, Common, Extend
find type Pd
U+002D Punctuation: Dash punctuation, Common, Other
U+058A Punctuation: Dash punctuation, Armenian, Other
U+05BE Punctuation: Dash punctuation, Hebrew, Other
U+1400 Punctuation: Dash punctuation, Canadian_Aboriginal, Other
U+1806 Punctuation: Dash punctuation, Mongolian, Other
U+2010..U+2015 Punctuation: Dash punctuation, Common, Other
U+2E17 Punctuation: Dash punctuation, Common, Other
U+2E1A Punctuation: Dash punctuation, Common, Other
U+2E3A..U+2E3B Punctuation: Dash punctuation, Common, Other
U+2E40 Punctuation: Dash punctuation, Common, Other
U+2E5D Punctuation: Dash punctuation, Common, Other
U+301C Punctuation: Dash punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
U+3030 Punctuation: Dash punctuation, Common, Extended Pictographic, [Bopomofo, Hangul, Han, Hiragana, Katakana]
U+30A0 Punctuation: Dash punctuation, Common, Other, [Hiragana, Katakana]
U+FE31..U+FE32 Punctuation: Dash punctuation, Common, Other
U+FE58 Punctuation: Dash punctuation, Common, Other
U+FE63 Punctuation: Dash punctuation, Common, Other
U+FF0D Punctuation: Dash punctuation, Common, Other
U+10EAD Punctuation: Dash punctuation, Yezidi, Other
U+002D ES Punctuation: Dash punctuation, Common, Other
U+058A ON Punctuation: Dash punctuation, Armenian, Other
U+05BE R Punctuation: Dash punctuation, Hebrew, Other
U+1400 ON Punctuation: Dash punctuation, Canadian_Aboriginal, Other
U+1806 ON Punctuation: Dash punctuation, Mongolian, Other
U+2010..U+2015 ON Punctuation: Dash punctuation, Common, Other
U+2E17 ON Punctuation: Dash punctuation, Common, Other
U+2E1A ON Punctuation: Dash punctuation, Common, Other
U+2E3A..U+2E3B ON Punctuation: Dash punctuation, Common, Other
U+2E40 ON Punctuation: Dash punctuation, Common, Other
U+2E5D ON Punctuation: Dash punctuation, Common, Other
U+301C ON Punctuation: Dash punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
U+3030 ON Punctuation: Dash punctuation, Common, Extended Pictographic, [Bopomofo, Hangul, Han, Hiragana, Katakana]
U+30A0 ON Punctuation: Dash punctuation, Common, Other, [Hiragana, Katakana]
U+FE31..U+FE32 ON Punctuation: Dash punctuation, Common, Other
U+FE58 ON Punctuation: Dash punctuation, Common, Other
U+FE63 ES Punctuation: Dash punctuation, Common, Other
U+FF0D ES Punctuation: Dash punctuation, Common, Other
U+10EAD R Punctuation: Dash punctuation, Yezidi, Other
find gbreak LVT
U+AC01..U+AC1B Letter: Other letter, Hangul, Hangul syllable type LVT
U+AC1D..U+AC37 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AC39..U+AC53 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AC55..U+AC6F Letter: Other letter, Hangul, Hangul syllable type LVT
U+AC71..U+AC8B Letter: Other letter, Hangul, Hangul syllable type LVT
U+AC8D..U+ACA7 Letter: Other letter, Hangul, Hangul syllable type LVT
U+ACA9..U+ACC3 Letter: Other letter, Hangul, Hangul syllable type LVT
U+ACC5..U+ACDF Letter: Other letter, Hangul, Hangul syllable type LVT
U+ACE1..U+ACFB Letter: Other letter, Hangul, Hangul syllable type LVT
U+ACFD..U+AD17 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AD19..U+AD33 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AD35..U+AD4F Letter: Other letter, Hangul, Hangul syllable type LVT
U+AD51..U+AD6B Letter: Other letter, Hangul, Hangul syllable type LVT
U+AD6D..U+AD87 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AD89..U+ADA3 Letter: Other letter, Hangul, Hangul syllable type LVT
U+ADA5..U+ADBF Letter: Other letter, Hangul, Hangul syllable type LVT
U+ADC1..U+ADDB Letter: Other letter, Hangul, Hangul syllable type LVT
U+ADDD..U+ADF7 Letter: Other letter, Hangul, Hangul syllable type LVT
U+ADF9..U+AE13 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AE15..U+AE2F Letter: Other letter, Hangul, Hangul syllable type LVT
U+AE31..U+AE4B Letter: Other letter, Hangul, Hangul syllable type LVT
U+AE4D..U+AE67 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AE69..U+AE83 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AE85..U+AE9F Letter: Other letter, Hangul, Hangul syllable type LVT
U+AEA1..U+AEBB Letter: Other letter, Hangul, Hangul syllable type LVT
U+AEBD..U+AED7 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AED9..U+AEF3 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AEF5..U+AF0F Letter: Other letter, Hangul, Hangul syllable type LVT
U+AF11..U+AF2B Letter: Other letter, Hangul, Hangul syllable type LVT
U+AF2D..U+AF47 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AF49..U+AF63 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AF65..U+AF7F Letter: Other letter, Hangul, Hangul syllable type LVT
U+AF81..U+AF9B Letter: Other letter, Hangul, Hangul syllable type LVT
U+AF9D..U+AFB7 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AFB9..U+AFD3 Letter: Other letter, Hangul, Hangul syllable type LVT
U+AFD5..U+AFEF Letter: Other letter, Hangul, Hangul syllable type LVT
U+AFF1..U+B00B Letter: Other letter, Hangul, Hangul syllable type LVT
U+B00D..U+B027 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B029..U+B043 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B045..U+B05F Letter: Other letter, Hangul, Hangul syllable type LVT
U+B061..U+B07B Letter: Other letter, Hangul, Hangul syllable type LVT
U+B07D..U+B097 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B099..U+B0B3 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B0B5..U+B0CF Letter: Other letter, Hangul, Hangul syllable type LVT
U+B0D1..U+B0EB Letter: Other letter, Hangul, Hangul syllable type LVT
U+B0ED..U+B107 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B109..U+B123 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B125..U+B13F Letter: Other letter, Hangul, Hangul syllable type LVT
U+B141..U+B15B Letter: Other letter, Hangul, Hangul syllable type LVT
U+B15D..U+B177 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B179..U+B193 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B195..U+B1AF Letter: Other letter, Hangul, Hangul syllable type LVT
U+B1B1..U+B1CB Letter: Other letter, Hangul, Hangul syllable type LVT
U+B1CD..U+B1E7 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B1E9..U+B203 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B205..U+B21F Letter: Other letter, Hangul, Hangul syllable type LVT
U+B221..U+B23B Letter: Other letter, Hangul, Hangul syllable type LVT
U+B23D..U+B257 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B259..U+B273 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B275..U+B28F Letter: Other letter, Hangul, Hangul syllable type LVT
U+B291..U+B2AB Letter: Other letter, Hangul, Hangul syllable type LVT
U+B2AD..U+B2C7 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B2C9..U+B2E3 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B2E5..U+B2FF Letter: Other letter, Hangul, Hangul syllable type LVT
U+B301..U+B31B Letter: Other letter, Hangul, Hangul syllable type LVT
U+B31D..U+B337 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B339..U+B353 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B355..U+B36F Letter: Other letter, Hangul, Hangul syllable type LVT
U+B371..U+B38B Letter: Other letter, Hangul, Hangul syllable type LVT
U+B38D..U+B3A7 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B3A9..U+B3C3 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B3C5..U+B3DF Letter: Other letter, Hangul, Hangul syllable type LVT
U+B3E1..U+B3FB Letter: Other letter, Hangul, Hangul syllable type LVT
U+B3FD..U+B417 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B419..U+B433 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B435..U+B44F Letter: Other letter, Hangul, Hangul syllable type LVT
U+B451..U+B46B Letter: Other letter, Hangul, Hangul syllable type LVT
U+B46D..U+B487 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B489..U+B4A3 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B4A5..U+B4BF Letter: Other letter, Hangul, Hangul syllable type LVT
U+B4C1..U+B4DB Letter: Other letter, Hangul, Hangul syllable type LVT
U+B4DD..U+B4F7 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B4F9..U+B513 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B515..U+B52F Letter: Other letter, Hangul, Hangul syllable type LVT
U+B531..U+B54B Letter: Other letter, Hangul, Hangul syllable type LVT
U+B54D..U+B567 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B569..U+B583 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B585..U+B59F Letter: Other letter, Hangul, Hangul syllable type LVT
U+B5A1..U+B5BB Letter: Other letter, Hangul, Hangul syllable type LVT
U+B5BD..U+B5D7 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B5D9..U+B5F3 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B5F5..U+B60F Letter: Other letter, Hangul, Hangul syllable type LVT
U+B611..U+B62B Letter: Other letter, Hangul, Hangul syllable type LVT
U+B62D..U+B647 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B649..U+B663 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B665..U+B67F Letter: Other letter, Hangul, Hangul syllable type LVT
U+B681..U+B69B Letter: Other letter, Hangul, Hangul syllable type LVT
U+B69D..U+B6B7 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B6B9..U+B6D3 Letter: Other letter, Hangul, Hangul syllable type LVT
U+B6D5..U+B6EF Letter: Other letter, Hangul, Hangul syllable type LVT
U+AC01..U+AC1B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AC1D..U+AC37 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AC39..U+AC53 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AC55..U+AC6F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AC71..U+AC8B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AC8D..U+ACA7 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+ACA9..U+ACC3 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+ACC5..U+ACDF L Letter: Other letter, Hangul, Hangul syllable type LVT
U+ACE1..U+ACFB L Letter: Other letter, Hangul, Hangul syllable type LVT
U+ACFD..U+AD17 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AD19..U+AD33 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AD35..U+AD4F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AD51..U+AD6B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AD6D..U+AD87 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AD89..U+ADA3 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+ADA5..U+ADBF L Letter: Other letter, Hangul, Hangul syllable type LVT
U+ADC1..U+ADDB L Letter: Other letter, Hangul, Hangul syllable type LVT
U+ADDD..U+ADF7 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+ADF9..U+AE13 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AE15..U+AE2F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AE31..U+AE4B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AE4D..U+AE67 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AE69..U+AE83 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AE85..U+AE9F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AEA1..U+AEBB L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AEBD..U+AED7 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AED9..U+AEF3 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AEF5..U+AF0F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AF11..U+AF2B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AF2D..U+AF47 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AF49..U+AF63 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AF65..U+AF7F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AF81..U+AF9B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AF9D..U+AFB7 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AFB9..U+AFD3 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AFD5..U+AFEF L Letter: Other letter, Hangul, Hangul syllable type LVT
U+AFF1..U+B00B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B00D..U+B027 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B029..U+B043 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B045..U+B05F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B061..U+B07B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B07D..U+B097 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B099..U+B0B3 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B0B5..U+B0CF L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B0D1..U+B0EB L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B0ED..U+B107 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B109..U+B123 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B125..U+B13F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B141..U+B15B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B15D..U+B177 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B179..U+B193 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B195..U+B1AF L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B1B1..U+B1CB L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B1CD..U+B1E7 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B1E9..U+B203 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B205..U+B21F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B221..U+B23B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B23D..U+B257 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B259..U+B273 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B275..U+B28F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B291..U+B2AB L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B2AD..U+B2C7 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B2C9..U+B2E3 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B2E5..U+B2FF L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B301..U+B31B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B31D..U+B337 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B339..U+B353 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B355..U+B36F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B371..U+B38B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B38D..U+B3A7 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B3A9..U+B3C3 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B3C5..U+B3DF L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B3E1..U+B3FB L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B3FD..U+B417 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B419..U+B433 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B435..U+B44F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B451..U+B46B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B46D..U+B487 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B489..U+B4A3 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B4A5..U+B4BF L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B4C1..U+B4DB L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B4DD..U+B4F7 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B4F9..U+B513 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B515..U+B52F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B531..U+B54B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B54D..U+B567 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B569..U+B583 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B585..U+B59F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B5A1..U+B5BB L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B5BD..U+B5D7 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B5D9..U+B5F3 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B5F5..U+B60F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B611..U+B62B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B62D..U+B647 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B649..U+B663 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B665..U+B67F L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B681..U+B69B L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B69D..U+B6B7 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B6B9..U+B6D3 L Letter: Other letter, Hangul, Hangul syllable type LVT
U+B6D5..U+B6EF L Letter: Other letter, Hangul, Hangul syllable type LVT
...
find script Old_Uyghur
U+10F70..U+10F81 Letter: Other letter, Old_Uyghur, Other
U+10F82..U+10F85 Mark: Non-spacing mark, Old_Uyghur, Extend
U+10F86..U+10F89 Punctuation: Other punctuation, Old_Uyghur, Other
U+10F70..U+10F81 R Letter: Other letter, Old_Uyghur, Other
U+10F82..U+10F85 NSM Mark: Non-spacing mark, Old_Uyghur, Extend
U+10F86..U+10F89 R Punctuation: Other punctuation, Old_Uyghur, Other
find bidi PDF
U+202C *PDF Control: Format, Common, Control
find bidi CS
U+002C CS Punctuation: Other punctuation, Common, Other
U+002E..U+002F CS Punctuation: Other punctuation, Common, Other
U+003A CS Punctuation: Other punctuation, Common, Other
U+00A0 CS Separator: Space separator, Common, Other
U+060C CS Punctuation: Other punctuation, Common, Other, [Arabic, Nko, Hanifi_Rohingya, Syriac, Thaana, Yezidi]
U+202F CS Separator: Space separator, Common, Other, [Latin, Mongolian]
U+2044 CS Symbol: Mathematical symbol, Common, Other
U+FE50 CS Punctuation: Other punctuation, Common, Other
U+FE52 CS Punctuation: Other punctuation, Common, Other
U+FE55 CS Punctuation: Other punctuation, Common, Other
U+FF0C CS Punctuation: Other punctuation, Common, Other
U+FF0E..U+FF0F CS Punctuation: Other punctuation, Common, Other
U+FF1A CS Punctuation: Other punctuation, Common, Other
find bidi CS type Sm
U+2044 CS Symbol: Mathematical symbol, Common, Other
find bidi B
U+000A B Control: Control, Common, LF
U+000D B Control: Control, Common, CR
U+001C..U+001E B Control: Control, Common, Control
U+0085 B Control: Control, Common, Control
U+2029 B Separator: Paragraph separator, Common, Control
find bidi FSI
U+2068 *FSI Control: Format, Common, Control
find bidi PDI
U+2069 *PDI Control: Format, Common, Control
find bidi RLI
U+2067 *RLI Control: Format, Common, Control
find bidi RLO
U+202E *RLO Control: Format, Common, Control
find bidi S
U+0009 S Control: Control, Common, Control
U+000B S Control: Control, Common, Control
U+001F S Control: Control, Common, Control
find bidi WS
U+000C WS Control: Control, Common, Control
U+0020 WS Separator: Space separator, Common, Other
U+1680 WS Separator: Space separator, Ogham, Other
U+2000..U+200A WS Separator: Space separator, Common, Other
U+2028 WS Separator: Line separator, Common, Control
U+205F WS Separator: Space separator, Common, Other
U+3000 WS Separator: Space separator, Common, Other
find bidi_control
U+061C *AL Control: Format, Arabic, Control, [Arabic, Syriac, Thaana]
U+200E *L Control: Format, Common, Control
U+200F *R Control: Format, Common, Control
U+202A *LRE Control: Format, Common, Control
U+202B *RLE Control: Format, Common, Control
U+202C *PDF Control: Format, Common, Control
U+202D *LRO Control: Format, Common, Control
U+202E *RLO Control: Format, Common, Control
U+2066 *LRT Control: Format, Common, Control
U+2067 *RLI Control: Format, Common, Control
U+2068 *FSI Control: Format, Common, Control
U+2069 *PDI Control: Format, Common, Control

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
New API code Copyright (c) 2016-2021 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -1798,7 +1798,8 @@ typedef struct {
uint8_t caseset; /* offset to multichar other cases or zero */
int32_t other_case; /* offset to other case, or zero if none */
int16_t scriptx; /* script extension value */
int16_t dummy; /* spare - to round to multiple of 4 bytes */
uint8_t bidi; /* bidi class and control flag */
uint8_t dummy; /* spare - to round to multiple of 4 bytes */
} ucd_record;
/* UCD access macros */
@ -1823,6 +1824,13 @@ typedef struct {
#define UCD_OTHERCASE(ch) ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
#define UCD_SCRIPTX(ch) GET_UCD(ch)->scriptx
/* The "bidi" field has the 0x80 bit set if the character has the Bidi_Control
property. The remaining bits hold the bidi class, but as there are only 23
classes, we can mask off 5 bits - leaving two free for the future. */
#define UCD_BIDICLASS(ch) (GET_UCD(ch)->bidi & 0x1fu)
#define UCD_BIDICONTROL(ch) (GET_UCD(ch)->bidi & 0x80u)
/* Header for serialized pcre2 codes. */
typedef struct pcre2_serialized_data {

File diff suppressed because it is too large Load Diff

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2018 University of Cambridge
New API code Copyright (c) 2016-2021 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -100,6 +100,34 @@ enum {
ucp_Zs /* Space separator */
};
/* These are the bidi class values. */
enum {
ucp_bidiAL, /* Arabic letter */
ucp_bidiAN, /* Arabic number */
ucp_bidiB, /* Paragraph separator */
ucp_bidiBN, /* Boundary neutral */
ucp_bidiCS, /* Common separator */
ucp_bidiEN, /* European number */
ucp_bidiES, /* European separator */
ucp_bidiET, /* European terminator */
ucp_bidiFSI, /* First strong isolate */
ucp_bidiL, /* Left to right */
ucp_bidiLRE, /* Left to right embedding */
ucp_bidiLRI, /* Left to right isolate */
ucp_bidiLRO, /* Left to right override */
ucp_bidiNSM, /* Non-spacing mark */
ucp_bidiON, /* Other neutral */
ucp_bidiPDF, /* Pop directional format */
ucp_bidiPDI, /* Pop directional isolate */
ucp_bidiR, /* Right to left */
ucp_bidiRLE, /* Right to left embedding */
ucp_bidiRLI, /* Right to left isolate */
ucp_bidiRLO, /* Right to left override */
ucp_bidiS, /* Segment separator */
ucp_bidiWS /* White space */
};
/* These are grapheme break properties. The Extended Pictographic property
comes from the emoji-data.txt file. */