Tidies and updates to maintenance programs utf8 and ucptest.
This commit is contained in:
parent
c472f3f91a
commit
9cebee7e75
19
maint/README
19
maint/README
|
@ -54,10 +54,12 @@ Unicode.tables The files in this directory were downloaded from the Unicode
|
||||||
ucptest.c A short C program for testing the Unicode property macros
|
ucptest.c A short C program for testing the Unicode property macros
|
||||||
that do lookups in the pcre2_ucd.c data, mainly useful after
|
that do lookups in the pcre2_ucd.c data, mainly useful after
|
||||||
rebuilding the Unicode property table. Compile and run this in
|
rebuilding the Unicode property table. Compile and run this in
|
||||||
the "maint" directory (see comments at its head).
|
the "maint" directory (see comments at its head). This program
|
||||||
|
can also be used to find characters with specific properties.
|
||||||
|
|
||||||
ucptestdata A directory containing two files, testinput1 and testoutput1,
|
ucptestdata A directory containing four files, testinput{1,2} and
|
||||||
to use in conjunction with the ucptest program.
|
testoutput{1,2}, for use in conjunction with the ucptest
|
||||||
|
program.
|
||||||
|
|
||||||
utf8.c A short, freestanding C program for converting a Unicode code
|
utf8.c A short, freestanding C program for converting a Unicode code
|
||||||
point into a sequence of bytes in the UTF-8 encoding, and vice
|
point into a sequence of bytes in the UTF-8 encoding, and vice
|
||||||
|
@ -65,7 +67,7 @@ utf8.c A short, freestanding C program for converting a Unicode code
|
||||||
outputs a list of the equivalent UTF-8 bytes. If its argument
|
outputs a list of the equivalent UTF-8 bytes. If its argument
|
||||||
is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it
|
is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it
|
||||||
treats them as a UTF-8 character and outputs the equivalent
|
treats them as a UTF-8 character and outputs the equivalent
|
||||||
code point in hex.
|
code point in hex. See comments at its head for details.
|
||||||
|
|
||||||
|
|
||||||
Updating to a new Unicode release
|
Updating to a new Unicode release
|
||||||
|
@ -96,9 +98,10 @@ lists of scripts.
|
||||||
|
|
||||||
The ucptest program can be compiled and used to check that the new tables in
|
The ucptest program can be compiled and used to check that the new tables in
|
||||||
pcre2_ucd.c work properly, using the data files in ucptestdata to check a
|
pcre2_ucd.c work properly, using the data files in ucptestdata to check a
|
||||||
number of test characters. The source file ucptest.c should also be updated
|
number of test characters. It used to be necessary to update the source
|
||||||
whenever new Unicode script names are added, and adding a few tests for new
|
ucptest.c whenever new Unicode scripts were added, but this is no longer
|
||||||
scripts is a good idea.
|
required because that program now uses the lists in the PCRE2 source. However,
|
||||||
|
adding a few tests for new scripts to the files in ucptestdata is a good idea.
|
||||||
|
|
||||||
|
|
||||||
Preparing for a PCRE2 release
|
Preparing for a PCRE2 release
|
||||||
|
@ -437,4 +440,4 @@ very sensible; some are rather wacky. Some have been on this list for years.
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: ph10
|
||||||
Email domain: cam.ac.uk
|
Email domain: cam.ac.uk
|
||||||
Last updated: 03 June 2019
|
Last updated: 01 April 2020
|
||||||
|
|
633
maint/ucptest.c
633
maint/ucptest.c
|
@ -16,36 +16,58 @@
|
||||||
/* This is a hacked-up program for testing the Unicode properties tables of
|
/* This is a hacked-up program for testing the Unicode properties tables of
|
||||||
PCRE2. It can also be used for finding characters with certain properties.
|
PCRE2. It can also be used for finding characters with certain properties.
|
||||||
I wrote it to help with debugging PCRE, and have added things that I found
|
I wrote it to help with debugging PCRE, and have added things that I found
|
||||||
useful, in a rather haphazard way. The code has never been "tidied" or checked
|
useful, in a rather haphazard way. The code has never been seriously tidied or
|
||||||
for robustness.
|
checked for robustness, but it shouldn't now give compiler warnings.
|
||||||
|
|
||||||
If there are arguments, they are a list of hexadecimal code points whose
|
There is only one option: "-s". If given, it applies only to the "findprop"
|
||||||
properties are to be output. Otherwise, the program expects to read commands on
|
command. It causes the UTF-8 sequence of bytes that encode the character to be
|
||||||
stdin, and it writes output to stdout. There are two commands:
|
output between angle brackets at the end of the line. On a UTF-8 terminal, this
|
||||||
|
will show the appropriate graphic for the code point.
|
||||||
|
|
||||||
"findprop" must be followed by a list of Unicode code points as hex numbers
|
If the command has arguments, they are concatenated into a buffer, separated by
|
||||||
(without any prefixes). The output is one line per character, giving its
|
spaces. If the first argument starts "U+" or consists entirely of hexadecimal
|
||||||
Unicode properties followed by its other case if there is one, followed by its
|
digits, "findprop" is inserted at the start. The buffer is then processed as a
|
||||||
Script Extension list if it is not just the same as the base script.
|
single line file, after which the program exits. If there are no arguments, the
|
||||||
|
program reads commands line by line on stdin and writes output to stdout. The
|
||||||
|
return code is always zero.
|
||||||
|
|
||||||
"find" must be followed by a list of property names and their values. This
|
There are three commands:
|
||||||
finds characters that have those properties. If multiple properties are listed,
|
|
||||||
they must all be matched. Currently supported:
|
"findprop" must be followed by a space-separated list of Unicode code points as
|
||||||
|
hex numbers, either without any prefix or starting with "U+". The output is one
|
||||||
|
line per character, giving its Unicode properties followed by its other case or
|
||||||
|
cases if one or more exist, followed by its Script Extension list if it is not
|
||||||
|
just the same as the base script. This list is in square brackets. The
|
||||||
|
properties are:
|
||||||
|
|
||||||
|
General type e.g. Letter
|
||||||
|
Specific type e.g. Upper case letter
|
||||||
|
Script e.g. Medefaidrin
|
||||||
|
Grapheme break type e.g. Extend (most common is Other)
|
||||||
|
|
||||||
|
"find" must be followed by a list of property names and their values. The
|
||||||
|
values are case-sensitive. This finds characters that have those properties. If
|
||||||
|
multiple properties are listed, they must all be matched. Currently supported:
|
||||||
|
|
||||||
script <name> The character must have this script property. Only one
|
script <name> The character must have this script property. Only one
|
||||||
such script may be given.
|
such script may be given.
|
||||||
scriptx <name> This script must be in the character's Script Extension
|
scriptx <name> This script must be in the character's Script Extension
|
||||||
property list. If this is used many times, all the given
|
property list. If this is used many times, all the given
|
||||||
scripts must be present.
|
scripts must be present.
|
||||||
type <abbrev> The character's type (e.g. Lu or Nd) must match.
|
type <abbrev> The character's specific type (e.g. Lu or Nd) must match.
|
||||||
gbreak <name> The grapheme break property must match.
|
gbreak <name> The grapheme break property must match.
|
||||||
|
|
||||||
If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
|
If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
|
||||||
Script Extensions, there may be a mixture of positive and negative
|
Script Extensions, there may be a mixture of positive and negative
|
||||||
requirements. All must be satisfied.
|
requirements. All must be satisfied.
|
||||||
|
|
||||||
No more than 100 characters are output. If there are more, the list ends with
|
Sequences of two or more characters are shown as ranges, for example
|
||||||
... */
|
U+0041..U+004A. No more than 100 lines are are output. If there are more
|
||||||
|
characters, the list ends with ...
|
||||||
|
|
||||||
|
"list" must be followed by a property name (script, type, or gbreak). The
|
||||||
|
defined values for that property are listed. */
|
||||||
|
|
||||||
|
|
||||||
#ifdef HAVE_CONFIG_H
|
#ifdef HAVE_CONFIG_H
|
||||||
#include "../src/config.h"
|
#include "../src/config.h"
|
||||||
|
@ -91,228 +113,99 @@ No more than 100 characters are output. If there are more, the list ends with
|
||||||
|
|
||||||
/* -------------------------------------------------------------------*/
|
/* -------------------------------------------------------------------*/
|
||||||
|
|
||||||
|
static BOOL show_character = FALSE;
|
||||||
|
|
||||||
const unsigned char *script_names[] = {
|
static const unsigned char *type_names[] = {
|
||||||
US"Unknown",
|
US"Cc", US"Control",
|
||||||
US"Arabic",
|
US"Cf", US"Format",
|
||||||
US"Armenian",
|
US"Cn", US"Unassigned",
|
||||||
US"Bengali",
|
US"Co", US"Private use",
|
||||||
US"Bopomofo",
|
US"Cs", US"Surrogate",
|
||||||
US"Braille",
|
US"Ll", US"Lower case letter",
|
||||||
US"Buginese",
|
US"Lm", US"Modifier letter",
|
||||||
US"Buhid",
|
US"Lo", US"Other letter",
|
||||||
US"Canadian_Aboriginal",
|
US"Lt", US"Title case letter",
|
||||||
US"Cherokee",
|
US"Lu", US"Upper case letter",
|
||||||
US"Common",
|
US"Mc", US"Spacing mark",
|
||||||
US"Coptic",
|
US"Me", US"Enclosing mark",
|
||||||
US"Cypriot",
|
US"Mn", US"Non-spacing mark",
|
||||||
US"Cyrillic",
|
US"Nd", US"Decimal number",
|
||||||
US"Deseret",
|
US"Nl", US"Letter number",
|
||||||
US"Devanagari",
|
US"No", US"Other number",
|
||||||
US"Ethiopic",
|
US"Pc", US"Connector punctuation",
|
||||||
US"Georgian",
|
US"Pd", US"Dash punctuation",
|
||||||
US"Glagolitic",
|
US"Pe", US"Close punctuation",
|
||||||
US"Gothic",
|
US"Pf", US"Final punctuation",
|
||||||
US"Greek",
|
US"Pi", US"Initial punctuation",
|
||||||
US"Gujarati",
|
US"Po", US"Other punctuation",
|
||||||
US"Gurmukhi",
|
US"Ps", US"Open punctuation",
|
||||||
US"Han",
|
US"Sc", US"Currency symbol",
|
||||||
US"Hangul",
|
US"Sk", US"Modifier symbol",
|
||||||
US"Hanunoo",
|
US"Sm", US"Mathematical symbol",
|
||||||
US"Hebrew",
|
US"So", US"Other symbol",
|
||||||
US"Hiragana",
|
US"Zl", US"Line separator",
|
||||||
US"Inherited",
|
US"Zp", US"Paragraph separator",
|
||||||
US"Kannada",
|
US"Zs", US"Space separator"
|
||||||
US"Katakana",
|
|
||||||
US"Kharoshthi",
|
|
||||||
US"Khmer",
|
|
||||||
US"Lao",
|
|
||||||
US"Latin",
|
|
||||||
US"Limbu",
|
|
||||||
US"Linear_B",
|
|
||||||
US"Malayalam",
|
|
||||||
US"Mongolian",
|
|
||||||
US"Myanmar",
|
|
||||||
US"New_Tai_Lue",
|
|
||||||
US"Ogham",
|
|
||||||
US"Old_Italic",
|
|
||||||
US"Old_Persian",
|
|
||||||
US"Oriya",
|
|
||||||
US"Osmanya",
|
|
||||||
US"Runic",
|
|
||||||
US"Shavian",
|
|
||||||
US"Sinhala",
|
|
||||||
US"Syloti_Nagri",
|
|
||||||
US"Syriac",
|
|
||||||
US"Tagalog",
|
|
||||||
US"Tagbanwa",
|
|
||||||
US"Tai_Le",
|
|
||||||
US"Tamil",
|
|
||||||
US"Telugu",
|
|
||||||
US"Thaana",
|
|
||||||
US"Thai",
|
|
||||||
US"Tibetan",
|
|
||||||
US"Tifinagh",
|
|
||||||
US"Ugaritic",
|
|
||||||
US"Yi",
|
|
||||||
/* New for Unicode 5.0: */
|
|
||||||
US"Balinese",
|
|
||||||
US"Cuneiform",
|
|
||||||
US"Nko",
|
|
||||||
US"Phags_Pa",
|
|
||||||
US"Phoenician",
|
|
||||||
/* New for Unicode 5.1: */
|
|
||||||
US"Carian",
|
|
||||||
US"Cham",
|
|
||||||
US"Kayah_Li",
|
|
||||||
US"Lepcha",
|
|
||||||
US"Lycian",
|
|
||||||
US"Lydian",
|
|
||||||
US"Ol_Chiki",
|
|
||||||
US"Rejang",
|
|
||||||
US"Saurashtra",
|
|
||||||
US"Sundanese",
|
|
||||||
US"Vai",
|
|
||||||
/* New for Unicode 5.2: */
|
|
||||||
US"Avestan",
|
|
||||||
US"Bamum",
|
|
||||||
US"Egyptian_Hieroglyphs",
|
|
||||||
US"Imperial_Aramaic",
|
|
||||||
US"Inscriptional_Pahlavi",
|
|
||||||
US"Inscriptional_Parthian",
|
|
||||||
US"Javanese",
|
|
||||||
US"Kaithi",
|
|
||||||
US"Lisu",
|
|
||||||
US"Meetei_Mayek",
|
|
||||||
US"Old_South_Arabian",
|
|
||||||
US"Old_Turkic",
|
|
||||||
US"Samaritan",
|
|
||||||
US"Tai_Tham",
|
|
||||||
US"Tai_Viet",
|
|
||||||
/* New for Unicode 6.0.0 */
|
|
||||||
US"Batak",
|
|
||||||
US"Brahmi",
|
|
||||||
US"Mandaic",
|
|
||||||
/* New for Unicode 6.1.0 */
|
|
||||||
US"Chakma",
|
|
||||||
US"Meroitic_Cursive",
|
|
||||||
US"Meroitic_Hieroglyphs",
|
|
||||||
US"Miao",
|
|
||||||
US"Sharada",
|
|
||||||
US"Sora Sompent",
|
|
||||||
US"Takri",
|
|
||||||
/* New for Unicode 7.0.0 */
|
|
||||||
US"Bassa_Vah",
|
|
||||||
US"Caucasian_Albanian",
|
|
||||||
US"Duployan",
|
|
||||||
US"Elbasan",
|
|
||||||
US"Grantha",
|
|
||||||
US"Khojki",
|
|
||||||
US"Khudawadi",
|
|
||||||
US"Linear_A",
|
|
||||||
US"Mahajani",
|
|
||||||
US"Manichaean",
|
|
||||||
US"Mende_Kikakui",
|
|
||||||
US"Modi",
|
|
||||||
US"Mro",
|
|
||||||
US"Nabataean",
|
|
||||||
US"Old_North_Arabian",
|
|
||||||
US"Old_Permic",
|
|
||||||
US"Pahawh_Hmong",
|
|
||||||
US"Palmyrene",
|
|
||||||
US"Psalter_Pahlavi",
|
|
||||||
US"Pau_Cin_Hau",
|
|
||||||
US"Siddham",
|
|
||||||
US"Tirhuta",
|
|
||||||
US"Warang_Citi",
|
|
||||||
/* New for Unicode 8.0.0 */
|
|
||||||
US"Ahom",
|
|
||||||
US"Anatolian_Hieroglyphs",
|
|
||||||
US"Hatran",
|
|
||||||
US"Multani",
|
|
||||||
US"Old_Hungarian",
|
|
||||||
US"SignWriting",
|
|
||||||
/* New for Unicode 10.0.0 (no update since 8.0.0) */
|
|
||||||
US"Adlam",
|
|
||||||
US"Bhaiksuki",
|
|
||||||
US"Marchen",
|
|
||||||
US"Newa",
|
|
||||||
US"Osage",
|
|
||||||
US"Tangut",
|
|
||||||
US"Masaram_Gondi",
|
|
||||||
US"Nushu",
|
|
||||||
US"Soyombo",
|
|
||||||
US"Zanabazar_Square",
|
|
||||||
/* New for Unicode 11.0.0 */
|
|
||||||
US"Dogra",
|
|
||||||
US"Gunjala_Gondi",
|
|
||||||
US"Hanifi_Rohingya",
|
|
||||||
US"Makasar",
|
|
||||||
US"Medefaidrin",
|
|
||||||
US"Old_Sogdian",
|
|
||||||
US"Sogdian",
|
|
||||||
/* New for Unicode 12.0.0 */
|
|
||||||
US"Elymaic",
|
|
||||||
US"Nandinagari",
|
|
||||||
US"Nyiakeng_Puachue_Hmong",
|
|
||||||
US"Wancho",
|
|
||||||
/* New for Unicode 13.0.0 */
|
|
||||||
US"Chorasmian",
|
|
||||||
US"Dives_Akuru",
|
|
||||||
US"Khitan_Small_Script",
|
|
||||||
US"Yezidi"
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const unsigned char *type_names[] = {
|
static const unsigned char *gb_names[] = {
|
||||||
US"Cc",
|
US"CR", US"carriage return",
|
||||||
US"Cf",
|
US"LF", US"linefeed",
|
||||||
US"Cn",
|
US"Control", US"",
|
||||||
US"Co",
|
US"Extend", US"",
|
||||||
US"Cs",
|
US"Prepend", US"",
|
||||||
US"Ll",
|
US"SpacingMark", US"",
|
||||||
US"Lm",
|
US"L", US"Hangul syllable type L",
|
||||||
US"Lo",
|
US"V", US"Hangul syllable type V",
|
||||||
US"Lt",
|
US"T", US"Hangul syllable type T",
|
||||||
US"Lu",
|
US"LV", US"Hangul syllable type LV",
|
||||||
US"Mc",
|
US"LVT", US"Hangul syllable type LVT",
|
||||||
US"Me",
|
US"RegionalIndicator", US"",
|
||||||
US"Mn",
|
US"Other", US"",
|
||||||
US"Nd",
|
US"ZWJ", US"zero width joiner",
|
||||||
US"Nl",
|
US"Extended_Pictographic", US""
|
||||||
US"No",
|
|
||||||
US"Pc",
|
|
||||||
US"Pd",
|
|
||||||
US"Pe",
|
|
||||||
US"Pf",
|
|
||||||
US"Pi",
|
|
||||||
US"Po",
|
|
||||||
US"Ps",
|
|
||||||
US"Sc",
|
|
||||||
US"Sk",
|
|
||||||
US"Sm",
|
|
||||||
US"So",
|
|
||||||
US"Zl",
|
|
||||||
US"Zp",
|
|
||||||
US"Zs"
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const unsigned char *gb_names[] = {
|
|
||||||
US"CR",
|
static const unsigned int utf8_table1[] = {
|
||||||
US"LF",
|
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
|
||||||
US"Control",
|
|
||||||
US"Extend",
|
static const int utf8_table2[] = {
|
||||||
US"Prepend",
|
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
||||||
US"SpacingMark",
|
|
||||||
US"L",
|
|
||||||
US"V",
|
/*************************************************
|
||||||
US"T",
|
* Convert character value to UTF-8 *
|
||||||
US"LV",
|
*************************************************/
|
||||||
US"LVT",
|
|
||||||
US"RegionalIndicator",
|
/* This function takes an unsigned long integer value in the range 0 -
|
||||||
US"Other",
|
0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.
|
||||||
US"ZWJ",
|
|
||||||
US"Extended_Pictographic"
|
Arguments:
|
||||||
};
|
cvalue the character value
|
||||||
|
buffer pointer to buffer for result - at least 6 bytes long
|
||||||
|
|
||||||
|
Returns: number of bytes placed in the buffer
|
||||||
|
0 if input code point is too big
|
||||||
|
*/
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
ord2utf8(unsigned int cvalue, unsigned char *buffer)
|
||||||
|
{
|
||||||
|
size_t i, j;
|
||||||
|
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
|
||||||
|
if (cvalue <= utf8_table1[i]) break;
|
||||||
|
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
|
||||||
|
buffer += i;
|
||||||
|
for (j = i; j > 0; j--)
|
||||||
|
{
|
||||||
|
*buffer-- = 0x80 | (cvalue & 0x3f);
|
||||||
|
cvalue >>= 6;
|
||||||
|
}
|
||||||
|
*buffer = utf8_table2[i] | cvalue;
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
|
@ -330,28 +223,47 @@ return isatty(fileno(stdin));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************************
|
||||||
|
* Get script name from ucp ident *
|
||||||
|
*************************************************/
|
||||||
|
|
||||||
|
static const char *
|
||||||
|
get_scriptname(int script)
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
const ucp_type_table *u;
|
||||||
|
|
||||||
|
for (i = 0; i < PRIV(utt_size); i++)
|
||||||
|
{
|
||||||
|
u = PRIV(utt) + i;
|
||||||
|
if (u->type == PT_SC && u->value == script) break;
|
||||||
|
}
|
||||||
|
if (i < PRIV(utt_size))
|
||||||
|
return PRIV(utt_names) + u->name_offset;
|
||||||
|
|
||||||
|
return "??";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Print Unicode property info for a char *
|
* Print Unicode property info for a char *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
static void
|
static void
|
||||||
print_prop(int c)
|
print_prop(unsigned int c, BOOL is_just_one)
|
||||||
{
|
{
|
||||||
int type = UCD_CATEGORY(c);
|
int type = UCD_CATEGORY(c);
|
||||||
int fulltype = UCD_CHARTYPE(c);
|
int fulltype = UCD_CHARTYPE(c);
|
||||||
int script = UCD_SCRIPT(c);
|
int script = UCD_SCRIPT(c);
|
||||||
int scriptx = UCD_SCRIPTX(c);
|
int scriptx = UCD_SCRIPTX(c);
|
||||||
int gbprop = UCD_GRAPHBREAK(c);
|
int gbprop = UCD_GRAPHBREAK(c);
|
||||||
int othercase = UCD_OTHERCASE(c);
|
unsigned int othercase = UCD_OTHERCASE(c);
|
||||||
int caseset = UCD_CASESET(c);
|
int caseset = UCD_CASESET(c);
|
||||||
|
|
||||||
const unsigned char *fulltypename = US"??";
|
const unsigned char *fulltypename = US"??";
|
||||||
const unsigned char *typename = US"??";
|
const unsigned char *typename = US"??";
|
||||||
const unsigned char *scriptname = US"??";
|
|
||||||
const unsigned char *graphbreak = US"??";
|
const unsigned char *graphbreak = US"??";
|
||||||
|
const unsigned char *scriptname = CUS get_scriptname(script);
|
||||||
if (script < sizeof(script_names)/sizeof(char *))
|
|
||||||
scriptname = script_names[script];
|
|
||||||
|
|
||||||
switch (type)
|
switch (type)
|
||||||
{
|
{
|
||||||
|
@ -420,15 +332,18 @@ switch(gbprop)
|
||||||
default: graphbreak = US"Unknown"; break;
|
default: graphbreak = US"Unknown"; break;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
|
printf("U+%04X %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
|
||||||
if (othercase != c)
|
if (is_just_one && othercase != c)
|
||||||
{
|
{
|
||||||
printf(", %04x", othercase);
|
printf(", U+%04X", othercase);
|
||||||
if (caseset != 0)
|
if (caseset != 0)
|
||||||
{
|
{
|
||||||
const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
|
const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
|
||||||
while (*(++p) < NOTACHAR)
|
while (*(++p) < NOTACHAR)
|
||||||
if (*p != othercase && *p != c) printf(", %04x", *p);
|
{
|
||||||
|
unsigned int d = *p;
|
||||||
|
if (d != othercase && d != c) printf(", U+%04X", d);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -436,26 +351,27 @@ if (scriptx != script)
|
||||||
{
|
{
|
||||||
printf(", [");
|
printf(", [");
|
||||||
if (scriptx >= 0)
|
if (scriptx >= 0)
|
||||||
{
|
printf("%s", get_scriptname(scriptx));
|
||||||
scriptname = (scriptx >= sizeof(script_names)/sizeof(char *))?
|
|
||||||
US"??" : script_names[scriptx];
|
|
||||||
printf("%s", scriptname);
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
char *sep = "";
|
const char *sep = "";
|
||||||
const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
|
const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
|
||||||
while (*p != 0)
|
while (*p != 0)
|
||||||
{
|
{
|
||||||
scriptname = (*p >= sizeof(script_names)/sizeof(char *))?
|
printf("%s%s", sep, get_scriptname(*p++));
|
||||||
US"??" : script_names[*p++];
|
|
||||||
printf("%s%s", sep, scriptname);
|
|
||||||
sep = ", ";
|
sep = ", ";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("]");
|
printf("]");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (show_character && is_just_one)
|
||||||
|
{
|
||||||
|
unsigned char buffer[8];
|
||||||
|
size_t len = ord2utf8(c, buffer);
|
||||||
|
printf(", >%.*s<", (int)len, buffer);
|
||||||
|
}
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -483,7 +399,7 @@ BOOL type_not = FALSE;
|
||||||
BOOL gbreak_not = FALSE;
|
BOOL gbreak_not = FALSE;
|
||||||
BOOL hadrange = FALSE;
|
BOOL hadrange = FALSE;
|
||||||
const ucd_record *ucd, *next_ucd;
|
const ucd_record *ucd, *next_ucd;
|
||||||
const char *pad = " ";
|
const char *pad = " ";
|
||||||
|
|
||||||
while (*s != 0)
|
while (*s != 0)
|
||||||
{
|
{
|
||||||
|
@ -508,17 +424,20 @@ while (*s != 0)
|
||||||
offset = 1;
|
offset = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < sizeof(script_names)/sizeof(char *); i++)
|
for (i = 0; i < PRIV(utt_size); i++)
|
||||||
{
|
{
|
||||||
if (strcmp(CS value + offset, script_names[i]) == 0)
|
const ucp_type_table *u = PRIV(utt) + i;
|
||||||
|
if (u->type == PT_SC && strcmp(CS(value + offset),
|
||||||
|
PRIV(utt_names) + u->name_offset) == 0)
|
||||||
{
|
{
|
||||||
|
c = u->value;
|
||||||
if (name[6] == 'x')
|
if (name[6] == 'x')
|
||||||
{
|
{
|
||||||
scriptx_list[scriptx_count++] = scriptx_not? (-i):i;
|
scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (script < 0) script = i; else
|
if (script < 0) script = c; else
|
||||||
{
|
{
|
||||||
printf("** Only 1 script value allowed\n");
|
printf("** Only 1 script value allowed\n");
|
||||||
return;
|
return;
|
||||||
|
@ -528,9 +447,9 @@ while (*s != 0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i >= sizeof(script_names)/sizeof(char *))
|
if (i >= PRIV(utt_size))
|
||||||
{
|
{
|
||||||
printf("** Unrecognized script name '%s'\n", value);
|
printf("** Unrecognized script name \"%s\"\n", value);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -550,17 +469,17 @@ while (*s != 0)
|
||||||
offset = 1;
|
offset = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < sizeof(type_names)/sizeof(char *); i++)
|
for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
|
||||||
{
|
{
|
||||||
if (strcmp(CS (value + offset), type_names[i]) == 0)
|
if (strcmp(CS (value + offset), CS type_names[i]) == 0)
|
||||||
{
|
{
|
||||||
type = i;
|
type = i/2;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (i >= sizeof(type_names)/sizeof(char *))
|
if (i >= sizeof(type_names)/sizeof(char *))
|
||||||
{
|
{
|
||||||
printf("** Unrecognized type name '%s'\n", value);
|
printf("** Unrecognized type name \"%s\"\n", value);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -581,17 +500,17 @@ while (*s != 0)
|
||||||
offset = 1;
|
offset = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < sizeof(gb_names)/sizeof(char *); i++)
|
for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
|
||||||
{
|
{
|
||||||
if (strcmp(CS (value + offset), gb_names[i]) == 0)
|
if (strcmp(CS (value + offset), CS gb_names[i]) == 0)
|
||||||
{
|
{
|
||||||
gbreak = i;
|
gbreak = i/2;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (i >= sizeof(gb_names)/sizeof(char *))
|
if (i >= sizeof(gb_names)/sizeof(char *))
|
||||||
{
|
{
|
||||||
printf("** Unrecognized gbreak name '%s'\n", value);
|
printf("** Unrecognized gbreak name \"%s\"\n", value);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -599,7 +518,7 @@ while (*s != 0)
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("** Unrecognized property name '%s'\n", name);
|
printf("** Unrecognized property name \"%s\"\n", name);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -617,7 +536,7 @@ for (c = 0; c <= 0x10ffff; c++)
|
||||||
if (scriptx_count > 0)
|
if (scriptx_count > 0)
|
||||||
{
|
{
|
||||||
const uint8_t *char_scriptx = NULL;
|
const uint8_t *char_scriptx = NULL;
|
||||||
int found = 0;
|
unsigned int found = 0;
|
||||||
int scriptx = UCD_SCRIPTX(c);
|
int scriptx = UCD_SCRIPTX(c);
|
||||||
|
|
||||||
if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx;
|
if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx;
|
||||||
|
@ -701,13 +620,13 @@ for (c = 0; c <= 0x10ffff; c++)
|
||||||
|
|
||||||
if (--i > c)
|
if (--i > c)
|
||||||
{
|
{
|
||||||
printf("%04x..", c);
|
printf("U+%04X..", c);
|
||||||
c = i;
|
c = i;
|
||||||
hadrange = TRUE;
|
hadrange = TRUE;
|
||||||
}
|
}
|
||||||
else if (hadrange) printf("%s", pad);
|
else if (hadrange) printf("%s", pad);
|
||||||
|
|
||||||
print_prop(c);
|
print_prop(c, FALSE);
|
||||||
if (c >= 0x100000) pad = " ";
|
if (c >= 0x100000) pad = " ";
|
||||||
else if (c >= 0x10000) pad = " ";
|
else if (c >= 0x10000) pad = " ";
|
||||||
count++;
|
count++;
|
||||||
|
@ -722,6 +641,101 @@ if (count == 0) printf("No characters found\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************************
|
||||||
|
* Process command line *
|
||||||
|
*************************************************/
|
||||||
|
|
||||||
|
static void
|
||||||
|
process_command_line(unsigned char *buffer)
|
||||||
|
{
|
||||||
|
unsigned char *s, *t;
|
||||||
|
unsigned char name[24];
|
||||||
|
|
||||||
|
s = buffer;
|
||||||
|
while (isspace(*s)) s++;
|
||||||
|
if (*s == 0) return;
|
||||||
|
|
||||||
|
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
||||||
|
*t = 0;
|
||||||
|
while (isspace(*s)) s++;
|
||||||
|
|
||||||
|
if (strcmp(CS name, "findprop") == 0)
|
||||||
|
{
|
||||||
|
while (*s != 0)
|
||||||
|
{
|
||||||
|
unsigned int c;
|
||||||
|
unsigned char *endptr;
|
||||||
|
t = s;
|
||||||
|
if (strncmp(CS t, "U+", 2) == 0) t += 2;
|
||||||
|
c = strtoul(CS t, CSS(&endptr), 16);
|
||||||
|
if (*endptr != 0 && !isspace(*endptr))
|
||||||
|
{
|
||||||
|
while (*endptr != 0 && !isspace(*endptr)) endptr++;
|
||||||
|
printf("** Invalid hex number: ignored \"%.*s\"\n", (int)(endptr-s), s);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (c > 0x10ffff)
|
||||||
|
printf("** U+%x is too big for a Unicode code point\n", c);
|
||||||
|
else
|
||||||
|
print_prop(c, TRUE);
|
||||||
|
}
|
||||||
|
s = endptr;
|
||||||
|
while (isspace(*s)) s++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (strcmp(CS name, "find") == 0)
|
||||||
|
{
|
||||||
|
find_chars(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (strcmp(CS name, "list") == 0)
|
||||||
|
{
|
||||||
|
while (*s != 0)
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
||||||
|
*t = 0;
|
||||||
|
while (isspace(*s)) s++;
|
||||||
|
|
||||||
|
if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
|
||||||
|
{
|
||||||
|
for (i = 0; i < PRIV(utt_size); i++)
|
||||||
|
if (PRIV(utt)[i].type == PT_SC)
|
||||||
|
printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
|
||||||
|
{
|
||||||
|
for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
|
||||||
|
printf("%s %s\n", type_names[i], type_names[i+1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
|
||||||
|
{
|
||||||
|
for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
|
||||||
|
{
|
||||||
|
if (gb_names[i+1][0] != 0)
|
||||||
|
printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
|
||||||
|
else
|
||||||
|
printf("%s\n", gb_names[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
else
|
||||||
|
{
|
||||||
|
printf("** Unknown property \"%s\"\n", name);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
else printf("** Unknown test command \"%s\"\n", name);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Main program *
|
* Main program *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
@ -730,19 +744,42 @@ int
|
||||||
main(int argc, char **argv)
|
main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
BOOL interactive;
|
BOOL interactive;
|
||||||
|
int first_arg = 1;
|
||||||
unsigned char buffer[1024];
|
unsigned char buffer[1024];
|
||||||
|
|
||||||
if (argc > 1)
|
if (argc > 1 && strcmp(argv[1], "-s") == 0)
|
||||||
|
{
|
||||||
|
show_character = TRUE;
|
||||||
|
first_arg++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc > first_arg)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
for (i = 1; i < argc; i++)
|
BOOL hexfirst = TRUE;
|
||||||
|
char *arg = argv[first_arg];
|
||||||
|
unsigned char *s = buffer;
|
||||||
|
|
||||||
|
if (strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
|
||||||
{
|
{
|
||||||
unsigned char *endptr;
|
while (*arg != 0)
|
||||||
int c = strtoul(argv[i], CSS(&endptr), 16);
|
{
|
||||||
if (*endptr != 0)
|
if (!isxdigit(*arg++)) { hexfirst = FALSE; break; }
|
||||||
printf("** Hex number expected; ignored '%s'\n", argv[i]);
|
}
|
||||||
else print_prop(c);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (hexfirst)
|
||||||
|
{
|
||||||
|
strcpy(CS s, "findprop ");
|
||||||
|
s += 9;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = first_arg; i < argc; i++)
|
||||||
|
{
|
||||||
|
s += sprintf(CS s, "%s ", argv[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
process_command_line(buffer);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -754,17 +791,14 @@ if (interactive) using_history();
|
||||||
|
|
||||||
for(;;)
|
for(;;)
|
||||||
{
|
{
|
||||||
unsigned char name[24];
|
|
||||||
unsigned char *s, *t;
|
|
||||||
|
|
||||||
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
||||||
if (interactive)
|
if (interactive)
|
||||||
{
|
{
|
||||||
size_t len;
|
size_t len;
|
||||||
s = readline("> ");
|
unsigned char *s = US readline("> ");
|
||||||
if (s == NULL) break;
|
if (s == NULL) break;
|
||||||
len = strlen(s);
|
len = strlen(CS s);
|
||||||
if (len > 0) add_history(s);
|
if (len > 0) add_history(CS s);
|
||||||
memcpy(buffer, s, len);
|
memcpy(buffer, s, len);
|
||||||
buffer[len] = '\n';
|
buffer[len] = '\n';
|
||||||
buffer[len+1] = 0;
|
buffer[len+1] = 0;
|
||||||
|
@ -779,38 +813,7 @@ for(;;)
|
||||||
if (!interactive) printf("%s", buffer);
|
if (!interactive) printf("%s", buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
s = buffer;
|
process_command_line(buffer);
|
||||||
while (isspace(*s)) s++;
|
|
||||||
if (*s == 0) continue;
|
|
||||||
|
|
||||||
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
|
||||||
*t = 0;
|
|
||||||
while (isspace(*s)) s++;
|
|
||||||
|
|
||||||
if (strcmp(CS name, "findprop") == 0)
|
|
||||||
{
|
|
||||||
while (*s != 0)
|
|
||||||
{
|
|
||||||
unsigned char *endptr;
|
|
||||||
int c = strtoul(CS s, CSS(&endptr), 16);
|
|
||||||
|
|
||||||
if (*endptr != 0 && !isspace(*endptr))
|
|
||||||
{
|
|
||||||
while (*endptr != 0 && !isspace(*endptr)) endptr++;
|
|
||||||
printf("** Hex number expected; ignored '%.*s'\n", endptr-s, s);
|
|
||||||
}
|
|
||||||
else print_prop(c);
|
|
||||||
s = endptr;
|
|
||||||
while (isspace(*s)) s++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (strcmp(CS name, "find") == 0)
|
|
||||||
{
|
|
||||||
find_chars(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
else printf("** Unknown test command %s\n", name);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (interactive) printf("\n");
|
if (interactive) printf("\n");
|
||||||
|
|
|
@ -45,4 +45,4 @@ findprop 32ff
|
||||||
|
|
||||||
findprop 1f16d
|
findprop 1f16d
|
||||||
|
|
||||||
findprop 10e93 10eaa
|
findprop U+10e93 U+10eaa
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
find script Han
|
||||||
|
find type Pe script Common scriptx Hangul
|
||||||
|
find type Sk
|
||||||
|
find type Pd
|
||||||
|
find gbreak LVT
|
|
@ -1,398 +1,398 @@
|
||||||
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
|
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
|
||||||
0000 Control: Control, Common, Control
|
U+0000 Control: Control, Common, Control
|
||||||
0001 Control: Control, Common, Control
|
U+0001 Control: Control, Common, Control
|
||||||
0002 Control: Control, Common, Control
|
U+0002 Control: Control, Common, Control
|
||||||
0003 Control: Control, Common, Control
|
U+0003 Control: Control, Common, Control
|
||||||
0004 Control: Control, Common, Control
|
U+0004 Control: Control, Common, Control
|
||||||
0005 Control: Control, Common, Control
|
U+0005 Control: Control, Common, Control
|
||||||
0006 Control: Control, Common, Control
|
U+0006 Control: Control, Common, Control
|
||||||
0007 Control: Control, Common, Control
|
U+0007 Control: Control, Common, Control
|
||||||
0008 Control: Control, Common, Control
|
U+0008 Control: Control, Common, Control
|
||||||
0009 Control: Control, Common, Control
|
U+0009 Control: Control, Common, Control
|
||||||
000a Control: Control, Common, LF
|
U+000A Control: Control, Common, LF
|
||||||
000b Control: Control, Common, Control
|
U+000B Control: Control, Common, Control
|
||||||
000c Control: Control, Common, Control
|
U+000C Control: Control, Common, Control
|
||||||
000d Control: Control, Common, CR
|
U+000D Control: Control, Common, CR
|
||||||
000e Control: Control, Common, Control
|
U+000E Control: Control, Common, Control
|
||||||
000f Control: Control, Common, Control
|
U+000F Control: Control, Common, Control
|
||||||
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
|
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
|
||||||
0010 Control: Control, Common, Control
|
U+0010 Control: Control, Common, Control
|
||||||
0011 Control: Control, Common, Control
|
U+0011 Control: Control, Common, Control
|
||||||
0012 Control: Control, Common, Control
|
U+0012 Control: Control, Common, Control
|
||||||
0013 Control: Control, Common, Control
|
U+0013 Control: Control, Common, Control
|
||||||
0014 Control: Control, Common, Control
|
U+0014 Control: Control, Common, Control
|
||||||
0015 Control: Control, Common, Control
|
U+0015 Control: Control, Common, Control
|
||||||
0016 Control: Control, Common, Control
|
U+0016 Control: Control, Common, Control
|
||||||
0017 Control: Control, Common, Control
|
U+0017 Control: Control, Common, Control
|
||||||
0018 Control: Control, Common, Control
|
U+0018 Control: Control, Common, Control
|
||||||
0019 Control: Control, Common, Control
|
U+0019 Control: Control, Common, Control
|
||||||
001a Control: Control, Common, Control
|
U+001A Control: Control, Common, Control
|
||||||
001b Control: Control, Common, Control
|
U+001B Control: Control, Common, Control
|
||||||
001c Control: Control, Common, Control
|
U+001C Control: Control, Common, Control
|
||||||
001d Control: Control, Common, Control
|
U+001D Control: Control, Common, Control
|
||||||
001e Control: Control, Common, Control
|
U+001E Control: Control, Common, Control
|
||||||
001f Control: Control, Common, Control
|
U+001F Control: Control, Common, Control
|
||||||
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
|
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
|
||||||
0020 Separator: Space separator, Common, Other
|
U+0020 Separator: Space separator, Common, Other
|
||||||
0021 Punctuation: Other punctuation, Common, Other
|
U+0021 Punctuation: Other punctuation, Common, Other
|
||||||
0022 Punctuation: Other punctuation, Common, Other
|
U+0022 Punctuation: Other punctuation, Common, Other
|
||||||
0023 Punctuation: Other punctuation, Common, Other
|
U+0023 Punctuation: Other punctuation, Common, Other
|
||||||
0024 Symbol: Currency symbol, Common, Other
|
U+0024 Symbol: Currency symbol, Common, Other
|
||||||
0025 Punctuation: Other punctuation, Common, Other
|
U+0025 Punctuation: Other punctuation, Common, Other
|
||||||
0026 Punctuation: Other punctuation, Common, Other
|
U+0026 Punctuation: Other punctuation, Common, Other
|
||||||
0027 Punctuation: Other punctuation, Common, Other
|
U+0027 Punctuation: Other punctuation, Common, Other
|
||||||
0028 Punctuation: Open punctuation, Common, Other
|
U+0028 Punctuation: Open punctuation, Common, Other
|
||||||
0029 Punctuation: Close punctuation, Common, Other
|
U+0029 Punctuation: Close punctuation, Common, Other
|
||||||
002a Punctuation: Other punctuation, Common, Other
|
U+002A Punctuation: Other punctuation, Common, Other
|
||||||
002b Symbol: Mathematical symbol, Common, Other
|
U+002B Symbol: Mathematical symbol, Common, Other
|
||||||
002c Punctuation: Other punctuation, Common, Other
|
U+002C Punctuation: Other punctuation, Common, Other
|
||||||
002d Punctuation: Dash punctuation, Common, Other
|
U+002D Punctuation: Dash punctuation, Common, Other
|
||||||
002e Punctuation: Other punctuation, Common, Other
|
U+002E Punctuation: Other punctuation, Common, Other
|
||||||
002f Punctuation: Other punctuation, Common, Other
|
U+002F Punctuation: Other punctuation, Common, Other
|
||||||
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
|
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
|
||||||
0030 Number: Decimal number, Common, Other
|
U+0030 Number: Decimal number, Common, Other
|
||||||
0031 Number: Decimal number, Common, Other
|
U+0031 Number: Decimal number, Common, Other
|
||||||
0032 Number: Decimal number, Common, Other
|
U+0032 Number: Decimal number, Common, Other
|
||||||
0033 Number: Decimal number, Common, Other
|
U+0033 Number: Decimal number, Common, Other
|
||||||
0034 Number: Decimal number, Common, Other
|
U+0034 Number: Decimal number, Common, Other
|
||||||
0035 Number: Decimal number, Common, Other
|
U+0035 Number: Decimal number, Common, Other
|
||||||
0036 Number: Decimal number, Common, Other
|
U+0036 Number: Decimal number, Common, Other
|
||||||
0037 Number: Decimal number, Common, Other
|
U+0037 Number: Decimal number, Common, Other
|
||||||
0038 Number: Decimal number, Common, Other
|
U+0038 Number: Decimal number, Common, Other
|
||||||
0039 Number: Decimal number, Common, Other
|
U+0039 Number: Decimal number, Common, Other
|
||||||
003a Punctuation: Other punctuation, Common, Other
|
U+003A Punctuation: Other punctuation, Common, Other
|
||||||
003b Punctuation: Other punctuation, Common, Other
|
U+003B Punctuation: Other punctuation, Common, Other
|
||||||
003c Symbol: Mathematical symbol, Common, Other
|
U+003C Symbol: Mathematical symbol, Common, Other
|
||||||
003d Symbol: Mathematical symbol, Common, Other
|
U+003D Symbol: Mathematical symbol, Common, Other
|
||||||
003e Symbol: Mathematical symbol, Common, Other
|
U+003E Symbol: Mathematical symbol, Common, Other
|
||||||
003f Punctuation: Other punctuation, Common, Other
|
U+003F Punctuation: Other punctuation, Common, Other
|
||||||
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
|
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
|
||||||
0040 Punctuation: Other punctuation, Common, Other
|
U+0040 Punctuation: Other punctuation, Common, Other
|
||||||
0041 Letter: Upper case letter, Latin, Other, 0061
|
U+0041 Letter: Upper case letter, Latin, Other, U+0061
|
||||||
0042 Letter: Upper case letter, Latin, Other, 0062
|
U+0042 Letter: Upper case letter, Latin, Other, U+0062
|
||||||
0043 Letter: Upper case letter, Latin, Other, 0063
|
U+0043 Letter: Upper case letter, Latin, Other, U+0063
|
||||||
0044 Letter: Upper case letter, Latin, Other, 0064
|
U+0044 Letter: Upper case letter, Latin, Other, U+0064
|
||||||
0045 Letter: Upper case letter, Latin, Other, 0065
|
U+0045 Letter: Upper case letter, Latin, Other, U+0065
|
||||||
0046 Letter: Upper case letter, Latin, Other, 0066
|
U+0046 Letter: Upper case letter, Latin, Other, U+0066
|
||||||
0047 Letter: Upper case letter, Latin, Other, 0067
|
U+0047 Letter: Upper case letter, Latin, Other, U+0067
|
||||||
0048 Letter: Upper case letter, Latin, Other, 0068
|
U+0048 Letter: Upper case letter, Latin, Other, U+0068
|
||||||
0049 Letter: Upper case letter, Latin, Other, 0069
|
U+0049 Letter: Upper case letter, Latin, Other, U+0069
|
||||||
004a Letter: Upper case letter, Latin, Other, 006a
|
U+004A Letter: Upper case letter, Latin, Other, U+006A
|
||||||
004b Letter: Upper case letter, Latin, Other, 006b, 212a
|
U+004B Letter: Upper case letter, Latin, Other, U+006B, U+212A
|
||||||
004c Letter: Upper case letter, Latin, Other, 006c
|
U+004C Letter: Upper case letter, Latin, Other, U+006C
|
||||||
004d Letter: Upper case letter, Latin, Other, 006d
|
U+004D Letter: Upper case letter, Latin, Other, U+006D
|
||||||
004e Letter: Upper case letter, Latin, Other, 006e
|
U+004E Letter: Upper case letter, Latin, Other, U+006E
|
||||||
004f Letter: Upper case letter, Latin, Other, 006f
|
U+004F Letter: Upper case letter, Latin, Other, U+006F
|
||||||
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
|
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
|
||||||
0050 Letter: Upper case letter, Latin, Other, 0070
|
U+0050 Letter: Upper case letter, Latin, Other, U+0070
|
||||||
0051 Letter: Upper case letter, Latin, Other, 0071
|
U+0051 Letter: Upper case letter, Latin, Other, U+0071
|
||||||
0052 Letter: Upper case letter, Latin, Other, 0072
|
U+0052 Letter: Upper case letter, Latin, Other, U+0072
|
||||||
0053 Letter: Upper case letter, Latin, Other, 0073, 017f
|
U+0053 Letter: Upper case letter, Latin, Other, U+0073, U+017F
|
||||||
0054 Letter: Upper case letter, Latin, Other, 0074
|
U+0054 Letter: Upper case letter, Latin, Other, U+0074
|
||||||
0055 Letter: Upper case letter, Latin, Other, 0075
|
U+0055 Letter: Upper case letter, Latin, Other, U+0075
|
||||||
0056 Letter: Upper case letter, Latin, Other, 0076
|
U+0056 Letter: Upper case letter, Latin, Other, U+0076
|
||||||
0057 Letter: Upper case letter, Latin, Other, 0077
|
U+0057 Letter: Upper case letter, Latin, Other, U+0077
|
||||||
0058 Letter: Upper case letter, Latin, Other, 0078
|
U+0058 Letter: Upper case letter, Latin, Other, U+0078
|
||||||
0059 Letter: Upper case letter, Latin, Other, 0079
|
U+0059 Letter: Upper case letter, Latin, Other, U+0079
|
||||||
005a Letter: Upper case letter, Latin, Other, 007a
|
U+005A Letter: Upper case letter, Latin, Other, U+007A
|
||||||
005b Punctuation: Open punctuation, Common, Other
|
U+005B Punctuation: Open punctuation, Common, Other
|
||||||
005c Punctuation: Other punctuation, Common, Other
|
U+005C Punctuation: Other punctuation, Common, Other
|
||||||
005d Punctuation: Close punctuation, Common, Other
|
U+005D Punctuation: Close punctuation, Common, Other
|
||||||
005e Symbol: Modifier symbol, Common, Other
|
U+005E Symbol: Modifier symbol, Common, Other
|
||||||
005f Punctuation: Connector punctuation, Common, Other
|
U+005F Punctuation: Connector punctuation, Common, Other
|
||||||
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
|
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
|
||||||
0060 Symbol: Modifier symbol, Common, Other
|
U+0060 Symbol: Modifier symbol, Common, Other
|
||||||
0061 Letter: Lower case letter, Latin, Other, 0041
|
U+0061 Letter: Lower case letter, Latin, Other, U+0041
|
||||||
0062 Letter: Lower case letter, Latin, Other, 0042
|
U+0062 Letter: Lower case letter, Latin, Other, U+0042
|
||||||
0063 Letter: Lower case letter, Latin, Other, 0043
|
U+0063 Letter: Lower case letter, Latin, Other, U+0043
|
||||||
0064 Letter: Lower case letter, Latin, Other, 0044
|
U+0064 Letter: Lower case letter, Latin, Other, U+0044
|
||||||
0065 Letter: Lower case letter, Latin, Other, 0045
|
U+0065 Letter: Lower case letter, Latin, Other, U+0045
|
||||||
0066 Letter: Lower case letter, Latin, Other, 0046
|
U+0066 Letter: Lower case letter, Latin, Other, U+0046
|
||||||
0067 Letter: Lower case letter, Latin, Other, 0047
|
U+0067 Letter: Lower case letter, Latin, Other, U+0047
|
||||||
0068 Letter: Lower case letter, Latin, Other, 0048
|
U+0068 Letter: Lower case letter, Latin, Other, U+0048
|
||||||
0069 Letter: Lower case letter, Latin, Other, 0049
|
U+0069 Letter: Lower case letter, Latin, Other, U+0049
|
||||||
006a Letter: Lower case letter, Latin, Other, 004a
|
U+006A Letter: Lower case letter, Latin, Other, U+004A
|
||||||
006b Letter: Lower case letter, Latin, Other, 004b, 212a
|
U+006B Letter: Lower case letter, Latin, Other, U+004B, U+212A
|
||||||
006c Letter: Lower case letter, Latin, Other, 004c
|
U+006C Letter: Lower case letter, Latin, Other, U+004C
|
||||||
006d Letter: Lower case letter, Latin, Other, 004d
|
U+006D Letter: Lower case letter, Latin, Other, U+004D
|
||||||
006e Letter: Lower case letter, Latin, Other, 004e
|
U+006E Letter: Lower case letter, Latin, Other, U+004E
|
||||||
006f Letter: Lower case letter, Latin, Other, 004f
|
U+006F Letter: Lower case letter, Latin, Other, U+004F
|
||||||
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
|
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
|
||||||
0070 Letter: Lower case letter, Latin, Other, 0050
|
U+0070 Letter: Lower case letter, Latin, Other, U+0050
|
||||||
0071 Letter: Lower case letter, Latin, Other, 0051
|
U+0071 Letter: Lower case letter, Latin, Other, U+0051
|
||||||
0072 Letter: Lower case letter, Latin, Other, 0052
|
U+0072 Letter: Lower case letter, Latin, Other, U+0052
|
||||||
0073 Letter: Lower case letter, Latin, Other, 0053, 017f
|
U+0073 Letter: Lower case letter, Latin, Other, U+0053, U+017F
|
||||||
0074 Letter: Lower case letter, Latin, Other, 0054
|
U+0074 Letter: Lower case letter, Latin, Other, U+0054
|
||||||
0075 Letter: Lower case letter, Latin, Other, 0055
|
U+0075 Letter: Lower case letter, Latin, Other, U+0055
|
||||||
0076 Letter: Lower case letter, Latin, Other, 0056
|
U+0076 Letter: Lower case letter, Latin, Other, U+0056
|
||||||
0077 Letter: Lower case letter, Latin, Other, 0057
|
U+0077 Letter: Lower case letter, Latin, Other, U+0057
|
||||||
0078 Letter: Lower case letter, Latin, Other, 0058
|
U+0078 Letter: Lower case letter, Latin, Other, U+0058
|
||||||
0079 Letter: Lower case letter, Latin, Other, 0059
|
U+0079 Letter: Lower case letter, Latin, Other, U+0059
|
||||||
007a Letter: Lower case letter, Latin, Other, 005a
|
U+007A Letter: Lower case letter, Latin, Other, U+005A
|
||||||
007b Punctuation: Open punctuation, Common, Other
|
U+007B Punctuation: Open punctuation, Common, Other
|
||||||
007c Symbol: Mathematical symbol, Common, Other
|
U+007C Symbol: Mathematical symbol, Common, Other
|
||||||
007d Punctuation: Close punctuation, Common, Other
|
U+007D Punctuation: Close punctuation, Common, Other
|
||||||
007e Symbol: Mathematical symbol, Common, Other
|
U+007E Symbol: Mathematical symbol, Common, Other
|
||||||
007f Control: Control, Common, Control
|
U+007F Control: Control, Common, Control
|
||||||
|
|
||||||
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
|
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
|
||||||
0080 Control: Control, Common, Control
|
U+0080 Control: Control, Common, Control
|
||||||
0081 Control: Control, Common, Control
|
U+0081 Control: Control, Common, Control
|
||||||
0082 Control: Control, Common, Control
|
U+0082 Control: Control, Common, Control
|
||||||
0083 Control: Control, Common, Control
|
U+0083 Control: Control, Common, Control
|
||||||
0084 Control: Control, Common, Control
|
U+0084 Control: Control, Common, Control
|
||||||
0085 Control: Control, Common, Control
|
U+0085 Control: Control, Common, Control
|
||||||
0086 Control: Control, Common, Control
|
U+0086 Control: Control, Common, Control
|
||||||
0087 Control: Control, Common, Control
|
U+0087 Control: Control, Common, Control
|
||||||
0088 Control: Control, Common, Control
|
U+0088 Control: Control, Common, Control
|
||||||
0089 Control: Control, Common, Control
|
U+0089 Control: Control, Common, Control
|
||||||
008a Control: Control, Common, Control
|
U+008A Control: Control, Common, Control
|
||||||
008b Control: Control, Common, Control
|
U+008B Control: Control, Common, Control
|
||||||
008c Control: Control, Common, Control
|
U+008C Control: Control, Common, Control
|
||||||
008d Control: Control, Common, Control
|
U+008D Control: Control, Common, Control
|
||||||
008e Control: Control, Common, Control
|
U+008E Control: Control, Common, Control
|
||||||
008f Control: Control, Common, Control
|
U+008F Control: Control, Common, Control
|
||||||
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
|
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
|
||||||
0090 Control: Control, Common, Control
|
U+0090 Control: Control, Common, Control
|
||||||
0091 Control: Control, Common, Control
|
U+0091 Control: Control, Common, Control
|
||||||
0092 Control: Control, Common, Control
|
U+0092 Control: Control, Common, Control
|
||||||
0093 Control: Control, Common, Control
|
U+0093 Control: Control, Common, Control
|
||||||
0094 Control: Control, Common, Control
|
U+0094 Control: Control, Common, Control
|
||||||
0095 Control: Control, Common, Control
|
U+0095 Control: Control, Common, Control
|
||||||
0096 Control: Control, Common, Control
|
U+0096 Control: Control, Common, Control
|
||||||
0097 Control: Control, Common, Control
|
U+0097 Control: Control, Common, Control
|
||||||
0098 Control: Control, Common, Control
|
U+0098 Control: Control, Common, Control
|
||||||
0099 Control: Control, Common, Control
|
U+0099 Control: Control, Common, Control
|
||||||
009a Control: Control, Common, Control
|
U+009A Control: Control, Common, Control
|
||||||
009b Control: Control, Common, Control
|
U+009B Control: Control, Common, Control
|
||||||
009c Control: Control, Common, Control
|
U+009C Control: Control, Common, Control
|
||||||
009d Control: Control, Common, Control
|
U+009D Control: Control, Common, Control
|
||||||
009e Control: Control, Common, Control
|
U+009E Control: Control, Common, Control
|
||||||
009f Control: Control, Common, Control
|
U+009F Control: Control, Common, Control
|
||||||
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
|
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
|
||||||
00a0 Separator: Space separator, Common, Other
|
U+00A0 Separator: Space separator, Common, Other
|
||||||
00a1 Punctuation: Other punctuation, Common, Other
|
U+00A1 Punctuation: Other punctuation, Common, Other
|
||||||
00a2 Symbol: Currency symbol, Common, Other
|
U+00A2 Symbol: Currency symbol, Common, Other
|
||||||
00a3 Symbol: Currency symbol, Common, Other
|
U+00A3 Symbol: Currency symbol, Common, Other
|
||||||
00a4 Symbol: Currency symbol, Common, Other
|
U+00A4 Symbol: Currency symbol, Common, Other
|
||||||
00a5 Symbol: Currency symbol, Common, Other
|
U+00A5 Symbol: Currency symbol, Common, Other
|
||||||
00a6 Symbol: Other symbol, Common, Other
|
U+00A6 Symbol: Other symbol, Common, Other
|
||||||
00a7 Punctuation: Other punctuation, Common, Other
|
U+00A7 Punctuation: Other punctuation, Common, Other
|
||||||
00a8 Symbol: Modifier symbol, Common, Other
|
U+00A8 Symbol: Modifier symbol, Common, Other
|
||||||
00a9 Symbol: Other symbol, Common, Extended Pictographic
|
U+00A9 Symbol: Other symbol, Common, Extended Pictographic
|
||||||
00aa Letter: Other letter, Latin, Other
|
U+00AA Letter: Other letter, Latin, Other
|
||||||
00ab Punctuation: Initial punctuation, Common, Other
|
U+00AB Punctuation: Initial punctuation, Common, Other
|
||||||
00ac Symbol: Mathematical symbol, Common, Other
|
U+00AC Symbol: Mathematical symbol, Common, Other
|
||||||
00ad Control: Format, Common, Control
|
U+00AD Control: Format, Common, Control
|
||||||
00ae Symbol: Other symbol, Common, Extended Pictographic
|
U+00AE Symbol: Other symbol, Common, Extended Pictographic
|
||||||
00af Symbol: Modifier symbol, Common, Other
|
U+00AF Symbol: Modifier symbol, Common, Other
|
||||||
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
|
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
|
||||||
00b0 Symbol: Other symbol, Common, Other
|
U+00B0 Symbol: Other symbol, Common, Other
|
||||||
00b1 Symbol: Mathematical symbol, Common, Other
|
U+00B1 Symbol: Mathematical symbol, Common, Other
|
||||||
00b2 Number: Other number, Common, Other
|
U+00B2 Number: Other number, Common, Other
|
||||||
00b3 Number: Other number, Common, Other
|
U+00B3 Number: Other number, Common, Other
|
||||||
00b4 Symbol: Modifier symbol, Common, Other
|
U+00B4 Symbol: Modifier symbol, Common, Other
|
||||||
00b5 Letter: Lower case letter, Common, Other, 03bc, 039c
|
U+00B5 Letter: Lower case letter, Common, Other, U+03BC, U+039C
|
||||||
00b6 Punctuation: Other punctuation, Common, Other
|
U+00B6 Punctuation: Other punctuation, Common, Other
|
||||||
00b7 Punctuation: Other punctuation, Common, Other
|
U+00B7 Punctuation: Other punctuation, Common, Other
|
||||||
00b8 Symbol: Modifier symbol, Common, Other
|
U+00B8 Symbol: Modifier symbol, Common, Other
|
||||||
00b9 Number: Other number, Common, Other
|
U+00B9 Number: Other number, Common, Other
|
||||||
00ba Letter: Other letter, Latin, Other
|
U+00BA Letter: Other letter, Latin, Other
|
||||||
00bb Punctuation: Final punctuation, Common, Other
|
U+00BB Punctuation: Final punctuation, Common, Other
|
||||||
00bc Number: Other number, Common, Other
|
U+00BC Number: Other number, Common, Other
|
||||||
00bd Number: Other number, Common, Other
|
U+00BD Number: Other number, Common, Other
|
||||||
00be Number: Other number, Common, Other
|
U+00BE Number: Other number, Common, Other
|
||||||
00bf Punctuation: Other punctuation, Common, Other
|
U+00BF Punctuation: Other punctuation, Common, Other
|
||||||
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
|
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
|
||||||
00c0 Letter: Upper case letter, Latin, Other, 00e0
|
U+00C0 Letter: Upper case letter, Latin, Other, U+00E0
|
||||||
00c1 Letter: Upper case letter, Latin, Other, 00e1
|
U+00C1 Letter: Upper case letter, Latin, Other, U+00E1
|
||||||
00c2 Letter: Upper case letter, Latin, Other, 00e2
|
U+00C2 Letter: Upper case letter, Latin, Other, U+00E2
|
||||||
00c3 Letter: Upper case letter, Latin, Other, 00e3
|
U+00C3 Letter: Upper case letter, Latin, Other, U+00E3
|
||||||
00c4 Letter: Upper case letter, Latin, Other, 00e4
|
U+00C4 Letter: Upper case letter, Latin, Other, U+00E4
|
||||||
00c5 Letter: Upper case letter, Latin, Other, 00e5, 212b
|
U+00C5 Letter: Upper case letter, Latin, Other, U+00E5, U+212B
|
||||||
00c6 Letter: Upper case letter, Latin, Other, 00e6
|
U+00C6 Letter: Upper case letter, Latin, Other, U+00E6
|
||||||
00c7 Letter: Upper case letter, Latin, Other, 00e7
|
U+00C7 Letter: Upper case letter, Latin, Other, U+00E7
|
||||||
00c8 Letter: Upper case letter, Latin, Other, 00e8
|
U+00C8 Letter: Upper case letter, Latin, Other, U+00E8
|
||||||
00c9 Letter: Upper case letter, Latin, Other, 00e9
|
U+00C9 Letter: Upper case letter, Latin, Other, U+00E9
|
||||||
00ca Letter: Upper case letter, Latin, Other, 00ea
|
U+00CA Letter: Upper case letter, Latin, Other, U+00EA
|
||||||
00cb Letter: Upper case letter, Latin, Other, 00eb
|
U+00CB Letter: Upper case letter, Latin, Other, U+00EB
|
||||||
00cc Letter: Upper case letter, Latin, Other, 00ec
|
U+00CC Letter: Upper case letter, Latin, Other, U+00EC
|
||||||
00cd Letter: Upper case letter, Latin, Other, 00ed
|
U+00CD Letter: Upper case letter, Latin, Other, U+00ED
|
||||||
00ce Letter: Upper case letter, Latin, Other, 00ee
|
U+00CE Letter: Upper case letter, Latin, Other, U+00EE
|
||||||
00cf Letter: Upper case letter, Latin, Other, 00ef
|
U+00CF Letter: Upper case letter, Latin, Other, U+00EF
|
||||||
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
|
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
|
||||||
00d0 Letter: Upper case letter, Latin, Other, 00f0
|
U+00D0 Letter: Upper case letter, Latin, Other, U+00F0
|
||||||
00d1 Letter: Upper case letter, Latin, Other, 00f1
|
U+00D1 Letter: Upper case letter, Latin, Other, U+00F1
|
||||||
00d2 Letter: Upper case letter, Latin, Other, 00f2
|
U+00D2 Letter: Upper case letter, Latin, Other, U+00F2
|
||||||
00d3 Letter: Upper case letter, Latin, Other, 00f3
|
U+00D3 Letter: Upper case letter, Latin, Other, U+00F3
|
||||||
00d4 Letter: Upper case letter, Latin, Other, 00f4
|
U+00D4 Letter: Upper case letter, Latin, Other, U+00F4
|
||||||
00d5 Letter: Upper case letter, Latin, Other, 00f5
|
U+00D5 Letter: Upper case letter, Latin, Other, U+00F5
|
||||||
00d6 Letter: Upper case letter, Latin, Other, 00f6
|
U+00D6 Letter: Upper case letter, Latin, Other, U+00F6
|
||||||
00d7 Symbol: Mathematical symbol, Common, Other
|
U+00D7 Symbol: Mathematical symbol, Common, Other
|
||||||
00d8 Letter: Upper case letter, Latin, Other, 00f8
|
U+00D8 Letter: Upper case letter, Latin, Other, U+00F8
|
||||||
00d9 Letter: Upper case letter, Latin, Other, 00f9
|
U+00D9 Letter: Upper case letter, Latin, Other, U+00F9
|
||||||
00da Letter: Upper case letter, Latin, Other, 00fa
|
U+00DA Letter: Upper case letter, Latin, Other, U+00FA
|
||||||
00db Letter: Upper case letter, Latin, Other, 00fb
|
U+00DB Letter: Upper case letter, Latin, Other, U+00FB
|
||||||
00dc Letter: Upper case letter, Latin, Other, 00fc
|
U+00DC Letter: Upper case letter, Latin, Other, U+00FC
|
||||||
00dd Letter: Upper case letter, Latin, Other, 00fd
|
U+00DD Letter: Upper case letter, Latin, Other, U+00FD
|
||||||
00de Letter: Upper case letter, Latin, Other, 00fe
|
U+00DE Letter: Upper case letter, Latin, Other, U+00FE
|
||||||
00df Letter: Lower case letter, Latin, Other, 1e9e
|
U+00DF Letter: Lower case letter, Latin, Other, U+1E9E
|
||||||
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
|
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
|
||||||
00e0 Letter: Lower case letter, Latin, Other, 00c0
|
U+00E0 Letter: Lower case letter, Latin, Other, U+00C0
|
||||||
00e1 Letter: Lower case letter, Latin, Other, 00c1
|
U+00E1 Letter: Lower case letter, Latin, Other, U+00C1
|
||||||
00e2 Letter: Lower case letter, Latin, Other, 00c2
|
U+00E2 Letter: Lower case letter, Latin, Other, U+00C2
|
||||||
00e3 Letter: Lower case letter, Latin, Other, 00c3
|
U+00E3 Letter: Lower case letter, Latin, Other, U+00C3
|
||||||
00e4 Letter: Lower case letter, Latin, Other, 00c4
|
U+00E4 Letter: Lower case letter, Latin, Other, U+00C4
|
||||||
00e5 Letter: Lower case letter, Latin, Other, 00c5, 212b
|
U+00E5 Letter: Lower case letter, Latin, Other, U+00C5, U+212B
|
||||||
00e6 Letter: Lower case letter, Latin, Other, 00c6
|
U+00E6 Letter: Lower case letter, Latin, Other, U+00C6
|
||||||
00e7 Letter: Lower case letter, Latin, Other, 00c7
|
U+00E7 Letter: Lower case letter, Latin, Other, U+00C7
|
||||||
00e8 Letter: Lower case letter, Latin, Other, 00c8
|
U+00E8 Letter: Lower case letter, Latin, Other, U+00C8
|
||||||
00e9 Letter: Lower case letter, Latin, Other, 00c9
|
U+00E9 Letter: Lower case letter, Latin, Other, U+00C9
|
||||||
00ea Letter: Lower case letter, Latin, Other, 00ca
|
U+00EA Letter: Lower case letter, Latin, Other, U+00CA
|
||||||
00eb Letter: Lower case letter, Latin, Other, 00cb
|
U+00EB Letter: Lower case letter, Latin, Other, U+00CB
|
||||||
00ec Letter: Lower case letter, Latin, Other, 00cc
|
U+00EC Letter: Lower case letter, Latin, Other, U+00CC
|
||||||
00ed Letter: Lower case letter, Latin, Other, 00cd
|
U+00ED Letter: Lower case letter, Latin, Other, U+00CD
|
||||||
00ee Letter: Lower case letter, Latin, Other, 00ce
|
U+00EE Letter: Lower case letter, Latin, Other, U+00CE
|
||||||
00ef Letter: Lower case letter, Latin, Other, 00cf
|
U+00EF Letter: Lower case letter, Latin, Other, U+00CF
|
||||||
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
|
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
|
||||||
00f0 Letter: Lower case letter, Latin, Other, 00d0
|
U+00F0 Letter: Lower case letter, Latin, Other, U+00D0
|
||||||
00f1 Letter: Lower case letter, Latin, Other, 00d1
|
U+00F1 Letter: Lower case letter, Latin, Other, U+00D1
|
||||||
00f2 Letter: Lower case letter, Latin, Other, 00d2
|
U+00F2 Letter: Lower case letter, Latin, Other, U+00D2
|
||||||
00f3 Letter: Lower case letter, Latin, Other, 00d3
|
U+00F3 Letter: Lower case letter, Latin, Other, U+00D3
|
||||||
00f4 Letter: Lower case letter, Latin, Other, 00d4
|
U+00F4 Letter: Lower case letter, Latin, Other, U+00D4
|
||||||
00f5 Letter: Lower case letter, Latin, Other, 00d5
|
U+00F5 Letter: Lower case letter, Latin, Other, U+00D5
|
||||||
00f6 Letter: Lower case letter, Latin, Other, 00d6
|
U+00F6 Letter: Lower case letter, Latin, Other, U+00D6
|
||||||
00f7 Symbol: Mathematical symbol, Common, Other
|
U+00F7 Symbol: Mathematical symbol, Common, Other
|
||||||
00f8 Letter: Lower case letter, Latin, Other, 00d8
|
U+00F8 Letter: Lower case letter, Latin, Other, U+00D8
|
||||||
00f9 Letter: Lower case letter, Latin, Other, 00d9
|
U+00F9 Letter: Lower case letter, Latin, Other, U+00D9
|
||||||
00fa Letter: Lower case letter, Latin, Other, 00da
|
U+00FA Letter: Lower case letter, Latin, Other, U+00DA
|
||||||
00fb Letter: Lower case letter, Latin, Other, 00db
|
U+00FB Letter: Lower case letter, Latin, Other, U+00DB
|
||||||
00fc Letter: Lower case letter, Latin, Other, 00dc
|
U+00FC Letter: Lower case letter, Latin, Other, U+00DC
|
||||||
00fd Letter: Lower case letter, Latin, Other, 00dd
|
U+00FD Letter: Lower case letter, Latin, Other, U+00DD
|
||||||
00fe Letter: Lower case letter, Latin, Other, 00de
|
U+00FE Letter: Lower case letter, Latin, Other, U+00DE
|
||||||
00ff Letter: Lower case letter, Latin, Other, 0178
|
U+00FF Letter: Lower case letter, Latin, Other, U+0178
|
||||||
|
|
||||||
findprop 0100 0101 0102 0103 0104 0105 0106
|
findprop 0100 0101 0102 0103 0104 0105 0106
|
||||||
0100 Letter: Upper case letter, Latin, Other, 0101
|
U+0100 Letter: Upper case letter, Latin, Other, U+0101
|
||||||
0101 Letter: Lower case letter, Latin, Other, 0100
|
U+0101 Letter: Lower case letter, Latin, Other, U+0100
|
||||||
0102 Letter: Upper case letter, Latin, Other, 0103
|
U+0102 Letter: Upper case letter, Latin, Other, U+0103
|
||||||
0103 Letter: Lower case letter, Latin, Other, 0102
|
U+0103 Letter: Lower case letter, Latin, Other, U+0102
|
||||||
0104 Letter: Upper case letter, Latin, Other, 0105
|
U+0104 Letter: Upper case letter, Latin, Other, U+0105
|
||||||
0105 Letter: Lower case letter, Latin, Other, 0104
|
U+0105 Letter: Lower case letter, Latin, Other, U+0104
|
||||||
0106 Letter: Upper case letter, Latin, Other, 0107
|
U+0106 Letter: Upper case letter, Latin, Other, U+0107
|
||||||
|
|
||||||
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
|
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
|
||||||
ffe0 Symbol: Currency symbol, Common, Other
|
U+FFE0 Symbol: Currency symbol, Common, Other
|
||||||
ffe1 Symbol: Currency symbol, Common, Other
|
U+FFE1 Symbol: Currency symbol, Common, Other
|
||||||
ffe2 Symbol: Mathematical symbol, Common, Other
|
U+FFE2 Symbol: Mathematical symbol, Common, Other
|
||||||
ffe3 Symbol: Modifier symbol, Common, Other
|
U+FFE3 Symbol: Modifier symbol, Common, Other
|
||||||
ffe4 Symbol: Other symbol, Common, Other
|
U+FFE4 Symbol: Other symbol, Common, Other
|
||||||
ffe5 Symbol: Currency symbol, Common, Other
|
U+FFE5 Symbol: Currency symbol, Common, Other
|
||||||
ffe6 Symbol: Currency symbol, Common, Other
|
U+FFE6 Symbol: Currency symbol, Common, Other
|
||||||
ffe7 Control: Unassigned, Unknown, Other
|
U+FFE7 Control: Unassigned, Unknown, Other
|
||||||
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
|
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
|
||||||
ffe8 Symbol: Other symbol, Common, Other
|
U+FFE8 Symbol: Other symbol, Common, Other
|
||||||
ffe9 Symbol: Mathematical symbol, Common, Other
|
U+FFE9 Symbol: Mathematical symbol, Common, Other
|
||||||
ffea Symbol: Mathematical symbol, Common, Other
|
U+FFEA Symbol: Mathematical symbol, Common, Other
|
||||||
ffeb Symbol: Mathematical symbol, Common, Other
|
U+FFEB Symbol: Mathematical symbol, Common, Other
|
||||||
ffec Symbol: Mathematical symbol, Common, Other
|
U+FFEC Symbol: Mathematical symbol, Common, Other
|
||||||
ffed Symbol: Other symbol, Common, Other
|
U+FFED Symbol: Other symbol, Common, Other
|
||||||
ffee Symbol: Other symbol, Common, Other
|
U+FFEE Symbol: Other symbol, Common, Other
|
||||||
ffef Control: Unassigned, Unknown, Other
|
U+FFEF Control: Unassigned, Unknown, Other
|
||||||
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
|
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
|
||||||
fff8 Control: Unassigned, Unknown, Control
|
U+FFF8 Control: Unassigned, Unknown, Control
|
||||||
fff9 Control: Format, Common, Control
|
U+FFF9 Control: Format, Common, Control
|
||||||
fffa Control: Format, Common, Control
|
U+FFFA Control: Format, Common, Control
|
||||||
fffb Control: Format, Common, Control
|
U+FFFB Control: Format, Common, Control
|
||||||
fffc Symbol: Other symbol, Common, Other
|
U+FFFC Symbol: Other symbol, Common, Other
|
||||||
fffd Symbol: Other symbol, Common, Other
|
U+FFFD Symbol: Other symbol, Common, Other
|
||||||
fffe Control: Unassigned, Unknown, Other
|
U+FFFE Control: Unassigned, Unknown, Other
|
||||||
ffff Control: Unassigned, Unknown, Other
|
U+FFFF Control: Unassigned, Unknown, Other
|
||||||
findprop 10000 10001 e01ef f0000 100000
|
findprop 10000 10001 e01ef f0000 100000
|
||||||
10000 Letter: Other letter, Linear_B, Other
|
U+10000 Letter: Other letter, Linear_B, Other
|
||||||
10001 Letter: Other letter, Linear_B, Other
|
U+10001 Letter: Other letter, Linear_B, Other
|
||||||
e01ef Mark: Non-spacing mark, Inherited, Extend
|
U+E01EF Mark: Non-spacing mark, Inherited, Extend
|
||||||
f0000 Control: Private use, Unknown, Other
|
U+F0000 Control: Private use, Unknown, Other
|
||||||
100000 Control: Private use, Unknown, Other
|
U+100000 Control: Private use, Unknown, Other
|
||||||
|
|
||||||
findprop 1b00 12000 7c0 a840 10900
|
findprop 1b00 12000 7c0 a840 10900
|
||||||
1b00 Mark: Non-spacing mark, Balinese, Extend
|
U+1B00 Mark: Non-spacing mark, Balinese, Extend
|
||||||
12000 Letter: Other letter, Cuneiform, Other
|
U+12000 Letter: Other letter, Cuneiform, Other
|
||||||
07c0 Number: Decimal number, Nko, Other
|
U+07C0 Number: Decimal number, Nko, Other
|
||||||
a840 Letter: Other letter, Phags_Pa, Other
|
U+A840 Letter: Other letter, Phags_Pa, Other
|
||||||
10900 Letter: Other letter, Phoenician, Other
|
U+10900 Letter: Other letter, Phoenician, Other
|
||||||
findprop 1d79 a77d
|
findprop 1d79 a77d
|
||||||
1d79 Letter: Lower case letter, Latin, Other, a77d
|
U+1D79 Letter: Lower case letter, Latin, Other, U+A77D
|
||||||
a77d Letter: Upper case letter, Latin, Other, 1d79
|
U+A77D Letter: Upper case letter, Latin, Other, U+1D79
|
||||||
|
|
||||||
findprop 0800 083e a4d0 a4f7 aa80 aadf
|
findprop 0800 083e a4d0 a4f7 aa80 aadf
|
||||||
0800 Letter: Other letter, Samaritan, Other
|
U+0800 Letter: Other letter, Samaritan, Other
|
||||||
083e Punctuation: Other punctuation, Samaritan, Other
|
U+083E Punctuation: Other punctuation, Samaritan, Other
|
||||||
a4d0 Letter: Other letter, Lisu, Other
|
U+A4D0 Letter: Other letter, Lisu, Other
|
||||||
a4f7 Letter: Other letter, Lisu, Other
|
U+A4F7 Letter: Other letter, Lisu, Other
|
||||||
aa80 Letter: Other letter, Tai_Viet, Other
|
U+AA80 Letter: Other letter, Tai_Viet, Other
|
||||||
aadf Punctuation: Other punctuation, Tai_Viet, Other
|
U+AADF Punctuation: Other punctuation, Tai_Viet, Other
|
||||||
findprop 10b00 10b35 13000 1342e 10840 10855
|
findprop 10b00 10b35 13000 1342e 10840 10855
|
||||||
10b00 Letter: Other letter, Avestan, Other
|
U+10B00 Letter: Other letter, Avestan, Other
|
||||||
10b35 Letter: Other letter, Avestan, Other
|
U+10B35 Letter: Other letter, Avestan, Other
|
||||||
13000 Letter: Other letter, Egyptian_Hieroglyphs, Other
|
U+13000 Letter: Other letter, Egyptian_Hieroglyphs, Other
|
||||||
1342e Letter: Other letter, Egyptian_Hieroglyphs, Other
|
U+1342E Letter: Other letter, Egyptian_Hieroglyphs, Other
|
||||||
10840 Letter: Other letter, Imperial_Aramaic, Other
|
U+10840 Letter: Other letter, Imperial_Aramaic, Other
|
||||||
10855 Letter: Other letter, Imperial_Aramaic, Other
|
U+10855 Letter: Other letter, Imperial_Aramaic, Other
|
||||||
|
|
||||||
findprop 11100 1113c 11680 116c0
|
findprop 11100 1113c 11680 116c0
|
||||||
11100 Mark: Non-spacing mark, Chakma, Extend
|
U+11100 Mark: Non-spacing mark, Chakma, Extend
|
||||||
1113c Number: Decimal number, Chakma, Other
|
U+1113C Number: Decimal number, Chakma, Other
|
||||||
11680 Letter: Other letter, Takri, Other
|
U+11680 Letter: Other letter, Takri, Other
|
||||||
116c0 Number: Decimal number, Takri, Other
|
U+116C0 Number: Decimal number, Takri, Other
|
||||||
|
|
||||||
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
|
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
|
||||||
000d Control: Control, Common, CR
|
U+000D Control: Control, Common, CR
|
||||||
000a Control: Control, Common, LF
|
U+000A Control: Control, Common, LF
|
||||||
000e Control: Control, Common, Control
|
U+000E Control: Control, Common, Control
|
||||||
0711 Mark: Non-spacing mark, Syriac, Extend
|
U+0711 Mark: Non-spacing mark, Syriac, Extend
|
||||||
1b04 Mark: Spacing mark, Balinese, SpacingMark
|
U+1B04 Mark: Spacing mark, Balinese, SpacingMark
|
||||||
1111 Letter: Other letter, Hangul, Hangul syllable type L
|
U+1111 Letter: Other letter, Hangul, Hangul syllable type L
|
||||||
1169 Letter: Other letter, Hangul, Hangul syllable type V
|
U+1169 Letter: Other letter, Hangul, Hangul syllable type V
|
||||||
11fe Letter: Other letter, Hangul, Hangul syllable type T
|
U+11FE Letter: Other letter, Hangul, Hangul syllable type T
|
||||||
ae4c Letter: Other letter, Hangul, Hangul syllable type LV
|
U+AE4C Letter: Other letter, Hangul, Hangul syllable type LV
|
||||||
ad89 Letter: Other letter, Hangul, Hangul syllable type LVT
|
U+AD89 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
|
||||||
findprop 118a0 11ac7 16ad0
|
findprop 118a0 11ac7 16ad0
|
||||||
118a0 Letter: Upper case letter, Warang_Citi, Other, 118c0
|
U+118A0 Letter: Upper case letter, Warang_Citi, Other, U+118C0
|
||||||
11ac7 Letter: Other letter, Pau_Cin_Hau, Other
|
U+11AC7 Letter: Other letter, Pau_Cin_Hau, Other
|
||||||
16ad0 Letter: Other letter, Bassa_Vah, Other
|
U+16AD0 Letter: Other letter, Bassa_Vah, Other
|
||||||
|
|
||||||
findprop 11700 14400 108e0 11280 1d800
|
findprop 11700 14400 108e0 11280 1d800
|
||||||
11700 Letter: Other letter, Ahom, Other
|
U+11700 Letter: Other letter, Ahom, Other
|
||||||
14400 Letter: Other letter, Anatolian_Hieroglyphs, Other
|
U+14400 Letter: Other letter, Anatolian_Hieroglyphs, Other
|
||||||
108e0 Letter: Other letter, Hatran, Other
|
U+108E0 Letter: Other letter, Hatran, Other
|
||||||
11280 Letter: Other letter, Multani, Other
|
U+11280 Letter: Other letter, Multani, Other
|
||||||
1d800 Symbol: Other symbol, SignWriting, Other
|
U+1D800 Symbol: Other symbol, SignWriting, Other
|
||||||
|
|
||||||
findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
|
findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
|
||||||
11800 Letter: Other letter, Dogra, Other
|
U+11800 Letter: Other letter, Dogra, Other
|
||||||
1e903 Letter: Upper case letter, Adlam, Other, 1e925
|
U+1E903 Letter: Upper case letter, Adlam, Other, U+1E925
|
||||||
11da9 Number: Decimal number, Gunjala_Gondi, Other
|
U+11DA9 Number: Decimal number, Gunjala_Gondi, Other
|
||||||
10d27 Mark: Non-spacing mark, Hanifi_Rohingya, Extend
|
U+10D27 Mark: Non-spacing mark, Hanifi_Rohingya, Extend
|
||||||
11ee0 Letter: Other letter, Makasar, Other
|
U+11EE0 Letter: Other letter, Makasar, Other
|
||||||
16e48 Letter: Upper case letter, Medefaidrin, Other, 16e68
|
U+16E48 Letter: Upper case letter, Medefaidrin, Other, U+16E68
|
||||||
10f27 Letter: Other letter, Old_Sogdian, Other
|
U+10F27 Letter: Other letter, Old_Sogdian, Other
|
||||||
10f30 Letter: Other letter, Sogdian, Other
|
U+10F30 Letter: Other letter, Sogdian, Other
|
||||||
|
|
||||||
findprop a836 a833 1cf4 20f0 1cd0
|
findprop a836 a833 1cf4 20f0 1cd0
|
||||||
a836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
|
U+A836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
|
||||||
a833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Nandinagari, Khudawadi, Takri, Tirhuta]
|
U+A833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Nandinagari, Khudawadi, Takri, Tirhuta]
|
||||||
1cf4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
|
U+1CF4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
|
||||||
20f0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
|
U+20F0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
|
||||||
1cd0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]
|
U+1CD0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]
|
||||||
|
|
||||||
findprop 32ff
|
findprop 32ff
|
||||||
32ff Symbol: Other symbol, Common, Other, [Han]
|
U+32FF Symbol: Other symbol, Common, Other, [Han]
|
||||||
|
|
||||||
findprop 1f16d
|
findprop 1f16d
|
||||||
1f16d Symbol: Other symbol, Common, Extended Pictographic
|
U+1F16D Symbol: Other symbol, Common, Extended Pictographic
|
||||||
|
|
||||||
findprop 10e93 10eaa
|
findprop U+10e93 U+10eaa
|
||||||
10e93 Letter: Other letter, Yezidi, Other
|
U+10E93 Letter: Other letter, Yezidi, Other
|
||||||
10eaa Control: Unassigned, Unknown, Other
|
U+10EAA Control: Unassigned, Unknown, Other
|
||||||
|
|
|
@ -0,0 +1,188 @@
|
||||||
|
find script Han
|
||||||
|
U+2E80..U+2E99 Symbol: Other symbol, Han, Other
|
||||||
|
U+2E9B..U+2EF3 Symbol: Other symbol, Han, Other
|
||||||
|
U+2F00..U+2FD5 Symbol: Other symbol, Han, Other
|
||||||
|
U+3005 Letter: Modifier letter, Han, Other
|
||||||
|
U+3007 Number: Letter number, Han, Other
|
||||||
|
U+3021..U+3029 Number: Letter number, Han, Other
|
||||||
|
U+3038..U+303A Number: Letter number, Han, Other
|
||||||
|
U+303B Letter: Modifier letter, Han, Other
|
||||||
|
U+3400..U+4DBF Letter: Other letter, Han, Other
|
||||||
|
U+4E00..U+9FFC Letter: Other letter, Han, Other
|
||||||
|
U+F900..U+FA6D Letter: Other letter, Han, Other
|
||||||
|
U+FA70..U+FAD9 Letter: Other letter, Han, Other
|
||||||
|
U+16FF0..U+16FF1 Mark: Spacing mark, Han, SpacingMark
|
||||||
|
U+20000..U+2A6DD Letter: Other letter, Han, Other
|
||||||
|
U+2A700..U+2B734 Letter: Other letter, Han, Other
|
||||||
|
U+2B740..U+2B81D Letter: Other letter, Han, Other
|
||||||
|
U+2B820..U+2CEA1 Letter: Other letter, Han, Other
|
||||||
|
U+2CEB0..U+2EBE0 Letter: Other letter, Han, Other
|
||||||
|
U+2F800..U+2FA1D Letter: Other letter, Han, Other
|
||||||
|
U+30000..U+3134A Letter: Other letter, Han, Other
|
||||||
|
find type Pe script Common scriptx Hangul
|
||||||
|
U+3009 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||||
|
U+300B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||||
|
U+300D Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||||
|
U+300F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||||
|
U+3011 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||||
|
U+3015 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||||
|
U+3017 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||||
|
U+3019 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||||
|
U+301B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||||
|
U+301E..U+301F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
|
||||||
|
U+FF63 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||||
|
find type Sk
|
||||||
|
U+005E Symbol: Modifier symbol, Common, Other
|
||||||
|
U+0060 Symbol: Modifier symbol, Common, Other
|
||||||
|
U+00A8 Symbol: Modifier symbol, Common, Other
|
||||||
|
U+00AF Symbol: Modifier symbol, Common, Other
|
||||||
|
U+00B4 Symbol: Modifier symbol, Common, Other
|
||||||
|
U+00B8 Symbol: Modifier symbol, Common, Other
|
||||||
|
U+02C2..U+02C5 Symbol: Modifier symbol, Common, Other
|
||||||
|
U+02D2..U+02DF Symbol: Modifier symbol, Common, Other
|
||||||
|
U+02E5..U+02E9 Symbol: Modifier symbol, Common, Other
|
||||||
|
U+02EA..U+02EB Symbol: Modifier symbol, Bopomofo, Other
|
||||||
|
U+02ED Symbol: Modifier symbol, Common, Other
|
||||||
|
U+02EF..U+02FF Symbol: Modifier symbol, Common, Other
|
||||||
|
U+0375 Symbol: Modifier symbol, Greek, Other
|
||||||
|
U+0384 Symbol: Modifier symbol, Greek, Other
|
||||||
|
U+0385 Symbol: Modifier symbol, Common, Other
|
||||||
|
U+1FBD Symbol: Modifier symbol, Greek, Other
|
||||||
|
U+1FBF..U+1FC1 Symbol: Modifier symbol, Greek, Other
|
||||||
|
U+1FCD..U+1FCF Symbol: Modifier symbol, Greek, Other
|
||||||
|
U+1FDD..U+1FDF Symbol: Modifier symbol, Greek, Other
|
||||||
|
U+1FED..U+1FEF Symbol: Modifier symbol, Greek, Other
|
||||||
|
U+1FFD..U+1FFE Symbol: Modifier symbol, Greek, Other
|
||||||
|
U+309B..U+309C Symbol: Modifier symbol, Common, Other, [Hiragana, Katakana]
|
||||||
|
U+A700..U+A707 Symbol: Modifier symbol, Common, Other, [Han, Latin]
|
||||||
|
U+A708..U+A716 Symbol: Modifier symbol, Common, Other
|
||||||
|
U+A720..U+A721 Symbol: Modifier symbol, Common, Other
|
||||||
|
U+A789..U+A78A Symbol: Modifier symbol, Common, Other
|
||||||
|
U+AB5B Symbol: Modifier symbol, Common, Other
|
||||||
|
U+AB6A..U+AB6B Symbol: Modifier symbol, Common, Other
|
||||||
|
U+FBB2..U+FBC1 Symbol: Modifier symbol, Arabic, Other
|
||||||
|
U+FF3E Symbol: Modifier symbol, Common, Other
|
||||||
|
U+FF40 Symbol: Modifier symbol, Common, Other
|
||||||
|
U+FFE3 Symbol: Modifier symbol, Common, Other
|
||||||
|
U+1F3FB..U+1F3FF Symbol: Modifier symbol, Common, Extend
|
||||||
|
find type Pd
|
||||||
|
U+002D Punctuation: Dash punctuation, Common, Other
|
||||||
|
U+058A Punctuation: Dash punctuation, Armenian, Other
|
||||||
|
U+05BE Punctuation: Dash punctuation, Hebrew, Other
|
||||||
|
U+1400 Punctuation: Dash punctuation, Canadian_Aboriginal, Other
|
||||||
|
U+1806 Punctuation: Dash punctuation, Mongolian, Other
|
||||||
|
U+2010..U+2015 Punctuation: Dash punctuation, Common, Other
|
||||||
|
U+2E17 Punctuation: Dash punctuation, Common, Other
|
||||||
|
U+2E1A Punctuation: Dash punctuation, Common, Other
|
||||||
|
U+2E3A..U+2E3B Punctuation: Dash punctuation, Common, Other
|
||||||
|
U+2E40 Punctuation: Dash punctuation, Common, Other
|
||||||
|
U+301C Punctuation: Dash punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
|
||||||
|
U+3030 Punctuation: Dash punctuation, Common, Extended Pictographic, [Bopomofo, Hangul, Han, Hiragana, Katakana]
|
||||||
|
U+30A0 Punctuation: Dash punctuation, Common, Other, [Hiragana, Katakana]
|
||||||
|
U+FE31..U+FE32 Punctuation: Dash punctuation, Common, Other
|
||||||
|
U+FE58 Punctuation: Dash punctuation, Common, Other
|
||||||
|
U+FE63 Punctuation: Dash punctuation, Common, Other
|
||||||
|
U+FF0D Punctuation: Dash punctuation, Common, Other
|
||||||
|
U+10EAD Punctuation: Dash punctuation, Yezidi, Other
|
||||||
|
find gbreak LVT
|
||||||
|
U+AC01..U+AC1B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AC1D..U+AC37 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AC39..U+AC53 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AC55..U+AC6F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AC71..U+AC8B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AC8D..U+ACA7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+ACA9..U+ACC3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+ACC5..U+ACDF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+ACE1..U+ACFB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+ACFD..U+AD17 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AD19..U+AD33 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AD35..U+AD4F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AD51..U+AD6B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AD6D..U+AD87 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AD89..U+ADA3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+ADA5..U+ADBF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+ADC1..U+ADDB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+ADDD..U+ADF7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+ADF9..U+AE13 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AE15..U+AE2F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AE31..U+AE4B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AE4D..U+AE67 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AE69..U+AE83 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AE85..U+AE9F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AEA1..U+AEBB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AEBD..U+AED7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AED9..U+AEF3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AEF5..U+AF0F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AF11..U+AF2B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AF2D..U+AF47 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AF49..U+AF63 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AF65..U+AF7F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AF81..U+AF9B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AF9D..U+AFB7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AFB9..U+AFD3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AFD5..U+AFEF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+AFF1..U+B00B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B00D..U+B027 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B029..U+B043 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B045..U+B05F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B061..U+B07B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B07D..U+B097 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B099..U+B0B3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B0B5..U+B0CF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B0D1..U+B0EB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B0ED..U+B107 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B109..U+B123 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B125..U+B13F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B141..U+B15B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B15D..U+B177 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B179..U+B193 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B195..U+B1AF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B1B1..U+B1CB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B1CD..U+B1E7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B1E9..U+B203 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B205..U+B21F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B221..U+B23B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B23D..U+B257 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B259..U+B273 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B275..U+B28F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B291..U+B2AB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B2AD..U+B2C7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B2C9..U+B2E3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B2E5..U+B2FF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B301..U+B31B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B31D..U+B337 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B339..U+B353 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B355..U+B36F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B371..U+B38B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B38D..U+B3A7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B3A9..U+B3C3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B3C5..U+B3DF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B3E1..U+B3FB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B3FD..U+B417 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B419..U+B433 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B435..U+B44F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B451..U+B46B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B46D..U+B487 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B489..U+B4A3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B4A5..U+B4BF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B4C1..U+B4DB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B4DD..U+B4F7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B4F9..U+B513 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B515..U+B52F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B531..U+B54B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B54D..U+B567 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B569..U+B583 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B585..U+B59F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B5A1..U+B5BB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B5BD..U+B5D7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B5D9..U+B5F3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B5F5..U+B60F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B611..U+B62B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B62D..U+B647 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B649..U+B663 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B665..U+B67F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B681..U+B69B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B69D..U+B6B7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B6B9..U+B6D3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
U+B6D5..U+B6EF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||||
|
...
|
227
maint/utf8.c
227
maint/utf8.c
|
@ -1,20 +1,28 @@
|
||||||
/* A test program for converting characters to UTF-8 and vice versa. Note that
|
/****************************************************
|
||||||
this program conforms to the original definition of UTF-8, which allows
|
* PCRE maintainers' helper program: UTF-8 converter *
|
||||||
codepoints up to 7fffffff. The more recent definition limits the validity of
|
****************************************************/
|
||||||
UTF-8 codepoints to a maximum of 10ffffff.
|
|
||||||
|
|
||||||
The arguments are either single codepoint values, written as 0xhhhh, for
|
/* This is a test program for converting character code points to UTF-8 and
|
||||||
conversion to UTF-8, or sequences of hex values, written without 0x and
|
vice versa. Note that this program conforms to the original definition of
|
||||||
|
UTF-8, which allows codepoints up to 7fffffff. The more recent definition
|
||||||
|
limits the validity of Unicode UTF-8 codepoints to a maximum of 10ffffff, and
|
||||||
|
forbids the "surrogate" code points. This program now gives warnings for these
|
||||||
|
invalid code points.
|
||||||
|
|
||||||
|
The arguments are either single code point values written as U+hh.. or 0xhh..
|
||||||
|
for conversion to UTF-8, or sequences of hex values, written without 0x and
|
||||||
optionally including spaces (but such arguments must be quoted), for conversion
|
optionally including spaces (but such arguments must be quoted), for conversion
|
||||||
from UTF-8 to codepoints. For example:
|
from UTF-8 to codepoints. For example:
|
||||||
|
|
||||||
./utf8 0x1234
|
./utf8 0x1234
|
||||||
0x00001234 => e1 88 b4
|
U+00001234 => e1 88 b4
|
||||||
|
|
||||||
./utf8 "e1 88 b4"
|
./utf8 "e1 88 b4"
|
||||||
0x00001234 <= e1 88 b4
|
U+00001234 <= e1 88 b4
|
||||||
|
|
||||||
In the second case, a number of characters can be present in one argument:
|
In the second case, a number of UTF-8 characters can be present in one
|
||||||
|
argument. In other words, each such argument is interpreted (after ignoring
|
||||||
|
spaces) as a string of UTF-8 bytes representing a string of characters:
|
||||||
|
|
||||||
./utf8 "65 e188b4 77"
|
./utf8 "65 e188b4 77"
|
||||||
0x00000065 <= 65
|
0x00000065 <= 65
|
||||||
|
@ -23,7 +31,16 @@ In the second case, a number of characters can be present in one argument:
|
||||||
|
|
||||||
If the option -s is given, the sequence of UTF-bytes is written out between
|
If the option -s is given, the sequence of UTF-bytes is written out between
|
||||||
angle brackets at the end of the line. On a UTF-8 terminal, this will show the
|
angle brackets at the end of the line. On a UTF-8 terminal, this will show the
|
||||||
appropriate graphic for the codepoint. */
|
appropriate graphic for the code point.
|
||||||
|
|
||||||
|
Errors provoke error messages, but the program carries on with the next
|
||||||
|
argument. The return code is always zero.
|
||||||
|
|
||||||
|
Philip Hazel
|
||||||
|
Original creation data: unknown
|
||||||
|
Code extended and tidied to avoid compiler warnings: 26 March 2020
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
@ -41,7 +58,7 @@ appropriate graphic for the codepoint. */
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
static const int utf8_table1[] = {
|
static const unsigned int utf8_table1[] = {
|
||||||
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
|
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
|
||||||
|
|
||||||
static const int utf8_table2[] = {
|
static const int utf8_table2[] = {
|
||||||
|
@ -50,38 +67,29 @@ static const int utf8_table2[] = {
|
||||||
static const int utf8_table3[] = {
|
static const int utf8_table3[] = {
|
||||||
0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
||||||
|
|
||||||
static const unsigned char utf8_table4[] = {
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
||||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
||||||
3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,6 };
|
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Convert character value to UTF-8 *
|
* Convert character value to UTF-8 *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* This function takes an integer value in the range 0 - 0x7fffffff
|
/* This function takes an unsigned long integer value in the range 0 -
|
||||||
and encodes it as a UTF-8 character in 1 to 6 bytes.
|
0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
cvalue the character value
|
cvalue the character value
|
||||||
buffer pointer to buffer for result - at least 6 bytes long
|
buffer pointer to buffer for result - at least 6 bytes long
|
||||||
|
|
||||||
Returns: number of characters placed in the buffer
|
Returns: number of bytes placed in the buffer
|
||||||
-1 if input character is negative
|
0 if input code point is too big
|
||||||
0 if input character is positive but too big (only when
|
|
||||||
int is longer than 32 bits)
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int
|
static size_t
|
||||||
ord2utf8(int cvalue, unsigned char *buffer)
|
ord2utf8(unsigned long int cvalue, unsigned char *buffer)
|
||||||
{
|
{
|
||||||
register int i, j;
|
size_t i, j;
|
||||||
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
|
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
|
||||||
if (cvalue <= utf8_table1[i]) break;
|
if (cvalue <= utf8_table1[i]) break;
|
||||||
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
|
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
|
||||||
if (cvalue < 0) return -1;
|
|
||||||
buffer += i;
|
buffer += i;
|
||||||
for (j = i; j > 0; j--)
|
for (j = i; j > 0; j--)
|
||||||
{
|
{
|
||||||
|
@ -98,32 +106,59 @@ return i + 1;
|
||||||
* Convert UTF-8 string to value *
|
* Convert UTF-8 string to value *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* This function takes one or more bytes that represents a UTF-8 character,
|
/* This function takes one or more bytes that represent a UTF-8 character from
|
||||||
and returns the value of the character.
|
the start of a string of bytes. It returns the value of the character, or the
|
||||||
|
offset of a malformation. For an overlong encoding that works but is not the
|
||||||
|
correct (shortest) one, the error offset is just after the last byte.
|
||||||
|
|
||||||
Argument:
|
Argument:
|
||||||
buffer a pointer to the byte vector
|
buffer a pointer to the byte vector
|
||||||
vptr a pointer to an int to receive the value
|
buffend a pointer to the end of the buffer
|
||||||
|
vptr a pointer to a variable to receive the value
|
||||||
|
lenptr a pointer to a variable to receive the offset when error detected
|
||||||
|
|
||||||
Returns: > 0 => the number of bytes consumed
|
Returns: > 0 => the number of bytes consumed
|
||||||
-6 to 0 => malformed UTF-8 character at offset = (-return)
|
0 => invalid UTF-8: first byte missing 0x40 bit
|
||||||
|
-1 => invalid UTF-8: first byte has too many high-order 1-bits
|
||||||
|
-2 => incomplete sequence at end of string
|
||||||
|
-3 => incomplete sequence within string
|
||||||
|
-4 => overlong code sequence
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int
|
static int
|
||||||
utf82ord(unsigned char *buffer, int *vptr)
|
utf82ord(unsigned char *buffer, unsigned char *buffend,
|
||||||
|
long unsigned int *vptr, int *lenptr)
|
||||||
{
|
{
|
||||||
int c = *buffer++;
|
unsigned int c = *buffer++;
|
||||||
int d = c;
|
unsigned int d = c;
|
||||||
int i, j, s;
|
int i, j, s;
|
||||||
|
|
||||||
for (i = -1; i < 6; i++) /* i is number of additional bytes */
|
/* Check for an ASCII character, or find the number of additional bytes in a
|
||||||
|
multibyte character. */
|
||||||
|
|
||||||
|
for (i = -1; i < 6; i++)
|
||||||
{
|
{
|
||||||
if ((d & 0x80) == 0) break;
|
if ((d & 0x80) == 0) break;
|
||||||
d <<= 1;
|
d <<= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == -1) { *vptr = c; return 1; } /* ascii character */
|
switch (i)
|
||||||
if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
|
{
|
||||||
|
case -1: /* ASCII character; first byte does not have 0x80 bit */
|
||||||
|
*vptr = c;
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
case 0: /* First byte has 0x80 but is missing 0x40 bit */
|
||||||
|
*lenptr = 0;
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
case 6:
|
||||||
|
*lenptr = 0; /* Too many high bits */
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
/* i now has a value in the range 1-5 */
|
/* i now has a value in the range 1-5 */
|
||||||
|
|
||||||
|
@ -132,32 +167,46 @@ d = (c & utf8_table3[i]) << s;
|
||||||
|
|
||||||
for (j = 0; j < i; j++)
|
for (j = 0; j < i; j++)
|
||||||
{
|
{
|
||||||
|
if (buffer >= buffend)
|
||||||
|
{
|
||||||
|
*lenptr = j + 1;
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
c = *buffer++;
|
c = *buffer++;
|
||||||
if ((c & 0xc0) != 0x80) return -(j+1);
|
if ((c & 0xc0) != 0x80)
|
||||||
|
{
|
||||||
|
*lenptr = j + 1;
|
||||||
|
return -3;
|
||||||
|
}
|
||||||
s -= 6;
|
s -= 6;
|
||||||
d |= (c & 0x3f) << s;
|
d |= (c & 0x3f) << s;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Check that encoding was the correct unique one */
|
/* Valid UTF-8 syntax */
|
||||||
|
|
||||||
for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
|
*vptr = d;
|
||||||
|
|
||||||
|
/* Check that encoding was the correct one, not overlong */
|
||||||
|
|
||||||
|
for (j = 0; j < (int)(sizeof(utf8_table1)/sizeof(int)); j++)
|
||||||
if (d <= utf8_table1[j]) break;
|
if (d <= utf8_table1[j]) break;
|
||||||
if (j != i) return -(i+1);
|
if (j != i)
|
||||||
|
{
|
||||||
|
*lenptr = i + 1;
|
||||||
|
return -4;
|
||||||
|
}
|
||||||
|
|
||||||
/* Valid value */
|
/* Valid value */
|
||||||
|
|
||||||
*vptr = d;
|
return i + 1;
|
||||||
return i+1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Main Program *
|
* Main Program *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
main(int argc, char **argv)
|
main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
|
@ -174,31 +223,43 @@ if (argc > 1 && strcmp(argv[1], "-s") == 0)
|
||||||
for (; i < argc; i++)
|
for (; i < argc; i++)
|
||||||
{
|
{
|
||||||
char *x = argv[i];
|
char *x = argv[i];
|
||||||
if (strncmp(x, "0x", 2) == 0)
|
char *endptr;
|
||||||
|
if (strncmp(x, "0x", 2) == 0 || strncmp(x, "U+", 2) == 0)
|
||||||
{
|
{
|
||||||
int j;
|
size_t rc, j;
|
||||||
int d = strtol(x+2, NULL, 16);
|
unsigned long int d = strtoul(x+2, &endptr, 16);
|
||||||
int rc = ord2utf8(d, buffer);
|
if (*endptr != 0)
|
||||||
printf("0x%08x => ", d);
|
{
|
||||||
if (rc <= 0) printf("*** Error %d ***", rc); else
|
printf("** Invalid hex number %s\n", x);
|
||||||
|
continue; /* With next argument */
|
||||||
|
}
|
||||||
|
rc = ord2utf8(d, buffer);
|
||||||
|
printf("U+%08lx => ", d);
|
||||||
|
if (rc == 0)
|
||||||
|
printf("** Code point greater than 0x7fffffff cannot be encoded");
|
||||||
|
else
|
||||||
{
|
{
|
||||||
for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
|
for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
|
||||||
if (show)
|
if (show)
|
||||||
{
|
{
|
||||||
printf(">");
|
printf(">");
|
||||||
for (j = 0; j < rc; j++) printf("%c", buffer[j]);
|
for (j = 0; j < rc; j++) printf("%c", buffer[j]);
|
||||||
printf("<");
|
printf("< ");
|
||||||
}
|
}
|
||||||
|
if (d >= 0xd800 && d <= 0xdfff)
|
||||||
|
printf("** Invalid Unicode (surrogate)");
|
||||||
|
else if (d > 0x10ffff)
|
||||||
|
printf("** Invalid Unicode (greater than U+10ffff)");
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int d, rc;
|
unsigned char *bptr;
|
||||||
int j = 0;
|
unsigned char *buffend;
|
||||||
|
int len = 0;
|
||||||
int y = 0;
|
int y = 0;
|
||||||
int z = 0;
|
int z = 0;
|
||||||
unsigned char *bptr;
|
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
|
@ -206,28 +267,35 @@ for (; i < argc; i++)
|
||||||
if (*x == 0 && !z) break;
|
if (*x == 0 && !z) break;
|
||||||
if (!isxdigit(*x))
|
if (!isxdigit(*x))
|
||||||
{
|
{
|
||||||
printf("Malformed hex string: %s\n", argv[i]);
|
printf("** Malformed hex string: %s\n", argv[i]);
|
||||||
j = -1;
|
len = -1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
|
y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
|
||||||
x++;
|
x++;
|
||||||
if (z)
|
if (z)
|
||||||
{
|
{
|
||||||
buffer[j++] = y;
|
buffer[len++] = y;
|
||||||
y = 0;
|
y = 0;
|
||||||
}
|
}
|
||||||
z ^= 1;
|
z ^= 1;
|
||||||
}
|
}
|
||||||
buffer[j] = 0;
|
|
||||||
bptr = buffer;
|
|
||||||
|
|
||||||
while (*bptr != 0)
|
if (len < 0) continue; /* With next argument after malformation */
|
||||||
|
|
||||||
|
bptr = buffer;
|
||||||
|
buffend = buffer + len;
|
||||||
|
|
||||||
|
while (bptr < buffend)
|
||||||
{
|
{
|
||||||
rc = utf82ord(bptr, &d);
|
unsigned long int d;
|
||||||
|
int j;
|
||||||
|
int offset;
|
||||||
|
int rc = utf82ord(bptr, buffend, &d, &offset);
|
||||||
|
|
||||||
if (rc > 0)
|
if (rc > 0)
|
||||||
{
|
{
|
||||||
printf("0x%08x <= ", d);
|
printf("U+%08lx <= ", d);
|
||||||
for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
|
for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
|
||||||
if (show)
|
if (show)
|
||||||
{
|
{
|
||||||
|
@ -238,16 +306,41 @@ for (; i < argc; i++)
|
||||||
printf("\n");
|
printf("\n");
|
||||||
bptr += rc;
|
bptr += rc;
|
||||||
}
|
}
|
||||||
|
else if (rc == -4)
|
||||||
|
{
|
||||||
|
printf("U+%08lx <= ", d);
|
||||||
|
for (j = 0; j < offset; j++) printf("%02x ", bptr[j]);
|
||||||
|
printf("** Overlong UTF-8 sequence\n");
|
||||||
|
bptr += offset;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("Malformed UTF-8 at offset %d <= ", -rc);
|
switch (rc)
|
||||||
while (*bptr != 0) printf("%02x ", *bptr++);
|
{
|
||||||
|
case 0: printf("** First byte missing 0x40 bit");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case -1: printf("** First byte has too many high-order bits");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case -2: printf("** Incomplete UTF-8 sequence at end of string");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case -3: printf("** Incomplete UTF-8 sequence");
|
||||||
|
break;
|
||||||
|
|
||||||
|
default: printf("** Unexpected return %d from utf82ord()", rc);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
printf(" at offset %d in string ", offset);
|
||||||
|
while (bptr < buffend) printf("%02x ", *bptr++);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue