Upgrade the ucptest program (used only by maintainer) and script run tests.
This commit is contained in:
parent
0fc5cda13b
commit
1c4dc562e4
783
maint/ucptest.c
783
maint/ucptest.c
|
@ -7,14 +7,42 @@
|
|||
/* Compile thus:
|
||||
gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
|
||||
ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
|
||||
Add -lreadline or -ledit if required.
|
||||
*/
|
||||
|
||||
/* If there are arguments, they are a list of hexadecimal code points whose
|
||||
/* This is a hacked-up program for testing the Unicode properties tables of
|
||||
PCRE2. It can also be used for finding characters with certain properties.
|
||||
I wrote it to help with debugging PCRE, and have added things that I found
|
||||
useful, in a rather haphazard way. The code has never been "tidied" or checked
|
||||
for robustness.
|
||||
|
||||
If there are arguments, they are a list of hexadecimal code points whose
|
||||
properties are to be output. Otherwise, the program expects to read commands on
|
||||
stdin, and it writes output to stdout. There is only one command, "findprop",
|
||||
followed by a list of Unicode code points as hex numbers (without any
|
||||
prefixes). The output is one line per character, giving its Unicode properties
|
||||
followed by its other case if there is one. */
|
||||
stdin, and it writes output to stdout. There are two commands:
|
||||
|
||||
"findprop" must be followed by a list of Unicode code points as hex numbers
|
||||
(without any prefixes). The output is one line per character, giving its
|
||||
Unicode properties followed by its other case if there is one, followed by its
|
||||
Script Extension list if it is not just the same as the base script.
|
||||
|
||||
"find" must be followed by a list of property names and their values. This
|
||||
finds characters that have those properties. If multiple properties are listed,
|
||||
they must all be matched. Currently supported:
|
||||
|
||||
script <name> The character must have this script property. Only one
|
||||
such script may be given.
|
||||
scriptx <name> This script must be in the character's Script Extension
|
||||
property list. If this is used many times, all the given
|
||||
scripts must be present.
|
||||
type <abbrev> The character's type (e.g. Lu or Nd) must match.
|
||||
gbreak <name> The grapheme break property must match.
|
||||
|
||||
If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
|
||||
Script Extensions, there may be a mixture of positive and negative
|
||||
requirements. All must be satisfied.
|
||||
|
||||
No more than 100 characters are output. If there are more, the list ends with
|
||||
... */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "../src/config.h"
|
||||
|
@ -31,6 +59,22 @@ followed by its other case if there is one. */
|
|||
#include "../src/pcre2_internal.h"
|
||||
#include "../src/pcre2_ucp.h"
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
||||
#if defined(SUPPORT_LIBREADLINE)
|
||||
#include <readline/readline.h>
|
||||
#include <readline/history.h>
|
||||
#else
|
||||
#if defined(HAVE_EDITLINE_READLINE_H)
|
||||
#include <editline/readline.h>
|
||||
#else
|
||||
#include <readline/readline.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/* -------------------------------------------------------------------*/
|
||||
|
@ -45,185 +89,234 @@ followed by its other case if there is one. */
|
|||
/* -------------------------------------------------------------------*/
|
||||
|
||||
|
||||
const unsigned char *script_names[] = {
|
||||
US"Unknown",
|
||||
US"Arabic",
|
||||
US"Armenian",
|
||||
US"Bengali",
|
||||
US"Bopomofo",
|
||||
US"Braille",
|
||||
US"Buginese",
|
||||
US"Buhid",
|
||||
US"Canadian_Aboriginal",
|
||||
US"Cherokee",
|
||||
US"Common",
|
||||
US"Coptic",
|
||||
US"Cypriot",
|
||||
US"Cyrillic",
|
||||
US"Deseret",
|
||||
US"Devanagari",
|
||||
US"Ethiopic",
|
||||
US"Georgian",
|
||||
US"Glagolitic",
|
||||
US"Gothic",
|
||||
US"Greek",
|
||||
US"Gujarati",
|
||||
US"Gurmukhi",
|
||||
US"Han",
|
||||
US"Hangul",
|
||||
US"Hanunoo",
|
||||
US"Hebrew",
|
||||
US"Hiragana",
|
||||
US"Inherited",
|
||||
US"Kannada",
|
||||
US"Katakana",
|
||||
US"Kharoshthi",
|
||||
US"Khmer",
|
||||
US"Lao",
|
||||
US"Latin",
|
||||
US"Limbu",
|
||||
US"Linear_B",
|
||||
US"Malayalam",
|
||||
US"Mongolian",
|
||||
US"Myanmar",
|
||||
US"New_Tai_Lue",
|
||||
US"Ogham",
|
||||
US"Old_Italic",
|
||||
US"Old_Persian",
|
||||
US"Oriya",
|
||||
US"Osmanya",
|
||||
US"Runic",
|
||||
US"Shavian",
|
||||
US"Sinhala",
|
||||
US"Syloti_Nagri",
|
||||
US"Syriac",
|
||||
US"Tagalog",
|
||||
US"Tagbanwa",
|
||||
US"Tai_Le",
|
||||
US"Tamil",
|
||||
US"Telugu",
|
||||
US"Thaana",
|
||||
US"Thai",
|
||||
US"Tibetan",
|
||||
US"Tifinagh",
|
||||
US"Ugaritic",
|
||||
US"Yi",
|
||||
/* New for Unicode 5.0: */
|
||||
US"Balinese",
|
||||
US"Cuneiform",
|
||||
US"Nko",
|
||||
US"Phags_Pa",
|
||||
US"Phoenician",
|
||||
/* New for Unicode 5.1: */
|
||||
US"Carian",
|
||||
US"Cham",
|
||||
US"Kayah_Li",
|
||||
US"Lepcha",
|
||||
US"Lycian",
|
||||
US"Lydian",
|
||||
US"Ol_Chiki",
|
||||
US"Rejang",
|
||||
US"Saurashtra",
|
||||
US"Sundanese",
|
||||
US"Vai",
|
||||
/* New for Unicode 5.2: */
|
||||
US"Avestan",
|
||||
US"Bamum",
|
||||
US"Egyptian_Hieroglyphs",
|
||||
US"Imperial_Aramaic",
|
||||
US"Inscriptional_Pahlavi",
|
||||
US"Inscriptional_Parthian",
|
||||
US"Javanese",
|
||||
US"Kaithi",
|
||||
US"Lisu",
|
||||
US"Meetei_Mayek",
|
||||
US"Old_South_Arabian",
|
||||
US"Old_Turkic",
|
||||
US"Samaritan",
|
||||
US"Tai_Tham",
|
||||
US"Tai_Viet",
|
||||
/* New for Unicode 6.0.0 */
|
||||
US"Batak",
|
||||
US"Brahmi",
|
||||
US"Mandaic",
|
||||
/* New for Unicode 6.1.0 */
|
||||
US"Chakma",
|
||||
US"Meroitic_Cursive",
|
||||
US"Meroitic_Hieroglyphs",
|
||||
US"Miao",
|
||||
US"Sharada",
|
||||
US"Sora Sompent",
|
||||
US"Takri",
|
||||
/* New for Unicode 7.0.0 */
|
||||
US"Bassa_Vah",
|
||||
US"Caucasian_Albanian",
|
||||
US"Duployan",
|
||||
US"Elbasan",
|
||||
US"Grantha",
|
||||
US"Khojki",
|
||||
US"Khudawadi",
|
||||
US"Linear_A",
|
||||
US"Mahajani",
|
||||
US"Manichaean",
|
||||
US"Mende_Kikakui",
|
||||
US"Modi",
|
||||
US"Mro",
|
||||
US"Nabataean",
|
||||
US"Old_North_Arabian",
|
||||
US"Old_Permic",
|
||||
US"Pahawh_Hmong",
|
||||
US"Palmyrene",
|
||||
US"Psalter_Pahlavi",
|
||||
US"Pau_Cin_Hau",
|
||||
US"Siddham",
|
||||
US"Tirhuta",
|
||||
US"Warang_Citi",
|
||||
/* New for Unicode 8.0.0 */
|
||||
US"Ahom",
|
||||
US"Anatolian_Hieroglyphs",
|
||||
US"Hatran",
|
||||
US"Multani",
|
||||
US"Old_Hungarian",
|
||||
US"SignWriting",
|
||||
/* New for Unicode 10.0.0 (no update since 8.0.0) */
|
||||
US"Adlam",
|
||||
US"Bhaiksuki",
|
||||
US"Marchen",
|
||||
US"Newa",
|
||||
US"Osage",
|
||||
US"Tangut",
|
||||
US"Masaram_Gondi",
|
||||
US"Nushu",
|
||||
US"Soyombo",
|
||||
US"Zanabazar_Square",
|
||||
/* New for Unicode 11.0.0 */
|
||||
US"Dogra",
|
||||
US"Gunjala_Gondi",
|
||||
US"Hanifi_Rohingya",
|
||||
US"Makasar",
|
||||
US"Medefaidrin",
|
||||
US"Old_Sogdian",
|
||||
US"Sogdian"
|
||||
};
|
||||
|
||||
const unsigned char *type_names[] = {
|
||||
US"Cc",
|
||||
US"Cf",
|
||||
US"Cn",
|
||||
US"Co",
|
||||
US"Cs",
|
||||
US"Ll",
|
||||
US"Lm",
|
||||
US"Lo",
|
||||
US"Lt",
|
||||
US"Lu",
|
||||
US"Mc",
|
||||
US"Me",
|
||||
US"Mn",
|
||||
US"Nd",
|
||||
US"Nl",
|
||||
US"No",
|
||||
US"Pc",
|
||||
US"Pd",
|
||||
US"Pe",
|
||||
US"Pf",
|
||||
US"Pi",
|
||||
US"Po",
|
||||
US"Ps",
|
||||
US"Sc",
|
||||
US"Sk",
|
||||
US"Sm",
|
||||
US"So",
|
||||
US"Zl",
|
||||
US"Zp",
|
||||
US"Zs"
|
||||
};
|
||||
|
||||
const unsigned char *gb_names[] = {
|
||||
US"CR",
|
||||
US"LF",
|
||||
US"Control",
|
||||
US"Extend",
|
||||
US"Prepend",
|
||||
US"SpacingMark",
|
||||
US"L",
|
||||
US"V",
|
||||
US"T",
|
||||
US"LV",
|
||||
US"LVT",
|
||||
US"RegionalIndicator",
|
||||
US"Other",
|
||||
US"ZWJ",
|
||||
US"Extended_Pictographic"
|
||||
};
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find a script name *
|
||||
* Test for interaction *
|
||||
*************************************************/
|
||||
|
||||
static unsigned char *
|
||||
find_script_name(int script)
|
||||
static BOOL
|
||||
is_stdin_tty(void)
|
||||
{
|
||||
switch(script)
|
||||
{
|
||||
default: return US"??";
|
||||
case ucp_Unknown: return US"Unknown";
|
||||
case ucp_Arabic: return US"Arabic";
|
||||
case ucp_Armenian: return US"Armenian";
|
||||
case ucp_Balinese: return US"Balinese";
|
||||
case ucp_Bengali: return US"Bengali";
|
||||
case ucp_Bopomofo: return US"Bopomofo";
|
||||
case ucp_Braille: return US"Braille";
|
||||
case ucp_Buginese: return US"Buginese";
|
||||
case ucp_Buhid: return US"Buhid";
|
||||
case ucp_Canadian_Aboriginal: return US"Canadian_Aboriginal";
|
||||
case ucp_Cherokee: return US"Cherokee";
|
||||
case ucp_Common: return US"Common";
|
||||
case ucp_Coptic: return US"Coptic";
|
||||
case ucp_Cuneiform: return US"Cuneiform";
|
||||
case ucp_Cypriot: return US"Cypriot";
|
||||
case ucp_Cyrillic: return US"Cyrillic";
|
||||
case ucp_Deseret: return US"Deseret";
|
||||
case ucp_Devanagari: return US"Devanagari";
|
||||
case ucp_Ethiopic: return US"Ethiopic";
|
||||
case ucp_Georgian: return US"Georgian";
|
||||
case ucp_Glagolitic: return US"Glagolitic";
|
||||
case ucp_Gothic: return US"Gothic";
|
||||
case ucp_Greek: return US"Greek";
|
||||
case ucp_Gujarati: return US"Gujarati";
|
||||
case ucp_Gurmukhi: return US"Gurmukhi";
|
||||
case ucp_Han: return US"Han";
|
||||
case ucp_Hangul: return US"Hangul";
|
||||
case ucp_Hanunoo: return US"Hanunoo";
|
||||
case ucp_Hebrew: return US"Hebrew";
|
||||
case ucp_Hiragana: return US"Hiragana";
|
||||
case ucp_Inherited: return US"Inherited";
|
||||
case ucp_Kannada: return US"Kannada";
|
||||
case ucp_Katakana: return US"Katakana";
|
||||
case ucp_Kharoshthi: return US"Kharoshthi";
|
||||
case ucp_Khmer: return US"Khmer";
|
||||
case ucp_Lao: return US"Lao";
|
||||
case ucp_Latin: return US"Latin";
|
||||
case ucp_Limbu: return US"Limbu";
|
||||
case ucp_Linear_B: return US"Linear_B";
|
||||
case ucp_Malayalam: return US"Malayalam";
|
||||
case ucp_Mongolian: return US"Mongolian";
|
||||
case ucp_Myanmar: return US"Myanmar";
|
||||
case ucp_New_Tai_Lue: return US"New_Tai_Lue";
|
||||
case ucp_Nko: return US"Nko";
|
||||
case ucp_Ogham: return US"Ogham";
|
||||
case ucp_Old_Italic: return US"Old_Italic";
|
||||
case ucp_Old_Persian: return US"Old_Persian";
|
||||
case ucp_Oriya: return US"Oriya";
|
||||
case ucp_Osmanya: return US"Osmanya";
|
||||
case ucp_Phags_Pa: return US"Phags_Pa";
|
||||
case ucp_Phoenician: return US"Phoenician";
|
||||
case ucp_Runic: return US"Runic";
|
||||
case ucp_Shavian: return US"Shavian";
|
||||
case ucp_Sinhala: return US"Sinhala";
|
||||
case ucp_Syloti_Nagri: return US"Syloti_Nagri";
|
||||
case ucp_Syriac: return US"Syriac";
|
||||
case ucp_Tagalog: return US"Tagalog";
|
||||
case ucp_Tagbanwa: return US"Tagbanwa";
|
||||
case ucp_Tai_Le: return US"Tai_Le";
|
||||
case ucp_Tamil: return US"Tamil";
|
||||
case ucp_Telugu: return US"Telugu";
|
||||
case ucp_Thaana: return US"Thaana";
|
||||
case ucp_Thai: return US"Thai";
|
||||
case ucp_Tibetan: return US"Tibetan";
|
||||
case ucp_Tifinagh: return US"Tifinagh";
|
||||
case ucp_Ugaritic: return US"Ugaritic";
|
||||
case ucp_Yi: return US"Yi";
|
||||
/* New for Unicode 5.1: */
|
||||
case ucp_Carian: return US"Carian";
|
||||
case ucp_Cham: return US"Cham";
|
||||
case ucp_Kayah_Li: return US"Kayah_Li";
|
||||
case ucp_Lepcha: return US"Lepcha";
|
||||
case ucp_Lycian: return US"Lycian";
|
||||
case ucp_Lydian: return US"Lydian";
|
||||
case ucp_Ol_Chiki: return US"Ol_Chiki";
|
||||
case ucp_Rejang: return US"Rejang";
|
||||
case ucp_Saurashtra: return US"Saurashtra";
|
||||
case ucp_Sundanese: return US"Sundanese";
|
||||
case ucp_Vai: return US"Vai";
|
||||
/* New for Unicode 5.2: */
|
||||
case ucp_Avestan: return US"Avestan";
|
||||
case ucp_Bamum: return US"Bamum";
|
||||
case ucp_Egyptian_Hieroglyphs: return US"Egyptian_Hieroglyphs";
|
||||
case ucp_Imperial_Aramaic: return US"Imperial_Aramaic";
|
||||
case ucp_Inscriptional_Pahlavi: return US"Inscriptional_Pahlavi";
|
||||
case ucp_Inscriptional_Parthian: return US"Inscriptional_Parthian";
|
||||
case ucp_Javanese: return US"Javanese";
|
||||
case ucp_Kaithi: return US"Kaithi";
|
||||
case ucp_Lisu: return US"Lisu";
|
||||
case ucp_Meetei_Mayek: return US"Meetei_Mayek";
|
||||
case ucp_Old_South_Arabian: return US"Old_South_Arabian";
|
||||
case ucp_Old_Turkic: return US"Old_Turkic";
|
||||
case ucp_Samaritan: return US"Samaritan";
|
||||
case ucp_Tai_Tham: return US"Tai_Tham";
|
||||
case ucp_Tai_Viet: return US"Tai_Viet";
|
||||
/* New for Unicode 6.0.0 */
|
||||
case ucp_Batak: return US"Batak";
|
||||
case ucp_Brahmi: return US"Brahmi";
|
||||
case ucp_Mandaic: return US"Mandaic";
|
||||
|
||||
/* New for Unicode 6.1.0 */
|
||||
case ucp_Chakma: return US"Chakma";
|
||||
case ucp_Meroitic_Cursive: return US"Meroitic_Cursive";
|
||||
case ucp_Meroitic_Hieroglyphs: return US"Meroitic_Hieroglyphs";
|
||||
case ucp_Miao: return US"Miao";
|
||||
case ucp_Sharada: return US"Sharada";
|
||||
case ucp_Sora_Sompeng: return US"Sora Sompent";
|
||||
case ucp_Takri: return US"Takri";
|
||||
|
||||
/* New for Unicode 7.0.0 */
|
||||
case ucp_Bassa_Vah: return US"Bassa_Vah";
|
||||
case ucp_Caucasian_Albanian: return US"Caucasian_Albanian";
|
||||
case ucp_Duployan: return US"Duployan";
|
||||
case ucp_Elbasan: return US"Elbasan";
|
||||
case ucp_Grantha: return US"Grantha";
|
||||
case ucp_Khojki: return US"Khojki";
|
||||
case ucp_Khudawadi: return US"Khudawadi";
|
||||
case ucp_Linear_A: return US"Linear_A";
|
||||
case ucp_Mahajani: return US"Mahajani";
|
||||
case ucp_Manichaean: return US"Manichaean";
|
||||
case ucp_Mende_Kikakui: return US"Mende_Kikakui";
|
||||
case ucp_Modi: return US"Modi";
|
||||
case ucp_Mro: return US"Mro";
|
||||
case ucp_Nabataean: return US"Nabataean";
|
||||
case ucp_Old_North_Arabian: return US"Old_North_Arabian";
|
||||
case ucp_Old_Permic: return US"Old_Permic";
|
||||
case ucp_Pahawh_Hmong: return US"Pahawh_Hmong";
|
||||
case ucp_Palmyrene: return US"Palmyrene";
|
||||
case ucp_Psalter_Pahlavi: return US"Psalter_Pahlavi";
|
||||
case ucp_Pau_Cin_Hau: return US"Pau_Cin_Hau";
|
||||
case ucp_Siddham: return US"Siddham";
|
||||
case ucp_Tirhuta: return US"Tirhuta";
|
||||
case ucp_Warang_Citi: return US"Warang_Citi";
|
||||
|
||||
/* New for Unicode 8.0.0 */
|
||||
case ucp_Ahom: return US"Ahom";
|
||||
case ucp_Anatolian_Hieroglyphs: return US"Anatolian_Hieroglyphs";
|
||||
case ucp_Hatran: return US"Hatran";
|
||||
case ucp_Multani: return US"Multani";
|
||||
case ucp_Old_Hungarian: return US"Old_Hungarian";
|
||||
case ucp_SignWriting: return US"SignWriting";
|
||||
|
||||
/* New for Unicode 10.0.0 (no update since 8.0.0) */
|
||||
case ucp_Adlam: return US"Adlam";
|
||||
case ucp_Bhaiksuki: return US"Bhaiksuki";
|
||||
case ucp_Marchen: return US"Marchen";
|
||||
case ucp_Newa: return US"Newa";
|
||||
case ucp_Osage: return US"Osage";
|
||||
case ucp_Tangut: return US"Tangut";
|
||||
case ucp_Masaram_Gondi: return US"Masaram_Gondi";
|
||||
case ucp_Nushu: return US"Nushu";
|
||||
case ucp_Soyombo: return US"Soyombo";
|
||||
case ucp_Zanabazar_Square: return US"Zanabazar_Square";
|
||||
|
||||
/* New for Unicode 11.0.0 */
|
||||
case ucp_Dogra: return US"Dogra";
|
||||
case ucp_Gunjala_Gondi: return US"Gunjala_Gondi";
|
||||
case ucp_Hanifi_Rohingya: return US"Hanifi_Rohingya";
|
||||
case ucp_Makasar: return US"Makasar";
|
||||
case ucp_Medefaidrin: return US"Medefaidrin";
|
||||
case ucp_Old_Sogdian: return US"Old_Sogdian";
|
||||
case ucp_Sogdian: return US"Sogdian";
|
||||
}
|
||||
#if defined WIN32
|
||||
return _isatty(_fileno(stdin));
|
||||
#else
|
||||
return isatty(fileno(stdin));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print Unicode property info for a char *
|
||||
*************************************************/
|
||||
|
@ -239,11 +332,13 @@ int gbprop = UCD_GRAPHBREAK(c);
|
|||
int othercase = UCD_OTHERCASE(c);
|
||||
int caseset = UCD_CASESET(c);
|
||||
|
||||
unsigned char *fulltypename = US"??";
|
||||
unsigned char *typename = US"??";
|
||||
unsigned char *graphbreak = US"??";
|
||||
const unsigned char *fulltypename = US"??";
|
||||
const unsigned char *typename = US"??";
|
||||
const unsigned char *scriptname = US"??";
|
||||
const unsigned char *graphbreak = US"??";
|
||||
|
||||
unsigned char *scriptname = find_script_name(script);
|
||||
if (script < sizeof(script_names)/sizeof(char *))
|
||||
scriptname = script_names[script];
|
||||
|
||||
switch (type)
|
||||
{
|
||||
|
@ -327,13 +422,21 @@ if (othercase != c)
|
|||
if (scriptx != script)
|
||||
{
|
||||
printf(", [");
|
||||
if (scriptx >= 0) printf("%s", find_script_name(scriptx)); else
|
||||
if (scriptx >= 0)
|
||||
{
|
||||
scriptname = (scriptx >= sizeof(script_names)/sizeof(char *))?
|
||||
US"??" : script_names[scriptx];
|
||||
printf("%s", scriptname);
|
||||
}
|
||||
else
|
||||
{
|
||||
char *sep = "";
|
||||
const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
|
||||
while (*p != 0)
|
||||
{
|
||||
printf("%s%s", sep, find_script_name(*p++));
|
||||
scriptname = (*p >= sizeof(script_names)/sizeof(char *))?
|
||||
US"??" : script_names[*p++];
|
||||
printf("%s%s", sep, scriptname);
|
||||
sep = ", ";
|
||||
}
|
||||
}
|
||||
|
@ -345,6 +448,267 @@ printf("\n");
|
|||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find character(s) with given property/ies *
|
||||
*************************************************/
|
||||
|
||||
static void
|
||||
find_chars(unsigned char *s)
|
||||
{
|
||||
unsigned char name[24];
|
||||
unsigned char value[24];
|
||||
unsigned char *t;
|
||||
unsigned int count= 0;
|
||||
int scriptx_list[24];
|
||||
unsigned int scriptx_count = 0;
|
||||
uint32_t i, c;
|
||||
int script = -1;
|
||||
int type = -1;
|
||||
int gbreak = -1;
|
||||
BOOL script_not = FALSE;
|
||||
BOOL type_not = FALSE;
|
||||
BOOL gbreak_not = FALSE;
|
||||
BOOL hadrange = FALSE;
|
||||
const ucd_record *ucd, *next_ucd;
|
||||
const char *pad = " ";
|
||||
|
||||
while (*s != 0)
|
||||
{
|
||||
unsigned int offset = 0;
|
||||
BOOL scriptx_not = FALSE;
|
||||
|
||||
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
||||
*t = 0;
|
||||
while (isspace(*s)) s++;
|
||||
|
||||
for (t = value; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
||||
*t = 0;
|
||||
while (isspace(*s)) s++;
|
||||
|
||||
if (strcmp(CS name, "script") == 0 ||
|
||||
strcmp(CS name, "scriptx") == 0)
|
||||
{
|
||||
if (value[0] == '!')
|
||||
{
|
||||
if (name[6] == 'x') scriptx_not = TRUE;
|
||||
else script_not = TRUE;
|
||||
offset = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < sizeof(script_names)/sizeof(char *); i++)
|
||||
{
|
||||
if (strcmp(CS value + offset, script_names[i]) == 0)
|
||||
{
|
||||
if (name[6] == 'x')
|
||||
{
|
||||
scriptx_list[scriptx_count++] = scriptx_not? (-i):i;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (script < 0) script = i; else
|
||||
{
|
||||
printf("** Only 1 script value allowed\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i >= sizeof(script_names)/sizeof(char *))
|
||||
{
|
||||
printf("** Unrecognized script name '%s'\n", value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "type") == 0)
|
||||
{
|
||||
if (type >= 0)
|
||||
{
|
||||
printf("** Only 1 type value allowed\n");
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (value[0] == '!')
|
||||
{
|
||||
type_not = TRUE;
|
||||
offset = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < sizeof(type_names)/sizeof(char *); i++)
|
||||
{
|
||||
if (strcmp(CS (value + offset), type_names[i]) == 0)
|
||||
{
|
||||
type = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i >= sizeof(type_names)/sizeof(char *))
|
||||
{
|
||||
printf("** Unrecognized type name '%s'\n", value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "gbreak") == 0)
|
||||
{
|
||||
if (gbreak >= 0)
|
||||
{
|
||||
printf("** Only 1 grapheme break value allowed\n");
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (value[0] == '!')
|
||||
{
|
||||
gbreak_not = TRUE;
|
||||
offset = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < sizeof(gb_names)/sizeof(char *); i++)
|
||||
{
|
||||
if (strcmp(CS (value + offset), gb_names[i]) == 0)
|
||||
{
|
||||
gbreak = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i >= sizeof(gb_names)/sizeof(char *))
|
||||
{
|
||||
printf("** Unrecognized gbreak name '%s'\n", value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
printf("** Unrecognized property name '%s'\n", name);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (script < 0 && scriptx_count == 0 && type < 0 && gbreak < 0)
|
||||
{
|
||||
printf("** No properties specified\n");
|
||||
return;
|
||||
}
|
||||
|
||||
for (c = 0; c <= 0x10ffff; c++)
|
||||
{
|
||||
if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
|
||||
|
||||
if (scriptx_count > 0)
|
||||
{
|
||||
const uint8_t *char_scriptx = NULL;
|
||||
int found = 0;
|
||||
int scriptx = UCD_SCRIPTX(c);
|
||||
|
||||
if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx;
|
||||
|
||||
for (i = 0; i < scriptx_count; i++)
|
||||
{
|
||||
/* Positive requirment */
|
||||
if (scriptx_list[i] >= 0)
|
||||
{
|
||||
if (scriptx >= 0)
|
||||
{
|
||||
if (scriptx == scriptx_list[i]) found++;
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
const uint8_t *p;
|
||||
for (p = char_scriptx; *p != 0; p++)
|
||||
{
|
||||
if (scriptx_list[i] == *p)
|
||||
{
|
||||
found++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Negative requirement */
|
||||
else
|
||||
{
|
||||
if (scriptx >= 0)
|
||||
{
|
||||
if (scriptx != -scriptx_list[i]) found++;
|
||||
}
|
||||
else
|
||||
{
|
||||
const uint8_t *p;
|
||||
for (p = char_scriptx; *p != 0; p++)
|
||||
if (-scriptx_list[i] == *p) break;
|
||||
if (*p == 0) found++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (found != scriptx_count) continue;
|
||||
}
|
||||
|
||||
if (type >= 0)
|
||||
{
|
||||
if (type_not)
|
||||
{
|
||||
if (type == UCD_CHARTYPE(c)) continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (type != UCD_CHARTYPE(c)) continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (gbreak >= 0)
|
||||
{
|
||||
if (gbreak_not)
|
||||
{
|
||||
if (gbreak == UCD_GRAPHBREAK(c)) continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (gbreak != UCD_GRAPHBREAK(c)) continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* All conditions are met. Look for runs. */
|
||||
|
||||
ucd = GET_UCD(c);
|
||||
|
||||
for (i = c + 1; i < 0x10ffff; i++)
|
||||
{
|
||||
next_ucd = GET_UCD(i);
|
||||
if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
|
||||
}
|
||||
|
||||
if (--i > c)
|
||||
{
|
||||
printf("%04x..", c);
|
||||
c = i;
|
||||
hadrange = TRUE;
|
||||
}
|
||||
else if (hadrange) printf("%s", pad);
|
||||
|
||||
print_prop(c);
|
||||
if (c >= 0x100000) pad = " ";
|
||||
else if (c >= 0x10000) pad = " ";
|
||||
count++;
|
||||
if (count >= 100)
|
||||
{
|
||||
printf("...\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0) printf("No characters found\n");
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Main program *
|
||||
*************************************************/
|
||||
|
@ -352,6 +716,7 @@ printf("\n");
|
|||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
BOOL interactive;
|
||||
unsigned char buffer[1024];
|
||||
|
||||
if (argc > 1)
|
||||
|
@ -361,17 +726,46 @@ if (argc > 1)
|
|||
{
|
||||
unsigned char *endptr;
|
||||
int c = strtoul(argv[i], CSS(&endptr), 16);
|
||||
print_prop(c);
|
||||
if (*endptr != 0)
|
||||
printf("** Hex number expected; ignored '%s'\n", argv[i]);
|
||||
else print_prop(c);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
|
||||
interactive = is_stdin_tty();
|
||||
|
||||
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
||||
if (interactive) using_history();
|
||||
#endif
|
||||
|
||||
for(;;)
|
||||
{
|
||||
unsigned char name[24];
|
||||
unsigned char *s, *t;
|
||||
|
||||
printf("%s", buffer);
|
||||
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
||||
if (interactive)
|
||||
{
|
||||
size_t len;
|
||||
s = readline("> ");
|
||||
if (s == NULL) break;
|
||||
len = strlen(s);
|
||||
if (len > 0) add_history(s);
|
||||
memcpy(buffer, s, len);
|
||||
buffer[len] = '\n';
|
||||
buffer[len+1] = 0;
|
||||
free(s);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
|
||||
{
|
||||
if (interactive) printf("> ");
|
||||
if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
|
||||
if (!interactive) printf("%s", buffer);
|
||||
}
|
||||
|
||||
s = buffer;
|
||||
while (isspace(*s)) s++;
|
||||
if (*s == 0) continue;
|
||||
|
@ -386,15 +780,32 @@ while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
|
|||
{
|
||||
unsigned char *endptr;
|
||||
int c = strtoul(CS s, CSS(&endptr), 16);
|
||||
print_prop(c);
|
||||
|
||||
if (*endptr != 0 && !isspace(*endptr))
|
||||
{
|
||||
while (*endptr != 0 && !isspace(*endptr)) endptr++;
|
||||
printf("** Hex number expected; ignored '%.*s'\n", endptr-s, s);
|
||||
}
|
||||
else print_prop(c);
|
||||
s = endptr;
|
||||
while (isspace(*s)) s++;
|
||||
}
|
||||
}
|
||||
|
||||
else printf("Unknown test command %s\n", name);
|
||||
else if (strcmp(CS name, "find") == 0)
|
||||
{
|
||||
find_chars(s);
|
||||
}
|
||||
|
||||
else printf("** Unknown test command %s\n", name);
|
||||
}
|
||||
|
||||
if (interactive) printf("\n");
|
||||
|
||||
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
||||
if (interactive) clear_history();
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -2432,6 +2432,8 @@
|
|||
AB\x{1cf7} Latin Latin Common-extended-Beng
|
||||
\x{1cf7}AB Common-extend-Beng Latin Latin
|
||||
\x{1cf7}\x{0993} Common-extend-Beng Bengali
|
||||
A\x{1abe}BC Test enclosing mark
|
||||
\x{0370}\x{1abe}\x{0371} Which can occur with any script (Greek here)
|
||||
|
||||
# Test loop breaking for empty string match
|
||||
|
||||
|
|
|
@ -3936,6 +3936,10 @@ No match
|
|||
0: \x{1cf7}
|
||||
\x{1cf7}\x{0993} Common-extend-Beng Bengali
|
||||
0: \x{1cf7}\x{993}
|
||||
A\x{1abe}BC Test enclosing mark
|
||||
0: A\x{1abe}BC
|
||||
\x{0370}\x{1abe}\x{0371} Which can occur with any script (Greek here)
|
||||
0: \x{370}\x{1abe}\x{371}
|
||||
|
||||
# Test loop breaking for empty string match
|
||||
|
||||
|
|
Loading…
Reference in New Issue