From 1c4dc562e4d75c418dc78e4b86307fb7061ce0f8 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sun, 14 Oct 2018 14:27:16 +0000 Subject: [PATCH] Upgrade the ucptest program (used only by maintainer) and script run tests. --- maint/ucptest.c | 817 ++++++++++++++++++++++++++++++++----------- testdata/testinput4 | 2 + testdata/testoutput4 | 4 + 3 files changed, 620 insertions(+), 203 deletions(-) diff --git a/maint/ucptest.c b/maint/ucptest.c index 0ffb34a..720160c 100644 --- a/maint/ucptest.c +++ b/maint/ucptest.c @@ -7,14 +7,42 @@ /* Compile thus: gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \ ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c + Add -lreadline or -ledit if required. */ -/* If there are arguments, they are a list of hexadecimal code points whose +/* This is a hacked-up program for testing the Unicode properties tables of +PCRE2. It can also be used for finding characters with certain properties. +I wrote it to help with debugging PCRE, and have added things that I found +useful, in a rather haphazard way. The code has never been "tidied" or checked +for robustness. + +If there are arguments, they are a list of hexadecimal code points whose properties are to be output. Otherwise, the program expects to read commands on -stdin, and it writes output to stdout. There is only one command, "findprop", -followed by a list of Unicode code points as hex numbers (without any -prefixes). The output is one line per character, giving its Unicode properties -followed by its other case if there is one. */ +stdin, and it writes output to stdout. There are two commands: + +"findprop" must be followed by a list of Unicode code points as hex numbers +(without any prefixes). The output is one line per character, giving its +Unicode properties followed by its other case if there is one, followed by its +Script Extension list if it is not just the same as the base script. + +"find" must be followed by a list of property names and their values. This +finds characters that have those properties. If multiple properties are listed, +they must all be matched. Currently supported: + + script The character must have this script property. Only one + such script may be given. + scriptx This script must be in the character's Script Extension + property list. If this is used many times, all the given + scripts must be present. + type The character's type (e.g. Lu or Nd) must match. + gbreak The grapheme break property must match. + +If a or is preceded by !, the value must NOT be present. For +Script Extensions, there may be a mixture of positive and negative +requirements. All must be satisfied. + +No more than 100 characters are output. If there are more, the list ends with +... */ #ifdef HAVE_CONFIG_H #include "../src/config.h" @@ -31,6 +59,22 @@ followed by its other case if there is one. */ #include "../src/pcre2_internal.h" #include "../src/pcre2_ucp.h" +#ifdef HAVE_UNISTD_H +#include +#endif + +#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) +#if defined(SUPPORT_LIBREADLINE) +#include +#include +#else +#if defined(HAVE_EDITLINE_READLINE_H) +#include +#else +#include +#endif +#endif +#endif /* -------------------------------------------------------------------*/ @@ -45,185 +89,234 @@ followed by its other case if there is one. */ /* -------------------------------------------------------------------*/ +const unsigned char *script_names[] = { + US"Unknown", + US"Arabic", + US"Armenian", + US"Bengali", + US"Bopomofo", + US"Braille", + US"Buginese", + US"Buhid", + US"Canadian_Aboriginal", + US"Cherokee", + US"Common", + US"Coptic", + US"Cypriot", + US"Cyrillic", + US"Deseret", + US"Devanagari", + US"Ethiopic", + US"Georgian", + US"Glagolitic", + US"Gothic", + US"Greek", + US"Gujarati", + US"Gurmukhi", + US"Han", + US"Hangul", + US"Hanunoo", + US"Hebrew", + US"Hiragana", + US"Inherited", + US"Kannada", + US"Katakana", + US"Kharoshthi", + US"Khmer", + US"Lao", + US"Latin", + US"Limbu", + US"Linear_B", + US"Malayalam", + US"Mongolian", + US"Myanmar", + US"New_Tai_Lue", + US"Ogham", + US"Old_Italic", + US"Old_Persian", + US"Oriya", + US"Osmanya", + US"Runic", + US"Shavian", + US"Sinhala", + US"Syloti_Nagri", + US"Syriac", + US"Tagalog", + US"Tagbanwa", + US"Tai_Le", + US"Tamil", + US"Telugu", + US"Thaana", + US"Thai", + US"Tibetan", + US"Tifinagh", + US"Ugaritic", + US"Yi", + /* New for Unicode 5.0: */ + US"Balinese", + US"Cuneiform", + US"Nko", + US"Phags_Pa", + US"Phoenician", + /* New for Unicode 5.1: */ + US"Carian", + US"Cham", + US"Kayah_Li", + US"Lepcha", + US"Lycian", + US"Lydian", + US"Ol_Chiki", + US"Rejang", + US"Saurashtra", + US"Sundanese", + US"Vai", + /* New for Unicode 5.2: */ + US"Avestan", + US"Bamum", + US"Egyptian_Hieroglyphs", + US"Imperial_Aramaic", + US"Inscriptional_Pahlavi", + US"Inscriptional_Parthian", + US"Javanese", + US"Kaithi", + US"Lisu", + US"Meetei_Mayek", + US"Old_South_Arabian", + US"Old_Turkic", + US"Samaritan", + US"Tai_Tham", + US"Tai_Viet", + /* New for Unicode 6.0.0 */ + US"Batak", + US"Brahmi", + US"Mandaic", + /* New for Unicode 6.1.0 */ + US"Chakma", + US"Meroitic_Cursive", + US"Meroitic_Hieroglyphs", + US"Miao", + US"Sharada", + US"Sora Sompent", + US"Takri", + /* New for Unicode 7.0.0 */ + US"Bassa_Vah", + US"Caucasian_Albanian", + US"Duployan", + US"Elbasan", + US"Grantha", + US"Khojki", + US"Khudawadi", + US"Linear_A", + US"Mahajani", + US"Manichaean", + US"Mende_Kikakui", + US"Modi", + US"Mro", + US"Nabataean", + US"Old_North_Arabian", + US"Old_Permic", + US"Pahawh_Hmong", + US"Palmyrene", + US"Psalter_Pahlavi", + US"Pau_Cin_Hau", + US"Siddham", + US"Tirhuta", + US"Warang_Citi", + /* New for Unicode 8.0.0 */ + US"Ahom", + US"Anatolian_Hieroglyphs", + US"Hatran", + US"Multani", + US"Old_Hungarian", + US"SignWriting", + /* New for Unicode 10.0.0 (no update since 8.0.0) */ + US"Adlam", + US"Bhaiksuki", + US"Marchen", + US"Newa", + US"Osage", + US"Tangut", + US"Masaram_Gondi", + US"Nushu", + US"Soyombo", + US"Zanabazar_Square", + /* New for Unicode 11.0.0 */ + US"Dogra", + US"Gunjala_Gondi", + US"Hanifi_Rohingya", + US"Makasar", + US"Medefaidrin", + US"Old_Sogdian", + US"Sogdian" +}; + +const unsigned char *type_names[] = { + US"Cc", + US"Cf", + US"Cn", + US"Co", + US"Cs", + US"Ll", + US"Lm", + US"Lo", + US"Lt", + US"Lu", + US"Mc", + US"Me", + US"Mn", + US"Nd", + US"Nl", + US"No", + US"Pc", + US"Pd", + US"Pe", + US"Pf", + US"Pi", + US"Po", + US"Ps", + US"Sc", + US"Sk", + US"Sm", + US"So", + US"Zl", + US"Zp", + US"Zs" +}; + +const unsigned char *gb_names[] = { + US"CR", + US"LF", + US"Control", + US"Extend", + US"Prepend", + US"SpacingMark", + US"L", + US"V", + US"T", + US"LV", + US"LVT", + US"RegionalIndicator", + US"Other", + US"ZWJ", + US"Extended_Pictographic" +}; /************************************************* -* Find a script name * +* Test for interaction * *************************************************/ -static unsigned char * -find_script_name(int script) +static BOOL +is_stdin_tty(void) { -switch(script) - { - default: return US"??"; - case ucp_Unknown: return US"Unknown"; - case ucp_Arabic: return US"Arabic"; - case ucp_Armenian: return US"Armenian"; - case ucp_Balinese: return US"Balinese"; - case ucp_Bengali: return US"Bengali"; - case ucp_Bopomofo: return US"Bopomofo"; - case ucp_Braille: return US"Braille"; - case ucp_Buginese: return US"Buginese"; - case ucp_Buhid: return US"Buhid"; - case ucp_Canadian_Aboriginal: return US"Canadian_Aboriginal"; - case ucp_Cherokee: return US"Cherokee"; - case ucp_Common: return US"Common"; - case ucp_Coptic: return US"Coptic"; - case ucp_Cuneiform: return US"Cuneiform"; - case ucp_Cypriot: return US"Cypriot"; - case ucp_Cyrillic: return US"Cyrillic"; - case ucp_Deseret: return US"Deseret"; - case ucp_Devanagari: return US"Devanagari"; - case ucp_Ethiopic: return US"Ethiopic"; - case ucp_Georgian: return US"Georgian"; - case ucp_Glagolitic: return US"Glagolitic"; - case ucp_Gothic: return US"Gothic"; - case ucp_Greek: return US"Greek"; - case ucp_Gujarati: return US"Gujarati"; - case ucp_Gurmukhi: return US"Gurmukhi"; - case ucp_Han: return US"Han"; - case ucp_Hangul: return US"Hangul"; - case ucp_Hanunoo: return US"Hanunoo"; - case ucp_Hebrew: return US"Hebrew"; - case ucp_Hiragana: return US"Hiragana"; - case ucp_Inherited: return US"Inherited"; - case ucp_Kannada: return US"Kannada"; - case ucp_Katakana: return US"Katakana"; - case ucp_Kharoshthi: return US"Kharoshthi"; - case ucp_Khmer: return US"Khmer"; - case ucp_Lao: return US"Lao"; - case ucp_Latin: return US"Latin"; - case ucp_Limbu: return US"Limbu"; - case ucp_Linear_B: return US"Linear_B"; - case ucp_Malayalam: return US"Malayalam"; - case ucp_Mongolian: return US"Mongolian"; - case ucp_Myanmar: return US"Myanmar"; - case ucp_New_Tai_Lue: return US"New_Tai_Lue"; - case ucp_Nko: return US"Nko"; - case ucp_Ogham: return US"Ogham"; - case ucp_Old_Italic: return US"Old_Italic"; - case ucp_Old_Persian: return US"Old_Persian"; - case ucp_Oriya: return US"Oriya"; - case ucp_Osmanya: return US"Osmanya"; - case ucp_Phags_Pa: return US"Phags_Pa"; - case ucp_Phoenician: return US"Phoenician"; - case ucp_Runic: return US"Runic"; - case ucp_Shavian: return US"Shavian"; - case ucp_Sinhala: return US"Sinhala"; - case ucp_Syloti_Nagri: return US"Syloti_Nagri"; - case ucp_Syriac: return US"Syriac"; - case ucp_Tagalog: return US"Tagalog"; - case ucp_Tagbanwa: return US"Tagbanwa"; - case ucp_Tai_Le: return US"Tai_Le"; - case ucp_Tamil: return US"Tamil"; - case ucp_Telugu: return US"Telugu"; - case ucp_Thaana: return US"Thaana"; - case ucp_Thai: return US"Thai"; - case ucp_Tibetan: return US"Tibetan"; - case ucp_Tifinagh: return US"Tifinagh"; - case ucp_Ugaritic: return US"Ugaritic"; - case ucp_Yi: return US"Yi"; - /* New for Unicode 5.1: */ - case ucp_Carian: return US"Carian"; - case ucp_Cham: return US"Cham"; - case ucp_Kayah_Li: return US"Kayah_Li"; - case ucp_Lepcha: return US"Lepcha"; - case ucp_Lycian: return US"Lycian"; - case ucp_Lydian: return US"Lydian"; - case ucp_Ol_Chiki: return US"Ol_Chiki"; - case ucp_Rejang: return US"Rejang"; - case ucp_Saurashtra: return US"Saurashtra"; - case ucp_Sundanese: return US"Sundanese"; - case ucp_Vai: return US"Vai"; - /* New for Unicode 5.2: */ - case ucp_Avestan: return US"Avestan"; - case ucp_Bamum: return US"Bamum"; - case ucp_Egyptian_Hieroglyphs: return US"Egyptian_Hieroglyphs"; - case ucp_Imperial_Aramaic: return US"Imperial_Aramaic"; - case ucp_Inscriptional_Pahlavi: return US"Inscriptional_Pahlavi"; - case ucp_Inscriptional_Parthian: return US"Inscriptional_Parthian"; - case ucp_Javanese: return US"Javanese"; - case ucp_Kaithi: return US"Kaithi"; - case ucp_Lisu: return US"Lisu"; - case ucp_Meetei_Mayek: return US"Meetei_Mayek"; - case ucp_Old_South_Arabian: return US"Old_South_Arabian"; - case ucp_Old_Turkic: return US"Old_Turkic"; - case ucp_Samaritan: return US"Samaritan"; - case ucp_Tai_Tham: return US"Tai_Tham"; - case ucp_Tai_Viet: return US"Tai_Viet"; - /* New for Unicode 6.0.0 */ - case ucp_Batak: return US"Batak"; - case ucp_Brahmi: return US"Brahmi"; - case ucp_Mandaic: return US"Mandaic"; - - /* New for Unicode 6.1.0 */ - case ucp_Chakma: return US"Chakma"; - case ucp_Meroitic_Cursive: return US"Meroitic_Cursive"; - case ucp_Meroitic_Hieroglyphs: return US"Meroitic_Hieroglyphs"; - case ucp_Miao: return US"Miao"; - case ucp_Sharada: return US"Sharada"; - case ucp_Sora_Sompeng: return US"Sora Sompent"; - case ucp_Takri: return US"Takri"; - - /* New for Unicode 7.0.0 */ - case ucp_Bassa_Vah: return US"Bassa_Vah"; - case ucp_Caucasian_Albanian: return US"Caucasian_Albanian"; - case ucp_Duployan: return US"Duployan"; - case ucp_Elbasan: return US"Elbasan"; - case ucp_Grantha: return US"Grantha"; - case ucp_Khojki: return US"Khojki"; - case ucp_Khudawadi: return US"Khudawadi"; - case ucp_Linear_A: return US"Linear_A"; - case ucp_Mahajani: return US"Mahajani"; - case ucp_Manichaean: return US"Manichaean"; - case ucp_Mende_Kikakui: return US"Mende_Kikakui"; - case ucp_Modi: return US"Modi"; - case ucp_Mro: return US"Mro"; - case ucp_Nabataean: return US"Nabataean"; - case ucp_Old_North_Arabian: return US"Old_North_Arabian"; - case ucp_Old_Permic: return US"Old_Permic"; - case ucp_Pahawh_Hmong: return US"Pahawh_Hmong"; - case ucp_Palmyrene: return US"Palmyrene"; - case ucp_Psalter_Pahlavi: return US"Psalter_Pahlavi"; - case ucp_Pau_Cin_Hau: return US"Pau_Cin_Hau"; - case ucp_Siddham: return US"Siddham"; - case ucp_Tirhuta: return US"Tirhuta"; - case ucp_Warang_Citi: return US"Warang_Citi"; - - /* New for Unicode 8.0.0 */ - case ucp_Ahom: return US"Ahom"; - case ucp_Anatolian_Hieroglyphs: return US"Anatolian_Hieroglyphs"; - case ucp_Hatran: return US"Hatran"; - case ucp_Multani: return US"Multani"; - case ucp_Old_Hungarian: return US"Old_Hungarian"; - case ucp_SignWriting: return US"SignWriting"; - - /* New for Unicode 10.0.0 (no update since 8.0.0) */ - case ucp_Adlam: return US"Adlam"; - case ucp_Bhaiksuki: return US"Bhaiksuki"; - case ucp_Marchen: return US"Marchen"; - case ucp_Newa: return US"Newa"; - case ucp_Osage: return US"Osage"; - case ucp_Tangut: return US"Tangut"; - case ucp_Masaram_Gondi: return US"Masaram_Gondi"; - case ucp_Nushu: return US"Nushu"; - case ucp_Soyombo: return US"Soyombo"; - case ucp_Zanabazar_Square: return US"Zanabazar_Square"; - - /* New for Unicode 11.0.0 */ - case ucp_Dogra: return US"Dogra"; - case ucp_Gunjala_Gondi: return US"Gunjala_Gondi"; - case ucp_Hanifi_Rohingya: return US"Hanifi_Rohingya"; - case ucp_Makasar: return US"Makasar"; - case ucp_Medefaidrin: return US"Medefaidrin"; - case ucp_Old_Sogdian: return US"Old_Sogdian"; - case ucp_Sogdian: return US"Sogdian"; - } +#if defined WIN32 +return _isatty(_fileno(stdin)); +#else +return isatty(fileno(stdin)); +#endif } - /************************************************* * Print Unicode property info for a char * *************************************************/ @@ -239,11 +332,13 @@ int gbprop = UCD_GRAPHBREAK(c); int othercase = UCD_OTHERCASE(c); int caseset = UCD_CASESET(c); -unsigned char *fulltypename = US"??"; -unsigned char *typename = US"??"; -unsigned char *graphbreak = US"??"; +const unsigned char *fulltypename = US"??"; +const unsigned char *typename = US"??"; +const unsigned char *scriptname = US"??"; +const unsigned char *graphbreak = US"??"; -unsigned char *scriptname = find_script_name(script); +if (script < sizeof(script_names)/sizeof(char *)) + scriptname = script_names[script]; switch (type) { @@ -289,7 +384,7 @@ switch (fulltype) case ucp_Zp: fulltypename = US"Paragraph separator"; break; case ucp_Zs: fulltypename = US"Space separator"; break; } - + switch(gbprop) { case ucp_gbCR: graphbreak = US"CR"; break; @@ -308,12 +403,12 @@ switch(gbprop) case ucp_gbOther: graphbreak = US"Other"; break; case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break; case ucp_gbExtended_Pictographic: - graphbreak = US"Extended Pictographic"; break; - default: graphbreak = US"Unknown"; break; + graphbreak = US"Extended Pictographic"; break; + default: graphbreak = US"Unknown"; break; } - + printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak); -if (othercase != c) +if (othercase != c) { printf(", %04x", othercase); if (caseset != 0) @@ -321,30 +416,299 @@ if (othercase != c) const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1; while (*(++p) < NOTACHAR) if (*p != othercase && *p != c) printf(", %04x", *p); - } - } - + } + } + if (scriptx != script) { - printf(", ["); - if (scriptx >= 0) printf("%s", find_script_name(scriptx)); else + printf(", ["); + if (scriptx >= 0) { - char *sep = ""; + scriptname = (scriptx >= sizeof(script_names)/sizeof(char *))? + US"??" : script_names[scriptx]; + printf("%s", scriptname); + } + else + { + char *sep = ""; const uint8_t *p = PRIV(ucd_script_sets) - scriptx; while (*p != 0) { - printf("%s%s", sep, find_script_name(*p++)); - sep = ", "; - } - } + scriptname = (*p >= sizeof(script_names)/sizeof(char *))? + US"??" : script_names[*p++]; + printf("%s%s", sep, scriptname); + sep = ", "; + } + } printf("]"); - } - + } + printf("\n"); } +/************************************************* +* Find character(s) with given property/ies * +*************************************************/ + +static void +find_chars(unsigned char *s) +{ +unsigned char name[24]; +unsigned char value[24]; +unsigned char *t; +unsigned int count= 0; +int scriptx_list[24]; +unsigned int scriptx_count = 0; +uint32_t i, c; +int script = -1; +int type = -1; +int gbreak = -1; +BOOL script_not = FALSE; +BOOL type_not = FALSE; +BOOL gbreak_not = FALSE; +BOOL hadrange = FALSE; +const ucd_record *ucd, *next_ucd; +const char *pad = " "; + +while (*s != 0) + { + unsigned int offset = 0; + BOOL scriptx_not = FALSE; + + for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s; + *t = 0; + while (isspace(*s)) s++; + + for (t = value; *s != 0 && !isspace(*s); s++) *t++ = *s; + *t = 0; + while (isspace(*s)) s++; + + if (strcmp(CS name, "script") == 0 || + strcmp(CS name, "scriptx") == 0) + { + if (value[0] == '!') + { + if (name[6] == 'x') scriptx_not = TRUE; + else script_not = TRUE; + offset = 1; + } + + for (i = 0; i < sizeof(script_names)/sizeof(char *); i++) + { + if (strcmp(CS value + offset, script_names[i]) == 0) + { + if (name[6] == 'x') + { + scriptx_list[scriptx_count++] = scriptx_not? (-i):i; + } + else + { + if (script < 0) script = i; else + { + printf("** Only 1 script value allowed\n"); + return; + } + } + break; + } + } + + if (i >= sizeof(script_names)/sizeof(char *)) + { + printf("** Unrecognized script name '%s'\n", value); + return; + } + } + + else if (strcmp(CS name, "type") == 0) + { + if (type >= 0) + { + printf("** Only 1 type value allowed\n"); + return; + } + else + { + if (value[0] == '!') + { + type_not = TRUE; + offset = 1; + } + + for (i = 0; i < sizeof(type_names)/sizeof(char *); i++) + { + if (strcmp(CS (value + offset), type_names[i]) == 0) + { + type = i; + break; + } + } + if (i >= sizeof(type_names)/sizeof(char *)) + { + printf("** Unrecognized type name '%s'\n", value); + return; + } + } + } + + else if (strcmp(CS name, "gbreak") == 0) + { + if (gbreak >= 0) + { + printf("** Only 1 grapheme break value allowed\n"); + return; + } + else + { + if (value[0] == '!') + { + gbreak_not = TRUE; + offset = 1; + } + + for (i = 0; i < sizeof(gb_names)/sizeof(char *); i++) + { + if (strcmp(CS (value + offset), gb_names[i]) == 0) + { + gbreak = i; + break; + } + } + if (i >= sizeof(gb_names)/sizeof(char *)) + { + printf("** Unrecognized gbreak name '%s'\n", value); + return; + } + } + } + + else + { + printf("** Unrecognized property name '%s'\n", name); + return; + } + } + +if (script < 0 && scriptx_count == 0 && type < 0 && gbreak < 0) + { + printf("** No properties specified\n"); + return; + } + +for (c = 0; c <= 0x10ffff; c++) + { + if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue; + + if (scriptx_count > 0) + { + const uint8_t *char_scriptx = NULL; + int found = 0; + int scriptx = UCD_SCRIPTX(c); + + if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx; + + for (i = 0; i < scriptx_count; i++) + { + /* Positive requirment */ + if (scriptx_list[i] >= 0) + { + if (scriptx >= 0) + { + if (scriptx == scriptx_list[i]) found++; + } + + else + { + const uint8_t *p; + for (p = char_scriptx; *p != 0; p++) + { + if (scriptx_list[i] == *p) + { + found++; + break; + } + } + } + } + /* Negative requirement */ + else + { + if (scriptx >= 0) + { + if (scriptx != -scriptx_list[i]) found++; + } + else + { + const uint8_t *p; + for (p = char_scriptx; *p != 0; p++) + if (-scriptx_list[i] == *p) break; + if (*p == 0) found++; + } + } + } + + if (found != scriptx_count) continue; + } + + if (type >= 0) + { + if (type_not) + { + if (type == UCD_CHARTYPE(c)) continue; + } + else + { + if (type != UCD_CHARTYPE(c)) continue; + } + } + + if (gbreak >= 0) + { + if (gbreak_not) + { + if (gbreak == UCD_GRAPHBREAK(c)) continue; + } + else + { + if (gbreak != UCD_GRAPHBREAK(c)) continue; + } + } + + /* All conditions are met. Look for runs. */ + + ucd = GET_UCD(c); + + for (i = c + 1; i < 0x10ffff; i++) + { + next_ucd = GET_UCD(i); + if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break; + } + + if (--i > c) + { + printf("%04x..", c); + c = i; + hadrange = TRUE; + } + else if (hadrange) printf("%s", pad); + + print_prop(c); + if (c >= 0x100000) pad = " "; + else if (c >= 0x10000) pad = " "; + count++; + if (count >= 100) + { + printf("...\n"); + break; + } + } + +if (count == 0) printf("No characters found\n"); +} + + /************************************************* * Main program * *************************************************/ @@ -352,6 +716,7 @@ printf("\n"); int main(int argc, char **argv) { +BOOL interactive; unsigned char buffer[1024]; if (argc > 1) @@ -359,19 +724,48 @@ if (argc > 1) int i; for (i = 1; i < argc; i++) { - unsigned char *endptr; + unsigned char *endptr; int c = strtoul(argv[i], CSS(&endptr), 16); - print_prop(c); + if (*endptr != 0) + printf("** Hex number expected; ignored '%s'\n", argv[i]); + else print_prop(c); } return 0; - } + } -while (fgets(CS buffer, sizeof(buffer), stdin) != NULL) +interactive = is_stdin_tty(); + +#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) +if (interactive) using_history(); +#endif + +for(;;) { unsigned char name[24]; unsigned char *s, *t; - printf("%s", buffer); +#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) + if (interactive) + { + size_t len; + s = readline("> "); + if (s == NULL) break; + len = strlen(s); + if (len > 0) add_history(s); + memcpy(buffer, s, len); + buffer[len] = '\n'; + buffer[len+1] = 0; + free(s); + } + else +#endif + + { + if (interactive) printf("> "); + if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break; + if (!interactive) printf("%s", buffer); + } + s = buffer; while (isspace(*s)) s++; if (*s == 0) continue; @@ -386,15 +780,32 @@ while (fgets(CS buffer, sizeof(buffer), stdin) != NULL) { unsigned char *endptr; int c = strtoul(CS s, CSS(&endptr), 16); - print_prop(c); + + if (*endptr != 0 && !isspace(*endptr)) + { + while (*endptr != 0 && !isspace(*endptr)) endptr++; + printf("** Hex number expected; ignored '%.*s'\n", endptr-s, s); + } + else print_prop(c); s = endptr; while (isspace(*s)) s++; } } - else printf("Unknown test command %s\n", name); + else if (strcmp(CS name, "find") == 0) + { + find_chars(s); + } + + else printf("** Unknown test command %s\n", name); } +if (interactive) printf("\n"); + +#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) +if (interactive) clear_history(); +#endif + return 0; } diff --git a/testdata/testinput4 b/testdata/testinput4 index 9ab4191..c8e9d69 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -2432,6 +2432,8 @@ AB\x{1cf7} Latin Latin Common-extended-Beng \x{1cf7}AB Common-extend-Beng Latin Latin \x{1cf7}\x{0993} Common-extend-Beng Bengali + A\x{1abe}BC Test enclosing mark + \x{0370}\x{1abe}\x{0371} Which can occur with any script (Greek here) # Test loop breaking for empty string match diff --git a/testdata/testoutput4 b/testdata/testoutput4 index b16e7b9..bef3111 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -3936,6 +3936,10 @@ No match 0: \x{1cf7} \x{1cf7}\x{0993} Common-extend-Beng Bengali 0: \x{1cf7}\x{993} + A\x{1abe}BC Test enclosing mark + 0: A\x{1abe}BC + \x{0370}\x{1abe}\x{0371} Which can occur with any script (Greek here) + 0: \x{370}\x{1abe}\x{371} # Test loop breaking for empty string match