Upgrade the ucptest program (used only by maintainer) and script run tests.

This commit is contained in:
Philip.Hazel 2018-10-14 14:27:16 +00:00
parent 0fc5cda13b
commit 1c4dc562e4
3 changed files with 620 additions and 203 deletions

View File

@ -7,14 +7,42 @@
/* Compile thus: /* Compile thus:
gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \ gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
Add -lreadline or -ledit if required.
*/ */
/* If there are arguments, they are a list of hexadecimal code points whose /* This is a hacked-up program for testing the Unicode properties tables of
PCRE2. It can also be used for finding characters with certain properties.
I wrote it to help with debugging PCRE, and have added things that I found
useful, in a rather haphazard way. The code has never been "tidied" or checked
for robustness.
If there are arguments, they are a list of hexadecimal code points whose
properties are to be output. Otherwise, the program expects to read commands on properties are to be output. Otherwise, the program expects to read commands on
stdin, and it writes output to stdout. There is only one command, "findprop", stdin, and it writes output to stdout. There are two commands:
followed by a list of Unicode code points as hex numbers (without any
prefixes). The output is one line per character, giving its Unicode properties "findprop" must be followed by a list of Unicode code points as hex numbers
followed by its other case if there is one. */ (without any prefixes). The output is one line per character, giving its
Unicode properties followed by its other case if there is one, followed by its
Script Extension list if it is not just the same as the base script.
"find" must be followed by a list of property names and their values. This
finds characters that have those properties. If multiple properties are listed,
they must all be matched. Currently supported:
script <name> The character must have this script property. Only one
such script may be given.
scriptx <name> This script must be in the character's Script Extension
property list. If this is used many times, all the given
scripts must be present.
type <abbrev> The character's type (e.g. Lu or Nd) must match.
gbreak <name> The grapheme break property must match.
If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
Script Extensions, there may be a mixture of positive and negative
requirements. All must be satisfied.
No more than 100 characters are output. If there are more, the list ends with
... */
#ifdef HAVE_CONFIG_H #ifdef HAVE_CONFIG_H
#include "../src/config.h" #include "../src/config.h"
@ -31,6 +59,22 @@ followed by its other case if there is one. */
#include "../src/pcre2_internal.h" #include "../src/pcre2_internal.h"
#include "../src/pcre2_ucp.h" #include "../src/pcre2_ucp.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
#if defined(SUPPORT_LIBREADLINE)
#include <readline/readline.h>
#include <readline/history.h>
#else
#if defined(HAVE_EDITLINE_READLINE_H)
#include <editline/readline.h>
#else
#include <readline/readline.h>
#endif
#endif
#endif
/* -------------------------------------------------------------------*/ /* -------------------------------------------------------------------*/
@ -45,185 +89,234 @@ followed by its other case if there is one. */
/* -------------------------------------------------------------------*/ /* -------------------------------------------------------------------*/
const unsigned char *script_names[] = {
US"Unknown",
US"Arabic",
US"Armenian",
US"Bengali",
US"Bopomofo",
US"Braille",
US"Buginese",
US"Buhid",
US"Canadian_Aboriginal",
US"Cherokee",
US"Common",
US"Coptic",
US"Cypriot",
US"Cyrillic",
US"Deseret",
US"Devanagari",
US"Ethiopic",
US"Georgian",
US"Glagolitic",
US"Gothic",
US"Greek",
US"Gujarati",
US"Gurmukhi",
US"Han",
US"Hangul",
US"Hanunoo",
US"Hebrew",
US"Hiragana",
US"Inherited",
US"Kannada",
US"Katakana",
US"Kharoshthi",
US"Khmer",
US"Lao",
US"Latin",
US"Limbu",
US"Linear_B",
US"Malayalam",
US"Mongolian",
US"Myanmar",
US"New_Tai_Lue",
US"Ogham",
US"Old_Italic",
US"Old_Persian",
US"Oriya",
US"Osmanya",
US"Runic",
US"Shavian",
US"Sinhala",
US"Syloti_Nagri",
US"Syriac",
US"Tagalog",
US"Tagbanwa",
US"Tai_Le",
US"Tamil",
US"Telugu",
US"Thaana",
US"Thai",
US"Tibetan",
US"Tifinagh",
US"Ugaritic",
US"Yi",
/* New for Unicode 5.0: */
US"Balinese",
US"Cuneiform",
US"Nko",
US"Phags_Pa",
US"Phoenician",
/* New for Unicode 5.1: */
US"Carian",
US"Cham",
US"Kayah_Li",
US"Lepcha",
US"Lycian",
US"Lydian",
US"Ol_Chiki",
US"Rejang",
US"Saurashtra",
US"Sundanese",
US"Vai",
/* New for Unicode 5.2: */
US"Avestan",
US"Bamum",
US"Egyptian_Hieroglyphs",
US"Imperial_Aramaic",
US"Inscriptional_Pahlavi",
US"Inscriptional_Parthian",
US"Javanese",
US"Kaithi",
US"Lisu",
US"Meetei_Mayek",
US"Old_South_Arabian",
US"Old_Turkic",
US"Samaritan",
US"Tai_Tham",
US"Tai_Viet",
/* New for Unicode 6.0.0 */
US"Batak",
US"Brahmi",
US"Mandaic",
/* New for Unicode 6.1.0 */
US"Chakma",
US"Meroitic_Cursive",
US"Meroitic_Hieroglyphs",
US"Miao",
US"Sharada",
US"Sora Sompent",
US"Takri",
/* New for Unicode 7.0.0 */
US"Bassa_Vah",
US"Caucasian_Albanian",
US"Duployan",
US"Elbasan",
US"Grantha",
US"Khojki",
US"Khudawadi",
US"Linear_A",
US"Mahajani",
US"Manichaean",
US"Mende_Kikakui",
US"Modi",
US"Mro",
US"Nabataean",
US"Old_North_Arabian",
US"Old_Permic",
US"Pahawh_Hmong",
US"Palmyrene",
US"Psalter_Pahlavi",
US"Pau_Cin_Hau",
US"Siddham",
US"Tirhuta",
US"Warang_Citi",
/* New for Unicode 8.0.0 */
US"Ahom",
US"Anatolian_Hieroglyphs",
US"Hatran",
US"Multani",
US"Old_Hungarian",
US"SignWriting",
/* New for Unicode 10.0.0 (no update since 8.0.0) */
US"Adlam",
US"Bhaiksuki",
US"Marchen",
US"Newa",
US"Osage",
US"Tangut",
US"Masaram_Gondi",
US"Nushu",
US"Soyombo",
US"Zanabazar_Square",
/* New for Unicode 11.0.0 */
US"Dogra",
US"Gunjala_Gondi",
US"Hanifi_Rohingya",
US"Makasar",
US"Medefaidrin",
US"Old_Sogdian",
US"Sogdian"
};
const unsigned char *type_names[] = {
US"Cc",
US"Cf",
US"Cn",
US"Co",
US"Cs",
US"Ll",
US"Lm",
US"Lo",
US"Lt",
US"Lu",
US"Mc",
US"Me",
US"Mn",
US"Nd",
US"Nl",
US"No",
US"Pc",
US"Pd",
US"Pe",
US"Pf",
US"Pi",
US"Po",
US"Ps",
US"Sc",
US"Sk",
US"Sm",
US"So",
US"Zl",
US"Zp",
US"Zs"
};
const unsigned char *gb_names[] = {
US"CR",
US"LF",
US"Control",
US"Extend",
US"Prepend",
US"SpacingMark",
US"L",
US"V",
US"T",
US"LV",
US"LVT",
US"RegionalIndicator",
US"Other",
US"ZWJ",
US"Extended_Pictographic"
};
/************************************************* /*************************************************
* Find a script name * * Test for interaction *
*************************************************/ *************************************************/
static unsigned char * static BOOL
find_script_name(int script) is_stdin_tty(void)
{ {
switch(script) #if defined WIN32
{ return _isatty(_fileno(stdin));
default: return US"??"; #else
case ucp_Unknown: return US"Unknown"; return isatty(fileno(stdin));
case ucp_Arabic: return US"Arabic"; #endif
case ucp_Armenian: return US"Armenian";
case ucp_Balinese: return US"Balinese";
case ucp_Bengali: return US"Bengali";
case ucp_Bopomofo: return US"Bopomofo";
case ucp_Braille: return US"Braille";
case ucp_Buginese: return US"Buginese";
case ucp_Buhid: return US"Buhid";
case ucp_Canadian_Aboriginal: return US"Canadian_Aboriginal";
case ucp_Cherokee: return US"Cherokee";
case ucp_Common: return US"Common";
case ucp_Coptic: return US"Coptic";
case ucp_Cuneiform: return US"Cuneiform";
case ucp_Cypriot: return US"Cypriot";
case ucp_Cyrillic: return US"Cyrillic";
case ucp_Deseret: return US"Deseret";
case ucp_Devanagari: return US"Devanagari";
case ucp_Ethiopic: return US"Ethiopic";
case ucp_Georgian: return US"Georgian";
case ucp_Glagolitic: return US"Glagolitic";
case ucp_Gothic: return US"Gothic";
case ucp_Greek: return US"Greek";
case ucp_Gujarati: return US"Gujarati";
case ucp_Gurmukhi: return US"Gurmukhi";
case ucp_Han: return US"Han";
case ucp_Hangul: return US"Hangul";
case ucp_Hanunoo: return US"Hanunoo";
case ucp_Hebrew: return US"Hebrew";
case ucp_Hiragana: return US"Hiragana";
case ucp_Inherited: return US"Inherited";
case ucp_Kannada: return US"Kannada";
case ucp_Katakana: return US"Katakana";
case ucp_Kharoshthi: return US"Kharoshthi";
case ucp_Khmer: return US"Khmer";
case ucp_Lao: return US"Lao";
case ucp_Latin: return US"Latin";
case ucp_Limbu: return US"Limbu";
case ucp_Linear_B: return US"Linear_B";
case ucp_Malayalam: return US"Malayalam";
case ucp_Mongolian: return US"Mongolian";
case ucp_Myanmar: return US"Myanmar";
case ucp_New_Tai_Lue: return US"New_Tai_Lue";
case ucp_Nko: return US"Nko";
case ucp_Ogham: return US"Ogham";
case ucp_Old_Italic: return US"Old_Italic";
case ucp_Old_Persian: return US"Old_Persian";
case ucp_Oriya: return US"Oriya";
case ucp_Osmanya: return US"Osmanya";
case ucp_Phags_Pa: return US"Phags_Pa";
case ucp_Phoenician: return US"Phoenician";
case ucp_Runic: return US"Runic";
case ucp_Shavian: return US"Shavian";
case ucp_Sinhala: return US"Sinhala";
case ucp_Syloti_Nagri: return US"Syloti_Nagri";
case ucp_Syriac: return US"Syriac";
case ucp_Tagalog: return US"Tagalog";
case ucp_Tagbanwa: return US"Tagbanwa";
case ucp_Tai_Le: return US"Tai_Le";
case ucp_Tamil: return US"Tamil";
case ucp_Telugu: return US"Telugu";
case ucp_Thaana: return US"Thaana";
case ucp_Thai: return US"Thai";
case ucp_Tibetan: return US"Tibetan";
case ucp_Tifinagh: return US"Tifinagh";
case ucp_Ugaritic: return US"Ugaritic";
case ucp_Yi: return US"Yi";
/* New for Unicode 5.1: */
case ucp_Carian: return US"Carian";
case ucp_Cham: return US"Cham";
case ucp_Kayah_Li: return US"Kayah_Li";
case ucp_Lepcha: return US"Lepcha";
case ucp_Lycian: return US"Lycian";
case ucp_Lydian: return US"Lydian";
case ucp_Ol_Chiki: return US"Ol_Chiki";
case ucp_Rejang: return US"Rejang";
case ucp_Saurashtra: return US"Saurashtra";
case ucp_Sundanese: return US"Sundanese";
case ucp_Vai: return US"Vai";
/* New for Unicode 5.2: */
case ucp_Avestan: return US"Avestan";
case ucp_Bamum: return US"Bamum";
case ucp_Egyptian_Hieroglyphs: return US"Egyptian_Hieroglyphs";
case ucp_Imperial_Aramaic: return US"Imperial_Aramaic";
case ucp_Inscriptional_Pahlavi: return US"Inscriptional_Pahlavi";
case ucp_Inscriptional_Parthian: return US"Inscriptional_Parthian";
case ucp_Javanese: return US"Javanese";
case ucp_Kaithi: return US"Kaithi";
case ucp_Lisu: return US"Lisu";
case ucp_Meetei_Mayek: return US"Meetei_Mayek";
case ucp_Old_South_Arabian: return US"Old_South_Arabian";
case ucp_Old_Turkic: return US"Old_Turkic";
case ucp_Samaritan: return US"Samaritan";
case ucp_Tai_Tham: return US"Tai_Tham";
case ucp_Tai_Viet: return US"Tai_Viet";
/* New for Unicode 6.0.0 */
case ucp_Batak: return US"Batak";
case ucp_Brahmi: return US"Brahmi";
case ucp_Mandaic: return US"Mandaic";
/* New for Unicode 6.1.0 */
case ucp_Chakma: return US"Chakma";
case ucp_Meroitic_Cursive: return US"Meroitic_Cursive";
case ucp_Meroitic_Hieroglyphs: return US"Meroitic_Hieroglyphs";
case ucp_Miao: return US"Miao";
case ucp_Sharada: return US"Sharada";
case ucp_Sora_Sompeng: return US"Sora Sompent";
case ucp_Takri: return US"Takri";
/* New for Unicode 7.0.0 */
case ucp_Bassa_Vah: return US"Bassa_Vah";
case ucp_Caucasian_Albanian: return US"Caucasian_Albanian";
case ucp_Duployan: return US"Duployan";
case ucp_Elbasan: return US"Elbasan";
case ucp_Grantha: return US"Grantha";
case ucp_Khojki: return US"Khojki";
case ucp_Khudawadi: return US"Khudawadi";
case ucp_Linear_A: return US"Linear_A";
case ucp_Mahajani: return US"Mahajani";
case ucp_Manichaean: return US"Manichaean";
case ucp_Mende_Kikakui: return US"Mende_Kikakui";
case ucp_Modi: return US"Modi";
case ucp_Mro: return US"Mro";
case ucp_Nabataean: return US"Nabataean";
case ucp_Old_North_Arabian: return US"Old_North_Arabian";
case ucp_Old_Permic: return US"Old_Permic";
case ucp_Pahawh_Hmong: return US"Pahawh_Hmong";
case ucp_Palmyrene: return US"Palmyrene";
case ucp_Psalter_Pahlavi: return US"Psalter_Pahlavi";
case ucp_Pau_Cin_Hau: return US"Pau_Cin_Hau";
case ucp_Siddham: return US"Siddham";
case ucp_Tirhuta: return US"Tirhuta";
case ucp_Warang_Citi: return US"Warang_Citi";
/* New for Unicode 8.0.0 */
case ucp_Ahom: return US"Ahom";
case ucp_Anatolian_Hieroglyphs: return US"Anatolian_Hieroglyphs";
case ucp_Hatran: return US"Hatran";
case ucp_Multani: return US"Multani";
case ucp_Old_Hungarian: return US"Old_Hungarian";
case ucp_SignWriting: return US"SignWriting";
/* New for Unicode 10.0.0 (no update since 8.0.0) */
case ucp_Adlam: return US"Adlam";
case ucp_Bhaiksuki: return US"Bhaiksuki";
case ucp_Marchen: return US"Marchen";
case ucp_Newa: return US"Newa";
case ucp_Osage: return US"Osage";
case ucp_Tangut: return US"Tangut";
case ucp_Masaram_Gondi: return US"Masaram_Gondi";
case ucp_Nushu: return US"Nushu";
case ucp_Soyombo: return US"Soyombo";
case ucp_Zanabazar_Square: return US"Zanabazar_Square";
/* New for Unicode 11.0.0 */
case ucp_Dogra: return US"Dogra";
case ucp_Gunjala_Gondi: return US"Gunjala_Gondi";
case ucp_Hanifi_Rohingya: return US"Hanifi_Rohingya";
case ucp_Makasar: return US"Makasar";
case ucp_Medefaidrin: return US"Medefaidrin";
case ucp_Old_Sogdian: return US"Old_Sogdian";
case ucp_Sogdian: return US"Sogdian";
}
} }
/************************************************* /*************************************************
* Print Unicode property info for a char * * Print Unicode property info for a char *
*************************************************/ *************************************************/
@ -239,11 +332,13 @@ int gbprop = UCD_GRAPHBREAK(c);
int othercase = UCD_OTHERCASE(c); int othercase = UCD_OTHERCASE(c);
int caseset = UCD_CASESET(c); int caseset = UCD_CASESET(c);
unsigned char *fulltypename = US"??"; const unsigned char *fulltypename = US"??";
unsigned char *typename = US"??"; const unsigned char *typename = US"??";
unsigned char *graphbreak = US"??"; const unsigned char *scriptname = US"??";
const unsigned char *graphbreak = US"??";
unsigned char *scriptname = find_script_name(script); if (script < sizeof(script_names)/sizeof(char *))
scriptname = script_names[script];
switch (type) switch (type)
{ {
@ -327,13 +422,21 @@ if (othercase != c)
if (scriptx != script) if (scriptx != script)
{ {
printf(", ["); printf(", [");
if (scriptx >= 0) printf("%s", find_script_name(scriptx)); else if (scriptx >= 0)
{
scriptname = (scriptx >= sizeof(script_names)/sizeof(char *))?
US"??" : script_names[scriptx];
printf("%s", scriptname);
}
else
{ {
char *sep = ""; char *sep = "";
const uint8_t *p = PRIV(ucd_script_sets) - scriptx; const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
while (*p != 0) while (*p != 0)
{ {
printf("%s%s", sep, find_script_name(*p++)); scriptname = (*p >= sizeof(script_names)/sizeof(char *))?
US"??" : script_names[*p++];
printf("%s%s", sep, scriptname);
sep = ", "; sep = ", ";
} }
} }
@ -345,6 +448,267 @@ printf("\n");
/*************************************************
* Find character(s) with given property/ies *
*************************************************/
static void
find_chars(unsigned char *s)
{
unsigned char name[24];
unsigned char value[24];
unsigned char *t;
unsigned int count= 0;
int scriptx_list[24];
unsigned int scriptx_count = 0;
uint32_t i, c;
int script = -1;
int type = -1;
int gbreak = -1;
BOOL script_not = FALSE;
BOOL type_not = FALSE;
BOOL gbreak_not = FALSE;
BOOL hadrange = FALSE;
const ucd_record *ucd, *next_ucd;
const char *pad = " ";
while (*s != 0)
{
unsigned int offset = 0;
BOOL scriptx_not = FALSE;
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
*t = 0;
while (isspace(*s)) s++;
for (t = value; *s != 0 && !isspace(*s); s++) *t++ = *s;
*t = 0;
while (isspace(*s)) s++;
if (strcmp(CS name, "script") == 0 ||
strcmp(CS name, "scriptx") == 0)
{
if (value[0] == '!')
{
if (name[6] == 'x') scriptx_not = TRUE;
else script_not = TRUE;
offset = 1;
}
for (i = 0; i < sizeof(script_names)/sizeof(char *); i++)
{
if (strcmp(CS value + offset, script_names[i]) == 0)
{
if (name[6] == 'x')
{
scriptx_list[scriptx_count++] = scriptx_not? (-i):i;
}
else
{
if (script < 0) script = i; else
{
printf("** Only 1 script value allowed\n");
return;
}
}
break;
}
}
if (i >= sizeof(script_names)/sizeof(char *))
{
printf("** Unrecognized script name '%s'\n", value);
return;
}
}
else if (strcmp(CS name, "type") == 0)
{
if (type >= 0)
{
printf("** Only 1 type value allowed\n");
return;
}
else
{
if (value[0] == '!')
{
type_not = TRUE;
offset = 1;
}
for (i = 0; i < sizeof(type_names)/sizeof(char *); i++)
{
if (strcmp(CS (value + offset), type_names[i]) == 0)
{
type = i;
break;
}
}
if (i >= sizeof(type_names)/sizeof(char *))
{
printf("** Unrecognized type name '%s'\n", value);
return;
}
}
}
else if (strcmp(CS name, "gbreak") == 0)
{
if (gbreak >= 0)
{
printf("** Only 1 grapheme break value allowed\n");
return;
}
else
{
if (value[0] == '!')
{
gbreak_not = TRUE;
offset = 1;
}
for (i = 0; i < sizeof(gb_names)/sizeof(char *); i++)
{
if (strcmp(CS (value + offset), gb_names[i]) == 0)
{
gbreak = i;
break;
}
}
if (i >= sizeof(gb_names)/sizeof(char *))
{
printf("** Unrecognized gbreak name '%s'\n", value);
return;
}
}
}
else
{
printf("** Unrecognized property name '%s'\n", name);
return;
}
}
if (script < 0 && scriptx_count == 0 && type < 0 && gbreak < 0)
{
printf("** No properties specified\n");
return;
}
for (c = 0; c <= 0x10ffff; c++)
{
if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
if (scriptx_count > 0)
{
const uint8_t *char_scriptx = NULL;
int found = 0;
int scriptx = UCD_SCRIPTX(c);
if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx;
for (i = 0; i < scriptx_count; i++)
{
/* Positive requirment */
if (scriptx_list[i] >= 0)
{
if (scriptx >= 0)
{
if (scriptx == scriptx_list[i]) found++;
}
else
{
const uint8_t *p;
for (p = char_scriptx; *p != 0; p++)
{
if (scriptx_list[i] == *p)
{
found++;
break;
}
}
}
}
/* Negative requirement */
else
{
if (scriptx >= 0)
{
if (scriptx != -scriptx_list[i]) found++;
}
else
{
const uint8_t *p;
for (p = char_scriptx; *p != 0; p++)
if (-scriptx_list[i] == *p) break;
if (*p == 0) found++;
}
}
}
if (found != scriptx_count) continue;
}
if (type >= 0)
{
if (type_not)
{
if (type == UCD_CHARTYPE(c)) continue;
}
else
{
if (type != UCD_CHARTYPE(c)) continue;
}
}
if (gbreak >= 0)
{
if (gbreak_not)
{
if (gbreak == UCD_GRAPHBREAK(c)) continue;
}
else
{
if (gbreak != UCD_GRAPHBREAK(c)) continue;
}
}
/* All conditions are met. Look for runs. */
ucd = GET_UCD(c);
for (i = c + 1; i < 0x10ffff; i++)
{
next_ucd = GET_UCD(i);
if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
}
if (--i > c)
{
printf("%04x..", c);
c = i;
hadrange = TRUE;
}
else if (hadrange) printf("%s", pad);
print_prop(c);
if (c >= 0x100000) pad = " ";
else if (c >= 0x10000) pad = " ";
count++;
if (count >= 100)
{
printf("...\n");
break;
}
}
if (count == 0) printf("No characters found\n");
}
/************************************************* /*************************************************
* Main program * * Main program *
*************************************************/ *************************************************/
@ -352,6 +716,7 @@ printf("\n");
int int
main(int argc, char **argv) main(int argc, char **argv)
{ {
BOOL interactive;
unsigned char buffer[1024]; unsigned char buffer[1024];
if (argc > 1) if (argc > 1)
@ -361,17 +726,46 @@ if (argc > 1)
{ {
unsigned char *endptr; unsigned char *endptr;
int c = strtoul(argv[i], CSS(&endptr), 16); int c = strtoul(argv[i], CSS(&endptr), 16);
print_prop(c); if (*endptr != 0)
printf("** Hex number expected; ignored '%s'\n", argv[i]);
else print_prop(c);
} }
return 0; return 0;
} }
while (fgets(CS buffer, sizeof(buffer), stdin) != NULL) interactive = is_stdin_tty();
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
if (interactive) using_history();
#endif
for(;;)
{ {
unsigned char name[24]; unsigned char name[24];
unsigned char *s, *t; unsigned char *s, *t;
printf("%s", buffer); #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
if (interactive)
{
size_t len;
s = readline("> ");
if (s == NULL) break;
len = strlen(s);
if (len > 0) add_history(s);
memcpy(buffer, s, len);
buffer[len] = '\n';
buffer[len+1] = 0;
free(s);
}
else
#endif
{
if (interactive) printf("> ");
if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
if (!interactive) printf("%s", buffer);
}
s = buffer; s = buffer;
while (isspace(*s)) s++; while (isspace(*s)) s++;
if (*s == 0) continue; if (*s == 0) continue;
@ -386,15 +780,32 @@ while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
{ {
unsigned char *endptr; unsigned char *endptr;
int c = strtoul(CS s, CSS(&endptr), 16); int c = strtoul(CS s, CSS(&endptr), 16);
print_prop(c);
if (*endptr != 0 && !isspace(*endptr))
{
while (*endptr != 0 && !isspace(*endptr)) endptr++;
printf("** Hex number expected; ignored '%.*s'\n", endptr-s, s);
}
else print_prop(c);
s = endptr; s = endptr;
while (isspace(*s)) s++; while (isspace(*s)) s++;
} }
} }
else printf("Unknown test command %s\n", name); else if (strcmp(CS name, "find") == 0)
{
find_chars(s);
} }
else printf("** Unknown test command %s\n", name);
}
if (interactive) printf("\n");
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
if (interactive) clear_history();
#endif
return 0; return 0;
} }

2
testdata/testinput4 vendored
View File

@ -2432,6 +2432,8 @@
AB\x{1cf7} Latin Latin Common-extended-Beng AB\x{1cf7} Latin Latin Common-extended-Beng
\x{1cf7}AB Common-extend-Beng Latin Latin \x{1cf7}AB Common-extend-Beng Latin Latin
\x{1cf7}\x{0993} Common-extend-Beng Bengali \x{1cf7}\x{0993} Common-extend-Beng Bengali
A\x{1abe}BC Test enclosing mark
\x{0370}\x{1abe}\x{0371} Which can occur with any script (Greek here)
# Test loop breaking for empty string match # Test loop breaking for empty string match

View File

@ -3936,6 +3936,10 @@ No match
0: \x{1cf7} 0: \x{1cf7}
\x{1cf7}\x{0993} Common-extend-Beng Bengali \x{1cf7}\x{0993} Common-extend-Beng Bengali
0: \x{1cf7}\x{993} 0: \x{1cf7}\x{993}
A\x{1abe}BC Test enclosing mark
0: A\x{1abe}BC
\x{0370}\x{1abe}\x{0371} Which can occur with any script (Greek here)
0: \x{370}\x{1abe}\x{371}
# Test loop breaking for empty string match # Test loop breaking for empty string match