2014-05-13 13:20:03 +02:00
|
|
|
/***************************************************
|
|
|
|
* A program for testing the Unicode property table *
|
|
|
|
***************************************************/
|
|
|
|
|
2019-07-30 19:59:42 +02:00
|
|
|
/* Copyright (c) University of Cambridge 2008-2019 */
|
2014-05-13 13:20:03 +02:00
|
|
|
|
|
|
|
/* Compile thus:
|
2019-07-30 19:59:42 +02:00
|
|
|
|
2014-05-13 13:20:03 +02:00
|
|
|
gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
|
|
|
|
ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
|
2019-07-30 19:59:42 +02:00
|
|
|
|
|
|
|
Add -lreadline or -ledit if PCRE2 was configured with readline or libedit
|
|
|
|
support in pcre2test.
|
2014-05-13 13:20:03 +02:00
|
|
|
*/
|
|
|
|
|
2018-10-14 16:27:16 +02:00
|
|
|
/* This is a hacked-up program for testing the Unicode properties tables of
|
|
|
|
PCRE2. It can also be used for finding characters with certain properties.
|
|
|
|
I wrote it to help with debugging PCRE, and have added things that I found
|
|
|
|
useful, in a rather haphazard way. The code has never been "tidied" or checked
|
|
|
|
for robustness.
|
|
|
|
|
|
|
|
If there are arguments, they are a list of hexadecimal code points whose
|
2018-10-06 19:39:52 +02:00
|
|
|
properties are to be output. Otherwise, the program expects to read commands on
|
2018-10-14 16:27:16 +02:00
|
|
|
stdin, and it writes output to stdout. There are two commands:
|
|
|
|
|
|
|
|
"findprop" must be followed by a list of Unicode code points as hex numbers
|
|
|
|
(without any prefixes). The output is one line per character, giving its
|
|
|
|
Unicode properties followed by its other case if there is one, followed by its
|
|
|
|
Script Extension list if it is not just the same as the base script.
|
|
|
|
|
|
|
|
"find" must be followed by a list of property names and their values. This
|
|
|
|
finds characters that have those properties. If multiple properties are listed,
|
|
|
|
they must all be matched. Currently supported:
|
|
|
|
|
|
|
|
script <name> The character must have this script property. Only one
|
|
|
|
such script may be given.
|
|
|
|
scriptx <name> This script must be in the character's Script Extension
|
|
|
|
property list. If this is used many times, all the given
|
|
|
|
scripts must be present.
|
|
|
|
type <abbrev> The character's type (e.g. Lu or Nd) must match.
|
|
|
|
gbreak <name> The grapheme break property must match.
|
|
|
|
|
|
|
|
If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
|
|
|
|
Script Extensions, there may be a mixture of positive and negative
|
|
|
|
requirements. All must be satisfied.
|
|
|
|
|
|
|
|
No more than 100 characters are output. If there are more, the list ends with
|
|
|
|
... */
|
2014-05-13 13:20:03 +02:00
|
|
|
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
|
|
#include "../src/config.h"
|
|
|
|
#endif
|
|
|
|
|
2014-09-19 09:43:39 +02:00
|
|
|
#ifndef SUPPORT_UNICODE
|
|
|
|
#define SUPPORT_UNICODE
|
2014-05-13 13:20:03 +02:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include "../src/pcre2_internal.h"
|
|
|
|
#include "../src/pcre2_ucp.h"
|
|
|
|
|
2018-10-14 16:27:16 +02:00
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
|
|
|
#if defined(SUPPORT_LIBREADLINE)
|
|
|
|
#include <readline/readline.h>
|
|
|
|
#include <readline/history.h>
|
|
|
|
#else
|
|
|
|
#if defined(HAVE_EDITLINE_READLINE_H)
|
|
|
|
#include <editline/readline.h>
|
|
|
|
#else
|
|
|
|
#include <readline/readline.h>
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
#endif
|
2014-05-13 13:20:03 +02:00
|
|
|
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------*/
|
|
|
|
|
|
|
|
#define CS (char *)
|
|
|
|
#define CCS (const char *)
|
|
|
|
#define CSS (char **)
|
|
|
|
#define US (unsigned char *)
|
|
|
|
#define CUS (const unsigned char *)
|
|
|
|
#define USS (unsigned char **)
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------*/
|
|
|
|
|
|
|
|
|
2018-10-14 16:27:16 +02:00
|
|
|
const unsigned char *script_names[] = {
|
|
|
|
US"Unknown",
|
|
|
|
US"Arabic",
|
|
|
|
US"Armenian",
|
|
|
|
US"Bengali",
|
|
|
|
US"Bopomofo",
|
|
|
|
US"Braille",
|
|
|
|
US"Buginese",
|
|
|
|
US"Buhid",
|
|
|
|
US"Canadian_Aboriginal",
|
|
|
|
US"Cherokee",
|
|
|
|
US"Common",
|
|
|
|
US"Coptic",
|
|
|
|
US"Cypriot",
|
|
|
|
US"Cyrillic",
|
|
|
|
US"Deseret",
|
|
|
|
US"Devanagari",
|
|
|
|
US"Ethiopic",
|
|
|
|
US"Georgian",
|
|
|
|
US"Glagolitic",
|
|
|
|
US"Gothic",
|
|
|
|
US"Greek",
|
|
|
|
US"Gujarati",
|
|
|
|
US"Gurmukhi",
|
|
|
|
US"Han",
|
|
|
|
US"Hangul",
|
|
|
|
US"Hanunoo",
|
|
|
|
US"Hebrew",
|
|
|
|
US"Hiragana",
|
|
|
|
US"Inherited",
|
|
|
|
US"Kannada",
|
|
|
|
US"Katakana",
|
|
|
|
US"Kharoshthi",
|
|
|
|
US"Khmer",
|
|
|
|
US"Lao",
|
|
|
|
US"Latin",
|
|
|
|
US"Limbu",
|
|
|
|
US"Linear_B",
|
|
|
|
US"Malayalam",
|
|
|
|
US"Mongolian",
|
|
|
|
US"Myanmar",
|
|
|
|
US"New_Tai_Lue",
|
|
|
|
US"Ogham",
|
|
|
|
US"Old_Italic",
|
|
|
|
US"Old_Persian",
|
|
|
|
US"Oriya",
|
|
|
|
US"Osmanya",
|
|
|
|
US"Runic",
|
|
|
|
US"Shavian",
|
|
|
|
US"Sinhala",
|
|
|
|
US"Syloti_Nagri",
|
|
|
|
US"Syriac",
|
|
|
|
US"Tagalog",
|
|
|
|
US"Tagbanwa",
|
|
|
|
US"Tai_Le",
|
|
|
|
US"Tamil",
|
|
|
|
US"Telugu",
|
|
|
|
US"Thaana",
|
|
|
|
US"Thai",
|
|
|
|
US"Tibetan",
|
|
|
|
US"Tifinagh",
|
|
|
|
US"Ugaritic",
|
|
|
|
US"Yi",
|
|
|
|
/* New for Unicode 5.0: */
|
|
|
|
US"Balinese",
|
|
|
|
US"Cuneiform",
|
|
|
|
US"Nko",
|
|
|
|
US"Phags_Pa",
|
|
|
|
US"Phoenician",
|
2018-10-06 19:39:52 +02:00
|
|
|
/* New for Unicode 5.1: */
|
2018-10-14 16:27:16 +02:00
|
|
|
US"Carian",
|
|
|
|
US"Cham",
|
|
|
|
US"Kayah_Li",
|
|
|
|
US"Lepcha",
|
|
|
|
US"Lycian",
|
|
|
|
US"Lydian",
|
|
|
|
US"Ol_Chiki",
|
|
|
|
US"Rejang",
|
|
|
|
US"Saurashtra",
|
|
|
|
US"Sundanese",
|
|
|
|
US"Vai",
|
2018-10-06 19:39:52 +02:00
|
|
|
/* New for Unicode 5.2: */
|
2018-10-14 16:27:16 +02:00
|
|
|
US"Avestan",
|
|
|
|
US"Bamum",
|
|
|
|
US"Egyptian_Hieroglyphs",
|
|
|
|
US"Imperial_Aramaic",
|
|
|
|
US"Inscriptional_Pahlavi",
|
|
|
|
US"Inscriptional_Parthian",
|
|
|
|
US"Javanese",
|
|
|
|
US"Kaithi",
|
|
|
|
US"Lisu",
|
|
|
|
US"Meetei_Mayek",
|
|
|
|
US"Old_South_Arabian",
|
|
|
|
US"Old_Turkic",
|
|
|
|
US"Samaritan",
|
|
|
|
US"Tai_Tham",
|
|
|
|
US"Tai_Viet",
|
2018-10-06 19:39:52 +02:00
|
|
|
/* New for Unicode 6.0.0 */
|
2018-10-14 16:27:16 +02:00
|
|
|
US"Batak",
|
|
|
|
US"Brahmi",
|
|
|
|
US"Mandaic",
|
2018-10-06 19:39:52 +02:00
|
|
|
/* New for Unicode 6.1.0 */
|
2018-10-14 16:27:16 +02:00
|
|
|
US"Chakma",
|
|
|
|
US"Meroitic_Cursive",
|
|
|
|
US"Meroitic_Hieroglyphs",
|
|
|
|
US"Miao",
|
|
|
|
US"Sharada",
|
|
|
|
US"Sora Sompent",
|
|
|
|
US"Takri",
|
2018-10-06 19:39:52 +02:00
|
|
|
/* New for Unicode 7.0.0 */
|
2018-10-14 16:27:16 +02:00
|
|
|
US"Bassa_Vah",
|
|
|
|
US"Caucasian_Albanian",
|
|
|
|
US"Duployan",
|
|
|
|
US"Elbasan",
|
|
|
|
US"Grantha",
|
|
|
|
US"Khojki",
|
|
|
|
US"Khudawadi",
|
|
|
|
US"Linear_A",
|
|
|
|
US"Mahajani",
|
|
|
|
US"Manichaean",
|
|
|
|
US"Mende_Kikakui",
|
|
|
|
US"Modi",
|
|
|
|
US"Mro",
|
|
|
|
US"Nabataean",
|
|
|
|
US"Old_North_Arabian",
|
|
|
|
US"Old_Permic",
|
|
|
|
US"Pahawh_Hmong",
|
|
|
|
US"Palmyrene",
|
|
|
|
US"Psalter_Pahlavi",
|
|
|
|
US"Pau_Cin_Hau",
|
|
|
|
US"Siddham",
|
|
|
|
US"Tirhuta",
|
|
|
|
US"Warang_Citi",
|
2018-10-06 19:39:52 +02:00
|
|
|
/* New for Unicode 8.0.0 */
|
2018-10-14 16:27:16 +02:00
|
|
|
US"Ahom",
|
|
|
|
US"Anatolian_Hieroglyphs",
|
|
|
|
US"Hatran",
|
|
|
|
US"Multani",
|
|
|
|
US"Old_Hungarian",
|
|
|
|
US"SignWriting",
|
2018-10-06 19:39:52 +02:00
|
|
|
/* New for Unicode 10.0.0 (no update since 8.0.0) */
|
2018-10-14 16:27:16 +02:00
|
|
|
US"Adlam",
|
|
|
|
US"Bhaiksuki",
|
|
|
|
US"Marchen",
|
|
|
|
US"Newa",
|
|
|
|
US"Osage",
|
|
|
|
US"Tangut",
|
|
|
|
US"Masaram_Gondi",
|
|
|
|
US"Nushu",
|
|
|
|
US"Soyombo",
|
|
|
|
US"Zanabazar_Square",
|
|
|
|
/* New for Unicode 11.0.0 */
|
|
|
|
US"Dogra",
|
|
|
|
US"Gunjala_Gondi",
|
|
|
|
US"Hanifi_Rohingya",
|
|
|
|
US"Makasar",
|
|
|
|
US"Medefaidrin",
|
|
|
|
US"Old_Sogdian",
|
2019-07-29 17:32:36 +02:00
|
|
|
US"Sogdian",
|
|
|
|
/* New for Unicode 12.0.0 */
|
|
|
|
US"Elymaic",
|
|
|
|
US"Nandinagari",
|
|
|
|
US"Nyiakeng_Puachue_Hmong",
|
|
|
|
US"Wancho"
|
2018-10-14 16:27:16 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
const unsigned char *type_names[] = {
|
|
|
|
US"Cc",
|
|
|
|
US"Cf",
|
|
|
|
US"Cn",
|
|
|
|
US"Co",
|
|
|
|
US"Cs",
|
|
|
|
US"Ll",
|
|
|
|
US"Lm",
|
|
|
|
US"Lo",
|
|
|
|
US"Lt",
|
|
|
|
US"Lu",
|
|
|
|
US"Mc",
|
|
|
|
US"Me",
|
|
|
|
US"Mn",
|
|
|
|
US"Nd",
|
|
|
|
US"Nl",
|
|
|
|
US"No",
|
|
|
|
US"Pc",
|
|
|
|
US"Pd",
|
|
|
|
US"Pe",
|
|
|
|
US"Pf",
|
|
|
|
US"Pi",
|
|
|
|
US"Po",
|
|
|
|
US"Ps",
|
|
|
|
US"Sc",
|
|
|
|
US"Sk",
|
|
|
|
US"Sm",
|
|
|
|
US"So",
|
|
|
|
US"Zl",
|
|
|
|
US"Zp",
|
|
|
|
US"Zs"
|
|
|
|
};
|
|
|
|
|
|
|
|
const unsigned char *gb_names[] = {
|
|
|
|
US"CR",
|
|
|
|
US"LF",
|
|
|
|
US"Control",
|
|
|
|
US"Extend",
|
|
|
|
US"Prepend",
|
|
|
|
US"SpacingMark",
|
|
|
|
US"L",
|
|
|
|
US"V",
|
|
|
|
US"T",
|
|
|
|
US"LV",
|
|
|
|
US"LVT",
|
|
|
|
US"RegionalIndicator",
|
|
|
|
US"Other",
|
|
|
|
US"ZWJ",
|
|
|
|
US"Extended_Pictographic"
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/*************************************************
|
|
|
|
* Test for interaction *
|
|
|
|
*************************************************/
|
2018-10-06 19:39:52 +02:00
|
|
|
|
2018-10-14 16:27:16 +02:00
|
|
|
static BOOL
|
|
|
|
is_stdin_tty(void)
|
|
|
|
{
|
|
|
|
#if defined WIN32
|
|
|
|
return _isatty(_fileno(stdin));
|
|
|
|
#else
|
|
|
|
return isatty(fileno(stdin));
|
|
|
|
#endif
|
|
|
|
}
|
2018-10-06 19:39:52 +02:00
|
|
|
|
|
|
|
|
2014-05-13 13:20:03 +02:00
|
|
|
/*************************************************
|
|
|
|
* Print Unicode property info for a char *
|
|
|
|
*************************************************/
|
|
|
|
|
|
|
|
static void
|
|
|
|
print_prop(int c)
|
|
|
|
{
|
|
|
|
int type = UCD_CATEGORY(c);
|
|
|
|
int fulltype = UCD_CHARTYPE(c);
|
|
|
|
int script = UCD_SCRIPT(c);
|
2018-10-06 19:39:52 +02:00
|
|
|
int scriptx = UCD_SCRIPTX(c);
|
2014-05-13 13:20:03 +02:00
|
|
|
int gbprop = UCD_GRAPHBREAK(c);
|
|
|
|
int othercase = UCD_OTHERCASE(c);
|
|
|
|
int caseset = UCD_CASESET(c);
|
|
|
|
|
2018-10-14 16:27:16 +02:00
|
|
|
const unsigned char *fulltypename = US"??";
|
|
|
|
const unsigned char *typename = US"??";
|
|
|
|
const unsigned char *scriptname = US"??";
|
|
|
|
const unsigned char *graphbreak = US"??";
|
2014-05-13 13:20:03 +02:00
|
|
|
|
2018-10-14 16:27:16 +02:00
|
|
|
if (script < sizeof(script_names)/sizeof(char *))
|
|
|
|
scriptname = script_names[script];
|
2018-10-06 19:39:52 +02:00
|
|
|
|
2014-05-13 13:20:03 +02:00
|
|
|
switch (type)
|
|
|
|
{
|
|
|
|
case ucp_C: typename = US"Control"; break;
|
|
|
|
case ucp_L: typename = US"Letter"; break;
|
|
|
|
case ucp_M: typename = US"Mark"; break;
|
|
|
|
case ucp_N: typename = US"Number"; break;
|
|
|
|
case ucp_P: typename = US"Punctuation"; break;
|
|
|
|
case ucp_S: typename = US"Symbol"; break;
|
|
|
|
case ucp_Z: typename = US"Separator"; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (fulltype)
|
|
|
|
{
|
|
|
|
case ucp_Cc: fulltypename = US"Control"; break;
|
|
|
|
case ucp_Cf: fulltypename = US"Format"; break;
|
|
|
|
case ucp_Cn: fulltypename = US"Unassigned"; break;
|
|
|
|
case ucp_Co: fulltypename = US"Private use"; break;
|
|
|
|
case ucp_Cs: fulltypename = US"Surrogate"; break;
|
|
|
|
case ucp_Ll: fulltypename = US"Lower case letter"; break;
|
|
|
|
case ucp_Lm: fulltypename = US"Modifier letter"; break;
|
|
|
|
case ucp_Lo: fulltypename = US"Other letter"; break;
|
|
|
|
case ucp_Lt: fulltypename = US"Title case letter"; break;
|
|
|
|
case ucp_Lu: fulltypename = US"Upper case letter"; break;
|
|
|
|
case ucp_Mc: fulltypename = US"Spacing mark"; break;
|
|
|
|
case ucp_Me: fulltypename = US"Enclosing mark"; break;
|
|
|
|
case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
|
|
|
|
case ucp_Nd: fulltypename = US"Decimal number"; break;
|
|
|
|
case ucp_Nl: fulltypename = US"Letter number"; break;
|
|
|
|
case ucp_No: fulltypename = US"Other number"; break;
|
|
|
|
case ucp_Pc: fulltypename = US"Connector punctuation"; break;
|
|
|
|
case ucp_Pd: fulltypename = US"Dash punctuation"; break;
|
|
|
|
case ucp_Pe: fulltypename = US"Close punctuation"; break;
|
|
|
|
case ucp_Pf: fulltypename = US"Final punctuation"; break;
|
|
|
|
case ucp_Pi: fulltypename = US"Initial punctuation"; break;
|
|
|
|
case ucp_Po: fulltypename = US"Other punctuation"; break;
|
|
|
|
case ucp_Ps: fulltypename = US"Open punctuation"; break;
|
|
|
|
case ucp_Sc: fulltypename = US"Currency symbol"; break;
|
|
|
|
case ucp_Sk: fulltypename = US"Modifier symbol"; break;
|
|
|
|
case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
|
|
|
|
case ucp_So: fulltypename = US"Other symbol"; break;
|
|
|
|
case ucp_Zl: fulltypename = US"Line separator"; break;
|
|
|
|
case ucp_Zp: fulltypename = US"Paragraph separator"; break;
|
|
|
|
case ucp_Zs: fulltypename = US"Space separator"; break;
|
|
|
|
}
|
2018-10-14 16:27:16 +02:00
|
|
|
|
2014-05-13 13:20:03 +02:00
|
|
|
switch(gbprop)
|
|
|
|
{
|
|
|
|
case ucp_gbCR: graphbreak = US"CR"; break;
|
|
|
|
case ucp_gbLF: graphbreak = US"LF"; break;
|
|
|
|
case ucp_gbControl: graphbreak = US"Control"; break;
|
|
|
|
case ucp_gbExtend: graphbreak = US"Extend"; break;
|
|
|
|
case ucp_gbPrepend: graphbreak = US"Prepend"; break;
|
|
|
|
case ucp_gbSpacingMark: graphbreak = US"SpacingMark"; break;
|
|
|
|
case ucp_gbL: graphbreak = US"Hangul syllable type L"; break;
|
|
|
|
case ucp_gbV: graphbreak = US"Hangul syllable type V"; break;
|
|
|
|
case ucp_gbT: graphbreak = US"Hangul syllable type T"; break;
|
|
|
|
case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break;
|
|
|
|
case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break;
|
2018-07-07 18:10:29 +02:00
|
|
|
case ucp_gbRegionalIndicator:
|
|
|
|
graphbreak = US"Regional Indicator"; break;
|
2014-05-13 13:20:03 +02:00
|
|
|
case ucp_gbOther: graphbreak = US"Other"; break;
|
2018-07-07 18:10:29 +02:00
|
|
|
case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break;
|
|
|
|
case ucp_gbExtended_Pictographic:
|
2018-10-14 16:27:16 +02:00
|
|
|
graphbreak = US"Extended Pictographic"; break;
|
|
|
|
default: graphbreak = US"Unknown"; break;
|
2014-05-13 13:20:03 +02:00
|
|
|
}
|
2018-10-14 16:27:16 +02:00
|
|
|
|
2014-05-13 13:20:03 +02:00
|
|
|
printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
|
2018-10-14 16:27:16 +02:00
|
|
|
if (othercase != c)
|
2014-05-13 13:20:03 +02:00
|
|
|
{
|
|
|
|
printf(", %04x", othercase);
|
|
|
|
if (caseset != 0)
|
|
|
|
{
|
|
|
|
const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
|
|
|
|
while (*(++p) < NOTACHAR)
|
|
|
|
if (*p != othercase && *p != c) printf(", %04x", *p);
|
2018-10-14 16:27:16 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-06 19:39:52 +02:00
|
|
|
if (scriptx != script)
|
|
|
|
{
|
2018-10-14 16:27:16 +02:00
|
|
|
printf(", [");
|
|
|
|
if (scriptx >= 0)
|
2018-10-06 19:39:52 +02:00
|
|
|
{
|
2018-10-14 16:27:16 +02:00
|
|
|
scriptname = (scriptx >= sizeof(script_names)/sizeof(char *))?
|
|
|
|
US"??" : script_names[scriptx];
|
|
|
|
printf("%s", scriptname);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
char *sep = "";
|
2018-10-06 19:39:52 +02:00
|
|
|
const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
|
|
|
|
while (*p != 0)
|
|
|
|
{
|
2018-10-14 16:27:16 +02:00
|
|
|
scriptname = (*p >= sizeof(script_names)/sizeof(char *))?
|
|
|
|
US"??" : script_names[*p++];
|
|
|
|
printf("%s%s", sep, scriptname);
|
|
|
|
sep = ", ";
|
|
|
|
}
|
|
|
|
}
|
2018-10-06 19:39:52 +02:00
|
|
|
printf("]");
|
2018-10-14 16:27:16 +02:00
|
|
|
}
|
|
|
|
|
2014-05-13 13:20:03 +02:00
|
|
|
printf("\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2018-10-14 16:27:16 +02:00
|
|
|
/*************************************************
|
|
|
|
* Find character(s) with given property/ies *
|
|
|
|
*************************************************/
|
|
|
|
|
|
|
|
static void
|
|
|
|
find_chars(unsigned char *s)
|
|
|
|
{
|
|
|
|
unsigned char name[24];
|
|
|
|
unsigned char value[24];
|
|
|
|
unsigned char *t;
|
|
|
|
unsigned int count= 0;
|
|
|
|
int scriptx_list[24];
|
|
|
|
unsigned int scriptx_count = 0;
|
|
|
|
uint32_t i, c;
|
|
|
|
int script = -1;
|
|
|
|
int type = -1;
|
|
|
|
int gbreak = -1;
|
|
|
|
BOOL script_not = FALSE;
|
|
|
|
BOOL type_not = FALSE;
|
|
|
|
BOOL gbreak_not = FALSE;
|
|
|
|
BOOL hadrange = FALSE;
|
|
|
|
const ucd_record *ucd, *next_ucd;
|
|
|
|
const char *pad = " ";
|
|
|
|
|
|
|
|
while (*s != 0)
|
|
|
|
{
|
|
|
|
unsigned int offset = 0;
|
|
|
|
BOOL scriptx_not = FALSE;
|
|
|
|
|
|
|
|
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
|
|
|
*t = 0;
|
|
|
|
while (isspace(*s)) s++;
|
|
|
|
|
|
|
|
for (t = value; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
|
|
|
*t = 0;
|
|
|
|
while (isspace(*s)) s++;
|
|
|
|
|
|
|
|
if (strcmp(CS name, "script") == 0 ||
|
|
|
|
strcmp(CS name, "scriptx") == 0)
|
|
|
|
{
|
|
|
|
if (value[0] == '!')
|
|
|
|
{
|
|
|
|
if (name[6] == 'x') scriptx_not = TRUE;
|
|
|
|
else script_not = TRUE;
|
|
|
|
offset = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < sizeof(script_names)/sizeof(char *); i++)
|
|
|
|
{
|
|
|
|
if (strcmp(CS value + offset, script_names[i]) == 0)
|
|
|
|
{
|
|
|
|
if (name[6] == 'x')
|
|
|
|
{
|
|
|
|
scriptx_list[scriptx_count++] = scriptx_not? (-i):i;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (script < 0) script = i; else
|
|
|
|
{
|
|
|
|
printf("** Only 1 script value allowed\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (i >= sizeof(script_names)/sizeof(char *))
|
|
|
|
{
|
|
|
|
printf("** Unrecognized script name '%s'\n", value);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
else if (strcmp(CS name, "type") == 0)
|
|
|
|
{
|
|
|
|
if (type >= 0)
|
|
|
|
{
|
|
|
|
printf("** Only 1 type value allowed\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (value[0] == '!')
|
|
|
|
{
|
|
|
|
type_not = TRUE;
|
|
|
|
offset = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < sizeof(type_names)/sizeof(char *); i++)
|
|
|
|
{
|
|
|
|
if (strcmp(CS (value + offset), type_names[i]) == 0)
|
|
|
|
{
|
|
|
|
type = i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (i >= sizeof(type_names)/sizeof(char *))
|
|
|
|
{
|
|
|
|
printf("** Unrecognized type name '%s'\n", value);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
else if (strcmp(CS name, "gbreak") == 0)
|
|
|
|
{
|
|
|
|
if (gbreak >= 0)
|
|
|
|
{
|
|
|
|
printf("** Only 1 grapheme break value allowed\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (value[0] == '!')
|
|
|
|
{
|
|
|
|
gbreak_not = TRUE;
|
|
|
|
offset = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < sizeof(gb_names)/sizeof(char *); i++)
|
|
|
|
{
|
|
|
|
if (strcmp(CS (value + offset), gb_names[i]) == 0)
|
|
|
|
{
|
|
|
|
gbreak = i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (i >= sizeof(gb_names)/sizeof(char *))
|
|
|
|
{
|
|
|
|
printf("** Unrecognized gbreak name '%s'\n", value);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
else
|
|
|
|
{
|
|
|
|
printf("** Unrecognized property name '%s'\n", name);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (script < 0 && scriptx_count == 0 && type < 0 && gbreak < 0)
|
|
|
|
{
|
|
|
|
printf("** No properties specified\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (c = 0; c <= 0x10ffff; c++)
|
|
|
|
{
|
|
|
|
if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
|
|
|
|
|
|
|
|
if (scriptx_count > 0)
|
|
|
|
{
|
|
|
|
const uint8_t *char_scriptx = NULL;
|
|
|
|
int found = 0;
|
|
|
|
int scriptx = UCD_SCRIPTX(c);
|
|
|
|
|
|
|
|
if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx;
|
|
|
|
|
|
|
|
for (i = 0; i < scriptx_count; i++)
|
|
|
|
{
|
|
|
|
/* Positive requirment */
|
|
|
|
if (scriptx_list[i] >= 0)
|
|
|
|
{
|
|
|
|
if (scriptx >= 0)
|
|
|
|
{
|
|
|
|
if (scriptx == scriptx_list[i]) found++;
|
|
|
|
}
|
|
|
|
|
|
|
|
else
|
|
|
|
{
|
|
|
|
const uint8_t *p;
|
|
|
|
for (p = char_scriptx; *p != 0; p++)
|
|
|
|
{
|
|
|
|
if (scriptx_list[i] == *p)
|
|
|
|
{
|
|
|
|
found++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* Negative requirement */
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (scriptx >= 0)
|
|
|
|
{
|
|
|
|
if (scriptx != -scriptx_list[i]) found++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
const uint8_t *p;
|
|
|
|
for (p = char_scriptx; *p != 0; p++)
|
|
|
|
if (-scriptx_list[i] == *p) break;
|
|
|
|
if (*p == 0) found++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (found != scriptx_count) continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type >= 0)
|
|
|
|
{
|
|
|
|
if (type_not)
|
|
|
|
{
|
|
|
|
if (type == UCD_CHARTYPE(c)) continue;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (type != UCD_CHARTYPE(c)) continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (gbreak >= 0)
|
|
|
|
{
|
|
|
|
if (gbreak_not)
|
|
|
|
{
|
|
|
|
if (gbreak == UCD_GRAPHBREAK(c)) continue;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (gbreak != UCD_GRAPHBREAK(c)) continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* All conditions are met. Look for runs. */
|
|
|
|
|
|
|
|
ucd = GET_UCD(c);
|
|
|
|
|
|
|
|
for (i = c + 1; i < 0x10ffff; i++)
|
|
|
|
{
|
|
|
|
next_ucd = GET_UCD(i);
|
|
|
|
if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (--i > c)
|
|
|
|
{
|
|
|
|
printf("%04x..", c);
|
|
|
|
c = i;
|
|
|
|
hadrange = TRUE;
|
|
|
|
}
|
|
|
|
else if (hadrange) printf("%s", pad);
|
|
|
|
|
|
|
|
print_prop(c);
|
|
|
|
if (c >= 0x100000) pad = " ";
|
|
|
|
else if (c >= 0x10000) pad = " ";
|
|
|
|
count++;
|
|
|
|
if (count >= 100)
|
|
|
|
{
|
|
|
|
printf("...\n");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (count == 0) printf("No characters found\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-05-13 13:20:03 +02:00
|
|
|
/*************************************************
|
|
|
|
* Main program *
|
|
|
|
*************************************************/
|
|
|
|
|
|
|
|
int
|
2018-10-06 19:39:52 +02:00
|
|
|
main(int argc, char **argv)
|
2014-05-13 13:20:03 +02:00
|
|
|
{
|
2018-10-14 16:27:16 +02:00
|
|
|
BOOL interactive;
|
2014-05-13 13:20:03 +02:00
|
|
|
unsigned char buffer[1024];
|
2018-10-06 19:39:52 +02:00
|
|
|
|
|
|
|
if (argc > 1)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
for (i = 1; i < argc; i++)
|
|
|
|
{
|
2018-10-14 16:27:16 +02:00
|
|
|
unsigned char *endptr;
|
2018-10-06 19:39:52 +02:00
|
|
|
int c = strtoul(argv[i], CSS(&endptr), 16);
|
2018-10-14 16:27:16 +02:00
|
|
|
if (*endptr != 0)
|
|
|
|
printf("** Hex number expected; ignored '%s'\n", argv[i]);
|
|
|
|
else print_prop(c);
|
2018-10-06 19:39:52 +02:00
|
|
|
}
|
|
|
|
return 0;
|
2018-10-14 16:27:16 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
interactive = is_stdin_tty();
|
2018-10-06 19:39:52 +02:00
|
|
|
|
2018-10-14 16:27:16 +02:00
|
|
|
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
|
|
|
if (interactive) using_history();
|
|
|
|
#endif
|
|
|
|
|
|
|
|
for(;;)
|
2014-05-13 13:20:03 +02:00
|
|
|
{
|
|
|
|
unsigned char name[24];
|
|
|
|
unsigned char *s, *t;
|
|
|
|
|
2018-10-14 16:27:16 +02:00
|
|
|
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
|
|
|
if (interactive)
|
|
|
|
{
|
|
|
|
size_t len;
|
|
|
|
s = readline("> ");
|
|
|
|
if (s == NULL) break;
|
|
|
|
len = strlen(s);
|
|
|
|
if (len > 0) add_history(s);
|
|
|
|
memcpy(buffer, s, len);
|
|
|
|
buffer[len] = '\n';
|
|
|
|
buffer[len+1] = 0;
|
|
|
|
free(s);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
|
|
|
|
{
|
|
|
|
if (interactive) printf("> ");
|
|
|
|
if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
|
|
|
|
if (!interactive) printf("%s", buffer);
|
|
|
|
}
|
|
|
|
|
2014-05-13 13:20:03 +02:00
|
|
|
s = buffer;
|
|
|
|
while (isspace(*s)) s++;
|
|
|
|
if (*s == 0) continue;
|
|
|
|
|
|
|
|
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
|
|
|
*t = 0;
|
|
|
|
while (isspace(*s)) s++;
|
|
|
|
|
|
|
|
if (strcmp(CS name, "findprop") == 0)
|
|
|
|
{
|
|
|
|
while (*s != 0)
|
|
|
|
{
|
|
|
|
unsigned char *endptr;
|
|
|
|
int c = strtoul(CS s, CSS(&endptr), 16);
|
2018-10-14 16:27:16 +02:00
|
|
|
|
|
|
|
if (*endptr != 0 && !isspace(*endptr))
|
|
|
|
{
|
|
|
|
while (*endptr != 0 && !isspace(*endptr)) endptr++;
|
|
|
|
printf("** Hex number expected; ignored '%.*s'\n", endptr-s, s);
|
|
|
|
}
|
|
|
|
else print_prop(c);
|
2014-05-13 13:20:03 +02:00
|
|
|
s = endptr;
|
|
|
|
while (isspace(*s)) s++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-14 16:27:16 +02:00
|
|
|
else if (strcmp(CS name, "find") == 0)
|
|
|
|
{
|
|
|
|
find_chars(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
else printf("** Unknown test command %s\n", name);
|
2014-05-13 13:20:03 +02:00
|
|
|
}
|
|
|
|
|
2018-10-14 16:27:16 +02:00
|
|
|
if (interactive) printf("\n");
|
|
|
|
|
|
|
|
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
|
|
|
if (interactive) clear_history();
|
|
|
|
#endif
|
|
|
|
|
2014-05-13 13:20:03 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* End */
|