Add support for 4-character script abbreviations
This commit is contained in:
parent
af2637ee5e
commit
7713f33e46
|
@ -58,6 +58,9 @@ of applications treat NULL/0 in this way.
|
|||
the same as \p{scx:scriptname} because this change happened in Perl at
|
||||
release 5.26.
|
||||
|
||||
(d) The standard Unicode 4-letter abbreviations for script names are now
|
||||
recognized.
|
||||
|
||||
18. The Python scripts in the maint directory have been refactored. There are
|
||||
now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c
|
||||
(which is #included by pcre2_tables.c). The data lists that used to be
|
||||
|
|
|
@ -810,171 +810,170 @@ interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
|||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||
part of an identified script are lumped together as "Common". The current list
|
||||
of scripts is:
|
||||
of script names and their 4-letter abbreviations is:
|
||||
</P>
|
||||
<P>
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cypro_Minoan,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Old_Uyghur,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangsa,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Toto,
|
||||
Ugaritic,
|
||||
Unknown,
|
||||
Vai,
|
||||
Vithkuqi,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Adlam (Adlm),
|
||||
Ahom (Ahom),
|
||||
Anatolian_Hieroglyphs (Hluw),
|
||||
Arabic (Arab),
|
||||
Armenian (Armn),
|
||||
Avestan (Avst),
|
||||
Balinese (Bali),
|
||||
Bamum (Bamu),
|
||||
Bassa_Vah (Bass),
|
||||
Batak (Batk),
|
||||
Bengali (Beng),
|
||||
Bhaiksuki (Bhks),
|
||||
Bopomofo (Bopo),
|
||||
Brahmi (Brah),
|
||||
Braille (Brai),
|
||||
Buginese (Bugi),
|
||||
Buhid (Buhd),
|
||||
Canadian_Aboriginal (Cans),
|
||||
Carian (Cari),
|
||||
Caucasian_Albanian (Aghb),
|
||||
Chakma (Cakm),
|
||||
Cham (Cham),
|
||||
Cherokee (Cher),
|
||||
Chorasmian (Chrs),
|
||||
Common (Zyyy),
|
||||
Coptic (Copt),
|
||||
Cuneiform (Xsux),
|
||||
Cypriot (Cprt),
|
||||
Cypro_Minoan (Cpmn),
|
||||
Cyrillic (Cyrl),
|
||||
Deseret (Dsrt),
|
||||
Devanagari (Deva),
|
||||
Dives_Akuru (Diak),
|
||||
Dogra (Dogr),
|
||||
Duployan (Dupl),
|
||||
Egyptian_Hieroglyphs (Egyp),
|
||||
Elbasan (Elba),
|
||||
Elymaic (Elym),
|
||||
Ethiopic (Ethi),
|
||||
Georgian (Geor),
|
||||
Glagolitic (Glag),
|
||||
Gothic (Goth),
|
||||
Grantha (Gran),
|
||||
Greek (Grek),
|
||||
Gujarati (Gujr),
|
||||
Gunjala_Gondi (Gong),
|
||||
Gurmukhi (Guru),
|
||||
Han (Hani),
|
||||
Hangul (Hang),
|
||||
Hanifi_Rohingya (Rohg),
|
||||
Hanunoo (Hano),
|
||||
Hatran (Hatr),
|
||||
Hebrew (Hebr),
|
||||
Hiragana (Hira),
|
||||
Imperial_Aramaic (Armi),
|
||||
Inherited (Zinh),
|
||||
Inscriptional_Pahlavi (Phli),
|
||||
Inscriptional_Parthian (Prti),
|
||||
Javanese (Java),
|
||||
Kaithi (Kthi),
|
||||
Kannada (Knda),
|
||||
Katakana (Kana),
|
||||
Kayah_Li (Kali),
|
||||
Kharoshthi (Khar),
|
||||
Khitan_Small_Script (Kits),
|
||||
Khmer (Khmr),
|
||||
Khojki (Khoj),
|
||||
Khudawadi (Sind),
|
||||
Lao (Laoo),
|
||||
Latin (Latn),
|
||||
Lepcha (Lepc),
|
||||
Limbu (Limb),
|
||||
Linear_A (Lina),
|
||||
Linear_B (Linb),
|
||||
Lisu (Lisu),
|
||||
Lycian (Lyci),
|
||||
Lydian (Lydi),
|
||||
Mahajani (Majh),
|
||||
Makasar (Maka),
|
||||
Malayalam (Mlym),
|
||||
Mandaic (Mand),
|
||||
Manichaean (Mani),
|
||||
Marchen (Marc),
|
||||
Masaram_Gondi (Gonm),
|
||||
Medefaidrin (Medf),
|
||||
Meetei_Mayek (Mtei),
|
||||
Mende_Kikakui (Mend),
|
||||
Meroitic_Cursive (Merc),
|
||||
Meroitic_Hieroglyphs (Mero),
|
||||
Miao (Miao),
|
||||
Modi (Modi),
|
||||
Mongolian (Mong),
|
||||
Mro (Mroo),
|
||||
Multani (Mult),
|
||||
Myanmar (Mymr),
|
||||
Nabataean (Nbar),
|
||||
Nandinagari (Nand),
|
||||
New_Tai_Lue (Talu),
|
||||
Newa (Newa),
|
||||
Nko (Nkoo),
|
||||
Nushu (Nshu),
|
||||
Nyiakeng_Puachue_Hmong (Hmnp),
|
||||
Ogham (Ogam),
|
||||
Ol_Chiki (Olck),
|
||||
Old_Hungarian (Hung),
|
||||
Old_Italic (Olck),
|
||||
Old_North_Arabian (Narb),
|
||||
Old_Permic (Perm),
|
||||
Old_Persian (Orkh),
|
||||
Old_Sogdian (Sogo),
|
||||
Old_South_Arabian (Sarb),
|
||||
Old_Turkic (Orkh),
|
||||
Old_Uyghur (Ougr),
|
||||
Oriya (Orya),
|
||||
Osage (Osge),
|
||||
Osmanya (Osma),
|
||||
Pahawh_Hmong (Hmng),
|
||||
Palmyrene (Palm),
|
||||
Pau_Cin_Hau (Pauc),
|
||||
Phags_Pa (Phag),
|
||||
Phoenician (Phnx),
|
||||
Psalter_Pahlavi (Phli),
|
||||
Rejang (Rjng),
|
||||
Runic (Runr),
|
||||
Samaritan (Samr),
|
||||
Saurashtra (Saur),
|
||||
Sharada (Shrd),
|
||||
Shavian (Shaw),
|
||||
Siddham (Sidd),
|
||||
SignWriting (Sgnw),
|
||||
Sinhala (Sinh),
|
||||
Sogdian (Sogd),
|
||||
Sora_Sompeng (Sora),
|
||||
Soyombo (Soyo),
|
||||
Sundanese (Sund),
|
||||
Syloti_Nagri (Sylo),
|
||||
Syriac (Syrc),
|
||||
Tagalog (Tglg),
|
||||
Tagbanwa (Tagb),
|
||||
Tai_Le (Tale),
|
||||
Tai_Tham (Lana),
|
||||
Tai_Viet (Tavt),
|
||||
Takri (Takr),
|
||||
Tamil (Taml),
|
||||
Tangsa (Tngs),
|
||||
Tangut (Tang),
|
||||
Telugu (Telu),
|
||||
Thaana (Thaa),
|
||||
Thai (Thai),
|
||||
Tibetan (Tibt),
|
||||
Tifinagh (Tfng),
|
||||
Tirhuta (Tirh),
|
||||
Toto (Toto),
|
||||
Ugaritic (Ugar),
|
||||
Vai (Vaii),
|
||||
Vithkuqi (Vith),
|
||||
Wancho (Wcho),
|
||||
Warang_Citi (Wara),
|
||||
Yezidi (Yezi),
|
||||
Yi (Yiii),
|
||||
Zanabazar_Square (Zanb).
|
||||
</P>
|
||||
<P>
|
||||
Each character has exactly one Unicode general category property, specified by
|
||||
|
@ -3909,7 +3908,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 22 December 2021
|
||||
Last updated: 28 December 2021
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -23,12 +23,12 @@ please consult the man page, in case the conversion went wrong.
|
|||
<br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS</a><br>
|
||||
<P>
|
||||
<b>int32_t pcre2_serialize_decode(pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, const uint32_t *<i>bytes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, const uint8_t *<i>bytes</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int32_t pcre2_serialize_encode(pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, uint32_t **<i>serialized_bytes</i>,</b>
|
||||
<b>int32_t pcre2_serialize_encode(const pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, uint8_t **<i>serialized_bytes</i>,</b>
|
||||
<b> PCRE2_SIZE *<i>serialized_size</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -154,7 +154,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
|
|||
<b>malloc()</b> and <b>free()</b> are used. After deserialization, the byte
|
||||
stream is no longer needed and can be discarded.
|
||||
<pre>
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
|
|
@ -207,171 +207,172 @@ at release 5.18.
|
|||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">SCRIPT MATCHING WITH \p AND \P</a><br>
|
||||
<P>
|
||||
The following script names are recognized in \p{sc:...} or \p{scx:...} items,
|
||||
or on their own with \p (and also \P of course):
|
||||
The following script names and their 4-letter abbreviations are recognized in
|
||||
\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of
|
||||
course):
|
||||
</P>
|
||||
<P>
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cypro_Minoan,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Old_Uyghur,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangsa,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Toto,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Vithkuqi,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Adlam (Adlm),
|
||||
Ahom (Ahom),
|
||||
Anatolian_Hieroglyphs (Hluw),
|
||||
Arabic (Arab),
|
||||
Armenian (Armn),
|
||||
Avestan (Avst),
|
||||
Balinese (Bali),
|
||||
Bamum (Bamu),
|
||||
Bassa_Vah (Bass),
|
||||
Batak (Batk),
|
||||
Bengali (Beng),
|
||||
Bhaiksuki (Bhks),
|
||||
Bopomofo (Bopo),
|
||||
Brahmi (Brah),
|
||||
Braille (Brai),
|
||||
Buginese (Bugi),
|
||||
Buhid (Buhd),
|
||||
Canadian_Aboriginal (Cans),
|
||||
Carian (Cari),
|
||||
Caucasian_Albanian (Aghb),
|
||||
Chakma (Cakm),
|
||||
Cham (Cham),
|
||||
Cherokee (Cher),
|
||||
Chorasmian (Chrs),
|
||||
Common (Zyyy),
|
||||
Coptic (Copt),
|
||||
Cuneiform (Xsux),
|
||||
Cypriot (Cprt),
|
||||
Cypro_Minoan (Cpmn),
|
||||
Cyrillic (Cyrl),
|
||||
Deseret (Dsrt),
|
||||
Devanagari (Deva),
|
||||
Dives_Akuru (Diak),
|
||||
Dogra (Dogr),
|
||||
Duployan (Dupl),
|
||||
Egyptian_Hieroglyphs (Egyp),
|
||||
Elbasan (Elba),
|
||||
Elymaic (Elym),
|
||||
Ethiopic (Ethi),
|
||||
Georgian (Geor),
|
||||
Glagolitic (Glag),
|
||||
Gothic (Goth),
|
||||
Grantha (Gran),
|
||||
Greek (Grek),
|
||||
Gujarati (Gujr),
|
||||
Gunjala_Gondi (Gong),
|
||||
Gurmukhi (Guru),
|
||||
Han (Hani),
|
||||
Hangul (Hang),
|
||||
Hanifi_Rohingya (Rohg),
|
||||
Hanunoo (Hano),
|
||||
Hatran (Hatr),
|
||||
Hebrew (Hebr),
|
||||
Hiragana (Hira),
|
||||
Imperial_Aramaic (Armi),
|
||||
Inherited (Zinh),
|
||||
Inscriptional_Pahlavi (Phli),
|
||||
Inscriptional_Parthian (Prti),
|
||||
Javanese (Java),
|
||||
Kaithi (Kthi),
|
||||
Kannada (Knda),
|
||||
Katakana (Kana),
|
||||
Kayah_Li (Kali),
|
||||
Kharoshthi (Khar),
|
||||
Khitan_Small_Script (Kits),
|
||||
Khmer (Khmr),
|
||||
Khojki (Khoj),
|
||||
Khudawadi (Sind),
|
||||
Lao (Laoo),
|
||||
Latin (Latn),
|
||||
Lepcha (Lepc),
|
||||
Limbu (Limb),
|
||||
Linear_A (Lina),
|
||||
Linear_B (Linb),
|
||||
Lisu (Lisu),
|
||||
Lycian (Lyci),
|
||||
Lydian (Lydi),
|
||||
Mahajani (Majh),
|
||||
Makasar (Maka),
|
||||
Malayalam (Mlym),
|
||||
Mandaic (Mand),
|
||||
Manichaean (Mani),
|
||||
Marchen (Marc),
|
||||
Masaram_Gondi (Gonm),
|
||||
Medefaidrin (Medf),
|
||||
Meetei_Mayek (Mtei),
|
||||
Mende_Kikakui (Mend),
|
||||
Meroitic_Cursive (Merc),
|
||||
Meroitic_Hieroglyphs (Mero),
|
||||
Miao (Miao),
|
||||
Modi (Modi),
|
||||
Mongolian (Mong),
|
||||
Mro (Mroo),
|
||||
Multani (Mult),
|
||||
Myanmar (Mymr),
|
||||
Nabataean (Nbar),
|
||||
Nandinagari (Nand),
|
||||
New_Tai_Lue (Talu),
|
||||
Newa (Newa),
|
||||
Nko (Nkoo),
|
||||
Nushu (Nshu),
|
||||
Nyiakeng_Puachue_Hmong (Hmnp),
|
||||
Ogham (Ogam),
|
||||
Ol_Chiki (Olck),
|
||||
Old_Hungarian (Hung),
|
||||
Old_Italic (Olck),
|
||||
Old_North_Arabian (Narb),
|
||||
Old_Permic (Perm),
|
||||
Old_Persian (Orkh),
|
||||
Old_Sogdian (Sogo),
|
||||
Old_South_Arabian (Sarb),
|
||||
Old_Turkic (Orkh),
|
||||
Old_Uyghur (Ougr),
|
||||
Oriya (Orya),
|
||||
Osage (Osge),
|
||||
Osmanya (Osma),
|
||||
Pahawh_Hmong (Hmng),
|
||||
Palmyrene (Palm),
|
||||
Pau_Cin_Hau (Pauc),
|
||||
Phags_Pa (Phag),
|
||||
Phoenician (Phnx),
|
||||
Psalter_Pahlavi (Phli),
|
||||
Rejang (Rjng),
|
||||
Runic (Runr),
|
||||
Samaritan (Samr),
|
||||
Saurashtra (Saur),
|
||||
Sharada (Shrd),
|
||||
Shavian (Shaw),
|
||||
Siddham (Sidd),
|
||||
SignWriting (Sgnw),
|
||||
Sinhala (Sinh),
|
||||
Sogdian (Sogd),
|
||||
Sora_Sompeng (Sora),
|
||||
Soyombo (Soyo),
|
||||
Sundanese (Sund),
|
||||
Syloti_Nagri (Sylo),
|
||||
Syriac (Syrc),
|
||||
Tagalog (Tglg),
|
||||
Tagbanwa (Tagb),
|
||||
Tai_Le (Tale),
|
||||
Tai_Tham (Lana),
|
||||
Tai_Viet (Tavt),
|
||||
Takri (Takr),
|
||||
Tamil (Taml),
|
||||
Tangsa (Tngs),
|
||||
Tangut (Tang),
|
||||
Telugu (Telu),
|
||||
Thaana (Thaa),
|
||||
Thai (Thai),
|
||||
Tibetan (Tibt),
|
||||
Tifinagh (Tfng),
|
||||
Tirhuta (Tirh),
|
||||
Toto (Toto),
|
||||
Ugaritic (Ugar),
|
||||
Vai (Vaii),
|
||||
Vithkuqi (Vith),
|
||||
Wancho (Wcho),
|
||||
Warang_Citi (Wara),
|
||||
Yezidi (Yezi),
|
||||
Yi (Yiii),
|
||||
Zanabazar_Square (Zanb).
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">BIDI_PROPERTIES FOR \p AND \P</a><br>
|
||||
<P>
|
||||
|
@ -743,7 +744,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 22 December 2021
|
||||
Last updated: 28 December 2021
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
|
|
153
doc/pcre2.txt
153
doc/pcre2.txt
|
@ -6920,33 +6920,51 @@ BACKSLASH
|
|||
Unassigned characters (and in non-UTF 32-bit mode, characters with code
|
||||
points greater than 0x10FFFF) are assigned the "Unknown" script. Others
|
||||
that are not part of an identified script are lumped together as "Com-
|
||||
mon". The current list of scripts is:
|
||||
mon". The current list of script names and their 4-letter abbreviations
|
||||
is:
|
||||
|
||||
Adlam, Ahom, Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, Bali-
|
||||
nese, Bamum, Bassa_Vah, Batak, Bengali, Bhaiksuki, Bopomofo, Brahmi,
|
||||
Braille, Buginese, Buhid, Canadian_Aboriginal, Carian, Caucasian_Alba-
|
||||
nian, Chakma, Cham, Cherokee, Chorasmian, Common, Coptic, Cuneiform,
|
||||
Cypriot, Cypro_Minoan, Cyrillic, Deseret, Devanagari, Dives_Akuru, Do-
|
||||
gra, Duployan, Egyptian_Hieroglyphs, Elbasan, Elymaic, Ethiopic, Geor-
|
||||
gian, Glagolitic, Gothic, Grantha, Greek, Gujarati, Gunjala_Gondi, Gur-
|
||||
mukhi, Han, Hangul, Hanifi_Rohingya, Hanunoo, Hatran, Hebrew, Hiragana,
|
||||
Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip-
|
||||
tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li,
|
||||
Kharoshthi, Khitan_Small_Script, Khmer, Khojki, Khudawadi, Lao, Latin,
|
||||
Lepcha, Limbu, Linear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani,
|
||||
Makasar, Malayalam, Mandaic, Manichaean, Marchen, Masaram_Gondi, Mede-
|
||||
faidrin, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive, Meroitic_Hiero-
|
||||
glyphs, Miao, Modi, Mongolian, Mro, Multani, Myanmar, Nabataean, Nandi-
|
||||
nagari, New_Tai_Lue, Newa, Nko, Nushu, Nyakeng_Puachue_Hmong, Ogham,
|
||||
Ol_Chiki, Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic,
|
||||
Old_Persian, Old_Sogdian, Old_South_Arabian, Old_Turkic, Old_Uyghur,
|
||||
Oriya, Osage, Osmanya, Pahawh_Hmong, Palmyrene, Pau_Cin_Hau, Phags_Pa,
|
||||
Phoenician, Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra,
|
||||
Sharada, Shavian, Siddham, SignWriting, Sinhala, Sogdian, Sora_Sompeng,
|
||||
Soyombo, Sundanese, Syloti_Nagri, Syriac, Tagalog, Tagbanwa, Tai_Le,
|
||||
Tai_Tham, Tai_Viet, Takri, Tamil, Tangsa, Tangut, Telugu, Thaana, Thai,
|
||||
Tibetan, Tifinagh, Tirhuta, Toto, Ugaritic, Unknown, Vai, Vithkuqi,
|
||||
Wancho, Warang_Citi, Yezidi, Yi, Zanabazar_Square.
|
||||
Adlam (Adlm), Ahom (Ahom), Anatolian_Hieroglyphs (Hluw), Arabic (Arab),
|
||||
Armenian (Armn), Avestan (Avst), Balinese (Bali), Bamum (Bamu),
|
||||
Bassa_Vah (Bass), Batak (Batk), Bengali (Beng), Bhaiksuki (Bhks), Bopo-
|
||||
mofo (Bopo), Brahmi (Brah), Braille (Brai), Buginese (Bugi), Buhid
|
||||
(Buhd), Canadian_Aboriginal (Cans), Carian (Cari), Caucasian_Albanian
|
||||
(Aghb), Chakma (Cakm), Cham (Cham), Cherokee (Cher), Chorasmian (Chrs),
|
||||
Common (Zyyy), Coptic (Copt), Cuneiform (Xsux), Cypriot (Cprt),
|
||||
Cypro_Minoan (Cpmn), Cyrillic (Cyrl), Deseret (Dsrt), Devanagari
|
||||
(Deva), Dives_Akuru (Diak), Dogra (Dogr), Duployan (Dupl), Egyptian_Hi-
|
||||
eroglyphs (Egyp), Elbasan (Elba), Elymaic (Elym), Ethiopic (Ethi),
|
||||
Georgian (Geor), Glagolitic (Glag), Gothic (Goth), Grantha (Gran),
|
||||
Greek (Grek), Gujarati (Gujr), Gunjala_Gondi (Gong), Gurmukhi (Guru),
|
||||
Han (Hani), Hangul (Hang), Hanifi_Rohingya (Rohg), Hanunoo (Hano), Ha-
|
||||
tran (Hatr), Hebrew (Hebr), Hiragana (Hira), Imperial_Aramaic (Armi),
|
||||
Inherited (Zinh), Inscriptional_Pahlavi (Phli), Inscriptional_Parthian
|
||||
(Prti), Javanese (Java), Kaithi (Kthi), Kannada (Knda), Katakana
|
||||
(Kana), Kayah_Li (Kali), Kharoshthi (Khar), Khitan_Small_Script (Kits),
|
||||
Khmer (Khmr), Khojki (Khoj), Khudawadi (Sind), Lao (Laoo), Latin
|
||||
(Latn), Lepcha (Lepc), Limbu (Limb), Linear_A (Lina), Linear_B (Linb),
|
||||
Lisu (Lisu), Lycian (Lyci), Lydian (Lydi), Mahajani (Majh), Makasar
|
||||
(Maka), Malayalam (Mlym), Mandaic (Mand), Manichaean (Mani), Marchen
|
||||
(Marc), Masaram_Gondi (Gonm), Medefaidrin (Medf), Meetei_Mayek (Mtei),
|
||||
Mende_Kikakui (Mend), Meroitic_Cursive (Merc), Meroitic_Hieroglyphs
|
||||
(Mero), Miao (Miao), Modi (Modi), Mongolian (Mong), Mro (Mroo), Multani
|
||||
(Mult), Myanmar (Mymr), Nabataean (Nbar), Nandinagari (Nand),
|
||||
New_Tai_Lue (Talu), Newa (Newa), Nko (Nkoo), Nushu (Nshu), Nyiak-
|
||||
eng_Puachue_Hmong (Hmnp), Ogham (Ogam), Ol_Chiki (Olck), Old_Hungarian
|
||||
(Hung), Old_Italic (Olck), Old_North_Arabian (Narb), Old_Permic (Perm),
|
||||
Old_Persian (Orkh), Old_Sogdian (Sogo), Old_South_Arabian (Sarb),
|
||||
Old_Turkic (Orkh), Old_Uyghur (Ougr), Oriya (Orya), Osage (Osge), Os-
|
||||
manya (Osma), Pahawh_Hmong (Hmng), Palmyrene (Palm), Pau_Cin_Hau
|
||||
(Pauc), Phags_Pa (Phag), Phoenician (Phnx), Psalter_Pahlavi (Phli), Re-
|
||||
jang (Rjng), Runic (Runr), Samaritan (Samr), Saurashtra (Saur), Sharada
|
||||
(Shrd), Shavian (Shaw), Siddham (Sidd), SignWriting (Sgnw), Sinhala
|
||||
(Sinh), Sogdian (Sogd), Sora_Sompeng (Sora), Soyombo (Soyo), Sundanese
|
||||
(Sund), Syloti_Nagri (Sylo), Syriac (Syrc), Tagalog (Tglg), Tagbanwa
|
||||
(Tagb), Tai_Le (Tale), Tai_Tham (Lana), Tai_Viet (Tavt), Takri (Takr),
|
||||
Tamil (Taml), Tangsa (Tngs), Tangut (Tang), Telugu (Telu), Thaana
|
||||
(Thaa), Thai (Thai), Tibetan (Tibt), Tifinagh (Tfng), Tirhuta (Tirh),
|
||||
Toto (Toto), Ugaritic (Ugar), Vai (Vaii), Vithkuqi (Vith), Wancho
|
||||
(Wcho), Warang_Citi (Wara), Yezidi (Yezi), Yi (Yiii), Zanabazar_Square
|
||||
(Zanb).
|
||||
|
||||
Each character has exactly one Unicode general category property, spec-
|
||||
ified by a two-letter abbreviation. For compatibility with Perl, nega-
|
||||
|
@ -9707,7 +9725,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 22 December 2021
|
||||
Last updated: 28 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -10379,11 +10397,11 @@ NAME
|
|||
SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS
|
||||
|
||||
int32_t pcre2_serialize_decode(pcre2_code **codes,
|
||||
int32_t number_of_codes, const uint32_t *bytes,
|
||||
int32_t number_of_codes, const uint8_t *bytes,
|
||||
pcre2_general_context *gcontext);
|
||||
|
||||
int32_t pcre2_serialize_encode(pcre2_code **codes,
|
||||
int32_t number_of_codes, uint32_t **serialized_bytes,
|
||||
int32_t pcre2_serialize_encode(const pcre2_code **codes,
|
||||
int32_t number_of_codes, uint8_t **serialized_bytes,
|
||||
PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
|
||||
|
||||
void pcre2_serialize_free(uint8_t *bytes);
|
||||
|
@ -10507,7 +10525,6 @@ RE-USING PRECOMPILED PATTERNS
|
|||
If this argument is NULL, malloc() and free() are used. After deserial-
|
||||
ization, the byte stream is no longer needed and can be discarded.
|
||||
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
@ -10724,34 +10741,52 @@ PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P
|
|||
|
||||
SCRIPT MATCHING WITH \p AND \P
|
||||
|
||||
The following script names are recognized in \p{sc:...} or \p{scx:...}
|
||||
items, or on their own with \p (and also \P of course):
|
||||
The following script names and their 4-letter abbreviations are recog-
|
||||
nized in \p{sc:...} or \p{scx:...} items, or on their own with \p (and
|
||||
also \P of course):
|
||||
|
||||
Adlam, Ahom, Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, Bali-
|
||||
nese, Bamum, Bassa_Vah, Batak, Bengali, Bhaiksuki, Bopomofo, Brahmi,
|
||||
Braille, Buginese, Buhid, Canadian_Aboriginal, Carian, Caucasian_Alba-
|
||||
nian, Chakma, Cham, Cherokee, Chorasmian, Common, Coptic, Cuneiform,
|
||||
Cypriot, Cypro_Minoan, Cyrillic, Deseret, Devanagari, Dives_Akuru, Do-
|
||||
gra, Duployan, Egyptian_Hieroglyphs, Elbasan, Elymaic, Ethiopic, Geor-
|
||||
gian, Glagolitic, Gothic, Grantha, Greek, Gujarati, Gunjala_Gondi, Gur-
|
||||
mukhi, Han, Hangul, Hanifi_Rohingya, Hanunoo, Hatran, Hebrew, Hiragana,
|
||||
Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip-
|
||||
tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li,
|
||||
Kharoshthi, Khitan_Small_Script, Khmer, Khojki, Khudawadi, Lao, Latin,
|
||||
Lepcha, Limbu, Linear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani,
|
||||
Makasar, Malayalam, Mandaic, Manichaean, Marchen, Masaram_Gondi, Mede-
|
||||
faidrin, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive, Meroitic_Hiero-
|
||||
glyphs, Miao, Modi, Mongolian, Mro, Multani, Myanmar, Nabataean, Nandi-
|
||||
nagari, New_Tai_Lue, Newa, Nko, Nushu, Nyakeng_Puachue_Hmong, Ogham,
|
||||
Ol_Chiki, Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic,
|
||||
Old_Persian, Old_Sogdian, Old_South_Arabian, Old_Turkic, Old_Uyghur,
|
||||
Oriya, Osage, Osmanya, Pahawh_Hmong, Palmyrene, Pau_Cin_Hau, Phags_Pa,
|
||||
Phoenician, Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra,
|
||||
Sharada, Shavian, Siddham, SignWriting, Sinhala, Sogdian, Sora_Sompeng,
|
||||
Soyombo, Sundanese, Syloti_Nagri, Syriac, Tagalog, Tagbanwa, Tai_Le,
|
||||
Tai_Tham, Tai_Viet, Takri, Tamil, Tangsa, Tangut, Telugu, Thaana, Thai,
|
||||
Tibetan, Tifinagh, Tirhuta, Toto, Ugaritic, Vai, Vithkuqi, Wancho,
|
||||
Warang_Citi, Yezidi, Yi, Zanabazar_Square.
|
||||
Adlam (Adlm), Ahom (Ahom), Anatolian_Hieroglyphs (Hluw), Arabic (Arab),
|
||||
Armenian (Armn), Avestan (Avst), Balinese (Bali), Bamum (Bamu),
|
||||
Bassa_Vah (Bass), Batak (Batk), Bengali (Beng), Bhaiksuki (Bhks), Bopo-
|
||||
mofo (Bopo), Brahmi (Brah), Braille (Brai), Buginese (Bugi), Buhid
|
||||
(Buhd), Canadian_Aboriginal (Cans), Carian (Cari), Caucasian_Albanian
|
||||
(Aghb), Chakma (Cakm), Cham (Cham), Cherokee (Cher), Chorasmian (Chrs),
|
||||
Common (Zyyy), Coptic (Copt), Cuneiform (Xsux), Cypriot (Cprt),
|
||||
Cypro_Minoan (Cpmn), Cyrillic (Cyrl), Deseret (Dsrt), Devanagari
|
||||
(Deva), Dives_Akuru (Diak), Dogra (Dogr), Duployan (Dupl), Egyptian_Hi-
|
||||
eroglyphs (Egyp), Elbasan (Elba), Elymaic (Elym), Ethiopic (Ethi),
|
||||
Georgian (Geor), Glagolitic (Glag), Gothic (Goth), Grantha (Gran),
|
||||
Greek (Grek), Gujarati (Gujr), Gunjala_Gondi (Gong), Gurmukhi (Guru),
|
||||
Han (Hani), Hangul (Hang), Hanifi_Rohingya (Rohg), Hanunoo (Hano), Ha-
|
||||
tran (Hatr), Hebrew (Hebr), Hiragana (Hira), Imperial_Aramaic (Armi),
|
||||
Inherited (Zinh), Inscriptional_Pahlavi (Phli), Inscriptional_Parthian
|
||||
(Prti), Javanese (Java), Kaithi (Kthi), Kannada (Knda), Katakana
|
||||
(Kana), Kayah_Li (Kali), Kharoshthi (Khar), Khitan_Small_Script (Kits),
|
||||
Khmer (Khmr), Khojki (Khoj), Khudawadi (Sind), Lao (Laoo), Latin
|
||||
(Latn), Lepcha (Lepc), Limbu (Limb), Linear_A (Lina), Linear_B (Linb),
|
||||
Lisu (Lisu), Lycian (Lyci), Lydian (Lydi), Mahajani (Majh), Makasar
|
||||
(Maka), Malayalam (Mlym), Mandaic (Mand), Manichaean (Mani), Marchen
|
||||
(Marc), Masaram_Gondi (Gonm), Medefaidrin (Medf), Meetei_Mayek (Mtei),
|
||||
Mende_Kikakui (Mend), Meroitic_Cursive (Merc), Meroitic_Hieroglyphs
|
||||
(Mero), Miao (Miao), Modi (Modi), Mongolian (Mong), Mro (Mroo), Multani
|
||||
(Mult), Myanmar (Mymr), Nabataean (Nbar), Nandinagari (Nand),
|
||||
New_Tai_Lue (Talu), Newa (Newa), Nko (Nkoo), Nushu (Nshu), Nyiak-
|
||||
eng_Puachue_Hmong (Hmnp), Ogham (Ogam), Ol_Chiki (Olck), Old_Hungarian
|
||||
(Hung), Old_Italic (Olck), Old_North_Arabian (Narb), Old_Permic (Perm),
|
||||
Old_Persian (Orkh), Old_Sogdian (Sogo), Old_South_Arabian (Sarb),
|
||||
Old_Turkic (Orkh), Old_Uyghur (Ougr), Oriya (Orya), Osage (Osge), Os-
|
||||
manya (Osma), Pahawh_Hmong (Hmng), Palmyrene (Palm), Pau_Cin_Hau
|
||||
(Pauc), Phags_Pa (Phag), Phoenician (Phnx), Psalter_Pahlavi (Phli), Re-
|
||||
jang (Rjng), Runic (Runr), Samaritan (Samr), Saurashtra (Saur), Sharada
|
||||
(Shrd), Shavian (Shaw), Siddham (Sidd), SignWriting (Sgnw), Sinhala
|
||||
(Sinh), Sogdian (Sogd), Sora_Sompeng (Sora), Soyombo (Soyo), Sundanese
|
||||
(Sund), Syloti_Nagri (Sylo), Syriac (Syrc), Tagalog (Tglg), Tagbanwa
|
||||
(Tagb), Tai_Le (Tale), Tai_Tham (Lana), Tai_Viet (Tavt), Takri (Takr),
|
||||
Tamil (Taml), Tangsa (Tngs), Tangut (Tang), Telugu (Telu), Thaana
|
||||
(Thaa), Thai (Thai), Tibetan (Tibt), Tifinagh (Tfng), Tirhuta (Tirh),
|
||||
Toto (Toto), Ugaritic (Ugar), Vai (Vaii), Vithkuqi (Vith), Wancho
|
||||
(Wcho), Warang_Citi (Wara), Yezidi (Yezi), Yi (Yiii), Zanabazar_Square
|
||||
(Zanb).
|
||||
|
||||
|
||||
BIDI_PROPERTIES FOR \p AND \P
|
||||
|
@ -11117,7 +11152,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 22 December 2021
|
||||
Last updated: 28 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "22 December 2021" "PCRE2 10.40"
|
||||
.TH PCRE2PATTERN 3 "28 December 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -807,170 +807,169 @@ interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
|||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||
part of an identified script are lumped together as "Common". The current list
|
||||
of scripts is:
|
||||
of script names and their 4-letter abbreviations is:
|
||||
.P
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cypro_Minoan,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Old_Uyghur,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangsa,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Toto,
|
||||
Ugaritic,
|
||||
Unknown,
|
||||
Vai,
|
||||
Vithkuqi,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Adlam (Adlm),
|
||||
Ahom (Ahom),
|
||||
Anatolian_Hieroglyphs (Hluw),
|
||||
Arabic (Arab),
|
||||
Armenian (Armn),
|
||||
Avestan (Avst),
|
||||
Balinese (Bali),
|
||||
Bamum (Bamu),
|
||||
Bassa_Vah (Bass),
|
||||
Batak (Batk),
|
||||
Bengali (Beng),
|
||||
Bhaiksuki (Bhks),
|
||||
Bopomofo (Bopo),
|
||||
Brahmi (Brah),
|
||||
Braille (Brai),
|
||||
Buginese (Bugi),
|
||||
Buhid (Buhd),
|
||||
Canadian_Aboriginal (Cans),
|
||||
Carian (Cari),
|
||||
Caucasian_Albanian (Aghb),
|
||||
Chakma (Cakm),
|
||||
Cham (Cham),
|
||||
Cherokee (Cher),
|
||||
Chorasmian (Chrs),
|
||||
Common (Zyyy),
|
||||
Coptic (Copt),
|
||||
Cuneiform (Xsux),
|
||||
Cypriot (Cprt),
|
||||
Cypro_Minoan (Cpmn),
|
||||
Cyrillic (Cyrl),
|
||||
Deseret (Dsrt),
|
||||
Devanagari (Deva),
|
||||
Dives_Akuru (Diak),
|
||||
Dogra (Dogr),
|
||||
Duployan (Dupl),
|
||||
Egyptian_Hieroglyphs (Egyp),
|
||||
Elbasan (Elba),
|
||||
Elymaic (Elym),
|
||||
Ethiopic (Ethi),
|
||||
Georgian (Geor),
|
||||
Glagolitic (Glag),
|
||||
Gothic (Goth),
|
||||
Grantha (Gran),
|
||||
Greek (Grek),
|
||||
Gujarati (Gujr),
|
||||
Gunjala_Gondi (Gong),
|
||||
Gurmukhi (Guru),
|
||||
Han (Hani),
|
||||
Hangul (Hang),
|
||||
Hanifi_Rohingya (Rohg),
|
||||
Hanunoo (Hano),
|
||||
Hatran (Hatr),
|
||||
Hebrew (Hebr),
|
||||
Hiragana (Hira),
|
||||
Imperial_Aramaic (Armi),
|
||||
Inherited (Zinh),
|
||||
Inscriptional_Pahlavi (Phli),
|
||||
Inscriptional_Parthian (Prti),
|
||||
Javanese (Java),
|
||||
Kaithi (Kthi),
|
||||
Kannada (Knda),
|
||||
Katakana (Kana),
|
||||
Kayah_Li (Kali),
|
||||
Kharoshthi (Khar),
|
||||
Khitan_Small_Script (Kits),
|
||||
Khmer (Khmr),
|
||||
Khojki (Khoj),
|
||||
Khudawadi (Sind),
|
||||
Lao (Laoo),
|
||||
Latin (Latn),
|
||||
Lepcha (Lepc),
|
||||
Limbu (Limb),
|
||||
Linear_A (Lina),
|
||||
Linear_B (Linb),
|
||||
Lisu (Lisu),
|
||||
Lycian (Lyci),
|
||||
Lydian (Lydi),
|
||||
Mahajani (Majh),
|
||||
Makasar (Maka),
|
||||
Malayalam (Mlym),
|
||||
Mandaic (Mand),
|
||||
Manichaean (Mani),
|
||||
Marchen (Marc),
|
||||
Masaram_Gondi (Gonm),
|
||||
Medefaidrin (Medf),
|
||||
Meetei_Mayek (Mtei),
|
||||
Mende_Kikakui (Mend),
|
||||
Meroitic_Cursive (Merc),
|
||||
Meroitic_Hieroglyphs (Mero),
|
||||
Miao (Miao),
|
||||
Modi (Modi),
|
||||
Mongolian (Mong),
|
||||
Mro (Mroo),
|
||||
Multani (Mult),
|
||||
Myanmar (Mymr),
|
||||
Nabataean (Nbar),
|
||||
Nandinagari (Nand),
|
||||
New_Tai_Lue (Talu),
|
||||
Newa (Newa),
|
||||
Nko (Nkoo),
|
||||
Nushu (Nshu),
|
||||
Nyiakeng_Puachue_Hmong (Hmnp),
|
||||
Ogham (Ogam),
|
||||
Ol_Chiki (Olck),
|
||||
Old_Hungarian (Hung),
|
||||
Old_Italic (Olck),
|
||||
Old_North_Arabian (Narb),
|
||||
Old_Permic (Perm),
|
||||
Old_Persian (Orkh),
|
||||
Old_Sogdian (Sogo),
|
||||
Old_South_Arabian (Sarb),
|
||||
Old_Turkic (Orkh),
|
||||
Old_Uyghur (Ougr),
|
||||
Oriya (Orya),
|
||||
Osage (Osge),
|
||||
Osmanya (Osma),
|
||||
Pahawh_Hmong (Hmng),
|
||||
Palmyrene (Palm),
|
||||
Pau_Cin_Hau (Pauc),
|
||||
Phags_Pa (Phag),
|
||||
Phoenician (Phnx),
|
||||
Psalter_Pahlavi (Phli),
|
||||
Rejang (Rjng),
|
||||
Runic (Runr),
|
||||
Samaritan (Samr),
|
||||
Saurashtra (Saur),
|
||||
Sharada (Shrd),
|
||||
Shavian (Shaw),
|
||||
Siddham (Sidd),
|
||||
SignWriting (Sgnw),
|
||||
Sinhala (Sinh),
|
||||
Sogdian (Sogd),
|
||||
Sora_Sompeng (Sora),
|
||||
Soyombo (Soyo),
|
||||
Sundanese (Sund),
|
||||
Syloti_Nagri (Sylo),
|
||||
Syriac (Syrc),
|
||||
Tagalog (Tglg),
|
||||
Tagbanwa (Tagb),
|
||||
Tai_Le (Tale),
|
||||
Tai_Tham (Lana),
|
||||
Tai_Viet (Tavt),
|
||||
Takri (Takr),
|
||||
Tamil (Taml),
|
||||
Tangsa (Tngs),
|
||||
Tangut (Tang),
|
||||
Telugu (Telu),
|
||||
Thaana (Thaa),
|
||||
Thai (Thai),
|
||||
Tibetan (Tibt),
|
||||
Tifinagh (Tfng),
|
||||
Tirhuta (Tirh),
|
||||
Toto (Toto),
|
||||
Ugaritic (Ugar),
|
||||
Vai (Vaii),
|
||||
Vithkuqi (Vith),
|
||||
Wancho (Wcho),
|
||||
Warang_Citi (Wara),
|
||||
Yezidi (Yezi),
|
||||
Yi (Yiii),
|
||||
Zanabazar_Square (Zanb).
|
||||
.P
|
||||
Each character has exactly one Unicode general category property, specified by
|
||||
a two-letter abbreviation. For compatibility with Perl, negation can be
|
||||
|
@ -3956,6 +3955,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 22 December 2021
|
||||
Last updated: 28 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2SYNTAX 3 "22 December 2021" "PCRE2 10.40"
|
||||
.TH PCRE2SYNTAX 3 "28 December 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||
|
@ -175,170 +175,171 @@ at release 5.18.
|
|||
.SH "SCRIPT MATCHING WITH \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
The following script names are recognized in \ep{sc:...} or \ep{scx:...} items,
|
||||
or on their own with \ep (and also \eP of course):
|
||||
The following script names and their 4-letter abbreviations are recognized in
|
||||
\ep{sc:...} or \ep{scx:...} items, or on their own with \ep (and also \eP of
|
||||
course):
|
||||
.P
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cypro_Minoan,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Old_Uyghur,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangsa,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Toto,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Vithkuqi,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Adlam (Adlm),
|
||||
Ahom (Ahom),
|
||||
Anatolian_Hieroglyphs (Hluw),
|
||||
Arabic (Arab),
|
||||
Armenian (Armn),
|
||||
Avestan (Avst),
|
||||
Balinese (Bali),
|
||||
Bamum (Bamu),
|
||||
Bassa_Vah (Bass),
|
||||
Batak (Batk),
|
||||
Bengali (Beng),
|
||||
Bhaiksuki (Bhks),
|
||||
Bopomofo (Bopo),
|
||||
Brahmi (Brah),
|
||||
Braille (Brai),
|
||||
Buginese (Bugi),
|
||||
Buhid (Buhd),
|
||||
Canadian_Aboriginal (Cans),
|
||||
Carian (Cari),
|
||||
Caucasian_Albanian (Aghb),
|
||||
Chakma (Cakm),
|
||||
Cham (Cham),
|
||||
Cherokee (Cher),
|
||||
Chorasmian (Chrs),
|
||||
Common (Zyyy),
|
||||
Coptic (Copt),
|
||||
Cuneiform (Xsux),
|
||||
Cypriot (Cprt),
|
||||
Cypro_Minoan (Cpmn),
|
||||
Cyrillic (Cyrl),
|
||||
Deseret (Dsrt),
|
||||
Devanagari (Deva),
|
||||
Dives_Akuru (Diak),
|
||||
Dogra (Dogr),
|
||||
Duployan (Dupl),
|
||||
Egyptian_Hieroglyphs (Egyp),
|
||||
Elbasan (Elba),
|
||||
Elymaic (Elym),
|
||||
Ethiopic (Ethi),
|
||||
Georgian (Geor),
|
||||
Glagolitic (Glag),
|
||||
Gothic (Goth),
|
||||
Grantha (Gran),
|
||||
Greek (Grek),
|
||||
Gujarati (Gujr),
|
||||
Gunjala_Gondi (Gong),
|
||||
Gurmukhi (Guru),
|
||||
Han (Hani),
|
||||
Hangul (Hang),
|
||||
Hanifi_Rohingya (Rohg),
|
||||
Hanunoo (Hano),
|
||||
Hatran (Hatr),
|
||||
Hebrew (Hebr),
|
||||
Hiragana (Hira),
|
||||
Imperial_Aramaic (Armi),
|
||||
Inherited (Zinh),
|
||||
Inscriptional_Pahlavi (Phli),
|
||||
Inscriptional_Parthian (Prti),
|
||||
Javanese (Java),
|
||||
Kaithi (Kthi),
|
||||
Kannada (Knda),
|
||||
Katakana (Kana),
|
||||
Kayah_Li (Kali),
|
||||
Kharoshthi (Khar),
|
||||
Khitan_Small_Script (Kits),
|
||||
Khmer (Khmr),
|
||||
Khojki (Khoj),
|
||||
Khudawadi (Sind),
|
||||
Lao (Laoo),
|
||||
Latin (Latn),
|
||||
Lepcha (Lepc),
|
||||
Limbu (Limb),
|
||||
Linear_A (Lina),
|
||||
Linear_B (Linb),
|
||||
Lisu (Lisu),
|
||||
Lycian (Lyci),
|
||||
Lydian (Lydi),
|
||||
Mahajani (Majh),
|
||||
Makasar (Maka),
|
||||
Malayalam (Mlym),
|
||||
Mandaic (Mand),
|
||||
Manichaean (Mani),
|
||||
Marchen (Marc),
|
||||
Masaram_Gondi (Gonm),
|
||||
Medefaidrin (Medf),
|
||||
Meetei_Mayek (Mtei),
|
||||
Mende_Kikakui (Mend),
|
||||
Meroitic_Cursive (Merc),
|
||||
Meroitic_Hieroglyphs (Mero),
|
||||
Miao (Miao),
|
||||
Modi (Modi),
|
||||
Mongolian (Mong),
|
||||
Mro (Mroo),
|
||||
Multani (Mult),
|
||||
Myanmar (Mymr),
|
||||
Nabataean (Nbar),
|
||||
Nandinagari (Nand),
|
||||
New_Tai_Lue (Talu),
|
||||
Newa (Newa),
|
||||
Nko (Nkoo),
|
||||
Nushu (Nshu),
|
||||
Nyiakeng_Puachue_Hmong (Hmnp),
|
||||
Ogham (Ogam),
|
||||
Ol_Chiki (Olck),
|
||||
Old_Hungarian (Hung),
|
||||
Old_Italic (Olck),
|
||||
Old_North_Arabian (Narb),
|
||||
Old_Permic (Perm),
|
||||
Old_Persian (Orkh),
|
||||
Old_Sogdian (Sogo),
|
||||
Old_South_Arabian (Sarb),
|
||||
Old_Turkic (Orkh),
|
||||
Old_Uyghur (Ougr),
|
||||
Oriya (Orya),
|
||||
Osage (Osge),
|
||||
Osmanya (Osma),
|
||||
Pahawh_Hmong (Hmng),
|
||||
Palmyrene (Palm),
|
||||
Pau_Cin_Hau (Pauc),
|
||||
Phags_Pa (Phag),
|
||||
Phoenician (Phnx),
|
||||
Psalter_Pahlavi (Phli),
|
||||
Rejang (Rjng),
|
||||
Runic (Runr),
|
||||
Samaritan (Samr),
|
||||
Saurashtra (Saur),
|
||||
Sharada (Shrd),
|
||||
Shavian (Shaw),
|
||||
Siddham (Sidd),
|
||||
SignWriting (Sgnw),
|
||||
Sinhala (Sinh),
|
||||
Sogdian (Sogd),
|
||||
Sora_Sompeng (Sora),
|
||||
Soyombo (Soyo),
|
||||
Sundanese (Sund),
|
||||
Syloti_Nagri (Sylo),
|
||||
Syriac (Syrc),
|
||||
Tagalog (Tglg),
|
||||
Tagbanwa (Tagb),
|
||||
Tai_Le (Tale),
|
||||
Tai_Tham (Lana),
|
||||
Tai_Viet (Tavt),
|
||||
Takri (Takr),
|
||||
Tamil (Taml),
|
||||
Tangsa (Tngs),
|
||||
Tangut (Tang),
|
||||
Telugu (Telu),
|
||||
Thaana (Thaa),
|
||||
Thai (Thai),
|
||||
Tibetan (Tibt),
|
||||
Tifinagh (Tfng),
|
||||
Tirhuta (Tirh),
|
||||
Toto (Toto),
|
||||
Ugaritic (Ugar),
|
||||
Vai (Vaii),
|
||||
Vithkuqi (Vith),
|
||||
Wancho (Wcho),
|
||||
Warang_Citi (Wara),
|
||||
Yezidi (Yezi),
|
||||
Yi (Yiii),
|
||||
Zanabazar_Square (Zanb).
|
||||
.
|
||||
.
|
||||
.SH "BIDI_PROPERTIES FOR \ep AND \eP"
|
||||
|
@ -727,6 +728,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 22 December 2021
|
||||
Last updated: 28 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -38,8 +38,11 @@
|
|||
# accordance with Unicode's "loose matching" rules, which Perl observes.
|
||||
# Changed default script type from PT_SC to PT_SCX, 18-December-2021
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
#
|
||||
# Note subsequent changes here:
|
||||
#
|
||||
# 27-December_2021: Added support for 4-letter script abbreviations.
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Import common data lists and functions
|
||||
|
@ -79,15 +82,27 @@ def stdnames(x):
|
|||
return y
|
||||
|
||||
std_script_names = stdnames(script_names)
|
||||
std_script_abbrevs = stdnames(script_abbrevs)
|
||||
std_category_names = stdnames(category_names)
|
||||
std_general_category_names = stdnames(general_category_names)
|
||||
std_bidi_class_names = stdnames(bidi_class_names)
|
||||
|
||||
# Create the table, starting with the Unicode script, category and bidi class
|
||||
# names. We keep both the standardized name and the original, because the
|
||||
# latter is used for the ucp_xx names.
|
||||
# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
|
||||
# still use the full original names.
|
||||
|
||||
utt_table = list(zip(std_script_names, script_names, ['PT_SCX'] * len(script_names)))
|
||||
utt_table += list(zip(std_script_abbrevs, script_names, ['PT_SCX'] * len(script_abbrevs)))
|
||||
|
||||
# At lease one script abbreviation is the same as the full name of the script,
|
||||
# so we must remove duplicates. It doesn't matter if this operation changes the
|
||||
# order, because we are going to sort the list later.
|
||||
|
||||
utt_table = list(set(utt_table))
|
||||
|
||||
# Add the remaining property lists
|
||||
|
||||
utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
|
||||
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
|
||||
utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
|
||||
|
|
|
@ -299,22 +299,48 @@ return isatty(fileno(stdin));
|
|||
* Get script name from ucp ident *
|
||||
*************************************************/
|
||||
|
||||
/* The utt table contains both the full script names and the 4-letter
|
||||
abbreviations. So search for both and use the longer if two are found, unless
|
||||
the first one is only 3 characters (some scripts have 3-character names). If
|
||||
this were not just a test program it might be worth making some kind of reverse
|
||||
index. */
|
||||
|
||||
static const char *
|
||||
get_scriptname(int script)
|
||||
{
|
||||
size_t i;
|
||||
const ucp_type_table *u;
|
||||
size_t i, j, len;
|
||||
size_t foundlist[2];
|
||||
const char *yield;
|
||||
|
||||
j = 0;
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
{
|
||||
u = PRIV(utt) + i;
|
||||
if (u->type == PT_SCX && u->value == script) break;
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
if (u->type == PT_SCX && u->value == script)
|
||||
{
|
||||
foundlist[j++] = i;
|
||||
if (j >= 2) break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i < PRIV(utt_size))
|
||||
return PRIV(utt_names) + u->name_offset;
|
||||
if (j == 0) return "??";
|
||||
|
||||
return "??";
|
||||
yield = NULL;
|
||||
len = 0;
|
||||
|
||||
for (i = 0; i < j; i++)
|
||||
{
|
||||
const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
|
||||
size_t sl = strlen(s);
|
||||
if (sl > len)
|
||||
{
|
||||
yield = s;
|
||||
if (sl == 3) break;
|
||||
len = sl;
|
||||
}
|
||||
}
|
||||
|
||||
return yield;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -15,3 +15,4 @@ find bidi RLO
|
|||
find bidi S
|
||||
find bidi WS
|
||||
find bidi_control
|
||||
find script bopo
|
||||
|
|
|
@ -218,3 +218,7 @@ U+2066 *LRI Control: Format, common, Control
|
|||
U+2067 *RLI Control: Format, common, Control
|
||||
U+2068 *FSI Control: Format, common, Control
|
||||
U+2069 *PDI Control: Format, common, Control
|
||||
find script bopo
|
||||
U+02EA..U+02EB ON Symbol: Modifier symbol, bopomofo, Other
|
||||
U+3105..U+312F L Letter: Other letter, bopomofo, Other
|
||||
U+31A0..U+31BF L Letter: Other letter, bopomofo, Other
|
||||
|
|
|
@ -230,21 +230,48 @@ for (; len > 0; len--)
|
|||
/* When there is no UTF/UCP support, the table of names does not exist. This
|
||||
function should not be called in such configurations, because a pattern that
|
||||
tries to use Unicode properties won't compile. Rather than put lots of #ifdefs
|
||||
into the main code, however, we just put one into this function. */
|
||||
into the main code, however, we just put one into this function.
|
||||
|
||||
Now that the table contains both full script names and their 4-character
|
||||
abbreviations, we do some fiddling to try to get the full name, which is either
|
||||
the longer of two found names, or a 3-character name. */
|
||||
|
||||
static const char *
|
||||
get_ucpname(unsigned int ptype, unsigned int pvalue)
|
||||
{
|
||||
#ifdef SUPPORT_UNICODE
|
||||
int i;
|
||||
int count = 0;
|
||||
const char *yield = "??";
|
||||
size_t len = 0;
|
||||
|
||||
if (ptype == PT_SC) ptype = PT_SCX; /* Table has scx values */
|
||||
for (i = PRIV(utt_size) - 1; i >= 0; i--)
|
||||
|
||||
for (int i = PRIV(utt_size) - 1; i >= 0; i--)
|
||||
{
|
||||
if (ptype == PRIV(utt)[i].type && pvalue == PRIV(utt)[i].value) break;
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
|
||||
if (ptype == u->type && pvalue == u->value)
|
||||
{
|
||||
const char *s = PRIV(utt_names) + u->name_offset;
|
||||
size_t sl = strlen(s);
|
||||
|
||||
if (sl == 3)
|
||||
{
|
||||
yield = s;
|
||||
break;
|
||||
}
|
||||
|
||||
return (i >= 0)? PRIV(utt_names) + PRIV(utt)[i].name_offset : "??";
|
||||
if (sl > len)
|
||||
{
|
||||
yield = s;
|
||||
len = sl;
|
||||
}
|
||||
|
||||
if (++count >= 2) break;
|
||||
}
|
||||
}
|
||||
|
||||
return yield;
|
||||
|
||||
#else /* No UTF support */
|
||||
(void)ptype;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -2641,4 +2641,7 @@
|
|||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
/[\p{taml}\p{sc:ugar}]+/utf
|
||||
\x{0b82}\x{10380}
|
||||
|
||||
# End of testinput4
|
||||
|
|
|
@ -4235,4 +4235,8 @@ No match
|
|||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
/[\p{taml}\p{sc:ugar}]+/utf
|
||||
\x{0b82}\x{10380}
|
||||
0: \x{b82}\x{10380}
|
||||
|
||||
# End of testinput4
|
||||
|
|
Loading…
Reference in New Issue