Add support for 4-character script abbreviations
This commit is contained in:
parent
af2637ee5e
commit
7713f33e46
|
@ -58,6 +58,9 @@ of applications treat NULL/0 in this way.
|
||||||
the same as \p{scx:scriptname} because this change happened in Perl at
|
the same as \p{scx:scriptname} because this change happened in Perl at
|
||||||
release 5.26.
|
release 5.26.
|
||||||
|
|
||||||
|
(d) The standard Unicode 4-letter abbreviations for script names are now
|
||||||
|
recognized.
|
||||||
|
|
||||||
18. The Python scripts in the maint directory have been refactored. There are
|
18. The Python scripts in the maint directory have been refactored. There are
|
||||||
now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c
|
now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c
|
||||||
(which is #included by pcre2_tables.c). The data lists that used to be
|
(which is #included by pcre2_tables.c). The data lists that used to be
|
||||||
|
|
|
@ -810,171 +810,170 @@ interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
||||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||||
part of an identified script are lumped together as "Common". The current list
|
part of an identified script are lumped together as "Common". The current list
|
||||||
of scripts is:
|
of script names and their 4-letter abbreviations is:
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Adlam,
|
Adlam (Adlm),
|
||||||
Ahom,
|
Ahom (Ahom),
|
||||||
Anatolian_Hieroglyphs,
|
Anatolian_Hieroglyphs (Hluw),
|
||||||
Arabic,
|
Arabic (Arab),
|
||||||
Armenian,
|
Armenian (Armn),
|
||||||
Avestan,
|
Avestan (Avst),
|
||||||
Balinese,
|
Balinese (Bali),
|
||||||
Bamum,
|
Bamum (Bamu),
|
||||||
Bassa_Vah,
|
Bassa_Vah (Bass),
|
||||||
Batak,
|
Batak (Batk),
|
||||||
Bengali,
|
Bengali (Beng),
|
||||||
Bhaiksuki,
|
Bhaiksuki (Bhks),
|
||||||
Bopomofo,
|
Bopomofo (Bopo),
|
||||||
Brahmi,
|
Brahmi (Brah),
|
||||||
Braille,
|
Braille (Brai),
|
||||||
Buginese,
|
Buginese (Bugi),
|
||||||
Buhid,
|
Buhid (Buhd),
|
||||||
Canadian_Aboriginal,
|
Canadian_Aboriginal (Cans),
|
||||||
Carian,
|
Carian (Cari),
|
||||||
Caucasian_Albanian,
|
Caucasian_Albanian (Aghb),
|
||||||
Chakma,
|
Chakma (Cakm),
|
||||||
Cham,
|
Cham (Cham),
|
||||||
Cherokee,
|
Cherokee (Cher),
|
||||||
Chorasmian,
|
Chorasmian (Chrs),
|
||||||
Common,
|
Common (Zyyy),
|
||||||
Coptic,
|
Coptic (Copt),
|
||||||
Cuneiform,
|
Cuneiform (Xsux),
|
||||||
Cypriot,
|
Cypriot (Cprt),
|
||||||
Cypro_Minoan,
|
Cypro_Minoan (Cpmn),
|
||||||
Cyrillic,
|
Cyrillic (Cyrl),
|
||||||
Deseret,
|
Deseret (Dsrt),
|
||||||
Devanagari,
|
Devanagari (Deva),
|
||||||
Dives_Akuru,
|
Dives_Akuru (Diak),
|
||||||
Dogra,
|
Dogra (Dogr),
|
||||||
Duployan,
|
Duployan (Dupl),
|
||||||
Egyptian_Hieroglyphs,
|
Egyptian_Hieroglyphs (Egyp),
|
||||||
Elbasan,
|
Elbasan (Elba),
|
||||||
Elymaic,
|
Elymaic (Elym),
|
||||||
Ethiopic,
|
Ethiopic (Ethi),
|
||||||
Georgian,
|
Georgian (Geor),
|
||||||
Glagolitic,
|
Glagolitic (Glag),
|
||||||
Gothic,
|
Gothic (Goth),
|
||||||
Grantha,
|
Grantha (Gran),
|
||||||
Greek,
|
Greek (Grek),
|
||||||
Gujarati,
|
Gujarati (Gujr),
|
||||||
Gunjala_Gondi,
|
Gunjala_Gondi (Gong),
|
||||||
Gurmukhi,
|
Gurmukhi (Guru),
|
||||||
Han,
|
Han (Hani),
|
||||||
Hangul,
|
Hangul (Hang),
|
||||||
Hanifi_Rohingya,
|
Hanifi_Rohingya (Rohg),
|
||||||
Hanunoo,
|
Hanunoo (Hano),
|
||||||
Hatran,
|
Hatran (Hatr),
|
||||||
Hebrew,
|
Hebrew (Hebr),
|
||||||
Hiragana,
|
Hiragana (Hira),
|
||||||
Imperial_Aramaic,
|
Imperial_Aramaic (Armi),
|
||||||
Inherited,
|
Inherited (Zinh),
|
||||||
Inscriptional_Pahlavi,
|
Inscriptional_Pahlavi (Phli),
|
||||||
Inscriptional_Parthian,
|
Inscriptional_Parthian (Prti),
|
||||||
Javanese,
|
Javanese (Java),
|
||||||
Kaithi,
|
Kaithi (Kthi),
|
||||||
Kannada,
|
Kannada (Knda),
|
||||||
Katakana,
|
Katakana (Kana),
|
||||||
Kayah_Li,
|
Kayah_Li (Kali),
|
||||||
Kharoshthi,
|
Kharoshthi (Khar),
|
||||||
Khitan_Small_Script,
|
Khitan_Small_Script (Kits),
|
||||||
Khmer,
|
Khmer (Khmr),
|
||||||
Khojki,
|
Khojki (Khoj),
|
||||||
Khudawadi,
|
Khudawadi (Sind),
|
||||||
Lao,
|
Lao (Laoo),
|
||||||
Latin,
|
Latin (Latn),
|
||||||
Lepcha,
|
Lepcha (Lepc),
|
||||||
Limbu,
|
Limbu (Limb),
|
||||||
Linear_A,
|
Linear_A (Lina),
|
||||||
Linear_B,
|
Linear_B (Linb),
|
||||||
Lisu,
|
Lisu (Lisu),
|
||||||
Lycian,
|
Lycian (Lyci),
|
||||||
Lydian,
|
Lydian (Lydi),
|
||||||
Mahajani,
|
Mahajani (Majh),
|
||||||
Makasar,
|
Makasar (Maka),
|
||||||
Malayalam,
|
Malayalam (Mlym),
|
||||||
Mandaic,
|
Mandaic (Mand),
|
||||||
Manichaean,
|
Manichaean (Mani),
|
||||||
Marchen,
|
Marchen (Marc),
|
||||||
Masaram_Gondi,
|
Masaram_Gondi (Gonm),
|
||||||
Medefaidrin,
|
Medefaidrin (Medf),
|
||||||
Meetei_Mayek,
|
Meetei_Mayek (Mtei),
|
||||||
Mende_Kikakui,
|
Mende_Kikakui (Mend),
|
||||||
Meroitic_Cursive,
|
Meroitic_Cursive (Merc),
|
||||||
Meroitic_Hieroglyphs,
|
Meroitic_Hieroglyphs (Mero),
|
||||||
Miao,
|
Miao (Miao),
|
||||||
Modi,
|
Modi (Modi),
|
||||||
Mongolian,
|
Mongolian (Mong),
|
||||||
Mro,
|
Mro (Mroo),
|
||||||
Multani,
|
Multani (Mult),
|
||||||
Myanmar,
|
Myanmar (Mymr),
|
||||||
Nabataean,
|
Nabataean (Nbar),
|
||||||
Nandinagari,
|
Nandinagari (Nand),
|
||||||
New_Tai_Lue,
|
New_Tai_Lue (Talu),
|
||||||
Newa,
|
Newa (Newa),
|
||||||
Nko,
|
Nko (Nkoo),
|
||||||
Nushu,
|
Nushu (Nshu),
|
||||||
Nyakeng_Puachue_Hmong,
|
Nyiakeng_Puachue_Hmong (Hmnp),
|
||||||
Ogham,
|
Ogham (Ogam),
|
||||||
Ol_Chiki,
|
Ol_Chiki (Olck),
|
||||||
Old_Hungarian,
|
Old_Hungarian (Hung),
|
||||||
Old_Italic,
|
Old_Italic (Olck),
|
||||||
Old_North_Arabian,
|
Old_North_Arabian (Narb),
|
||||||
Old_Permic,
|
Old_Permic (Perm),
|
||||||
Old_Persian,
|
Old_Persian (Orkh),
|
||||||
Old_Sogdian,
|
Old_Sogdian (Sogo),
|
||||||
Old_South_Arabian,
|
Old_South_Arabian (Sarb),
|
||||||
Old_Turkic,
|
Old_Turkic (Orkh),
|
||||||
Old_Uyghur,
|
Old_Uyghur (Ougr),
|
||||||
Oriya,
|
Oriya (Orya),
|
||||||
Osage,
|
Osage (Osge),
|
||||||
Osmanya,
|
Osmanya (Osma),
|
||||||
Pahawh_Hmong,
|
Pahawh_Hmong (Hmng),
|
||||||
Palmyrene,
|
Palmyrene (Palm),
|
||||||
Pau_Cin_Hau,
|
Pau_Cin_Hau (Pauc),
|
||||||
Phags_Pa,
|
Phags_Pa (Phag),
|
||||||
Phoenician,
|
Phoenician (Phnx),
|
||||||
Psalter_Pahlavi,
|
Psalter_Pahlavi (Phli),
|
||||||
Rejang,
|
Rejang (Rjng),
|
||||||
Runic,
|
Runic (Runr),
|
||||||
Samaritan,
|
Samaritan (Samr),
|
||||||
Saurashtra,
|
Saurashtra (Saur),
|
||||||
Sharada,
|
Sharada (Shrd),
|
||||||
Shavian,
|
Shavian (Shaw),
|
||||||
Siddham,
|
Siddham (Sidd),
|
||||||
SignWriting,
|
SignWriting (Sgnw),
|
||||||
Sinhala,
|
Sinhala (Sinh),
|
||||||
Sogdian,
|
Sogdian (Sogd),
|
||||||
Sora_Sompeng,
|
Sora_Sompeng (Sora),
|
||||||
Soyombo,
|
Soyombo (Soyo),
|
||||||
Sundanese,
|
Sundanese (Sund),
|
||||||
Syloti_Nagri,
|
Syloti_Nagri (Sylo),
|
||||||
Syriac,
|
Syriac (Syrc),
|
||||||
Tagalog,
|
Tagalog (Tglg),
|
||||||
Tagbanwa,
|
Tagbanwa (Tagb),
|
||||||
Tai_Le,
|
Tai_Le (Tale),
|
||||||
Tai_Tham,
|
Tai_Tham (Lana),
|
||||||
Tai_Viet,
|
Tai_Viet (Tavt),
|
||||||
Takri,
|
Takri (Takr),
|
||||||
Tamil,
|
Tamil (Taml),
|
||||||
Tangsa,
|
Tangsa (Tngs),
|
||||||
Tangut,
|
Tangut (Tang),
|
||||||
Telugu,
|
Telugu (Telu),
|
||||||
Thaana,
|
Thaana (Thaa),
|
||||||
Thai,
|
Thai (Thai),
|
||||||
Tibetan,
|
Tibetan (Tibt),
|
||||||
Tifinagh,
|
Tifinagh (Tfng),
|
||||||
Tirhuta,
|
Tirhuta (Tirh),
|
||||||
Toto,
|
Toto (Toto),
|
||||||
Ugaritic,
|
Ugaritic (Ugar),
|
||||||
Unknown,
|
Vai (Vaii),
|
||||||
Vai,
|
Vithkuqi (Vith),
|
||||||
Vithkuqi,
|
Wancho (Wcho),
|
||||||
Wancho,
|
Warang_Citi (Wara),
|
||||||
Warang_Citi,
|
Yezidi (Yezi),
|
||||||
Yezidi,
|
Yi (Yiii),
|
||||||
Yi,
|
Zanabazar_Square (Zanb).
|
||||||
Zanabazar_Square.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Each character has exactly one Unicode general category property, specified by
|
Each character has exactly one Unicode general category property, specified by
|
||||||
|
@ -3909,7 +3908,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 22 December 2021
|
Last updated: 28 December 2021
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2021 University of Cambridge.
|
Copyright © 1997-2021 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -23,12 +23,12 @@ please consult the man page, in case the conversion went wrong.
|
||||||
<br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS</a><br>
|
<br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>int32_t pcre2_serialize_decode(pcre2_code **<i>codes</i>,</b>
|
<b>int32_t pcre2_serialize_decode(pcre2_code **<i>codes</i>,</b>
|
||||||
<b> int32_t <i>number_of_codes</i>, const uint32_t *<i>bytes</i>,</b>
|
<b> int32_t <i>number_of_codes</i>, const uint8_t *<i>bytes</i>,</b>
|
||||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>int32_t pcre2_serialize_encode(pcre2_code **<i>codes</i>,</b>
|
<b>int32_t pcre2_serialize_encode(const pcre2_code **<i>codes</i>,</b>
|
||||||
<b> int32_t <i>number_of_codes</i>, uint32_t **<i>serialized_bytes</i>,</b>
|
<b> int32_t <i>number_of_codes</i>, uint8_t **<i>serialized_bytes</i>,</b>
|
||||||
<b> PCRE2_SIZE *<i>serialized_size</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
<b> PCRE2_SIZE *<i>serialized_size</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
|
@ -154,7 +154,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
|
||||||
<b>malloc()</b> and <b>free()</b> are used. After deserialization, the byte
|
<b>malloc()</b> and <b>free()</b> are used. After deserialization, the byte
|
||||||
stream is no longer needed and can be discarded.
|
stream is no longer needed and can be discarded.
|
||||||
<pre>
|
<pre>
|
||||||
int32_t number_of_codes;
|
|
||||||
pcre2_code *list_of_codes[2];
|
pcre2_code *list_of_codes[2];
|
||||||
uint8_t *bytes = <serialized data>;
|
uint8_t *bytes = <serialized data>;
|
||||||
int32_t number_of_codes =
|
int32_t number_of_codes =
|
||||||
|
|
|
@ -207,171 +207,172 @@ at release 5.18.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC7" href="#TOC1">SCRIPT MATCHING WITH \p AND \P</a><br>
|
<br><a name="SEC7" href="#TOC1">SCRIPT MATCHING WITH \p AND \P</a><br>
|
||||||
<P>
|
<P>
|
||||||
The following script names are recognized in \p{sc:...} or \p{scx:...} items,
|
The following script names and their 4-letter abbreviations are recognized in
|
||||||
or on their own with \p (and also \P of course):
|
\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of
|
||||||
|
course):
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Adlam,
|
Adlam (Adlm),
|
||||||
Ahom,
|
Ahom (Ahom),
|
||||||
Anatolian_Hieroglyphs,
|
Anatolian_Hieroglyphs (Hluw),
|
||||||
Arabic,
|
Arabic (Arab),
|
||||||
Armenian,
|
Armenian (Armn),
|
||||||
Avestan,
|
Avestan (Avst),
|
||||||
Balinese,
|
Balinese (Bali),
|
||||||
Bamum,
|
Bamum (Bamu),
|
||||||
Bassa_Vah,
|
Bassa_Vah (Bass),
|
||||||
Batak,
|
Batak (Batk),
|
||||||
Bengali,
|
Bengali (Beng),
|
||||||
Bhaiksuki,
|
Bhaiksuki (Bhks),
|
||||||
Bopomofo,
|
Bopomofo (Bopo),
|
||||||
Brahmi,
|
Brahmi (Brah),
|
||||||
Braille,
|
Braille (Brai),
|
||||||
Buginese,
|
Buginese (Bugi),
|
||||||
Buhid,
|
Buhid (Buhd),
|
||||||
Canadian_Aboriginal,
|
Canadian_Aboriginal (Cans),
|
||||||
Carian,
|
Carian (Cari),
|
||||||
Caucasian_Albanian,
|
Caucasian_Albanian (Aghb),
|
||||||
Chakma,
|
Chakma (Cakm),
|
||||||
Cham,
|
Cham (Cham),
|
||||||
Cherokee,
|
Cherokee (Cher),
|
||||||
Chorasmian,
|
Chorasmian (Chrs),
|
||||||
Common,
|
Common (Zyyy),
|
||||||
Coptic,
|
Coptic (Copt),
|
||||||
Cuneiform,
|
Cuneiform (Xsux),
|
||||||
Cypriot,
|
Cypriot (Cprt),
|
||||||
Cypro_Minoan,
|
Cypro_Minoan (Cpmn),
|
||||||
Cyrillic,
|
Cyrillic (Cyrl),
|
||||||
Deseret,
|
Deseret (Dsrt),
|
||||||
Devanagari,
|
Devanagari (Deva),
|
||||||
Dives_Akuru,
|
Dives_Akuru (Diak),
|
||||||
Dogra,
|
Dogra (Dogr),
|
||||||
Duployan,
|
Duployan (Dupl),
|
||||||
Egyptian_Hieroglyphs,
|
Egyptian_Hieroglyphs (Egyp),
|
||||||
Elbasan,
|
Elbasan (Elba),
|
||||||
Elymaic,
|
Elymaic (Elym),
|
||||||
Ethiopic,
|
Ethiopic (Ethi),
|
||||||
Georgian,
|
Georgian (Geor),
|
||||||
Glagolitic,
|
Glagolitic (Glag),
|
||||||
Gothic,
|
Gothic (Goth),
|
||||||
Grantha,
|
Grantha (Gran),
|
||||||
Greek,
|
Greek (Grek),
|
||||||
Gujarati,
|
Gujarati (Gujr),
|
||||||
Gunjala_Gondi,
|
Gunjala_Gondi (Gong),
|
||||||
Gurmukhi,
|
Gurmukhi (Guru),
|
||||||
Han,
|
Han (Hani),
|
||||||
Hangul,
|
Hangul (Hang),
|
||||||
Hanifi_Rohingya,
|
Hanifi_Rohingya (Rohg),
|
||||||
Hanunoo,
|
Hanunoo (Hano),
|
||||||
Hatran,
|
Hatran (Hatr),
|
||||||
Hebrew,
|
Hebrew (Hebr),
|
||||||
Hiragana,
|
Hiragana (Hira),
|
||||||
Imperial_Aramaic,
|
Imperial_Aramaic (Armi),
|
||||||
Inherited,
|
Inherited (Zinh),
|
||||||
Inscriptional_Pahlavi,
|
Inscriptional_Pahlavi (Phli),
|
||||||
Inscriptional_Parthian,
|
Inscriptional_Parthian (Prti),
|
||||||
Javanese,
|
Javanese (Java),
|
||||||
Kaithi,
|
Kaithi (Kthi),
|
||||||
Kannada,
|
Kannada (Knda),
|
||||||
Katakana,
|
Katakana (Kana),
|
||||||
Kayah_Li,
|
Kayah_Li (Kali),
|
||||||
Kharoshthi,
|
Kharoshthi (Khar),
|
||||||
Khitan_Small_Script,
|
Khitan_Small_Script (Kits),
|
||||||
Khmer,
|
Khmer (Khmr),
|
||||||
Khojki,
|
Khojki (Khoj),
|
||||||
Khudawadi,
|
Khudawadi (Sind),
|
||||||
Lao,
|
Lao (Laoo),
|
||||||
Latin,
|
Latin (Latn),
|
||||||
Lepcha,
|
Lepcha (Lepc),
|
||||||
Limbu,
|
Limbu (Limb),
|
||||||
Linear_A,
|
Linear_A (Lina),
|
||||||
Linear_B,
|
Linear_B (Linb),
|
||||||
Lisu,
|
Lisu (Lisu),
|
||||||
Lycian,
|
Lycian (Lyci),
|
||||||
Lydian,
|
Lydian (Lydi),
|
||||||
Mahajani,
|
Mahajani (Majh),
|
||||||
Makasar,
|
Makasar (Maka),
|
||||||
Malayalam,
|
Malayalam (Mlym),
|
||||||
Mandaic,
|
Mandaic (Mand),
|
||||||
Manichaean,
|
Manichaean (Mani),
|
||||||
Marchen,
|
Marchen (Marc),
|
||||||
Masaram_Gondi,
|
Masaram_Gondi (Gonm),
|
||||||
Medefaidrin,
|
Medefaidrin (Medf),
|
||||||
Meetei_Mayek,
|
Meetei_Mayek (Mtei),
|
||||||
Mende_Kikakui,
|
Mende_Kikakui (Mend),
|
||||||
Meroitic_Cursive,
|
Meroitic_Cursive (Merc),
|
||||||
Meroitic_Hieroglyphs,
|
Meroitic_Hieroglyphs (Mero),
|
||||||
Miao,
|
Miao (Miao),
|
||||||
Modi,
|
Modi (Modi),
|
||||||
Mongolian,
|
Mongolian (Mong),
|
||||||
Mro,
|
Mro (Mroo),
|
||||||
Multani,
|
Multani (Mult),
|
||||||
Myanmar,
|
Myanmar (Mymr),
|
||||||
Nabataean,
|
Nabataean (Nbar),
|
||||||
Nandinagari,
|
Nandinagari (Nand),
|
||||||
New_Tai_Lue,
|
New_Tai_Lue (Talu),
|
||||||
Newa,
|
Newa (Newa),
|
||||||
Nko,
|
Nko (Nkoo),
|
||||||
Nushu,
|
Nushu (Nshu),
|
||||||
Nyakeng_Puachue_Hmong,
|
Nyiakeng_Puachue_Hmong (Hmnp),
|
||||||
Ogham,
|
Ogham (Ogam),
|
||||||
Ol_Chiki,
|
Ol_Chiki (Olck),
|
||||||
Old_Hungarian,
|
Old_Hungarian (Hung),
|
||||||
Old_Italic,
|
Old_Italic (Olck),
|
||||||
Old_North_Arabian,
|
Old_North_Arabian (Narb),
|
||||||
Old_Permic,
|
Old_Permic (Perm),
|
||||||
Old_Persian,
|
Old_Persian (Orkh),
|
||||||
Old_Sogdian,
|
Old_Sogdian (Sogo),
|
||||||
Old_South_Arabian,
|
Old_South_Arabian (Sarb),
|
||||||
Old_Turkic,
|
Old_Turkic (Orkh),
|
||||||
Old_Uyghur,
|
Old_Uyghur (Ougr),
|
||||||
Oriya,
|
Oriya (Orya),
|
||||||
Osage,
|
Osage (Osge),
|
||||||
Osmanya,
|
Osmanya (Osma),
|
||||||
Pahawh_Hmong,
|
Pahawh_Hmong (Hmng),
|
||||||
Palmyrene,
|
Palmyrene (Palm),
|
||||||
Pau_Cin_Hau,
|
Pau_Cin_Hau (Pauc),
|
||||||
Phags_Pa,
|
Phags_Pa (Phag),
|
||||||
Phoenician,
|
Phoenician (Phnx),
|
||||||
Psalter_Pahlavi,
|
Psalter_Pahlavi (Phli),
|
||||||
Rejang,
|
Rejang (Rjng),
|
||||||
Runic,
|
Runic (Runr),
|
||||||
Samaritan,
|
Samaritan (Samr),
|
||||||
Saurashtra,
|
Saurashtra (Saur),
|
||||||
Sharada,
|
Sharada (Shrd),
|
||||||
Shavian,
|
Shavian (Shaw),
|
||||||
Siddham,
|
Siddham (Sidd),
|
||||||
SignWriting,
|
SignWriting (Sgnw),
|
||||||
Sinhala,
|
Sinhala (Sinh),
|
||||||
Sogdian,
|
Sogdian (Sogd),
|
||||||
Sora_Sompeng,
|
Sora_Sompeng (Sora),
|
||||||
Soyombo,
|
Soyombo (Soyo),
|
||||||
Sundanese,
|
Sundanese (Sund),
|
||||||
Syloti_Nagri,
|
Syloti_Nagri (Sylo),
|
||||||
Syriac,
|
Syriac (Syrc),
|
||||||
Tagalog,
|
Tagalog (Tglg),
|
||||||
Tagbanwa,
|
Tagbanwa (Tagb),
|
||||||
Tai_Le,
|
Tai_Le (Tale),
|
||||||
Tai_Tham,
|
Tai_Tham (Lana),
|
||||||
Tai_Viet,
|
Tai_Viet (Tavt),
|
||||||
Takri,
|
Takri (Takr),
|
||||||
Tamil,
|
Tamil (Taml),
|
||||||
Tangsa,
|
Tangsa (Tngs),
|
||||||
Tangut,
|
Tangut (Tang),
|
||||||
Telugu,
|
Telugu (Telu),
|
||||||
Thaana,
|
Thaana (Thaa),
|
||||||
Thai,
|
Thai (Thai),
|
||||||
Tibetan,
|
Tibetan (Tibt),
|
||||||
Tifinagh,
|
Tifinagh (Tfng),
|
||||||
Tirhuta,
|
Tirhuta (Tirh),
|
||||||
Toto,
|
Toto (Toto),
|
||||||
Ugaritic,
|
Ugaritic (Ugar),
|
||||||
Vai,
|
Vai (Vaii),
|
||||||
Vithkuqi,
|
Vithkuqi (Vith),
|
||||||
Wancho,
|
Wancho (Wcho),
|
||||||
Warang_Citi,
|
Warang_Citi (Wara),
|
||||||
Yezidi,
|
Yezidi (Yezi),
|
||||||
Yi,
|
Yi (Yiii),
|
||||||
Zanabazar_Square.
|
Zanabazar_Square (Zanb).
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC8" href="#TOC1">BIDI_PROPERTIES FOR \p AND \P</a><br>
|
<br><a name="SEC8" href="#TOC1">BIDI_PROPERTIES FOR \p AND \P</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -743,7 +744,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 22 December 2021
|
Last updated: 28 December 2021
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2021 University of Cambridge.
|
Copyright © 1997-2021 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
153
doc/pcre2.txt
153
doc/pcre2.txt
|
@ -6920,33 +6920,51 @@ BACKSLASH
|
||||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code
|
Unassigned characters (and in non-UTF 32-bit mode, characters with code
|
||||||
points greater than 0x10FFFF) are assigned the "Unknown" script. Others
|
points greater than 0x10FFFF) are assigned the "Unknown" script. Others
|
||||||
that are not part of an identified script are lumped together as "Com-
|
that are not part of an identified script are lumped together as "Com-
|
||||||
mon". The current list of scripts is:
|
mon". The current list of script names and their 4-letter abbreviations
|
||||||
|
is:
|
||||||
|
|
||||||
Adlam, Ahom, Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, Bali-
|
Adlam (Adlm), Ahom (Ahom), Anatolian_Hieroglyphs (Hluw), Arabic (Arab),
|
||||||
nese, Bamum, Bassa_Vah, Batak, Bengali, Bhaiksuki, Bopomofo, Brahmi,
|
Armenian (Armn), Avestan (Avst), Balinese (Bali), Bamum (Bamu),
|
||||||
Braille, Buginese, Buhid, Canadian_Aboriginal, Carian, Caucasian_Alba-
|
Bassa_Vah (Bass), Batak (Batk), Bengali (Beng), Bhaiksuki (Bhks), Bopo-
|
||||||
nian, Chakma, Cham, Cherokee, Chorasmian, Common, Coptic, Cuneiform,
|
mofo (Bopo), Brahmi (Brah), Braille (Brai), Buginese (Bugi), Buhid
|
||||||
Cypriot, Cypro_Minoan, Cyrillic, Deseret, Devanagari, Dives_Akuru, Do-
|
(Buhd), Canadian_Aboriginal (Cans), Carian (Cari), Caucasian_Albanian
|
||||||
gra, Duployan, Egyptian_Hieroglyphs, Elbasan, Elymaic, Ethiopic, Geor-
|
(Aghb), Chakma (Cakm), Cham (Cham), Cherokee (Cher), Chorasmian (Chrs),
|
||||||
gian, Glagolitic, Gothic, Grantha, Greek, Gujarati, Gunjala_Gondi, Gur-
|
Common (Zyyy), Coptic (Copt), Cuneiform (Xsux), Cypriot (Cprt),
|
||||||
mukhi, Han, Hangul, Hanifi_Rohingya, Hanunoo, Hatran, Hebrew, Hiragana,
|
Cypro_Minoan (Cpmn), Cyrillic (Cyrl), Deseret (Dsrt), Devanagari
|
||||||
Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip-
|
(Deva), Dives_Akuru (Diak), Dogra (Dogr), Duployan (Dupl), Egyptian_Hi-
|
||||||
tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li,
|
eroglyphs (Egyp), Elbasan (Elba), Elymaic (Elym), Ethiopic (Ethi),
|
||||||
Kharoshthi, Khitan_Small_Script, Khmer, Khojki, Khudawadi, Lao, Latin,
|
Georgian (Geor), Glagolitic (Glag), Gothic (Goth), Grantha (Gran),
|
||||||
Lepcha, Limbu, Linear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani,
|
Greek (Grek), Gujarati (Gujr), Gunjala_Gondi (Gong), Gurmukhi (Guru),
|
||||||
Makasar, Malayalam, Mandaic, Manichaean, Marchen, Masaram_Gondi, Mede-
|
Han (Hani), Hangul (Hang), Hanifi_Rohingya (Rohg), Hanunoo (Hano), Ha-
|
||||||
faidrin, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive, Meroitic_Hiero-
|
tran (Hatr), Hebrew (Hebr), Hiragana (Hira), Imperial_Aramaic (Armi),
|
||||||
glyphs, Miao, Modi, Mongolian, Mro, Multani, Myanmar, Nabataean, Nandi-
|
Inherited (Zinh), Inscriptional_Pahlavi (Phli), Inscriptional_Parthian
|
||||||
nagari, New_Tai_Lue, Newa, Nko, Nushu, Nyakeng_Puachue_Hmong, Ogham,
|
(Prti), Javanese (Java), Kaithi (Kthi), Kannada (Knda), Katakana
|
||||||
Ol_Chiki, Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic,
|
(Kana), Kayah_Li (Kali), Kharoshthi (Khar), Khitan_Small_Script (Kits),
|
||||||
Old_Persian, Old_Sogdian, Old_South_Arabian, Old_Turkic, Old_Uyghur,
|
Khmer (Khmr), Khojki (Khoj), Khudawadi (Sind), Lao (Laoo), Latin
|
||||||
Oriya, Osage, Osmanya, Pahawh_Hmong, Palmyrene, Pau_Cin_Hau, Phags_Pa,
|
(Latn), Lepcha (Lepc), Limbu (Limb), Linear_A (Lina), Linear_B (Linb),
|
||||||
Phoenician, Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra,
|
Lisu (Lisu), Lycian (Lyci), Lydian (Lydi), Mahajani (Majh), Makasar
|
||||||
Sharada, Shavian, Siddham, SignWriting, Sinhala, Sogdian, Sora_Sompeng,
|
(Maka), Malayalam (Mlym), Mandaic (Mand), Manichaean (Mani), Marchen
|
||||||
Soyombo, Sundanese, Syloti_Nagri, Syriac, Tagalog, Tagbanwa, Tai_Le,
|
(Marc), Masaram_Gondi (Gonm), Medefaidrin (Medf), Meetei_Mayek (Mtei),
|
||||||
Tai_Tham, Tai_Viet, Takri, Tamil, Tangsa, Tangut, Telugu, Thaana, Thai,
|
Mende_Kikakui (Mend), Meroitic_Cursive (Merc), Meroitic_Hieroglyphs
|
||||||
Tibetan, Tifinagh, Tirhuta, Toto, Ugaritic, Unknown, Vai, Vithkuqi,
|
(Mero), Miao (Miao), Modi (Modi), Mongolian (Mong), Mro (Mroo), Multani
|
||||||
Wancho, Warang_Citi, Yezidi, Yi, Zanabazar_Square.
|
(Mult), Myanmar (Mymr), Nabataean (Nbar), Nandinagari (Nand),
|
||||||
|
New_Tai_Lue (Talu), Newa (Newa), Nko (Nkoo), Nushu (Nshu), Nyiak-
|
||||||
|
eng_Puachue_Hmong (Hmnp), Ogham (Ogam), Ol_Chiki (Olck), Old_Hungarian
|
||||||
|
(Hung), Old_Italic (Olck), Old_North_Arabian (Narb), Old_Permic (Perm),
|
||||||
|
Old_Persian (Orkh), Old_Sogdian (Sogo), Old_South_Arabian (Sarb),
|
||||||
|
Old_Turkic (Orkh), Old_Uyghur (Ougr), Oriya (Orya), Osage (Osge), Os-
|
||||||
|
manya (Osma), Pahawh_Hmong (Hmng), Palmyrene (Palm), Pau_Cin_Hau
|
||||||
|
(Pauc), Phags_Pa (Phag), Phoenician (Phnx), Psalter_Pahlavi (Phli), Re-
|
||||||
|
jang (Rjng), Runic (Runr), Samaritan (Samr), Saurashtra (Saur), Sharada
|
||||||
|
(Shrd), Shavian (Shaw), Siddham (Sidd), SignWriting (Sgnw), Sinhala
|
||||||
|
(Sinh), Sogdian (Sogd), Sora_Sompeng (Sora), Soyombo (Soyo), Sundanese
|
||||||
|
(Sund), Syloti_Nagri (Sylo), Syriac (Syrc), Tagalog (Tglg), Tagbanwa
|
||||||
|
(Tagb), Tai_Le (Tale), Tai_Tham (Lana), Tai_Viet (Tavt), Takri (Takr),
|
||||||
|
Tamil (Taml), Tangsa (Tngs), Tangut (Tang), Telugu (Telu), Thaana
|
||||||
|
(Thaa), Thai (Thai), Tibetan (Tibt), Tifinagh (Tfng), Tirhuta (Tirh),
|
||||||
|
Toto (Toto), Ugaritic (Ugar), Vai (Vaii), Vithkuqi (Vith), Wancho
|
||||||
|
(Wcho), Warang_Citi (Wara), Yezidi (Yezi), Yi (Yiii), Zanabazar_Square
|
||||||
|
(Zanb).
|
||||||
|
|
||||||
Each character has exactly one Unicode general category property, spec-
|
Each character has exactly one Unicode general category property, spec-
|
||||||
ified by a two-letter abbreviation. For compatibility with Perl, nega-
|
ified by a two-letter abbreviation. For compatibility with Perl, nega-
|
||||||
|
@ -9707,7 +9725,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 22 December 2021
|
Last updated: 28 December 2021
|
||||||
Copyright (c) 1997-2021 University of Cambridge.
|
Copyright (c) 1997-2021 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -10379,11 +10397,11 @@ NAME
|
||||||
SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS
|
SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS
|
||||||
|
|
||||||
int32_t pcre2_serialize_decode(pcre2_code **codes,
|
int32_t pcre2_serialize_decode(pcre2_code **codes,
|
||||||
int32_t number_of_codes, const uint32_t *bytes,
|
int32_t number_of_codes, const uint8_t *bytes,
|
||||||
pcre2_general_context *gcontext);
|
pcre2_general_context *gcontext);
|
||||||
|
|
||||||
int32_t pcre2_serialize_encode(pcre2_code **codes,
|
int32_t pcre2_serialize_encode(const pcre2_code **codes,
|
||||||
int32_t number_of_codes, uint32_t **serialized_bytes,
|
int32_t number_of_codes, uint8_t **serialized_bytes,
|
||||||
PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
|
PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
|
||||||
|
|
||||||
void pcre2_serialize_free(uint8_t *bytes);
|
void pcre2_serialize_free(uint8_t *bytes);
|
||||||
|
@ -10507,7 +10525,6 @@ RE-USING PRECOMPILED PATTERNS
|
||||||
If this argument is NULL, malloc() and free() are used. After deserial-
|
If this argument is NULL, malloc() and free() are used. After deserial-
|
||||||
ization, the byte stream is no longer needed and can be discarded.
|
ization, the byte stream is no longer needed and can be discarded.
|
||||||
|
|
||||||
int32_t number_of_codes;
|
|
||||||
pcre2_code *list_of_codes[2];
|
pcre2_code *list_of_codes[2];
|
||||||
uint8_t *bytes = <serialized data>;
|
uint8_t *bytes = <serialized data>;
|
||||||
int32_t number_of_codes =
|
int32_t number_of_codes =
|
||||||
|
@ -10724,34 +10741,52 @@ PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P
|
||||||
|
|
||||||
SCRIPT MATCHING WITH \p AND \P
|
SCRIPT MATCHING WITH \p AND \P
|
||||||
|
|
||||||
The following script names are recognized in \p{sc:...} or \p{scx:...}
|
The following script names and their 4-letter abbreviations are recog-
|
||||||
items, or on their own with \p (and also \P of course):
|
nized in \p{sc:...} or \p{scx:...} items, or on their own with \p (and
|
||||||
|
also \P of course):
|
||||||
|
|
||||||
Adlam, Ahom, Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, Bali-
|
Adlam (Adlm), Ahom (Ahom), Anatolian_Hieroglyphs (Hluw), Arabic (Arab),
|
||||||
nese, Bamum, Bassa_Vah, Batak, Bengali, Bhaiksuki, Bopomofo, Brahmi,
|
Armenian (Armn), Avestan (Avst), Balinese (Bali), Bamum (Bamu),
|
||||||
Braille, Buginese, Buhid, Canadian_Aboriginal, Carian, Caucasian_Alba-
|
Bassa_Vah (Bass), Batak (Batk), Bengali (Beng), Bhaiksuki (Bhks), Bopo-
|
||||||
nian, Chakma, Cham, Cherokee, Chorasmian, Common, Coptic, Cuneiform,
|
mofo (Bopo), Brahmi (Brah), Braille (Brai), Buginese (Bugi), Buhid
|
||||||
Cypriot, Cypro_Minoan, Cyrillic, Deseret, Devanagari, Dives_Akuru, Do-
|
(Buhd), Canadian_Aboriginal (Cans), Carian (Cari), Caucasian_Albanian
|
||||||
gra, Duployan, Egyptian_Hieroglyphs, Elbasan, Elymaic, Ethiopic, Geor-
|
(Aghb), Chakma (Cakm), Cham (Cham), Cherokee (Cher), Chorasmian (Chrs),
|
||||||
gian, Glagolitic, Gothic, Grantha, Greek, Gujarati, Gunjala_Gondi, Gur-
|
Common (Zyyy), Coptic (Copt), Cuneiform (Xsux), Cypriot (Cprt),
|
||||||
mukhi, Han, Hangul, Hanifi_Rohingya, Hanunoo, Hatran, Hebrew, Hiragana,
|
Cypro_Minoan (Cpmn), Cyrillic (Cyrl), Deseret (Dsrt), Devanagari
|
||||||
Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip-
|
(Deva), Dives_Akuru (Diak), Dogra (Dogr), Duployan (Dupl), Egyptian_Hi-
|
||||||
tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li,
|
eroglyphs (Egyp), Elbasan (Elba), Elymaic (Elym), Ethiopic (Ethi),
|
||||||
Kharoshthi, Khitan_Small_Script, Khmer, Khojki, Khudawadi, Lao, Latin,
|
Georgian (Geor), Glagolitic (Glag), Gothic (Goth), Grantha (Gran),
|
||||||
Lepcha, Limbu, Linear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani,
|
Greek (Grek), Gujarati (Gujr), Gunjala_Gondi (Gong), Gurmukhi (Guru),
|
||||||
Makasar, Malayalam, Mandaic, Manichaean, Marchen, Masaram_Gondi, Mede-
|
Han (Hani), Hangul (Hang), Hanifi_Rohingya (Rohg), Hanunoo (Hano), Ha-
|
||||||
faidrin, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive, Meroitic_Hiero-
|
tran (Hatr), Hebrew (Hebr), Hiragana (Hira), Imperial_Aramaic (Armi),
|
||||||
glyphs, Miao, Modi, Mongolian, Mro, Multani, Myanmar, Nabataean, Nandi-
|
Inherited (Zinh), Inscriptional_Pahlavi (Phli), Inscriptional_Parthian
|
||||||
nagari, New_Tai_Lue, Newa, Nko, Nushu, Nyakeng_Puachue_Hmong, Ogham,
|
(Prti), Javanese (Java), Kaithi (Kthi), Kannada (Knda), Katakana
|
||||||
Ol_Chiki, Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic,
|
(Kana), Kayah_Li (Kali), Kharoshthi (Khar), Khitan_Small_Script (Kits),
|
||||||
Old_Persian, Old_Sogdian, Old_South_Arabian, Old_Turkic, Old_Uyghur,
|
Khmer (Khmr), Khojki (Khoj), Khudawadi (Sind), Lao (Laoo), Latin
|
||||||
Oriya, Osage, Osmanya, Pahawh_Hmong, Palmyrene, Pau_Cin_Hau, Phags_Pa,
|
(Latn), Lepcha (Lepc), Limbu (Limb), Linear_A (Lina), Linear_B (Linb),
|
||||||
Phoenician, Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra,
|
Lisu (Lisu), Lycian (Lyci), Lydian (Lydi), Mahajani (Majh), Makasar
|
||||||
Sharada, Shavian, Siddham, SignWriting, Sinhala, Sogdian, Sora_Sompeng,
|
(Maka), Malayalam (Mlym), Mandaic (Mand), Manichaean (Mani), Marchen
|
||||||
Soyombo, Sundanese, Syloti_Nagri, Syriac, Tagalog, Tagbanwa, Tai_Le,
|
(Marc), Masaram_Gondi (Gonm), Medefaidrin (Medf), Meetei_Mayek (Mtei),
|
||||||
Tai_Tham, Tai_Viet, Takri, Tamil, Tangsa, Tangut, Telugu, Thaana, Thai,
|
Mende_Kikakui (Mend), Meroitic_Cursive (Merc), Meroitic_Hieroglyphs
|
||||||
Tibetan, Tifinagh, Tirhuta, Toto, Ugaritic, Vai, Vithkuqi, Wancho,
|
(Mero), Miao (Miao), Modi (Modi), Mongolian (Mong), Mro (Mroo), Multani
|
||||||
Warang_Citi, Yezidi, Yi, Zanabazar_Square.
|
(Mult), Myanmar (Mymr), Nabataean (Nbar), Nandinagari (Nand),
|
||||||
|
New_Tai_Lue (Talu), Newa (Newa), Nko (Nkoo), Nushu (Nshu), Nyiak-
|
||||||
|
eng_Puachue_Hmong (Hmnp), Ogham (Ogam), Ol_Chiki (Olck), Old_Hungarian
|
||||||
|
(Hung), Old_Italic (Olck), Old_North_Arabian (Narb), Old_Permic (Perm),
|
||||||
|
Old_Persian (Orkh), Old_Sogdian (Sogo), Old_South_Arabian (Sarb),
|
||||||
|
Old_Turkic (Orkh), Old_Uyghur (Ougr), Oriya (Orya), Osage (Osge), Os-
|
||||||
|
manya (Osma), Pahawh_Hmong (Hmng), Palmyrene (Palm), Pau_Cin_Hau
|
||||||
|
(Pauc), Phags_Pa (Phag), Phoenician (Phnx), Psalter_Pahlavi (Phli), Re-
|
||||||
|
jang (Rjng), Runic (Runr), Samaritan (Samr), Saurashtra (Saur), Sharada
|
||||||
|
(Shrd), Shavian (Shaw), Siddham (Sidd), SignWriting (Sgnw), Sinhala
|
||||||
|
(Sinh), Sogdian (Sogd), Sora_Sompeng (Sora), Soyombo (Soyo), Sundanese
|
||||||
|
(Sund), Syloti_Nagri (Sylo), Syriac (Syrc), Tagalog (Tglg), Tagbanwa
|
||||||
|
(Tagb), Tai_Le (Tale), Tai_Tham (Lana), Tai_Viet (Tavt), Takri (Takr),
|
||||||
|
Tamil (Taml), Tangsa (Tngs), Tangut (Tang), Telugu (Telu), Thaana
|
||||||
|
(Thaa), Thai (Thai), Tibetan (Tibt), Tifinagh (Tfng), Tirhuta (Tirh),
|
||||||
|
Toto (Toto), Ugaritic (Ugar), Vai (Vaii), Vithkuqi (Vith), Wancho
|
||||||
|
(Wcho), Warang_Citi (Wara), Yezidi (Yezi), Yi (Yiii), Zanabazar_Square
|
||||||
|
(Zanb).
|
||||||
|
|
||||||
|
|
||||||
BIDI_PROPERTIES FOR \p AND \P
|
BIDI_PROPERTIES FOR \p AND \P
|
||||||
|
@ -11117,7 +11152,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 22 December 2021
|
Last updated: 28 December 2021
|
||||||
Copyright (c) 1997-2021 University of Cambridge.
|
Copyright (c) 1997-2021 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2PATTERN 3 "22 December 2021" "PCRE2 10.40"
|
.TH PCRE2PATTERN 3 "28 December 2021" "PCRE2 10.40"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||||
|
@ -807,170 +807,169 @@ interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
||||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||||
part of an identified script are lumped together as "Common". The current list
|
part of an identified script are lumped together as "Common". The current list
|
||||||
of scripts is:
|
of script names and their 4-letter abbreviations is:
|
||||||
.P
|
.P
|
||||||
Adlam,
|
Adlam (Adlm),
|
||||||
Ahom,
|
Ahom (Ahom),
|
||||||
Anatolian_Hieroglyphs,
|
Anatolian_Hieroglyphs (Hluw),
|
||||||
Arabic,
|
Arabic (Arab),
|
||||||
Armenian,
|
Armenian (Armn),
|
||||||
Avestan,
|
Avestan (Avst),
|
||||||
Balinese,
|
Balinese (Bali),
|
||||||
Bamum,
|
Bamum (Bamu),
|
||||||
Bassa_Vah,
|
Bassa_Vah (Bass),
|
||||||
Batak,
|
Batak (Batk),
|
||||||
Bengali,
|
Bengali (Beng),
|
||||||
Bhaiksuki,
|
Bhaiksuki (Bhks),
|
||||||
Bopomofo,
|
Bopomofo (Bopo),
|
||||||
Brahmi,
|
Brahmi (Brah),
|
||||||
Braille,
|
Braille (Brai),
|
||||||
Buginese,
|
Buginese (Bugi),
|
||||||
Buhid,
|
Buhid (Buhd),
|
||||||
Canadian_Aboriginal,
|
Canadian_Aboriginal (Cans),
|
||||||
Carian,
|
Carian (Cari),
|
||||||
Caucasian_Albanian,
|
Caucasian_Albanian (Aghb),
|
||||||
Chakma,
|
Chakma (Cakm),
|
||||||
Cham,
|
Cham (Cham),
|
||||||
Cherokee,
|
Cherokee (Cher),
|
||||||
Chorasmian,
|
Chorasmian (Chrs),
|
||||||
Common,
|
Common (Zyyy),
|
||||||
Coptic,
|
Coptic (Copt),
|
||||||
Cuneiform,
|
Cuneiform (Xsux),
|
||||||
Cypriot,
|
Cypriot (Cprt),
|
||||||
Cypro_Minoan,
|
Cypro_Minoan (Cpmn),
|
||||||
Cyrillic,
|
Cyrillic (Cyrl),
|
||||||
Deseret,
|
Deseret (Dsrt),
|
||||||
Devanagari,
|
Devanagari (Deva),
|
||||||
Dives_Akuru,
|
Dives_Akuru (Diak),
|
||||||
Dogra,
|
Dogra (Dogr),
|
||||||
Duployan,
|
Duployan (Dupl),
|
||||||
Egyptian_Hieroglyphs,
|
Egyptian_Hieroglyphs (Egyp),
|
||||||
Elbasan,
|
Elbasan (Elba),
|
||||||
Elymaic,
|
Elymaic (Elym),
|
||||||
Ethiopic,
|
Ethiopic (Ethi),
|
||||||
Georgian,
|
Georgian (Geor),
|
||||||
Glagolitic,
|
Glagolitic (Glag),
|
||||||
Gothic,
|
Gothic (Goth),
|
||||||
Grantha,
|
Grantha (Gran),
|
||||||
Greek,
|
Greek (Grek),
|
||||||
Gujarati,
|
Gujarati (Gujr),
|
||||||
Gunjala_Gondi,
|
Gunjala_Gondi (Gong),
|
||||||
Gurmukhi,
|
Gurmukhi (Guru),
|
||||||
Han,
|
Han (Hani),
|
||||||
Hangul,
|
Hangul (Hang),
|
||||||
Hanifi_Rohingya,
|
Hanifi_Rohingya (Rohg),
|
||||||
Hanunoo,
|
Hanunoo (Hano),
|
||||||
Hatran,
|
Hatran (Hatr),
|
||||||
Hebrew,
|
Hebrew (Hebr),
|
||||||
Hiragana,
|
Hiragana (Hira),
|
||||||
Imperial_Aramaic,
|
Imperial_Aramaic (Armi),
|
||||||
Inherited,
|
Inherited (Zinh),
|
||||||
Inscriptional_Pahlavi,
|
Inscriptional_Pahlavi (Phli),
|
||||||
Inscriptional_Parthian,
|
Inscriptional_Parthian (Prti),
|
||||||
Javanese,
|
Javanese (Java),
|
||||||
Kaithi,
|
Kaithi (Kthi),
|
||||||
Kannada,
|
Kannada (Knda),
|
||||||
Katakana,
|
Katakana (Kana),
|
||||||
Kayah_Li,
|
Kayah_Li (Kali),
|
||||||
Kharoshthi,
|
Kharoshthi (Khar),
|
||||||
Khitan_Small_Script,
|
Khitan_Small_Script (Kits),
|
||||||
Khmer,
|
Khmer (Khmr),
|
||||||
Khojki,
|
Khojki (Khoj),
|
||||||
Khudawadi,
|
Khudawadi (Sind),
|
||||||
Lao,
|
Lao (Laoo),
|
||||||
Latin,
|
Latin (Latn),
|
||||||
Lepcha,
|
Lepcha (Lepc),
|
||||||
Limbu,
|
Limbu (Limb),
|
||||||
Linear_A,
|
Linear_A (Lina),
|
||||||
Linear_B,
|
Linear_B (Linb),
|
||||||
Lisu,
|
Lisu (Lisu),
|
||||||
Lycian,
|
Lycian (Lyci),
|
||||||
Lydian,
|
Lydian (Lydi),
|
||||||
Mahajani,
|
Mahajani (Majh),
|
||||||
Makasar,
|
Makasar (Maka),
|
||||||
Malayalam,
|
Malayalam (Mlym),
|
||||||
Mandaic,
|
Mandaic (Mand),
|
||||||
Manichaean,
|
Manichaean (Mani),
|
||||||
Marchen,
|
Marchen (Marc),
|
||||||
Masaram_Gondi,
|
Masaram_Gondi (Gonm),
|
||||||
Medefaidrin,
|
Medefaidrin (Medf),
|
||||||
Meetei_Mayek,
|
Meetei_Mayek (Mtei),
|
||||||
Mende_Kikakui,
|
Mende_Kikakui (Mend),
|
||||||
Meroitic_Cursive,
|
Meroitic_Cursive (Merc),
|
||||||
Meroitic_Hieroglyphs,
|
Meroitic_Hieroglyphs (Mero),
|
||||||
Miao,
|
Miao (Miao),
|
||||||
Modi,
|
Modi (Modi),
|
||||||
Mongolian,
|
Mongolian (Mong),
|
||||||
Mro,
|
Mro (Mroo),
|
||||||
Multani,
|
Multani (Mult),
|
||||||
Myanmar,
|
Myanmar (Mymr),
|
||||||
Nabataean,
|
Nabataean (Nbar),
|
||||||
Nandinagari,
|
Nandinagari (Nand),
|
||||||
New_Tai_Lue,
|
New_Tai_Lue (Talu),
|
||||||
Newa,
|
Newa (Newa),
|
||||||
Nko,
|
Nko (Nkoo),
|
||||||
Nushu,
|
Nushu (Nshu),
|
||||||
Nyakeng_Puachue_Hmong,
|
Nyiakeng_Puachue_Hmong (Hmnp),
|
||||||
Ogham,
|
Ogham (Ogam),
|
||||||
Ol_Chiki,
|
Ol_Chiki (Olck),
|
||||||
Old_Hungarian,
|
Old_Hungarian (Hung),
|
||||||
Old_Italic,
|
Old_Italic (Olck),
|
||||||
Old_North_Arabian,
|
Old_North_Arabian (Narb),
|
||||||
Old_Permic,
|
Old_Permic (Perm),
|
||||||
Old_Persian,
|
Old_Persian (Orkh),
|
||||||
Old_Sogdian,
|
Old_Sogdian (Sogo),
|
||||||
Old_South_Arabian,
|
Old_South_Arabian (Sarb),
|
||||||
Old_Turkic,
|
Old_Turkic (Orkh),
|
||||||
Old_Uyghur,
|
Old_Uyghur (Ougr),
|
||||||
Oriya,
|
Oriya (Orya),
|
||||||
Osage,
|
Osage (Osge),
|
||||||
Osmanya,
|
Osmanya (Osma),
|
||||||
Pahawh_Hmong,
|
Pahawh_Hmong (Hmng),
|
||||||
Palmyrene,
|
Palmyrene (Palm),
|
||||||
Pau_Cin_Hau,
|
Pau_Cin_Hau (Pauc),
|
||||||
Phags_Pa,
|
Phags_Pa (Phag),
|
||||||
Phoenician,
|
Phoenician (Phnx),
|
||||||
Psalter_Pahlavi,
|
Psalter_Pahlavi (Phli),
|
||||||
Rejang,
|
Rejang (Rjng),
|
||||||
Runic,
|
Runic (Runr),
|
||||||
Samaritan,
|
Samaritan (Samr),
|
||||||
Saurashtra,
|
Saurashtra (Saur),
|
||||||
Sharada,
|
Sharada (Shrd),
|
||||||
Shavian,
|
Shavian (Shaw),
|
||||||
Siddham,
|
Siddham (Sidd),
|
||||||
SignWriting,
|
SignWriting (Sgnw),
|
||||||
Sinhala,
|
Sinhala (Sinh),
|
||||||
Sogdian,
|
Sogdian (Sogd),
|
||||||
Sora_Sompeng,
|
Sora_Sompeng (Sora),
|
||||||
Soyombo,
|
Soyombo (Soyo),
|
||||||
Sundanese,
|
Sundanese (Sund),
|
||||||
Syloti_Nagri,
|
Syloti_Nagri (Sylo),
|
||||||
Syriac,
|
Syriac (Syrc),
|
||||||
Tagalog,
|
Tagalog (Tglg),
|
||||||
Tagbanwa,
|
Tagbanwa (Tagb),
|
||||||
Tai_Le,
|
Tai_Le (Tale),
|
||||||
Tai_Tham,
|
Tai_Tham (Lana),
|
||||||
Tai_Viet,
|
Tai_Viet (Tavt),
|
||||||
Takri,
|
Takri (Takr),
|
||||||
Tamil,
|
Tamil (Taml),
|
||||||
Tangsa,
|
Tangsa (Tngs),
|
||||||
Tangut,
|
Tangut (Tang),
|
||||||
Telugu,
|
Telugu (Telu),
|
||||||
Thaana,
|
Thaana (Thaa),
|
||||||
Thai,
|
Thai (Thai),
|
||||||
Tibetan,
|
Tibetan (Tibt),
|
||||||
Tifinagh,
|
Tifinagh (Tfng),
|
||||||
Tirhuta,
|
Tirhuta (Tirh),
|
||||||
Toto,
|
Toto (Toto),
|
||||||
Ugaritic,
|
Ugaritic (Ugar),
|
||||||
Unknown,
|
Vai (Vaii),
|
||||||
Vai,
|
Vithkuqi (Vith),
|
||||||
Vithkuqi,
|
Wancho (Wcho),
|
||||||
Wancho,
|
Warang_Citi (Wara),
|
||||||
Warang_Citi,
|
Yezidi (Yezi),
|
||||||
Yezidi,
|
Yi (Yiii),
|
||||||
Yi,
|
Zanabazar_Square (Zanb).
|
||||||
Zanabazar_Square.
|
|
||||||
.P
|
.P
|
||||||
Each character has exactly one Unicode general category property, specified by
|
Each character has exactly one Unicode general category property, specified by
|
||||||
a two-letter abbreviation. For compatibility with Perl, negation can be
|
a two-letter abbreviation. For compatibility with Perl, negation can be
|
||||||
|
@ -3956,6 +3955,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 22 December 2021
|
Last updated: 28 December 2021
|
||||||
Copyright (c) 1997-2021 University of Cambridge.
|
Copyright (c) 1997-2021 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2SYNTAX 3 "22 December 2021" "PCRE2 10.40"
|
.TH PCRE2SYNTAX 3 "28 December 2021" "PCRE2 10.40"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||||
|
@ -175,170 +175,171 @@ at release 5.18.
|
||||||
.SH "SCRIPT MATCHING WITH \ep AND \eP"
|
.SH "SCRIPT MATCHING WITH \ep AND \eP"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
The following script names are recognized in \ep{sc:...} or \ep{scx:...} items,
|
The following script names and their 4-letter abbreviations are recognized in
|
||||||
or on their own with \ep (and also \eP of course):
|
\ep{sc:...} or \ep{scx:...} items, or on their own with \ep (and also \eP of
|
||||||
|
course):
|
||||||
.P
|
.P
|
||||||
Adlam,
|
Adlam (Adlm),
|
||||||
Ahom,
|
Ahom (Ahom),
|
||||||
Anatolian_Hieroglyphs,
|
Anatolian_Hieroglyphs (Hluw),
|
||||||
Arabic,
|
Arabic (Arab),
|
||||||
Armenian,
|
Armenian (Armn),
|
||||||
Avestan,
|
Avestan (Avst),
|
||||||
Balinese,
|
Balinese (Bali),
|
||||||
Bamum,
|
Bamum (Bamu),
|
||||||
Bassa_Vah,
|
Bassa_Vah (Bass),
|
||||||
Batak,
|
Batak (Batk),
|
||||||
Bengali,
|
Bengali (Beng),
|
||||||
Bhaiksuki,
|
Bhaiksuki (Bhks),
|
||||||
Bopomofo,
|
Bopomofo (Bopo),
|
||||||
Brahmi,
|
Brahmi (Brah),
|
||||||
Braille,
|
Braille (Brai),
|
||||||
Buginese,
|
Buginese (Bugi),
|
||||||
Buhid,
|
Buhid (Buhd),
|
||||||
Canadian_Aboriginal,
|
Canadian_Aboriginal (Cans),
|
||||||
Carian,
|
Carian (Cari),
|
||||||
Caucasian_Albanian,
|
Caucasian_Albanian (Aghb),
|
||||||
Chakma,
|
Chakma (Cakm),
|
||||||
Cham,
|
Cham (Cham),
|
||||||
Cherokee,
|
Cherokee (Cher),
|
||||||
Chorasmian,
|
Chorasmian (Chrs),
|
||||||
Common,
|
Common (Zyyy),
|
||||||
Coptic,
|
Coptic (Copt),
|
||||||
Cuneiform,
|
Cuneiform (Xsux),
|
||||||
Cypriot,
|
Cypriot (Cprt),
|
||||||
Cypro_Minoan,
|
Cypro_Minoan (Cpmn),
|
||||||
Cyrillic,
|
Cyrillic (Cyrl),
|
||||||
Deseret,
|
Deseret (Dsrt),
|
||||||
Devanagari,
|
Devanagari (Deva),
|
||||||
Dives_Akuru,
|
Dives_Akuru (Diak),
|
||||||
Dogra,
|
Dogra (Dogr),
|
||||||
Duployan,
|
Duployan (Dupl),
|
||||||
Egyptian_Hieroglyphs,
|
Egyptian_Hieroglyphs (Egyp),
|
||||||
Elbasan,
|
Elbasan (Elba),
|
||||||
Elymaic,
|
Elymaic (Elym),
|
||||||
Ethiopic,
|
Ethiopic (Ethi),
|
||||||
Georgian,
|
Georgian (Geor),
|
||||||
Glagolitic,
|
Glagolitic (Glag),
|
||||||
Gothic,
|
Gothic (Goth),
|
||||||
Grantha,
|
Grantha (Gran),
|
||||||
Greek,
|
Greek (Grek),
|
||||||
Gujarati,
|
Gujarati (Gujr),
|
||||||
Gunjala_Gondi,
|
Gunjala_Gondi (Gong),
|
||||||
Gurmukhi,
|
Gurmukhi (Guru),
|
||||||
Han,
|
Han (Hani),
|
||||||
Hangul,
|
Hangul (Hang),
|
||||||
Hanifi_Rohingya,
|
Hanifi_Rohingya (Rohg),
|
||||||
Hanunoo,
|
Hanunoo (Hano),
|
||||||
Hatran,
|
Hatran (Hatr),
|
||||||
Hebrew,
|
Hebrew (Hebr),
|
||||||
Hiragana,
|
Hiragana (Hira),
|
||||||
Imperial_Aramaic,
|
Imperial_Aramaic (Armi),
|
||||||
Inherited,
|
Inherited (Zinh),
|
||||||
Inscriptional_Pahlavi,
|
Inscriptional_Pahlavi (Phli),
|
||||||
Inscriptional_Parthian,
|
Inscriptional_Parthian (Prti),
|
||||||
Javanese,
|
Javanese (Java),
|
||||||
Kaithi,
|
Kaithi (Kthi),
|
||||||
Kannada,
|
Kannada (Knda),
|
||||||
Katakana,
|
Katakana (Kana),
|
||||||
Kayah_Li,
|
Kayah_Li (Kali),
|
||||||
Kharoshthi,
|
Kharoshthi (Khar),
|
||||||
Khitan_Small_Script,
|
Khitan_Small_Script (Kits),
|
||||||
Khmer,
|
Khmer (Khmr),
|
||||||
Khojki,
|
Khojki (Khoj),
|
||||||
Khudawadi,
|
Khudawadi (Sind),
|
||||||
Lao,
|
Lao (Laoo),
|
||||||
Latin,
|
Latin (Latn),
|
||||||
Lepcha,
|
Lepcha (Lepc),
|
||||||
Limbu,
|
Limbu (Limb),
|
||||||
Linear_A,
|
Linear_A (Lina),
|
||||||
Linear_B,
|
Linear_B (Linb),
|
||||||
Lisu,
|
Lisu (Lisu),
|
||||||
Lycian,
|
Lycian (Lyci),
|
||||||
Lydian,
|
Lydian (Lydi),
|
||||||
Mahajani,
|
Mahajani (Majh),
|
||||||
Makasar,
|
Makasar (Maka),
|
||||||
Malayalam,
|
Malayalam (Mlym),
|
||||||
Mandaic,
|
Mandaic (Mand),
|
||||||
Manichaean,
|
Manichaean (Mani),
|
||||||
Marchen,
|
Marchen (Marc),
|
||||||
Masaram_Gondi,
|
Masaram_Gondi (Gonm),
|
||||||
Medefaidrin,
|
Medefaidrin (Medf),
|
||||||
Meetei_Mayek,
|
Meetei_Mayek (Mtei),
|
||||||
Mende_Kikakui,
|
Mende_Kikakui (Mend),
|
||||||
Meroitic_Cursive,
|
Meroitic_Cursive (Merc),
|
||||||
Meroitic_Hieroglyphs,
|
Meroitic_Hieroglyphs (Mero),
|
||||||
Miao,
|
Miao (Miao),
|
||||||
Modi,
|
Modi (Modi),
|
||||||
Mongolian,
|
Mongolian (Mong),
|
||||||
Mro,
|
Mro (Mroo),
|
||||||
Multani,
|
Multani (Mult),
|
||||||
Myanmar,
|
Myanmar (Mymr),
|
||||||
Nabataean,
|
Nabataean (Nbar),
|
||||||
Nandinagari,
|
Nandinagari (Nand),
|
||||||
New_Tai_Lue,
|
New_Tai_Lue (Talu),
|
||||||
Newa,
|
Newa (Newa),
|
||||||
Nko,
|
Nko (Nkoo),
|
||||||
Nushu,
|
Nushu (Nshu),
|
||||||
Nyakeng_Puachue_Hmong,
|
Nyiakeng_Puachue_Hmong (Hmnp),
|
||||||
Ogham,
|
Ogham (Ogam),
|
||||||
Ol_Chiki,
|
Ol_Chiki (Olck),
|
||||||
Old_Hungarian,
|
Old_Hungarian (Hung),
|
||||||
Old_Italic,
|
Old_Italic (Olck),
|
||||||
Old_North_Arabian,
|
Old_North_Arabian (Narb),
|
||||||
Old_Permic,
|
Old_Permic (Perm),
|
||||||
Old_Persian,
|
Old_Persian (Orkh),
|
||||||
Old_Sogdian,
|
Old_Sogdian (Sogo),
|
||||||
Old_South_Arabian,
|
Old_South_Arabian (Sarb),
|
||||||
Old_Turkic,
|
Old_Turkic (Orkh),
|
||||||
Old_Uyghur,
|
Old_Uyghur (Ougr),
|
||||||
Oriya,
|
Oriya (Orya),
|
||||||
Osage,
|
Osage (Osge),
|
||||||
Osmanya,
|
Osmanya (Osma),
|
||||||
Pahawh_Hmong,
|
Pahawh_Hmong (Hmng),
|
||||||
Palmyrene,
|
Palmyrene (Palm),
|
||||||
Pau_Cin_Hau,
|
Pau_Cin_Hau (Pauc),
|
||||||
Phags_Pa,
|
Phags_Pa (Phag),
|
||||||
Phoenician,
|
Phoenician (Phnx),
|
||||||
Psalter_Pahlavi,
|
Psalter_Pahlavi (Phli),
|
||||||
Rejang,
|
Rejang (Rjng),
|
||||||
Runic,
|
Runic (Runr),
|
||||||
Samaritan,
|
Samaritan (Samr),
|
||||||
Saurashtra,
|
Saurashtra (Saur),
|
||||||
Sharada,
|
Sharada (Shrd),
|
||||||
Shavian,
|
Shavian (Shaw),
|
||||||
Siddham,
|
Siddham (Sidd),
|
||||||
SignWriting,
|
SignWriting (Sgnw),
|
||||||
Sinhala,
|
Sinhala (Sinh),
|
||||||
Sogdian,
|
Sogdian (Sogd),
|
||||||
Sora_Sompeng,
|
Sora_Sompeng (Sora),
|
||||||
Soyombo,
|
Soyombo (Soyo),
|
||||||
Sundanese,
|
Sundanese (Sund),
|
||||||
Syloti_Nagri,
|
Syloti_Nagri (Sylo),
|
||||||
Syriac,
|
Syriac (Syrc),
|
||||||
Tagalog,
|
Tagalog (Tglg),
|
||||||
Tagbanwa,
|
Tagbanwa (Tagb),
|
||||||
Tai_Le,
|
Tai_Le (Tale),
|
||||||
Tai_Tham,
|
Tai_Tham (Lana),
|
||||||
Tai_Viet,
|
Tai_Viet (Tavt),
|
||||||
Takri,
|
Takri (Takr),
|
||||||
Tamil,
|
Tamil (Taml),
|
||||||
Tangsa,
|
Tangsa (Tngs),
|
||||||
Tangut,
|
Tangut (Tang),
|
||||||
Telugu,
|
Telugu (Telu),
|
||||||
Thaana,
|
Thaana (Thaa),
|
||||||
Thai,
|
Thai (Thai),
|
||||||
Tibetan,
|
Tibetan (Tibt),
|
||||||
Tifinagh,
|
Tifinagh (Tfng),
|
||||||
Tirhuta,
|
Tirhuta (Tirh),
|
||||||
Toto,
|
Toto (Toto),
|
||||||
Ugaritic,
|
Ugaritic (Ugar),
|
||||||
Vai,
|
Vai (Vaii),
|
||||||
Vithkuqi,
|
Vithkuqi (Vith),
|
||||||
Wancho,
|
Wancho (Wcho),
|
||||||
Warang_Citi,
|
Warang_Citi (Wara),
|
||||||
Yezidi,
|
Yezidi (Yezi),
|
||||||
Yi,
|
Yi (Yiii),
|
||||||
Zanabazar_Square.
|
Zanabazar_Square (Zanb).
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SH "BIDI_PROPERTIES FOR \ep AND \eP"
|
.SH "BIDI_PROPERTIES FOR \ep AND \eP"
|
||||||
|
@ -727,6 +728,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 22 December 2021
|
Last updated: 28 December 2021
|
||||||
Copyright (c) 1997-2021 University of Cambridge.
|
Copyright (c) 1997-2021 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -38,8 +38,11 @@
|
||||||
# accordance with Unicode's "loose matching" rules, which Perl observes.
|
# accordance with Unicode's "loose matching" rules, which Perl observes.
|
||||||
# Changed default script type from PT_SC to PT_SCX, 18-December-2021
|
# Changed default script type from PT_SC to PT_SCX, 18-December-2021
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
#
|
||||||
# Note subsequent changes here:
|
# Note subsequent changes here:
|
||||||
|
#
|
||||||
|
# 27-December_2021: Added support for 4-letter script abbreviations.
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
# Import common data lists and functions
|
# Import common data lists and functions
|
||||||
|
@ -79,15 +82,27 @@ def stdnames(x):
|
||||||
return y
|
return y
|
||||||
|
|
||||||
std_script_names = stdnames(script_names)
|
std_script_names = stdnames(script_names)
|
||||||
|
std_script_abbrevs = stdnames(script_abbrevs)
|
||||||
std_category_names = stdnames(category_names)
|
std_category_names = stdnames(category_names)
|
||||||
std_general_category_names = stdnames(general_category_names)
|
std_general_category_names = stdnames(general_category_names)
|
||||||
std_bidi_class_names = stdnames(bidi_class_names)
|
std_bidi_class_names = stdnames(bidi_class_names)
|
||||||
|
|
||||||
# Create the table, starting with the Unicode script, category and bidi class
|
# Create the table, starting with the Unicode script, category and bidi class
|
||||||
# names. We keep both the standardized name and the original, because the
|
# names. We keep both the standardized name and the original, because the
|
||||||
# latter is used for the ucp_xx names.
|
# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
|
||||||
|
# still use the full original names.
|
||||||
|
|
||||||
utt_table = list(zip(std_script_names, script_names, ['PT_SCX'] * len(script_names)))
|
utt_table = list(zip(std_script_names, script_names, ['PT_SCX'] * len(script_names)))
|
||||||
|
utt_table += list(zip(std_script_abbrevs, script_names, ['PT_SCX'] * len(script_abbrevs)))
|
||||||
|
|
||||||
|
# At lease one script abbreviation is the same as the full name of the script,
|
||||||
|
# so we must remove duplicates. It doesn't matter if this operation changes the
|
||||||
|
# order, because we are going to sort the list later.
|
||||||
|
|
||||||
|
utt_table = list(set(utt_table))
|
||||||
|
|
||||||
|
# Add the remaining property lists
|
||||||
|
|
||||||
utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
|
utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
|
||||||
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
|
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
|
||||||
utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
|
utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
|
||||||
|
|
|
@ -299,22 +299,48 @@ return isatty(fileno(stdin));
|
||||||
* Get script name from ucp ident *
|
* Get script name from ucp ident *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
|
/* The utt table contains both the full script names and the 4-letter
|
||||||
|
abbreviations. So search for both and use the longer if two are found, unless
|
||||||
|
the first one is only 3 characters (some scripts have 3-character names). If
|
||||||
|
this were not just a test program it might be worth making some kind of reverse
|
||||||
|
index. */
|
||||||
|
|
||||||
static const char *
|
static const char *
|
||||||
get_scriptname(int script)
|
get_scriptname(int script)
|
||||||
{
|
{
|
||||||
size_t i;
|
size_t i, j, len;
|
||||||
const ucp_type_table *u;
|
size_t foundlist[2];
|
||||||
|
const char *yield;
|
||||||
|
|
||||||
|
j = 0;
|
||||||
for (i = 0; i < PRIV(utt_size); i++)
|
for (i = 0; i < PRIV(utt_size); i++)
|
||||||
{
|
{
|
||||||
u = PRIV(utt) + i;
|
const ucp_type_table *u = PRIV(utt) + i;
|
||||||
if (u->type == PT_SCX && u->value == script) break;
|
if (u->type == PT_SCX && u->value == script)
|
||||||
|
{
|
||||||
|
foundlist[j++] = i;
|
||||||
|
if (j >= 2) break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i < PRIV(utt_size))
|
if (j == 0) return "??";
|
||||||
return PRIV(utt_names) + u->name_offset;
|
|
||||||
|
|
||||||
return "??";
|
yield = NULL;
|
||||||
|
len = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < j; i++)
|
||||||
|
{
|
||||||
|
const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
|
||||||
|
size_t sl = strlen(s);
|
||||||
|
if (sl > len)
|
||||||
|
{
|
||||||
|
yield = s;
|
||||||
|
if (sl == 3) break;
|
||||||
|
len = sl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return yield;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -15,3 +15,4 @@ find bidi RLO
|
||||||
find bidi S
|
find bidi S
|
||||||
find bidi WS
|
find bidi WS
|
||||||
find bidi_control
|
find bidi_control
|
||||||
|
find script bopo
|
||||||
|
|
|
@ -218,3 +218,7 @@ U+2066 *LRI Control: Format, common, Control
|
||||||
U+2067 *RLI Control: Format, common, Control
|
U+2067 *RLI Control: Format, common, Control
|
||||||
U+2068 *FSI Control: Format, common, Control
|
U+2068 *FSI Control: Format, common, Control
|
||||||
U+2069 *PDI Control: Format, common, Control
|
U+2069 *PDI Control: Format, common, Control
|
||||||
|
find script bopo
|
||||||
|
U+02EA..U+02EB ON Symbol: Modifier symbol, bopomofo, Other
|
||||||
|
U+3105..U+312F L Letter: Other letter, bopomofo, Other
|
||||||
|
U+31A0..U+31BF L Letter: Other letter, bopomofo, Other
|
||||||
|
|
|
@ -230,21 +230,48 @@ for (; len > 0; len--)
|
||||||
/* When there is no UTF/UCP support, the table of names does not exist. This
|
/* When there is no UTF/UCP support, the table of names does not exist. This
|
||||||
function should not be called in such configurations, because a pattern that
|
function should not be called in such configurations, because a pattern that
|
||||||
tries to use Unicode properties won't compile. Rather than put lots of #ifdefs
|
tries to use Unicode properties won't compile. Rather than put lots of #ifdefs
|
||||||
into the main code, however, we just put one into this function. */
|
into the main code, however, we just put one into this function.
|
||||||
|
|
||||||
|
Now that the table contains both full script names and their 4-character
|
||||||
|
abbreviations, we do some fiddling to try to get the full name, which is either
|
||||||
|
the longer of two found names, or a 3-character name. */
|
||||||
|
|
||||||
static const char *
|
static const char *
|
||||||
get_ucpname(unsigned int ptype, unsigned int pvalue)
|
get_ucpname(unsigned int ptype, unsigned int pvalue)
|
||||||
{
|
{
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
int i;
|
int count = 0;
|
||||||
|
const char *yield = "??";
|
||||||
|
size_t len = 0;
|
||||||
|
|
||||||
if (ptype == PT_SC) ptype = PT_SCX; /* Table has scx values */
|
if (ptype == PT_SC) ptype = PT_SCX; /* Table has scx values */
|
||||||
for (i = PRIV(utt_size) - 1; i >= 0; i--)
|
|
||||||
|
for (int i = PRIV(utt_size) - 1; i >= 0; i--)
|
||||||
{
|
{
|
||||||
if (ptype == PRIV(utt)[i].type && pvalue == PRIV(utt)[i].value) break;
|
const ucp_type_table *u = PRIV(utt) + i;
|
||||||
|
|
||||||
|
if (ptype == u->type && pvalue == u->value)
|
||||||
|
{
|
||||||
|
const char *s = PRIV(utt_names) + u->name_offset;
|
||||||
|
size_t sl = strlen(s);
|
||||||
|
|
||||||
|
if (sl == 3)
|
||||||
|
{
|
||||||
|
yield = s;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sl > len)
|
||||||
|
{
|
||||||
|
yield = s;
|
||||||
|
len = sl;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (++count >= 2) break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return (i >= 0)? PRIV(utt_names) + PRIV(utt)[i].name_offset : "??";
|
return yield;
|
||||||
|
|
||||||
#else /* No UTF support */
|
#else /* No UTF support */
|
||||||
(void)ptype;
|
(void)ptype;
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -2641,4 +2641,7 @@
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/[\p{taml}\p{sc:ugar}]+/utf
|
||||||
|
\x{0b82}\x{10380}
|
||||||
|
|
||||||
# End of testinput4
|
# End of testinput4
|
||||||
|
|
|
@ -4235,4 +4235,8 @@ No match
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/[\p{taml}\p{sc:ugar}]+/utf
|
||||||
|
\x{0b82}\x{10380}
|
||||||
|
0: \x{b82}\x{10380}
|
||||||
|
|
||||||
# End of testinput4
|
# End of testinput4
|
||||||
|
|
Loading…
Reference in New Issue