2012-01-20 02:28:31 +01:00
|
|
|
#!/usr/bin/python
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import re
|
|
|
|
import unicodedata
|
|
|
|
|
|
|
|
shorthands = {
|
|
|
|
"ZERO WIDTH NON-JOINER": "ZWNJ",
|
|
|
|
"ZERO WIDTH JOINER": "ZWJ",
|
|
|
|
"NARROW NO-BREAK SPACE": "NNBSP",
|
|
|
|
"COMBINING GRAPHEME JOINER": "CGJ",
|
2012-01-20 19:39:27 +01:00
|
|
|
"LEFT-TO-RIGHT MARK": "LRM",
|
|
|
|
"RIGHT-TO-LEFT MARK": "RLM",
|
|
|
|
"LEFT-TO-RIGHT EMBEDDING": "LRE",
|
|
|
|
"RIGHT-TO-LEFT EMBEDDING": "RLE",
|
|
|
|
"POP DIRECTIONAL FORMATTING": "PDF",
|
|
|
|
"LEFT-TO-RIGHT OVERRIDE": "LRO",
|
|
|
|
"RIGHT-TO-LEFT OVERRIDE": "RLO",
|
2012-01-20 02:28:31 +01:00
|
|
|
}
|
|
|
|
def pretty_name (x):
|
|
|
|
try:
|
|
|
|
s = unicodedata.name (x)
|
|
|
|
except ValueError:
|
|
|
|
return "XXX"
|
|
|
|
s = re.sub (".* LETTER ", "", s)
|
|
|
|
s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
|
|
|
|
s = re.sub (".* SIGN ", "", s)
|
|
|
|
s = re.sub (".* COMBINING ", "", s)
|
|
|
|
if re.match (".* VIRAMA", s):
|
|
|
|
s = "HALANT"
|
|
|
|
if s in shorthands:
|
|
|
|
s = shorthands[s]
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
|
|
def pretty_names (s):
|
2012-01-20 19:31:59 +01:00
|
|
|
s = re.sub (r"[<+>\\]", "", s)
|
|
|
|
s = re.sub (r"[uU]", " ", s)
|
|
|
|
s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
|
2012-01-20 02:28:31 +01:00
|
|
|
return ' + '.join (pretty_name (x) for x in s)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2012-01-20 19:31:59 +01:00
|
|
|
if '--stdin' in sys.argv:
|
|
|
|
sys.argv.remove ('--stdin')
|
|
|
|
for line in sys.stdin.readlines ():
|
|
|
|
print pretty_names (line)
|
|
|
|
else:
|
|
|
|
print pretty_names (','.join (sys.argv[1:]))
|