520 lines
13 KiB
Python
520 lines
13 KiB
Python
#!/usr/bin/python
|
|
|
|
import sys, os, re, difflib, unicodedata, errno, cgi
|
|
from itertools import *
|
|
|
|
diff_symbols = "-+=*&^%$#@!~/"
|
|
diff_colors = ['red', 'green', 'blue']
|
|
|
|
class ColorFormatter:
|
|
|
|
class Null:
|
|
@staticmethod
|
|
def start_color (c): return ''
|
|
@staticmethod
|
|
def end_color (): return ''
|
|
@staticmethod
|
|
def escape (s): return s
|
|
@staticmethod
|
|
def newline (): return '\n'
|
|
|
|
class ANSI:
|
|
@staticmethod
|
|
def start_color (c):
|
|
return {
|
|
'red': '\033[41;37;1m',
|
|
'green': '\033[42;37;1m',
|
|
'blue': '\033[44;37;1m',
|
|
}[c]
|
|
@staticmethod
|
|
def end_color ():
|
|
return '\033[m'
|
|
@staticmethod
|
|
def escape (s): return s
|
|
@staticmethod
|
|
def newline (): return '\n'
|
|
|
|
class HTML:
|
|
@staticmethod
|
|
def start_color (c):
|
|
return '<span style="background:%s">' % c
|
|
@staticmethod
|
|
def end_color ():
|
|
return '</span>'
|
|
@staticmethod
|
|
def escape (s): return cgi.escape (s)
|
|
@staticmethod
|
|
def newline (): return '<br/>\n'
|
|
|
|
@staticmethod
|
|
def Auto (argv = [], out = sys.stdout):
|
|
format = ColorFormatter.ANSI
|
|
if "--format" in argv:
|
|
argv.remove ("--format")
|
|
format = ColorFormatter.ANSI
|
|
if "--format=ansi" in argv:
|
|
argv.remove ("--format=ansi")
|
|
format = ColorFormatter.ANSI
|
|
if "--format=html" in argv:
|
|
argv.remove ("--format=html")
|
|
format = ColorFormatter.HTML
|
|
if "--no-format" in argv:
|
|
argv.remove ("--no-format")
|
|
format = ColorFormatter.Null
|
|
return format
|
|
|
|
|
|
class DiffColorizer:
|
|
|
|
diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)')
|
|
|
|
def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols):
|
|
self.formatter = formatter
|
|
self.colors = colors
|
|
self.symbols = symbols
|
|
|
|
def colorize_lines (self, lines):
|
|
lines = (l if l else '' for l in lines)
|
|
ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines]
|
|
oo = ["",""]
|
|
st = [False, False]
|
|
for l in difflib.Differ().compare (*ss):
|
|
if l[0] == '?':
|
|
continue
|
|
if l[0] == ' ':
|
|
for i in range(2):
|
|
if st[i]:
|
|
oo[i] += self.formatter.end_color ()
|
|
st[i] = False
|
|
oo = [o + self.formatter.escape (l[2:]) for o in oo]
|
|
continue
|
|
if l[0] in self.symbols:
|
|
i = self.symbols.index (l[0])
|
|
if not st[i]:
|
|
oo[i] += self.formatter.start_color (self.colors[i])
|
|
st[i] = True
|
|
oo[i] += self.formatter.escape (l[2:])
|
|
continue
|
|
for i in range(2):
|
|
if st[i]:
|
|
oo[i] += self.formatter.end_color ()
|
|
st[i] = False
|
|
oo = [o.replace ('\n', '') for o in oo]
|
|
return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2]
|
|
|
|
def colorize_diff (self, f):
|
|
lines = [None, None]
|
|
for l in f:
|
|
if l[0] not in self.symbols:
|
|
yield self.formatter.escape (l).replace ('\n', self.formatter.newline ())
|
|
continue
|
|
i = self.symbols.index (l[0])
|
|
if lines[i]:
|
|
# Flush
|
|
for line in self.colorize_lines (lines):
|
|
yield line
|
|
lines = [None, None]
|
|
lines[i] = l[1:]
|
|
if (all (lines)):
|
|
# Flush
|
|
for line in self.colorize_lines (lines):
|
|
yield line
|
|
lines = [None, None]
|
|
if (any (lines)):
|
|
# Flush
|
|
for line in self.colorize_lines (lines):
|
|
yield line
|
|
|
|
|
|
class ZipDiffer:
|
|
|
|
@staticmethod
|
|
def diff_files (files, symbols=diff_symbols):
|
|
files = tuple (files) # in case it's a generator, copy it
|
|
try:
|
|
for lines in izip_longest (*files):
|
|
if all (lines[0] == line for line in lines[1:]):
|
|
sys.stdout.writelines ([" ", lines[0]])
|
|
continue
|
|
|
|
for i, l in enumerate (lines):
|
|
if l:
|
|
sys.stdout.writelines ([symbols[i], l])
|
|
except IOError as e:
|
|
if e.errno != errno.EPIPE:
|
|
print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
|
|
sys.exit (1)
|
|
|
|
|
|
class DiffFilters:
|
|
|
|
@staticmethod
|
|
def filter_failures (f):
|
|
for key, lines in DiffHelpers.separate_test_cases (f):
|
|
lines = list (lines)
|
|
if not DiffHelpers.test_passed (lines):
|
|
for l in lines: yield l
|
|
|
|
class Stat:
|
|
|
|
def __init__ (self):
|
|
self.count = 0
|
|
self.freq = 0
|
|
|
|
def add (self, test):
|
|
self.count += 1
|
|
self.freq += test.freq
|
|
|
|
class Stats:
|
|
|
|
def __init__ (self):
|
|
self.passed = Stat ()
|
|
self.failed = Stat ()
|
|
self.total = Stat ()
|
|
|
|
def add (self, test):
|
|
self.total.add (test)
|
|
if test.passed:
|
|
self.passed.add (test)
|
|
else:
|
|
self.failed.add (test)
|
|
|
|
def mean (self):
|
|
return float (self.passed.count) / self.total.count
|
|
|
|
def variance (self):
|
|
return (float (self.passed.count) / self.total.count) * \
|
|
(float (self.failed.count) / self.total.count)
|
|
|
|
def stddev (self):
|
|
return self.variance () ** .5
|
|
|
|
def zscore (self, population):
|
|
"""Calculate the standard score.
|
|
Population is the Stats for population.
|
|
Self is Stats for sample.
|
|
Returns larger absolute value if sample is highly unlikely to be random.
|
|
Anything outside of -3..+3 is very unlikely to be random.
|
|
See: http://en.wikipedia.org/wiki/Standard_score"""
|
|
|
|
return (self.mean () - population.mean ()) / population.stddev ()
|
|
|
|
|
|
|
|
|
|
class DiffSinks:
|
|
|
|
@staticmethod
|
|
def print_stat (f):
|
|
passed = 0
|
|
failed = 0
|
|
# XXX port to Stats, but that would really slow us down here
|
|
for key, lines in DiffHelpers.separate_test_cases (f):
|
|
if DiffHelpers.test_passed (lines):
|
|
passed += 1
|
|
else:
|
|
failed += 1
|
|
total = passed + failed
|
|
print "%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)
|
|
|
|
@staticmethod
|
|
def print_ngrams (f, ns=(1,2,3)):
|
|
gens = tuple (Ngram.generator (n) for n in ns)
|
|
allstats = Stats ()
|
|
allgrams = {}
|
|
for key, lines in DiffHelpers.separate_test_cases (f):
|
|
test = Test (lines)
|
|
allstats.add (test)
|
|
|
|
for gen in gens:
|
|
for ngram in gen (test.unicodes):
|
|
if ngram not in allgrams:
|
|
allgrams[ngram] = Stats ()
|
|
allgrams[ngram].add (test)
|
|
|
|
importantgrams = {}
|
|
for ngram, stats in allgrams.iteritems ():
|
|
if stats.failed.count >= 30: # for statistical reasons
|
|
importantgrams[ngram] = stats
|
|
allgrams = importantgrams
|
|
del importantgrams
|
|
|
|
for ngram, stats in allgrams.iteritems ():
|
|
print "zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram))
|
|
|
|
|
|
|
|
class Test:
|
|
|
|
def __init__ (self, lines):
|
|
self.freq = 1
|
|
self.passed = True
|
|
self.identifier = None
|
|
self.text = None
|
|
self.unicodes = None
|
|
self.glyphs = None
|
|
for l in lines:
|
|
symbol = l[0]
|
|
if symbol != ' ':
|
|
self.passed = False
|
|
i = 1
|
|
if ':' in l:
|
|
i = l.index (':')
|
|
if not self.identifier:
|
|
self.identifier = l[1:i]
|
|
i = i + 2 # Skip colon and space
|
|
j = -1
|
|
if l[j] == '\n':
|
|
j -= 1
|
|
brackets = l[i] + l[j]
|
|
l = l[i+1:-2]
|
|
if brackets == '()':
|
|
self.text = l
|
|
elif brackets == '<>':
|
|
self.unicodes = Unicode.parse (l)
|
|
elif brackets == '[]':
|
|
# XXX we don't handle failed tests here
|
|
self.glyphs = l
|
|
|
|
|
|
class DiffHelpers:
|
|
|
|
@staticmethod
|
|
def separate_test_cases (f):
|
|
'''Reads lines from f, and if the lines have identifiers, ie.
|
|
have a colon character, groups them by identifier,
|
|
yielding lists of all lines with the same identifier.'''
|
|
|
|
def identifier (l):
|
|
if ':' in l[1:]:
|
|
return l[1:l.index (':')]
|
|
return l
|
|
return groupby (f, key=identifier)
|
|
|
|
@staticmethod
|
|
def test_passed (lines):
|
|
lines = list (lines)
|
|
# XXX This is a hack, but does the job for now.
|
|
if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True
|
|
if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True
|
|
if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True
|
|
if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True
|
|
if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True
|
|
if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True
|
|
return all (l[0] == ' ' for l in lines)
|
|
|
|
|
|
class FilterHelpers:
|
|
|
|
@staticmethod
|
|
def filter_printer_function (filter_callback):
|
|
def printer (f):
|
|
for line in filter_callback (f):
|
|
print line
|
|
return printer
|
|
|
|
@staticmethod
|
|
def filter_printer_function_no_newline (filter_callback):
|
|
def printer (f):
|
|
for line in filter_callback (f):
|
|
sys.stdout.writelines ([line])
|
|
return printer
|
|
|
|
|
|
class Ngram:
|
|
|
|
@staticmethod
|
|
def generator (n):
|
|
|
|
def gen (f):
|
|
l = []
|
|
for x in f:
|
|
l.append (x)
|
|
if len (l) == n:
|
|
yield tuple (l)
|
|
l[:1] = []
|
|
|
|
gen.n = n
|
|
return gen
|
|
|
|
|
|
class UtilMains:
|
|
|
|
@staticmethod
|
|
def process_multiple_files (callback, mnemonic = "FILE"):
|
|
|
|
if "--help" in sys.argv:
|
|
print "Usage: %s %s..." % (sys.argv[0], mnemonic)
|
|
sys.exit (1)
|
|
|
|
try:
|
|
files = sys.argv[1:] if len (sys.argv) > 1 else ['-']
|
|
for s in files:
|
|
callback (FileHelpers.open_file_or_stdin (s))
|
|
except IOError as e:
|
|
if e.errno != errno.EPIPE:
|
|
print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
|
|
sys.exit (1)
|
|
|
|
@staticmethod
|
|
def process_multiple_args (callback, mnemonic):
|
|
|
|
if len (sys.argv) == 1 or "--help" in sys.argv:
|
|
print "Usage: %s %s..." % (sys.argv[0], mnemonic)
|
|
sys.exit (1)
|
|
|
|
try:
|
|
for s in sys.argv[1:]:
|
|
callback (s)
|
|
except IOError as e:
|
|
if e.errno != errno.EPIPE:
|
|
print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
|
|
sys.exit (1)
|
|
|
|
@staticmethod
|
|
def filter_multiple_strings_or_stdin (callback, mnemonic, \
|
|
separator = " ", \
|
|
concat_separator = False):
|
|
|
|
if "--help" in sys.argv:
|
|
print "Usage:\n %s %s...\nor:\n %s\n\nWhen called with no arguments, input is read from standard input." \
|
|
% (sys.argv[0], mnemonic, sys.argv[0])
|
|
sys.exit (1)
|
|
|
|
try:
|
|
if len (sys.argv) == 1:
|
|
while (1):
|
|
line = sys.stdin.readline ()
|
|
if not len (line):
|
|
break
|
|
if line[-1] == '\n':
|
|
line = line[:-1]
|
|
print callback (line)
|
|
else:
|
|
args = sys.argv[1:]
|
|
if concat_separator != False:
|
|
args = [concat_separator.join (args)]
|
|
print separator.join (callback (x) for x in (args))
|
|
except IOError as e:
|
|
if e.errno != errno.EPIPE:
|
|
print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
|
|
sys.exit (1)
|
|
|
|
|
|
class Unicode:
|
|
|
|
@staticmethod
|
|
def decode (s):
|
|
return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8')
|
|
|
|
@staticmethod
|
|
def parse (s):
|
|
s = re.sub (r"0[xX]", " ", s)
|
|
s = re.sub (r"[<+>,;&#\\xXuU\n ]", " ", s)
|
|
return [int (x, 16) for x in s.split (' ') if len (x)]
|
|
|
|
@staticmethod
|
|
def encode (s):
|
|
return u''.join (unichr (x) for x in Unicode.parse (s)).encode ('utf-8')
|
|
|
|
shorthands = {
|
|
"ZERO WIDTH NON-JOINER": "ZWNJ",
|
|
"ZERO WIDTH JOINER": "ZWJ",
|
|
"NARROW NO-BREAK SPACE": "NNBSP",
|
|
"COMBINING GRAPHEME JOINER": "CGJ",
|
|
"LEFT-TO-RIGHT MARK": "LRM",
|
|
"RIGHT-TO-LEFT MARK": "RLM",
|
|
"LEFT-TO-RIGHT EMBEDDING": "LRE",
|
|
"RIGHT-TO-LEFT EMBEDDING": "RLE",
|
|
"POP DIRECTIONAL FORMATTING": "PDF",
|
|
"LEFT-TO-RIGHT OVERRIDE": "LRO",
|
|
"RIGHT-TO-LEFT OVERRIDE": "RLO",
|
|
}
|
|
|
|
@staticmethod
|
|
def pretty_name (u):
|
|
try:
|
|
s = unicodedata.name (u)
|
|
except ValueError:
|
|
return "XXX"
|
|
s = re.sub (".* LETTER ", "", s)
|
|
s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
|
|
s = re.sub (".* SIGN ", "", s)
|
|
s = re.sub (".* COMBINING ", "", s)
|
|
if re.match (".* VIRAMA", s):
|
|
s = "HALANT"
|
|
if s in Unicode.shorthands:
|
|
s = Unicode.shorthands[s]
|
|
return s
|
|
|
|
@staticmethod
|
|
def pretty_names (s):
|
|
s = re.sub (r"[<+>\\uU]", " ", s)
|
|
s = re.sub (r"0[xX]", " ", s)
|
|
s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
|
|
return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8')
|
|
|
|
|
|
class FileHelpers:
|
|
|
|
@staticmethod
|
|
def open_file_or_stdin (f):
|
|
if f == '-':
|
|
return sys.stdin
|
|
return file (f)
|
|
|
|
|
|
class Manifest:
|
|
|
|
@staticmethod
|
|
def read (s, strict = True):
|
|
|
|
if not os.path.exists (s):
|
|
if strict:
|
|
print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], s)
|
|
sys.exit (1)
|
|
return
|
|
|
|
s = os.path.normpath (s)
|
|
|
|
if os.path.isdir (s):
|
|
|
|
try:
|
|
m = file (os.path.join (s, "MANIFEST"))
|
|
items = [x.strip () for x in m.readlines ()]
|
|
for f in items:
|
|
for p in Manifest.read (os.path.join (s, f)):
|
|
yield p
|
|
except IOError:
|
|
if strict:
|
|
print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST"))
|
|
sys.exit (1)
|
|
return
|
|
else:
|
|
yield s
|
|
|
|
@staticmethod
|
|
def update_recursive (s):
|
|
|
|
for dirpath, dirnames, filenames in os.walk (s, followlinks=True):
|
|
|
|
for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]:
|
|
if f in dirnames:
|
|
dirnames.remove (f)
|
|
if f in filenames:
|
|
filenames.remove (f)
|
|
dirnames.sort ()
|
|
filenames.sort ()
|
|
ms = os.path.join (dirpath, "MANIFEST")
|
|
print " GEN %s" % ms
|
|
m = open (ms, "w")
|
|
for f in filenames:
|
|
print >> m, f
|
|
for f in dirnames:
|
|
print >> m, f
|
|
for f in dirnames:
|
|
Manifest.update_recursive (os.path.join (dirpath, f))
|
|
|
|
if __name__ == '__main__':
|
|
pass
|