Add hb-diff-ngrams

This commit is contained in:
Behdad Esfahbod 2012-05-09 09:54:54 +02:00
parent 178e6dce01
commit 2214a03900
3 changed files with 72 additions and 5 deletions

View File

@ -13,6 +13,7 @@ EXTRA_DIST += \
hb-diff \ hb-diff \
hb-diff-colorize \ hb-diff-colorize \
hb-diff-filter-failures \ hb-diff-filter-failures \
hb-diff-ngrams \
hb-diff-stat \ hb-diff-stat \
hb-manifest-read \ hb-manifest-read \
hb-manifest-update \ hb-manifest-update \

5
test/shaping/hb-diff-ngrams Executable file
View File

@ -0,0 +1,5 @@
#!/usr/bin/python
from hb_test_tools import *
UtilMains.process_multiple_files (DiffSinks.print_ngrams)

View File

@ -155,12 +155,60 @@ class DiffFilters:
if not DiffHelpers.test_passed (lines): if not DiffHelpers.test_passed (lines):
for l in lines: yield l for l in lines: yield l
class Stat:
def __init__ (self):
self.count = 0
self.freq = 0
def add (self, test):
self.count += 1
self.freq += test.freq
class Stats:
def __init__ (self):
self.passed = Stat ()
self.failed = Stat ()
self.total = Stat ()
def add (self, test):
self.total.add (test)
if test.passed:
self.passed.add (test)
else:
self.failed.add (test)
def mean (self):
return float (self.passed.count) / self.total.count
def variance (self):
return (float (self.passed.count) / self.total.count) * \
(float (self.failed.count) / self.total.count)
def stddev (self):
return self.variance () ** .5
def zscore (self, population):
"""Calculate the standard score.
Population is the Stats for population.
Self is Stats for sample.
Returns larger absolute value if sample is highly unlikely to be random.
Anything outside of -3..+3 is very unlikely to be random.
See: http://en.wikipedia.org/wiki/Standard_score"""
return (self.mean () - population.mean ()) / population.stddev ()
class DiffSinks: class DiffSinks:
@staticmethod @staticmethod
def print_stat (f): def print_stat (f):
passed = 0 passed = 0
failed = 0 failed = 0
# XXX port to Stats, but that would really slow us down here
for key, lines in DiffHelpers.separate_test_cases (f): for key, lines in DiffHelpers.separate_test_cases (f):
if DiffHelpers.test_passed (lines): if DiffHelpers.test_passed (lines):
passed += 1 passed += 1
@ -172,21 +220,34 @@ class DiffSinks:
@staticmethod @staticmethod
def print_ngrams (f, ns=(1,2,3)): def print_ngrams (f, ns=(1,2,3)):
gens = tuple (Ngram.generator (n) for n in ns) gens = tuple (Ngram.generator (n) for n in ns)
allstats = Stats ()
allgrams = {}
for key, lines in DiffHelpers.separate_test_cases (f): for key, lines in DiffHelpers.separate_test_cases (f):
test = Test (lines) test = Test (lines)
unicodes = test.unicodes allstats.add (test)
del test
for gen in gens: for gen in gens:
print "Printing %d-grams:" % gen.n for ngram in gen (test.unicodes):
for ngram in gen (unicodes): if ngram not in allgrams:
print ngram allgrams[ngram] = Stats ()
allgrams[ngram].add (test)
importantgrams = {}
for ngram, stats in allgrams.iteritems ():
if stats.failed.count >= 30: # for statistical reasons
importantgrams[ngram] = stats
allgrams = importantgrams
del importantgrams
for ngram, stats in allgrams.iteritems ():
print "zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram))
class Test: class Test:
def __init__ (self, lines): def __init__ (self, lines):
self.freq = 1
self.passed = True self.passed = True
self.identifier = None self.identifier = None
self.text = None self.text = None