Add hb-diff-ngrams
This commit is contained in:
parent
178e6dce01
commit
2214a03900
|
@ -13,6 +13,7 @@ EXTRA_DIST += \
|
||||||
hb-diff \
|
hb-diff \
|
||||||
hb-diff-colorize \
|
hb-diff-colorize \
|
||||||
hb-diff-filter-failures \
|
hb-diff-filter-failures \
|
||||||
|
hb-diff-ngrams \
|
||||||
hb-diff-stat \
|
hb-diff-stat \
|
||||||
hb-manifest-read \
|
hb-manifest-read \
|
||||||
hb-manifest-update \
|
hb-manifest-update \
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
from hb_test_tools import *
|
||||||
|
|
||||||
|
UtilMains.process_multiple_files (DiffSinks.print_ngrams)
|
|
@ -155,12 +155,60 @@ class DiffFilters:
|
||||||
if not DiffHelpers.test_passed (lines):
|
if not DiffHelpers.test_passed (lines):
|
||||||
for l in lines: yield l
|
for l in lines: yield l
|
||||||
|
|
||||||
|
class Stat:
|
||||||
|
|
||||||
|
def __init__ (self):
|
||||||
|
self.count = 0
|
||||||
|
self.freq = 0
|
||||||
|
|
||||||
|
def add (self, test):
|
||||||
|
self.count += 1
|
||||||
|
self.freq += test.freq
|
||||||
|
|
||||||
|
class Stats:
|
||||||
|
|
||||||
|
def __init__ (self):
|
||||||
|
self.passed = Stat ()
|
||||||
|
self.failed = Stat ()
|
||||||
|
self.total = Stat ()
|
||||||
|
|
||||||
|
def add (self, test):
|
||||||
|
self.total.add (test)
|
||||||
|
if test.passed:
|
||||||
|
self.passed.add (test)
|
||||||
|
else:
|
||||||
|
self.failed.add (test)
|
||||||
|
|
||||||
|
def mean (self):
|
||||||
|
return float (self.passed.count) / self.total.count
|
||||||
|
|
||||||
|
def variance (self):
|
||||||
|
return (float (self.passed.count) / self.total.count) * \
|
||||||
|
(float (self.failed.count) / self.total.count)
|
||||||
|
|
||||||
|
def stddev (self):
|
||||||
|
return self.variance () ** .5
|
||||||
|
|
||||||
|
def zscore (self, population):
|
||||||
|
"""Calculate the standard score.
|
||||||
|
Population is the Stats for population.
|
||||||
|
Self is Stats for sample.
|
||||||
|
Returns larger absolute value if sample is highly unlikely to be random.
|
||||||
|
Anything outside of -3..+3 is very unlikely to be random.
|
||||||
|
See: http://en.wikipedia.org/wiki/Standard_score"""
|
||||||
|
|
||||||
|
return (self.mean () - population.mean ()) / population.stddev ()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class DiffSinks:
|
class DiffSinks:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def print_stat (f):
|
def print_stat (f):
|
||||||
passed = 0
|
passed = 0
|
||||||
failed = 0
|
failed = 0
|
||||||
|
# XXX port to Stats, but that would really slow us down here
|
||||||
for key, lines in DiffHelpers.separate_test_cases (f):
|
for key, lines in DiffHelpers.separate_test_cases (f):
|
||||||
if DiffHelpers.test_passed (lines):
|
if DiffHelpers.test_passed (lines):
|
||||||
passed += 1
|
passed += 1
|
||||||
|
@ -172,21 +220,34 @@ class DiffSinks:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def print_ngrams (f, ns=(1,2,3)):
|
def print_ngrams (f, ns=(1,2,3)):
|
||||||
gens = tuple (Ngram.generator (n) for n in ns)
|
gens = tuple (Ngram.generator (n) for n in ns)
|
||||||
|
allstats = Stats ()
|
||||||
|
allgrams = {}
|
||||||
for key, lines in DiffHelpers.separate_test_cases (f):
|
for key, lines in DiffHelpers.separate_test_cases (f):
|
||||||
test = Test (lines)
|
test = Test (lines)
|
||||||
unicodes = test.unicodes
|
allstats.add (test)
|
||||||
del test
|
|
||||||
|
|
||||||
for gen in gens:
|
for gen in gens:
|
||||||
print "Printing %d-grams:" % gen.n
|
for ngram in gen (test.unicodes):
|
||||||
for ngram in gen (unicodes):
|
if ngram not in allgrams:
|
||||||
print ngram
|
allgrams[ngram] = Stats ()
|
||||||
|
allgrams[ngram].add (test)
|
||||||
|
|
||||||
|
importantgrams = {}
|
||||||
|
for ngram, stats in allgrams.iteritems ():
|
||||||
|
if stats.failed.count >= 30: # for statistical reasons
|
||||||
|
importantgrams[ngram] = stats
|
||||||
|
allgrams = importantgrams
|
||||||
|
del importantgrams
|
||||||
|
|
||||||
|
for ngram, stats in allgrams.iteritems ():
|
||||||
|
print "zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Test:
|
class Test:
|
||||||
|
|
||||||
def __init__ (self, lines):
|
def __init__ (self, lines):
|
||||||
|
self.freq = 1
|
||||||
self.passed = True
|
self.passed = True
|
||||||
self.identifier = None
|
self.identifier = None
|
||||||
self.text = None
|
self.text = None
|
||||||
|
|
Loading…
Reference in New Issue