Add N-gram generator

This commit is contained in:
Behdad Esfahbod 2012-05-09 08:57:29 +02:00
parent 98669ceb77
commit 178e6dce01
1 changed files with 70 additions and 2 deletions

View File

@ -169,6 +169,53 @@ class DiffSinks:
total = passed + failed total = passed + failed
print "%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total) print "%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)
@staticmethod
def print_ngrams (f, ns=(1,2,3)):
gens = tuple (Ngram.generator (n) for n in ns)
for key, lines in DiffHelpers.separate_test_cases (f):
test = Test (lines)
unicodes = test.unicodes
del test
for gen in gens:
print "Printing %d-grams:" % gen.n
for ngram in gen (unicodes):
print ngram
class Test:
def __init__ (self, lines):
self.passed = True
self.identifier = None
self.text = None
self.unicodes = None
self.glyphs = None
for l in lines:
symbol = l[0]
if symbol != ' ':
self.passed = False
i = 1
if ':' in l:
i = l.index (':')
if not self.identifier:
self.identifier = l[1:i]
i = i + 2 # Skip colon and space
j = -1
if l[j] == '\n':
j -= 1
brackets = l[i] + l[j]
l = l[i+1:-2]
if brackets == '()':
self.text = l
elif brackets == '<>':
self.unicodes = Unicode.parse (l)
elif brackets == '[]':
# XXX we don't handle failed tests here
self.glyphs = l
class DiffHelpers: class DiffHelpers:
@staticmethod @staticmethod
@ -205,6 +252,23 @@ class FilterHelpers:
return printer return printer
class Ngram:
@staticmethod
def generator (n):
def gen (f):
l = []
for x in f:
l.append (x)
if len (l) == n:
yield tuple (l)
l[:1] = []
gen.n = n
return gen
class UtilMains: class UtilMains:
@staticmethod @staticmethod
@ -276,10 +340,14 @@ class Unicode:
return '<' + u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8') + '>' return '<' + u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8') + '>'
@staticmethod @staticmethod
def encode (s): def parse (s):
s = re.sub (r"[<+>,\\uU\n ]", " ", s) s = re.sub (r"[<+>,\\uU\n ]", " ", s)
s = re.sub (r"0[xX]", " ", s) s = re.sub (r"0[xX]", " ", s)
return u''.join (unichr (int (x, 16)) for x in s.split (' ') if len (x)).encode ('utf-8') return [int (x, 16) for x in s.split (' ') if len (x)]
@staticmethod
def encode (s):
return u''.join (unichr (x) for x in Unicode.parse (s)).encode ('utf-8')
shorthands = { shorthands = {
"ZERO WIDTH NON-JOINER": "ZWNJ", "ZERO WIDTH NON-JOINER": "ZWNJ",