Add N-gram generator
This commit is contained in:
parent
98669ceb77
commit
178e6dce01
|
@ -169,6 +169,53 @@ class DiffSinks:
|
||||||
total = passed + failed
|
total = passed + failed
|
||||||
print "%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)
|
print "%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def print_ngrams (f, ns=(1,2,3)):
|
||||||
|
gens = tuple (Ngram.generator (n) for n in ns)
|
||||||
|
for key, lines in DiffHelpers.separate_test_cases (f):
|
||||||
|
test = Test (lines)
|
||||||
|
unicodes = test.unicodes
|
||||||
|
del test
|
||||||
|
|
||||||
|
for gen in gens:
|
||||||
|
print "Printing %d-grams:" % gen.n
|
||||||
|
for ngram in gen (unicodes):
|
||||||
|
print ngram
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Test:
|
||||||
|
|
||||||
|
def __init__ (self, lines):
|
||||||
|
self.passed = True
|
||||||
|
self.identifier = None
|
||||||
|
self.text = None
|
||||||
|
self.unicodes = None
|
||||||
|
self.glyphs = None
|
||||||
|
for l in lines:
|
||||||
|
symbol = l[0]
|
||||||
|
if symbol != ' ':
|
||||||
|
self.passed = False
|
||||||
|
i = 1
|
||||||
|
if ':' in l:
|
||||||
|
i = l.index (':')
|
||||||
|
if not self.identifier:
|
||||||
|
self.identifier = l[1:i]
|
||||||
|
i = i + 2 # Skip colon and space
|
||||||
|
j = -1
|
||||||
|
if l[j] == '\n':
|
||||||
|
j -= 1
|
||||||
|
brackets = l[i] + l[j]
|
||||||
|
l = l[i+1:-2]
|
||||||
|
if brackets == '()':
|
||||||
|
self.text = l
|
||||||
|
elif brackets == '<>':
|
||||||
|
self.unicodes = Unicode.parse (l)
|
||||||
|
elif brackets == '[]':
|
||||||
|
# XXX we don't handle failed tests here
|
||||||
|
self.glyphs = l
|
||||||
|
|
||||||
|
|
||||||
class DiffHelpers:
|
class DiffHelpers:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -205,6 +252,23 @@ class FilterHelpers:
|
||||||
return printer
|
return printer
|
||||||
|
|
||||||
|
|
||||||
|
class Ngram:
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def generator (n):
|
||||||
|
|
||||||
|
def gen (f):
|
||||||
|
l = []
|
||||||
|
for x in f:
|
||||||
|
l.append (x)
|
||||||
|
if len (l) == n:
|
||||||
|
yield tuple (l)
|
||||||
|
l[:1] = []
|
||||||
|
|
||||||
|
gen.n = n
|
||||||
|
return gen
|
||||||
|
|
||||||
|
|
||||||
class UtilMains:
|
class UtilMains:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -276,10 +340,14 @@ class Unicode:
|
||||||
return '<' + u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8') + '>'
|
return '<' + u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8') + '>'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def encode (s):
|
def parse (s):
|
||||||
s = re.sub (r"[<+>,\\uU\n ]", " ", s)
|
s = re.sub (r"[<+>,\\uU\n ]", " ", s)
|
||||||
s = re.sub (r"0[xX]", " ", s)
|
s = re.sub (r"0[xX]", " ", s)
|
||||||
return u''.join (unichr (int (x, 16)) for x in s.split (' ') if len (x)).encode ('utf-8')
|
return [int (x, 16) for x in s.split (' ') if len (x)]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def encode (s):
|
||||||
|
return u''.join (unichr (x) for x in Unicode.parse (s)).encode ('utf-8')
|
||||||
|
|
||||||
shorthands = {
|
shorthands = {
|
||||||
"ZERO WIDTH NON-JOINER": "ZWNJ",
|
"ZERO WIDTH NON-JOINER": "ZWNJ",
|
||||||
|
|
Loading…
Reference in New Issue