2010-03-04 13:13:53 +01:00
|
|
|
#!/usr/bin/perl
|
|
|
|
# convert TeX (Patgen) hyphenation patterns to Libhnj format
|
|
|
|
# (A utility for finding substring embeddings in patterns)
|
|
|
|
# usage: substrings.pl inputfile outputfile [encoding]
|
|
|
|
|
|
|
|
if (!defined $ARGV[1]) {
|
|
|
|
print "" .
|
|
|
|
"substrings.pl - convert TeX (Patgen) hyphenation patterns to Libhnj format\n" .
|
|
|
|
"(A utility for finding substring embeddings in patterns)\n" .
|
|
|
|
"usage: substrings.pl infile outfile [encoding [lefthyphenmin [righthyphenmin]]]\n";
|
|
|
|
exit 1;
|
|
|
|
}
|
|
|
|
$fn = $ARGV[0];
|
|
|
|
if (!-e $fn) { $fn = "hyphen.us"; }
|
|
|
|
open HYPH, $fn;
|
|
|
|
open OUT, ">$ARGV[1]";
|
|
|
|
$encoding = $ARGV[2];
|
|
|
|
$lhmin = $ARGV[3];
|
|
|
|
$rhmin = $ARGV[4];
|
|
|
|
if (defined $encoding) { print OUT "$encoding\n"; }
|
|
|
|
if (defined $lhmin) { print OUT "LEFTHYPHENMIN $lhmin\n"; }
|
|
|
|
if (defined $rhmin) { print OUT "RIGHTHYPHENMIN $rhmin\n"; }
|
|
|
|
|
2016-05-04 14:49:15 +02:00
|
|
|
# keep header
|
2010-03-04 13:13:53 +01:00
|
|
|
while (<HYPH>)
|
|
|
|
{
|
2016-05-04 14:49:15 +02:00
|
|
|
if (/^%/) {
|
|
|
|
print OUT "$_";
|
|
|
|
} else {
|
|
|
|
last;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
while (<HYPH>)
|
|
|
|
{
|
|
|
|
if (/^%/) {
|
2016-05-04 14:44:46 +02:00
|
|
|
#comment, ignore
|
2016-05-04 14:49:15 +02:00
|
|
|
} else {
|
|
|
|
$_ =~ s/%.*$//g;
|
|
|
|
for (split ' ', $_) {
|
|
|
|
if (/^(.+)\/([^,]+),([0-9]+),([0-9]+)$/) {
|
|
|
|
$origpat = $1;
|
|
|
|
$pat = $1;
|
|
|
|
$repl = $2;
|
|
|
|
$beg = $3;
|
|
|
|
$len = $4;
|
|
|
|
$pat =~ s/\d//g;
|
|
|
|
if ($origpat eq $pat) {
|
|
|
|
print "error - missing hyphenation point: $_";
|
|
|
|
exit 1;
|
|
|
|
}
|
|
|
|
push @patlist, $pat;
|
|
|
|
$pattab{$pat} = $origpat;
|
|
|
|
$repltab{$pat} = $repl;
|
|
|
|
$replbeg{$pat} = $beg - 1;
|
|
|
|
$repllen{$pat} = $len;
|
|
|
|
} elsif (/^(.+)\/(.+)$/) {
|
|
|
|
$origpat = $1;
|
|
|
|
$pat = $1;
|
|
|
|
$repl = $2;
|
|
|
|
$pat =~ s/\d//g;
|
|
|
|
if ($origpat eq $pat) {
|
|
|
|
print "error - missing hyphenation point: $_";
|
|
|
|
exit 1;
|
|
|
|
}
|
|
|
|
push @patlist, $pat;
|
|
|
|
$pattab{$pat} = $origpat;
|
|
|
|
$repltab{$pat} = $repl;
|
|
|
|
$replbeg{$pat} = 0;
|
|
|
|
$repllen{$pat} = enclen($pat);
|
|
|
|
} elsif (/^(.+)$/) {
|
|
|
|
$origpat = $1;
|
|
|
|
$pat = $1;
|
|
|
|
$pat =~ s/\d//g;
|
|
|
|
push @patlist, $pat;
|
|
|
|
$pattab{$pat} = $origpat;
|
|
|
|
}
|
2010-03-04 13:13:53 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach $pat (@patlist) {
|
|
|
|
$patsize = length $pat;
|
|
|
|
for $i (0..$patsize - 1) {
|
2016-05-04 14:44:46 +02:00
|
|
|
for $j (1..$patsize - $i) {
|
|
|
|
$subpat = substr ($pat, $i, $j);
|
|
|
|
if (defined $pattab{$subpat}) {
|
|
|
|
print "$pattab{$subpat} is embedded in $pattab{$pat}\n";
|
|
|
|
$newpat = substr $pat, 0, $i + $j;
|
|
|
|
if (!defined $newpattab{$newpat}) {
|
|
|
|
$newpattab{$newpat} =
|
|
|
|
substr ($pat, 0, $i).$pattab{$subpat};
|
|
|
|
$ss = substr $pat, 0, $i;
|
|
|
|
print "$ss+$pattab{$subpat}\n";
|
|
|
|
push @newpatlist, $newpat;
|
|
|
|
if (defined $repltab{$subpat}) {
|
2010-03-04 13:13:53 +01:00
|
|
|
$begcorr = (($pat =~ /^[.]/) && !($subpat =~ /^[.]/)) ? 1 : 0;
|
|
|
|
$newrepltab{$newpat} = $repltab{$subpat};
|
|
|
|
$newreplbeg{$newpat} = $replbeg{$subpat} + enclen($ss) - $begcorr;
|
|
|
|
$newrepllen{$newpat} = $repllen{$subpat};
|
|
|
|
}
|
2016-05-04 14:44:46 +02:00
|
|
|
} else {
|
|
|
|
$tmp = $newpattab{$newpat};
|
|
|
|
$newpattab{$newpat} =
|
|
|
|
combine ($newpattab{$newpat}, $pattab{$subpat});
|
|
|
|
print "$tmp + $pattab{$subpat} -> $newpattab{$newpat}\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-03-04 13:13:53 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach $pat (@newpatlist) {
|
|
|
|
if (defined $newrepltab{$pat}) {
|
|
|
|
print OUT $newpattab{$pat}."/".$newrepltab{$pat}.",".($newreplbeg{$pat}+1).",".$newrepllen{$pat}."\n";
|
|
|
|
} else {
|
|
|
|
print OUT $newpattab{$pat}."\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#convert 'n1im' to 0n1i0m0 expresed as a list
|
|
|
|
sub expand {
|
|
|
|
my ($pat) = @_;
|
|
|
|
my $last = '.';
|
|
|
|
my @exp = ();
|
|
|
|
|
|
|
|
foreach $c (split (//, $pat)) {
|
2016-05-04 14:44:46 +02:00
|
|
|
if ($last =~ /[\D]/ && $c =~ /[\D]/) {
|
|
|
|
push @exp, 0;
|
|
|
|
}
|
|
|
|
push @exp, $c;
|
|
|
|
$last = $c;
|
2010-03-04 13:13:53 +01:00
|
|
|
}
|
|
|
|
if ($last =~ /[\D]/) {
|
2016-05-04 14:44:46 +02:00
|
|
|
push @exp, 0;
|
2010-03-04 13:13:53 +01:00
|
|
|
}
|
|
|
|
return @exp;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
|
|
|
|
# The second pattern needs to be a substring of the first (modulo digits)
|
|
|
|
sub combine {
|
|
|
|
my @exp = expand shift;
|
|
|
|
my @subexp = expand shift;
|
|
|
|
my $pat1, $pat2;
|
|
|
|
my $i;
|
|
|
|
|
|
|
|
$pat1 = join ('', map { $_ =~ /\d/ ? () : $_ } @exp);
|
|
|
|
$pat2 = join ('', map { $_ =~ /\d/ ? () : $_ } @subexp);
|
|
|
|
|
|
|
|
$begcorr = ($pat1 =~ /^[.]/) ? 1 : 0;
|
|
|
|
|
|
|
|
for $i (0..length ($pat1) - length ($pat2)) {
|
2016-05-04 14:44:46 +02:00
|
|
|
if (substr ($pat1, $i, length $pat2) eq $subpat) {
|
|
|
|
for ($j = 0; $j < @subexp; $j += 2) {
|
|
|
|
if ($subexp[$j] > $exp[2 * $i + $j]) {
|
|
|
|
$exp[2 * $i + $j] = $subexp[$j];
|
2010-03-04 13:13:53 +01:00
|
|
|
if (defined $newrepltab{$pat2} && !defined $newrepltab{$pat1}) {
|
|
|
|
$ss = substr ($pat1, 0, $i);
|
|
|
|
$newrepltab{$pat1} = $newrepltab{$pat2};
|
|
|
|
$newreplbeg{$pat1} = $newreplbeg{$pat2} + enclen($ss) - $begcorr;
|
|
|
|
$newrepllen{$pat1} = $newrepllen{$pat2};
|
|
|
|
}
|
2016-05-04 14:44:46 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
print ("$pat1 includes $pat2 at pos $i\n");
|
|
|
|
}
|
2010-03-04 13:13:53 +01:00
|
|
|
}
|
|
|
|
return join ('', map { $_ eq '0' ? () : $_ } @exp);
|
|
|
|
}
|
|
|
|
|
|
|
|
# 8 bit or UTF-8 character length (calculating right start position for discretionary hyphenation)
|
|
|
|
sub enclen {
|
|
|
|
my $nonchar = 0;
|
|
|
|
my $len = length($_[0]);
|
|
|
|
if ($encoding eq "UTF-8") {
|
|
|
|
# length of an UTF-8 string equals to the count of the characters not started with '10' bits
|
|
|
|
for ($i = 0; $i < $len; $i++) {
|
|
|
|
if ((ord(substr($_[0], $i, 1)) >> 6) == 2) { $nonchar++; }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return $len - $nonchar;
|
|
|
|
}
|