#!/usr/bin/perl # convert TeX (Patgen) hyphenation patterns to Libhnj format # (A utility for finding substring embeddings in patterns) # usage: substrings.pl inputfile outputfile [encoding] if (!defined $ARGV[1]) { print "" . "substrings.pl - convert TeX (Patgen) hyphenation patterns to Libhnj format\n" . "(A utility for finding substring embeddings in patterns)\n" . "usage: substrings.pl infile outfile [encoding [lefthyphenmin [righthyphenmin]]]\n"; exit 1; } $fn = $ARGV[0]; if (!-e $fn) { $fn = "hyphen.us"; } open HYPH, $fn; open OUT, ">$ARGV[1]"; $encoding = $ARGV[2]; $lhmin = $ARGV[3]; $rhmin = $ARGV[4]; if (defined $encoding) { print OUT "$encoding\n"; } if (defined $lhmin) { print OUT "LEFTHYPHENMIN $lhmin\n"; } if (defined $rhmin) { print OUT "RIGHTHYPHENMIN $rhmin\n"; } # keep header while () { if (/^%/) { print OUT "$_"; } else { last; } } while () { if (/^%/) { #comment, ignore } else { $_ =~ s/%.*$//g; for (split ' ', $_) { if (/^(.+)\/([^,]+),([0-9]+),([0-9]+)$/) { $origpat = $1; $pat = $1; $repl = $2; $beg = $3; $len = $4; $pat =~ s/\d//g; if ($origpat eq $pat) { print "error - missing hyphenation point: $_"; exit 1; } push @patlist, $pat; $pattab{$pat} = $origpat; $repltab{$pat} = $repl; $replbeg{$pat} = $beg - 1; $repllen{$pat} = $len; } elsif (/^(.+)\/(.+)$/) { $origpat = $1; $pat = $1; $repl = $2; $pat =~ s/\d//g; if ($origpat eq $pat) { print "error - missing hyphenation point: $_"; exit 1; } push @patlist, $pat; $pattab{$pat} = $origpat; $repltab{$pat} = $repl; $replbeg{$pat} = 0; $repllen{$pat} = enclen($pat); } elsif (/^(.+)$/) { $origpat = $1; $pat = $1; $pat =~ s/\d//g; push @patlist, $pat; $pattab{$pat} = $origpat; } } } } foreach $pat (@patlist) { $patsize = length $pat; for $i (0..$patsize - 1) { for $j (1..$patsize - $i) { $subpat = substr ($pat, $i, $j); if (defined $pattab{$subpat}) { print "$pattab{$subpat} is embedded in $pattab{$pat}\n"; $newpat = substr $pat, 0, $i + $j; if (!defined $newpattab{$newpat}) { $newpattab{$newpat} = substr ($pat, 0, $i).$pattab{$subpat}; $ss = substr $pat, 0, $i; print "$ss+$pattab{$subpat}\n"; push @newpatlist, $newpat; if (defined $repltab{$subpat}) { $begcorr = (($pat =~ /^[.]/) && !($subpat =~ /^[.]/)) ? 1 : 0; $newrepltab{$newpat} = $repltab{$subpat}; $newreplbeg{$newpat} = $replbeg{$subpat} + enclen($ss) - $begcorr; $newrepllen{$newpat} = $repllen{$subpat}; } } else { $tmp = $newpattab{$newpat}; $newpattab{$newpat} = combine ($newpattab{$newpat}, $pattab{$subpat}); print "$tmp + $pattab{$subpat} -> $newpattab{$newpat}\n"; } } } } } foreach $pat (@newpatlist) { if (defined $newrepltab{$pat}) { print OUT $newpattab{$pat}."/".$newrepltab{$pat}.",".($newreplbeg{$pat}+1).",".$newrepllen{$pat}."\n"; } else { print OUT $newpattab{$pat}."\n"; } } #convert 'n1im' to 0n1i0m0 expresed as a list sub expand { my ($pat) = @_; my $last = '.'; my @exp = (); foreach $c (split (//, $pat)) { if ($last =~ /[\D]/ && $c =~ /[\D]/) { push @exp, 0; } push @exp, $c; $last = $c; } if ($last =~ /[\D]/) { push @exp, 0; } return @exp; } # Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der # The second pattern needs to be a substring of the first (modulo digits) sub combine { my @exp = expand shift; my @subexp = expand shift; my $pat1, $pat2; my $i; $pat1 = join ('', map { $_ =~ /\d/ ? () : $_ } @exp); $pat2 = join ('', map { $_ =~ /\d/ ? () : $_ } @subexp); $begcorr = ($pat1 =~ /^[.]/) ? 1 : 0; for $i (0..length ($pat1) - length ($pat2)) { if (substr ($pat1, $i, length $pat2) eq $subpat) { for ($j = 0; $j < @subexp; $j += 2) { if ($subexp[$j] > $exp[2 * $i + $j]) { $exp[2 * $i + $j] = $subexp[$j]; if (defined $newrepltab{$pat2} && !defined $newrepltab{$pat1}) { $ss = substr ($pat1, 0, $i); $newrepltab{$pat1} = $newrepltab{$pat2}; $newreplbeg{$pat1} = $newreplbeg{$pat2} + enclen($ss) - $begcorr; $newrepllen{$pat1} = $newrepllen{$pat2}; } } } print ("$pat1 includes $pat2 at pos $i\n"); } } return join ('', map { $_ eq '0' ? () : $_ } @exp); } # 8 bit or UTF-8 character length (calculating right start position for discretionary hyphenation) sub enclen { my $nonchar = 0; my $len = length($_[0]); if ($encoding eq "UTF-8") { # length of an UTF-8 string equals to the count of the characters not started with '10' bits for ($i = 0; $i < $len; $i++) { if ((ord(substr($_[0], $i, 1)) >> 6) == 2) { $nonchar++; } } } return $len - $nonchar; }