hyphen/substrings.c

263 lines
6.3 KiB
C

//
// A utility for finding substring embeddings in patterns
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define MAXPATHS (256*1024)
//
//
static void die(
const char*msg
) {
fprintf(stderr,"%s\n",msg);
exit(1);
}
// Finds the index of an entry, only used on xxx_key arrays
// Caveat: the table has to be sorted
static int find_in(
char *tab[],
int max,
const char *pat
) {
int left=0, right=max-1;
while (left <= right) {
int mid = ((right-left)/2)+left;
int v = strcmp(pat,tab[mid]);
if (v>0) {
left = mid + 1;
} else if (v<0) {
right = mid -1;
} else {
return mid;
}
}
return -1;
}
// used by partition (which is used by qsort_arr)
//
static void swap2(
char *a[],
char *b[],
int i,
int j
) {
if (i==j) return;
char*v;
v=a[i]; a[i]=a[j]; a[j]=v;
v=b[i]; b[i]=b[j]; b[j]=v;
}
// used by qsort_arr
//
static int partition(
char *a[],
char *b[],
int left,
int right,
int p
) {
const char *pivotValue = a[p];
int i;
swap2(a,b,p,right); // Move pivot to end
p = left;
for (i=left; i<right; i++) {
if (strcmp(a[i],pivotValue)<=0) {
swap2(a,b,p,i);
p++;
}
}
swap2(a,b,right,p); // Move pivot to its final place
return p;
}
//
//
static void qsort_arr(
char *a[],
char *b[],
int left,
int right
) {
while (right > left) {
int p = left + (right-left)/2; //select a pivot
p = partition(a,b, left, right, p);
if ((p-1) - left < right - (p+1)) {
qsort_arr(a,b, left, p-1);
left = p+1;
} else {
qsort_arr(a,b, p+1, right);
right = p-1;
}
}
}
// Removes extra '0' entries from the string
//
static char* compact(
char *expr
) {
int l=strlen(expr);
int i,j;
for (i=0,j=0; i<l; i++) {
if (expr[i]!='0') expr[j++] = expr[i];
}
expr[j]=0;
return expr;
}
// convert 'n1im' to 0n1i0m0 expressed as a string
//
static void expand(
char *expr,
const char *pat,
int l
) {
int el = 0;
char last = '.';
int i;
for (i=0; i<l; i++) {
char c = pat[i];
if ( (last<'0' || last>'9')
&& (c <'0' || c >'9')
) {
expr[el++] = '0';
}
expr[el++] = c;
last = c;
}
if (last<'0' || last>'9') expr[el++] = '0';
expr[el]=0;
}
// Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
// The second pattern needs to be a right side match of the first
// (modulo digits)
static char *combine(
char *expr,
const char *subexpr
) {
int l1 = strlen(expr);
int l2 = strlen(subexpr);
int off = l1-l2;
int j;
// this works also for utf8 sequences because the substring is identical
// to the last substring-length bytes of expr except for the (single byte)
// hyphenation encoders
for (j=0; j<l2; j++) {
if (subexpr[j]>expr[off+j]) {
expr[off+j] = subexpr[j];
}
}
return expr;
}
static char *pattab_key[MAXPATHS];
static char *pattab_val[MAXPATHS];
static char *newpattab_key[MAXPATHS];
static char *newpattab_val[MAXPATHS];
int main(int argc, const char* argv[]) {
FILE *in, *out;
int patterns = 0;
int newpatterns = 0;
char format[132]; // 64+65+newline+zero+spare
int p;
if (argc!=3) die("Usage: <orig-file> <new-file>\n");
if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input");
if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output");
// read all patterns and split in pure text (_key) & expanded patterns (_val)
while(fgets(format,132,in) != NULL) {
int l = strlen(format);
if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp
if (format[0]=='%' || format[0]==0) {
// skip
} else {
if (format[l-1]=='%') {
l--;
format[l] = 0; // remove '%'
}
int i,j;
char *pat = (char*) malloc(l+1);
char *org = (char*) malloc(l*2+1);
if (pat==NULL || org==NULL) die("not enough memory");
expand(org,format,l);
// remove hyphenation encoders (digits) from pat
for (i=0,j=0; i<l; i++) {
// odd, but utf-8 proof
char c = format[i];
if (c<'0' || c>'9') pat[j++]=c;
}
pat[j]=0;
pattab_key[patterns] = pat;
pattab_val[patterns++] = org;
if (patterns>MAXPATHS) die("to many base patterns");
}
}
fclose(in);
// As we use binairy search, make sure it is sorted
qsort_arr(pattab_key,pattab_val,0,patterns-1);
for (p=0; p<patterns; p++) {
char *pat = pattab_key[p];
int patsize = strlen(pat);
int j,l;
for (l=1; l<=patsize; l++) {
for (j=1; j<=l; j++) {
int i = l-j;
int subpat_ndx;
char subpat[132];
strncpy(subpat,pat+i,j); subpat[j]=0;
if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) {
int newpat_ndx;
char *newpat=malloc(l+1);
if (newpat==NULL) die("not enough memory");
//printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]);
strncpy(newpat, pat+0,l); newpat[l]=0;
if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) {
char *neworg = malloc(132); // TODO: compute exact length
if (neworg==NULL) die("not enough memory");
expand(neworg,newpat,l);
newpattab_key[newpatterns] = newpat;
newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]);
if (newpatterns>MAXPATHS) die("to many new patterns");
//printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_val[p],pattab_val[subpat_ndx],neworg);
} else {
free(newpat);
newpattab_val[newpat_ndx] = combine(
newpattab_val[newpat_ndx], pattab_val[subpat_ndx] );
}
}
}
}
}
/* for some tiny extra speed, one could forget the free()s
* as the memory is freed anyway on exit().
* However, the gain is minimal and now the code can be cleanly
* incorporated into other code */
for (p=0; p<newpatterns; p++) {
fprintf(out,"%s\n",compact(newpattab_val[p]));
free(newpattab_key[p]);
free(newpattab_val[p]);
}
fclose(out);
for (p=0; p<patterns; p++) {
free(pattab_key[p]);
free(pattab_val[p]);
}
return 0;
}