203 lines
5.2 KiB
C
203 lines
5.2 KiB
C
#include <string.h>
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
|
|
#include "hyphen.h"
|
|
#include "csutil.h"
|
|
|
|
#define BUFSIZE 1000
|
|
|
|
void help() {
|
|
fprintf(stderr,"correct syntax is:\n");
|
|
fprintf(stderr,"example [-d | -dd] hyphen_dictionary_file file_of_words_to_check\n");
|
|
fprintf(stderr,"-o = use old algorithm (without non-standard hyphenation)\n");
|
|
fprintf(stderr,"-d = hyphenation with listing of the possible hyphenations\n");
|
|
}
|
|
|
|
/* get a pointer to the nth 8-bit or UTF-8 character of the word */
|
|
char * hindex(char * word, int n, int utf8) {
|
|
int j = 0;
|
|
while (j < n) {
|
|
j++;
|
|
word++;
|
|
while (utf8 && ((((unsigned char) *word) >> 6) == 2)) word++;
|
|
}
|
|
return word;
|
|
}
|
|
|
|
/* list possible hyphenations with -dd option (example for the usage of the hyphenate2() function) */
|
|
void single_hyphenations(char * word, char * hyphen, char ** rep, int * pos, int * cut, int utf8) {
|
|
int i, k, j = 0;
|
|
char r;
|
|
for (i = 0; (i + 1) < strlen(word); i++) {
|
|
if (utf8 && ((((unsigned char) word[i]) >> 6) == 2)) continue;
|
|
if ((hyphen[j] & 1)) {
|
|
if (rep && rep[j]) {
|
|
k = hindex(word, j - pos[j] + 1, utf8) - word;
|
|
r = word[k];
|
|
word[k] = 0;
|
|
printf(" - %s%s", word, rep[j]);
|
|
word[k] = r;
|
|
printf("%s\n", hindex(word + k, cut[j], utf8));
|
|
} else {
|
|
k = hindex(word, j + 1, utf8) - word;
|
|
r = word[k];
|
|
word[k] = 0;
|
|
printf(" - %s=", word);
|
|
word[k] = r;
|
|
printf("%s\n", word + k);
|
|
}
|
|
}
|
|
j++;
|
|
}
|
|
}
|
|
|
|
int
|
|
main(int argc, char** argv)
|
|
{
|
|
|
|
HyphenDict *dict;
|
|
int df;
|
|
int wtc;
|
|
FILE* wtclst;
|
|
int k, n, i, j, c;
|
|
char buf[BUFSIZE + 1];
|
|
int nHyphCount;
|
|
char *hyphens;
|
|
char *lcword;
|
|
char *hyphword;
|
|
char hword[BUFSIZE * 2];
|
|
int arg = 1;
|
|
int optd = 1;
|
|
int optdd = 0;
|
|
char ** rep;
|
|
int * pos;
|
|
int * cut;
|
|
|
|
/* first parse the command line options */
|
|
/* arg1 - hyphen dictionary file, arg2 - file of words to check */
|
|
|
|
if (argv[arg]) {
|
|
if (strcmp(argv[arg], "-o") == 0) {
|
|
optd = 0;
|
|
arg++;
|
|
}
|
|
if (argv[arg] && strcmp(argv[arg], "-d") == 0) {
|
|
optd = 1;
|
|
optdd = 1;
|
|
arg++;
|
|
}
|
|
}
|
|
|
|
if (argv[arg]) {
|
|
df = arg++;
|
|
} else {
|
|
help();
|
|
exit(1);
|
|
}
|
|
|
|
if (argv[arg]) {
|
|
wtc = arg++;
|
|
} else {
|
|
help();
|
|
exit(1);
|
|
}
|
|
|
|
/* load the hyphenation dictionary */
|
|
if ((dict = hnj_hyphen_load(argv[df])) == NULL) {
|
|
fprintf(stderr, "Couldn't find file %s\n", argv[df]);
|
|
fflush(stderr);
|
|
exit(1);
|
|
}
|
|
|
|
/* open the words to check list */
|
|
wtclst = fopen(argv[wtc],"r");
|
|
if (!wtclst) {
|
|
fprintf(stderr,"Error - could not open file of words to check\n");
|
|
exit(1);
|
|
}
|
|
|
|
|
|
/* now read each word from the wtc file */
|
|
while(fgets(buf,BUFSIZE,wtclst) != NULL) {
|
|
k = strlen(buf);
|
|
if (buf[k - 1] == '\n') buf[k - 1] = '\0';
|
|
if (*buf && buf[k - 2] == '\r') buf[k-- - 2] = '\0';
|
|
|
|
/* set aside some buffers to hold lower cased */
|
|
/* and hyphen information */
|
|
lcword = (char *) malloc(k+1);
|
|
hyphens = (char *)malloc(k+5);
|
|
if (dict->utf8) {
|
|
strcpy(lcword, buf);
|
|
} else {
|
|
enmkallsmall(lcword,buf,dict->cset);
|
|
}
|
|
|
|
/* first remove any trailing periods */
|
|
n = k-1;
|
|
while((n >=0) && (lcword[n] == '.')) n--;
|
|
n++;
|
|
|
|
/* now actually try to hyphenate the word */
|
|
|
|
rep = NULL;
|
|
pos = NULL;
|
|
cut = NULL;
|
|
hword[0] = '\0';
|
|
|
|
if ((!optd && hnj_hyphen_hyphenate(dict, lcword, n-1, hyphens)) ||
|
|
(optd && hnj_hyphen_hyphenate2(dict, lcword, n-1, hyphens, hword, &rep, &pos, &cut))) {
|
|
free(hyphens);
|
|
free(lcword);
|
|
fprintf(stderr, "hyphenation error\n");
|
|
exit(1);
|
|
}
|
|
|
|
if (!optd) {
|
|
/* now backfill hyphens[] for any removed periods */
|
|
for (c = n; c < k; c++) hyphens[c] = '0';
|
|
hyphens[k] = '\0';
|
|
|
|
/* now create a new char string showing hyphenation positions */
|
|
/* count the hyphens and allocate space for the new hypehanted string */
|
|
nHyphCount = 0;
|
|
for (i = 0; i < n; i++)
|
|
if (hyphens[i]&1)
|
|
nHyphCount++;
|
|
hyphword = (char *) malloc(k+1+nHyphCount);
|
|
j = 0;
|
|
for (i = 0; i < n; i++) {
|
|
hyphword[j++] = buf[i];
|
|
if (hyphens[i]&1) {
|
|
hyphword[j++] = '-';
|
|
}
|
|
}
|
|
hyphword[j] = '\0';
|
|
fprintf(stdout,"%s\n",hyphword);
|
|
fflush(stdout);
|
|
free(hyphword);
|
|
} else {
|
|
/* fprintf(stderr, "vasz: %s", hyphens); */
|
|
fprintf(stdout,"%s\n", hword);
|
|
|
|
|
|
if (optdd) single_hyphenations(lcword, hyphens, rep, pos, cut, dict->utf8);
|
|
if (rep) {
|
|
for (i = 0; i < n - 1; i++) {
|
|
if (rep[i]) free(rep[i]);
|
|
}
|
|
free(rep);
|
|
free(pos);
|
|
free(cut);
|
|
}
|
|
}
|
|
free(hyphens);
|
|
free(lcword);
|
|
}
|
|
|
|
fclose(wtclst);
|
|
hnj_hyphen_free(dict);
|
|
return 0;
|
|
}
|