Added hnj_hyphen_load_data, to load dicts from raw data block
This commit is contained in:
parent
e359ee4ddd
commit
dfa8212e06
159
hyphen.c
159
hyphen.c
|
@ -523,6 +523,165 @@ for (k = 0; k < 2; k++) {
|
||||||
return dict[0];
|
return dict[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static char *hnj_hyphen_load_data_fgets(char * s, int n, const char **stream, size_t *streamLen) {
|
||||||
|
const char *ptr = *stream;
|
||||||
|
size_t pos = 0;
|
||||||
|
while(n > 1 && pos < *streamLen && ptr[pos] != '\n') {
|
||||||
|
++ pos;
|
||||||
|
-- n;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptr[0] != '\n' && n > 1 && pos < *streamLen) {
|
||||||
|
++ pos;
|
||||||
|
-- n;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pos > 0) {
|
||||||
|
strncpy(s, *stream, pos);
|
||||||
|
*stream = (*stream + pos);
|
||||||
|
*streamLen = (*streamLen - pos);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
HyphenDict *
|
||||||
|
hnj_hyphen_load_data (const char *fdata, size_t flen)
|
||||||
|
{
|
||||||
|
HyphenDict *dict[2];
|
||||||
|
HashTab *hashtab;
|
||||||
|
char buf[MAX_CHARS];
|
||||||
|
int nextlevel = 0;
|
||||||
|
int i, j, k;
|
||||||
|
HashEntry *e;
|
||||||
|
int state_num = 0;
|
||||||
|
// loading one or two dictionaries (separated by NEXTLEVEL keyword)
|
||||||
|
for (k = 0; k < 2; k++) {
|
||||||
|
hashtab = hnj_hash_new ();
|
||||||
|
#ifdef VERBOSE
|
||||||
|
global[k] = hashtab;
|
||||||
|
#endif
|
||||||
|
hnj_hash_insert (hashtab, "", 0);
|
||||||
|
dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict));
|
||||||
|
dict[k]->num_states = 1;
|
||||||
|
dict[k]->states = (HyphenState *) hnj_malloc (sizeof(HyphenState));
|
||||||
|
dict[k]->states[0].match = NULL;
|
||||||
|
dict[k]->states[0].repl = NULL;
|
||||||
|
dict[k]->states[0].fallback_state = -1;
|
||||||
|
dict[k]->states[0].num_trans = 0;
|
||||||
|
dict[k]->states[0].trans = NULL;
|
||||||
|
dict[k]->nextlevel = NULL;
|
||||||
|
dict[k]->lhmin = 0;
|
||||||
|
dict[k]->rhmin = 0;
|
||||||
|
dict[k]->clhmin = 0;
|
||||||
|
dict[k]->crhmin = 0;
|
||||||
|
dict[k]->nohyphen = NULL;
|
||||||
|
dict[k]->nohyphenl = 0;
|
||||||
|
|
||||||
|
/* read in character set info */
|
||||||
|
if (k == 0) {
|
||||||
|
for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
|
||||||
|
if (hnj_hyphen_load_data_fgets(dict[k]->cset, sizeof(dict[k]->cset), &fdata, &flen) != NULL) {
|
||||||
|
for (i=0;i<MAX_NAME;i++)
|
||||||
|
if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
|
||||||
|
dict[k]->cset[i] = 0;
|
||||||
|
} else {
|
||||||
|
dict[k]->cset[0] = 0;
|
||||||
|
}
|
||||||
|
dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
|
||||||
|
} else {
|
||||||
|
strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);
|
||||||
|
dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';
|
||||||
|
dict[k]->utf8 = dict[0]->utf8;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (k == 0 || nextlevel) {
|
||||||
|
while (hnj_hyphen_load_data_fgets (buf, sizeof(buf), &fdata, &flen) != NULL) {
|
||||||
|
if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
|
||||||
|
nextlevel = 1;
|
||||||
|
break;
|
||||||
|
} else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
|
||||||
|
}
|
||||||
|
} else if (k == 1) {
|
||||||
|
/* default first level: hyphen and ASCII apostrophe */
|
||||||
|
if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);
|
||||||
|
else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
|
||||||
|
strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here
|
||||||
|
buf[MAX_CHARS-1] = '\0';
|
||||||
|
hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
|
||||||
|
hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
|
||||||
|
if (dict[0]->utf8) {
|
||||||
|
hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
|
||||||
|
hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Could do unioning of matches here (instead of the preprocessor script).
|
||||||
|
If we did, the pseudocode would look something like this:
|
||||||
|
|
||||||
|
foreach state in the hash table
|
||||||
|
foreach i = [1..length(state) - 1]
|
||||||
|
state to check is substr (state, i)
|
||||||
|
look it up
|
||||||
|
if found, and if there is a match, union the match in.
|
||||||
|
|
||||||
|
It's also possible to avoid the quadratic blowup by doing the
|
||||||
|
search in order of increasing state string sizes - then you
|
||||||
|
can break the loop after finding the first match.
|
||||||
|
|
||||||
|
This step should be optional in any case - if there is a
|
||||||
|
preprocessed rule table, it's always faster to use that.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* put in the fallback states */
|
||||||
|
for (i = 0; i < HASH_SIZE; i++)
|
||||||
|
for (e = hashtab->entries[i]; e; e = e->next)
|
||||||
|
{
|
||||||
|
if (*(e->key)) for (j = 1; 1; j++)
|
||||||
|
{
|
||||||
|
state_num = hnj_hash_lookup (hashtab, e->key + j);
|
||||||
|
if (state_num >= 0)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* KBH: FIXME state 0 fallback_state should always be -1? */
|
||||||
|
if (e->val)
|
||||||
|
dict[k]->states[e->val].fallback_state = state_num;
|
||||||
|
}
|
||||||
|
#ifdef VERBOSE
|
||||||
|
for (i = 0; i < HASH_SIZE; i++)
|
||||||
|
for (e = hashtab->entries[i]; e; e = e->next)
|
||||||
|
{
|
||||||
|
printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
|
||||||
|
dict[k]->states[e->val].fallback_state);
|
||||||
|
for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
|
||||||
|
printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
|
||||||
|
dict[k]->states[e->val].trans[j].new_state);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef VERBOSE
|
||||||
|
hnj_hash_free (hashtab);
|
||||||
|
#endif
|
||||||
|
state_num = 0;
|
||||||
|
}
|
||||||
|
if (nextlevel) dict[0]->nextlevel = dict[1];
|
||||||
|
else {
|
||||||
|
dict[1] -> nextlevel = dict[0];
|
||||||
|
dict[1]->lhmin = dict[0]->lhmin;
|
||||||
|
dict[1]->rhmin = dict[0]->rhmin;
|
||||||
|
dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
|
||||||
|
dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
|
||||||
|
#ifdef VERBOSE
|
||||||
|
HashTab *r = global[0];
|
||||||
|
global[0] = global[1];
|
||||||
|
global[1] = r;
|
||||||
|
#endif
|
||||||
|
return dict[1];
|
||||||
|
}
|
||||||
|
return dict[0];
|
||||||
|
}
|
||||||
|
|
||||||
void hnj_hyphen_free (HyphenDict *dict)
|
void hnj_hyphen_free (HyphenDict *dict)
|
||||||
{
|
{
|
||||||
int state_num;
|
int state_num;
|
||||||
|
|
1
hyphen.h
1
hyphen.h
|
@ -97,6 +97,7 @@ struct _HyphenTrans {
|
||||||
|
|
||||||
HyphenDict *hnj_hyphen_load (const char *fn);
|
HyphenDict *hnj_hyphen_load (const char *fn);
|
||||||
HyphenDict *hnj_hyphen_load_file (FILE *f);
|
HyphenDict *hnj_hyphen_load_file (FILE *f);
|
||||||
|
HyphenDict *hnj_hyphen_load_data (const char *fdata, size_t flen);
|
||||||
void hnj_hyphen_free (HyphenDict *dict);
|
void hnj_hyphen_free (HyphenDict *dict);
|
||||||
|
|
||||||
/* obsolete, use hnj_hyphen_hyphenate2() or *hyphenate3() functions) */
|
/* obsolete, use hnj_hyphen_hyphenate2() or *hyphenate3() functions) */
|
||||||
|
|
Loading…
Reference in New Issue