Added hnj_hyphen_load_data, to load dicts from raw data block

This commit is contained in:
SBKarr 2016-08-15 16:53:04 +08:00
parent e359ee4ddd
commit dfa8212e06
2 changed files with 188 additions and 28 deletions

203
hyphen.c
View File

@ -3,8 +3,8 @@
*/ */
/* LibHnj - a library for high quality hyphenation and justification /* LibHnj - a library for high quality hyphenation and justification
* Copyright (C) 1998 Raph Levien, * Copyright (C) 1998 Raph Levien,
* (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
* (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
* (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
* *
@ -19,8 +19,8 @@
* Library General Public License for more details. * Library General Public License for more details.
* *
* You should have received a copy of the GNU Library General Public * You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the * License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307 USA. * Boston, MA 02111-1307 USA.
*/ */
@ -281,7 +281,7 @@ void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
} }
} }
return; return;
} }
j = 0; j = 0;
pattern[j] = '0'; pattern[j] = '0';
repl = strchr(buf, '/'); repl = strchr(buf, '/');
@ -296,7 +296,7 @@ void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
if (index2) { if (index2) {
*index2 = '\0'; *index2 = '\0';
replindex = (signed char) atoi(index + 1) - 1; replindex = (signed char) atoi(index + 1) - 1;
replcut = (signed char) atoi(index2 + 1); replcut = (signed char) atoi(index2 + 1);
} }
} else { } else {
hnj_strchomp(repl + 1); hnj_strchomp(repl + 1);
@ -397,7 +397,7 @@ hnj_hyphen_load_file (FILE *f)
HashEntry *e; HashEntry *e;
int state_num = 0; int state_num = 0;
// loading one or two dictionaries (separated by NEXTLEVEL keyword) // loading one or two dictionaries (separated by NEXTLEVEL keyword)
for (k = 0; k < 2; k++) { for (k = 0; k < 2; k++) {
hashtab = hnj_hash_new (); hashtab = hnj_hash_new ();
#ifdef VERBOSE #ifdef VERBOSE
global[k] = hashtab; global[k] = hashtab;
@ -480,7 +480,166 @@ for (k = 0; k < 2; k++) {
for (e = hashtab->entries[i]; e; e = e->next) for (e = hashtab->entries[i]; e; e = e->next)
{ {
if (*(e->key)) for (j = 1; 1; j++) if (*(e->key)) for (j = 1; 1; j++)
{ {
state_num = hnj_hash_lookup (hashtab, e->key + j);
if (state_num >= 0)
break;
}
/* KBH: FIXME state 0 fallback_state should always be -1? */
if (e->val)
dict[k]->states[e->val].fallback_state = state_num;
}
#ifdef VERBOSE
for (i = 0; i < HASH_SIZE; i++)
for (e = hashtab->entries[i]; e; e = e->next)
{
printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
dict[k]->states[e->val].fallback_state);
for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
dict[k]->states[e->val].trans[j].new_state);
}
#endif
#ifndef VERBOSE
hnj_hash_free (hashtab);
#endif
state_num = 0;
}
if (nextlevel) dict[0]->nextlevel = dict[1];
else {
dict[1] -> nextlevel = dict[0];
dict[1]->lhmin = dict[0]->lhmin;
dict[1]->rhmin = dict[0]->rhmin;
dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
#ifdef VERBOSE
HashTab *r = global[0];
global[0] = global[1];
global[1] = r;
#endif
return dict[1];
}
return dict[0];
}
static char *hnj_hyphen_load_data_fgets(char * s, int n, const char **stream, size_t *streamLen) {
const char *ptr = *stream;
size_t pos = 0;
while(n > 1 && pos < *streamLen && ptr[pos] != '\n') {
++ pos;
-- n;
}
if (ptr[0] != '\n' && n > 1 && pos < *streamLen) {
++ pos;
-- n;
}
if (pos > 0) {
strncpy(s, *stream, pos);
*stream = (*stream + pos);
*streamLen = (*streamLen - pos);
return s;
}
return NULL;
}
HyphenDict *
hnj_hyphen_load_data (const char *fdata, size_t flen)
{
HyphenDict *dict[2];
HashTab *hashtab;
char buf[MAX_CHARS];
int nextlevel = 0;
int i, j, k;
HashEntry *e;
int state_num = 0;
// loading one or two dictionaries (separated by NEXTLEVEL keyword)
for (k = 0; k < 2; k++) {
hashtab = hnj_hash_new ();
#ifdef VERBOSE
global[k] = hashtab;
#endif
hnj_hash_insert (hashtab, "", 0);
dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict));
dict[k]->num_states = 1;
dict[k]->states = (HyphenState *) hnj_malloc (sizeof(HyphenState));
dict[k]->states[0].match = NULL;
dict[k]->states[0].repl = NULL;
dict[k]->states[0].fallback_state = -1;
dict[k]->states[0].num_trans = 0;
dict[k]->states[0].trans = NULL;
dict[k]->nextlevel = NULL;
dict[k]->lhmin = 0;
dict[k]->rhmin = 0;
dict[k]->clhmin = 0;
dict[k]->crhmin = 0;
dict[k]->nohyphen = NULL;
dict[k]->nohyphenl = 0;
/* read in character set info */
if (k == 0) {
for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
if (hnj_hyphen_load_data_fgets(dict[k]->cset, sizeof(dict[k]->cset), &fdata, &flen) != NULL) {
for (i=0;i<MAX_NAME;i++)
if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
dict[k]->cset[i] = 0;
} else {
dict[k]->cset[0] = 0;
}
dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
} else {
strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);
dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';
dict[k]->utf8 = dict[0]->utf8;
}
if (k == 0 || nextlevel) {
while (hnj_hyphen_load_data_fgets (buf, sizeof(buf), &fdata, &flen) != NULL) {
if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
nextlevel = 1;
break;
} else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
}
} else if (k == 1) {
/* default first level: hyphen and ASCII apostrophe */
if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);
else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here
buf[MAX_CHARS-1] = '\0';
hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
if (dict[0]->utf8) {
hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
}
}
/* Could do unioning of matches here (instead of the preprocessor script).
If we did, the pseudocode would look something like this:
foreach state in the hash table
foreach i = [1..length(state) - 1]
state to check is substr (state, i)
look it up
if found, and if there is a match, union the match in.
It's also possible to avoid the quadratic blowup by doing the
search in order of increasing state string sizes - then you
can break the loop after finding the first match.
This step should be optional in any case - if there is a
preprocessed rule table, it's always faster to use that.
*/
/* put in the fallback states */
for (i = 0; i < HASH_SIZE; i++)
for (e = hashtab->entries[i]; e; e = e->next)
{
if (*(e->key)) for (j = 1; 1; j++)
{
state_num = hnj_hash_lookup (hashtab, e->key + j); state_num = hnj_hash_lookup (hashtab, e->key + j);
if (state_num >= 0) if (state_num >= 0)
break; break;
@ -597,7 +756,7 @@ int hnj_hyphen_hyphenate (HyphenDict *dict,
/* KBH: FIXME shouldn't this be as follows? */ /* KBH: FIXME shouldn't this be as follows? */
state = 0; state = 0;
goto try_next_letter; goto try_next_letter;
} }
#ifdef VERBOSE #ifdef VERBOSE
char *state_str; char *state_str;
@ -669,8 +828,8 @@ int hnj_hyphen_hyphenate (HyphenDict *dict,
hyphens[word_size] = '\0'; hyphens[word_size] = '\0';
hnj_free (prep_word); hnj_free (prep_word);
return 0; return 0;
} }
/* Unicode ligature length */ /* Unicode ligature length */
@ -783,7 +942,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
int offset; int offset;
int * matchlen; int * matchlen;
int * matchindex; int * matchindex;
char ** matchrepl; char ** matchrepl;
int isrepl = 0; int isrepl = 0;
int nHyphCount; int nHyphCount;
@ -795,7 +954,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
j = 0; j = 0;
prep_word[j++] = '.'; prep_word[j++] = '.';
for (i = 0; i < word_size; i++) { for (i = 0; i < word_size; i++) {
if (word[i] <= '9' && word[i] >= '0') { if (word[i] <= '9' && word[i] >= '0') {
prep_word[j++] = '.'; prep_word[j++] = '.';
@ -810,7 +969,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
prep_word[j] = '\0'; prep_word[j] = '\0';
for (i = 0; i < j; i++) for (i = 0; i < j; i++)
hyphens[i] = '0'; hyphens[i] = '0';
#ifdef VERBOSE #ifdef VERBOSE
printf ("prep_word = %s\n", prep_word); printf ("prep_word = %s\n", prep_word);
@ -829,7 +988,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
/* KBH: FIXME shouldn't this be as follows? */ /* KBH: FIXME shouldn't this be as follows? */
state = 0; state = 0;
goto try_next_letter; goto try_next_letter;
} }
#ifdef VERBOSE #ifdef VERBOSE
char *state_str; char *state_str;
@ -892,7 +1051,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
} }
} }
} }
} }
/* KBH: we need this to make sure we keep looking in a word */ /* KBH: we need this to make sure we keep looking in a word */
@ -926,7 +1085,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
nHyphCount++; nHyphCount++;
j = 0; j = 0;
for (i = 0; i < word_size; i++) { for (i = 0; i < word_size; i++) {
if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
if (rep && pos && cut) { if (rep && pos && cut) {
if (!*rep) if (!*rep)
*rep = (char **) calloc(word_size, sizeof(char *)); *rep = (char **) calloc(word_size, sizeof(char *));
@ -961,7 +1120,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
cut2 = (int*) hnj_malloc (word_size * sizeof(int)); cut2 = (int*) hnj_malloc (word_size * sizeof(int));
hyphens2 = (char*) hnj_malloc (word_size + 3); hyphens2 = (char*) hnj_malloc (word_size + 3);
for (i = 0; i < word_size; i++) rep2[i] = NULL; for (i = 0; i < word_size; i++) rep2[i] = NULL;
for (i = 0; i < word_size; i++) if for (i = 0; i < word_size; i++) if
(hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) { (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
if (i - begin > 0) { if (i - begin > 0) {
int hyph = 0; int hyph = 0;
@ -1009,7 +1168,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
begin = i + 1; begin = i + 1;
for (j = 0; j < word_size; j++) rep2[j] = NULL; for (j = 0; j < word_size; j++) rep2[j] = NULL;
} }
// non-compound // non-compound
if (begin == 0) { if (begin == 0) {
hnj_hyphen_hyph_(dict->nextlevel, word, word_size, hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
@ -1019,7 +1178,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
rep, pos, cut, crhmin); rep, pos, cut, crhmin);
} }
free(rep2); free(rep2);
free(cut2); free(cut2);
free(pos2); free(pos2);
@ -1053,7 +1212,7 @@ int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
} }
k = i - l + 1; k = i - l + 1;
l = k + (*cut)[i]; l = k + (*cut)[i];
(*cut)[j] = 0; (*cut)[j] = 0;
for (; k < l; k++) { for (; k < l; k++) {
if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++; if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
} }
@ -1073,7 +1232,7 @@ int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
} }
/* get the word with all possible hyphenations (output: hyphword) */ /* get the word with all possible hyphenations (output: hyphword) */
void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
char * hyphword, char *** rep, int ** pos, int ** cut) char * hyphword, char *** rep, int ** pos, int ** cut)
{ {
int hyphenslen = l + 5; int hyphenslen = l + 5;

View File

@ -10,7 +10,7 @@
* to use it in OpenOffice.org. * to use it in OpenOffice.org.
* *
* Non-standard and compound word hyphenation support by László Németh. * Non-standard and compound word hyphenation support by László Németh.
* *
* License is the original LibHnj license: * License is the original LibHnj license:
* *
* LibHnj is dual licensed under LGPL and MPL. Boilerplate for both * LibHnj is dual licensed under LGPL and MPL. Boilerplate for both
@ -31,8 +31,8 @@
* Library General Public License for more details. * Library General Public License for more details.
* *
* You should have received a copy of the GNU Library General Public * You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the * License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307 USA. * Boston, MA 02111-1307 USA.
*/ */
@ -97,6 +97,7 @@ struct _HyphenTrans {
HyphenDict *hnj_hyphen_load (const char *fn); HyphenDict *hnj_hyphen_load (const char *fn);
HyphenDict *hnj_hyphen_load_file (FILE *f); HyphenDict *hnj_hyphen_load_file (FILE *f);
HyphenDict *hnj_hyphen_load_data (const char *fdata, size_t flen);
void hnj_hyphen_free (HyphenDict *dict); void hnj_hyphen_free (HyphenDict *dict);
/* obsolete, use hnj_hyphen_hyphenate2() or *hyphenate3() functions) */ /* obsolete, use hnj_hyphen_hyphenate2() or *hyphenate3() functions) */
@ -110,11 +111,11 @@ int hnj_hyphen_hyphenate (HyphenDict *dict,
(It supports Catalan, Dutch, German, Hungarian, Norwegian, Swedish (It supports Catalan, Dutch, German, Hungarian, Norwegian, Swedish
etc. orthography, see documentation.) etc. orthography, see documentation.)
input data: input data:
word: input word word: input word
word_size: byte length of the input word word_size: byte length of the input word
hyphens: allocated character buffer (size = word_size + 5) hyphens: allocated character buffer (size = word_size + 5)
hyphenated_word: allocated character buffer (size ~ word_size * 2) or NULL hyphenated_word: allocated character buffer (size ~ word_size * 2) or NULL
rep, pos, cut: pointers (point to the allocated and _zeroed_ buffers rep, pos, cut: pointers (point to the allocated and _zeroed_ buffers
@ -147,7 +148,7 @@ int hnj_hyphen_hyphenate (HyphenDict *dict,
int * cut = NULL; int * cut = NULL;
char hyphens[MAXWORDLEN]; char hyphens[MAXWORDLEN];
hnj_hyphen_hyphenate2(dict, "example", 7, hyphens, NULL, &rep, &pos, &cut); hnj_hyphen_hyphenate2(dict, "example", 7, hyphens, NULL, &rep, &pos, &cut);
See example in the source distribution. See example in the source distribution.
*/ */