diff --git a/hyphen.c b/hyphen.c index 4954dbd..ea61969 100644 --- a/hyphen.c +++ b/hyphen.c @@ -3,8 +3,8 @@ */ /* LibHnj - a library for high quality hyphenation and justification - * Copyright (C) 1998 Raph Levien, - * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), + * Copyright (C) 1998 Raph Levien, + * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) * @@ -19,8 +19,8 @@ * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307 USA. */ @@ -281,7 +281,7 @@ void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) { } } return; - } + } j = 0; pattern[j] = '0'; repl = strchr(buf, '/'); @@ -296,7 +296,7 @@ void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) { if (index2) { *index2 = '\0'; replindex = (signed char) atoi(index + 1) - 1; - replcut = (signed char) atoi(index2 + 1); + replcut = (signed char) atoi(index2 + 1); } } else { hnj_strchomp(repl + 1); @@ -397,7 +397,7 @@ hnj_hyphen_load_file (FILE *f) HashEntry *e; int state_num = 0; // loading one or two dictionaries (separated by NEXTLEVEL keyword) -for (k = 0; k < 2; k++) { +for (k = 0; k < 2; k++) { hashtab = hnj_hash_new (); #ifdef VERBOSE global[k] = hashtab; @@ -480,7 +480,166 @@ for (k = 0; k < 2; k++) { for (e = hashtab->entries[i]; e; e = e->next) { if (*(e->key)) for (j = 1; 1; j++) - { + { + state_num = hnj_hash_lookup (hashtab, e->key + j); + if (state_num >= 0) + break; + } + /* KBH: FIXME state 0 fallback_state should always be -1? */ + if (e->val) + dict[k]->states[e->val].fallback_state = state_num; + } +#ifdef VERBOSE + for (i = 0; i < HASH_SIZE; i++) + for (e = hashtab->entries[i]; e; e = e->next) + { + printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val, + dict[k]->states[e->val].fallback_state); + for (j = 0; j < dict[k]->states[e->val].num_trans; j++) + printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch, + dict[k]->states[e->val].trans[j].new_state); + } +#endif + +#ifndef VERBOSE + hnj_hash_free (hashtab); +#endif + state_num = 0; +} + if (nextlevel) dict[0]->nextlevel = dict[1]; + else { + dict[1] -> nextlevel = dict[0]; + dict[1]->lhmin = dict[0]->lhmin; + dict[1]->rhmin = dict[0]->rhmin; + dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3); + dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3); +#ifdef VERBOSE + HashTab *r = global[0]; + global[0] = global[1]; + global[1] = r; +#endif + return dict[1]; + } + return dict[0]; +} + +static char *hnj_hyphen_load_data_fgets(char * s, int n, const char **stream, size_t *streamLen) { + const char *ptr = *stream; + size_t pos = 0; + while(n > 1 && pos < *streamLen && ptr[pos] != '\n') { + ++ pos; + -- n; + } + + if (ptr[0] != '\n' && n > 1 && pos < *streamLen) { + ++ pos; + -- n; + } + + if (pos > 0) { + strncpy(s, *stream, pos); + *stream = (*stream + pos); + *streamLen = (*streamLen - pos); + return s; + } + return NULL; +} + +HyphenDict * +hnj_hyphen_load_data (const char *fdata, size_t flen) +{ + HyphenDict *dict[2]; + HashTab *hashtab; + char buf[MAX_CHARS]; + int nextlevel = 0; + int i, j, k; + HashEntry *e; + int state_num = 0; +// loading one or two dictionaries (separated by NEXTLEVEL keyword) +for (k = 0; k < 2; k++) { + hashtab = hnj_hash_new (); +#ifdef VERBOSE + global[k] = hashtab; +#endif + hnj_hash_insert (hashtab, "", 0); + dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict)); + dict[k]->num_states = 1; + dict[k]->states = (HyphenState *) hnj_malloc (sizeof(HyphenState)); + dict[k]->states[0].match = NULL; + dict[k]->states[0].repl = NULL; + dict[k]->states[0].fallback_state = -1; + dict[k]->states[0].num_trans = 0; + dict[k]->states[0].trans = NULL; + dict[k]->nextlevel = NULL; + dict[k]->lhmin = 0; + dict[k]->rhmin = 0; + dict[k]->clhmin = 0; + dict[k]->crhmin = 0; + dict[k]->nohyphen = NULL; + dict[k]->nohyphenl = 0; + + /* read in character set info */ + if (k == 0) { + for (i=0;icset[i]= 0; + if (hnj_hyphen_load_data_fgets(dict[k]->cset, sizeof(dict[k]->cset), &fdata, &flen) != NULL) { + for (i=0;icset[i] == '\r') || (dict[k]->cset[i] == '\n')) + dict[k]->cset[i] = 0; + } else { + dict[k]->cset[0] = 0; + } + dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0); + } else { + strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1); + dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0'; + dict[k]->utf8 = dict[0]->utf8; + } + + if (k == 0 || nextlevel) { + while (hnj_hyphen_load_data_fgets (buf, sizeof(buf), &fdata, &flen) != NULL) { + if (strncmp(buf, "NEXTLEVEL", 9) == 0) { + nextlevel = 1; + break; + } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab); + } + } else if (k == 1) { + /* default first level: hyphen and ASCII apostrophe */ + if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab); + else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab); + strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here + buf[MAX_CHARS-1] = '\0'; + hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */ + hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */ + if (dict[0]->utf8) { + hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */ + hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */ + } + } + + /* Could do unioning of matches here (instead of the preprocessor script). + If we did, the pseudocode would look something like this: + + foreach state in the hash table + foreach i = [1..length(state) - 1] + state to check is substr (state, i) + look it up + if found, and if there is a match, union the match in. + + It's also possible to avoid the quadratic blowup by doing the + search in order of increasing state string sizes - then you + can break the loop after finding the first match. + + This step should be optional in any case - if there is a + preprocessed rule table, it's always faster to use that. + +*/ + + /* put in the fallback states */ + for (i = 0; i < HASH_SIZE; i++) + for (e = hashtab->entries[i]; e; e = e->next) + { + if (*(e->key)) for (j = 1; 1; j++) + { state_num = hnj_hash_lookup (hashtab, e->key + j); if (state_num >= 0) break; @@ -597,7 +756,7 @@ int hnj_hyphen_hyphenate (HyphenDict *dict, /* KBH: FIXME shouldn't this be as follows? */ state = 0; goto try_next_letter; - } + } #ifdef VERBOSE char *state_str; @@ -669,8 +828,8 @@ int hnj_hyphen_hyphenate (HyphenDict *dict, hyphens[word_size] = '\0'; hnj_free (prep_word); - - return 0; + + return 0; } /* Unicode ligature length */ @@ -783,7 +942,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, int offset; int * matchlen; int * matchindex; - char ** matchrepl; + char ** matchrepl; int isrepl = 0; int nHyphCount; @@ -795,7 +954,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, j = 0; prep_word[j++] = '.'; - + for (i = 0; i < word_size; i++) { if (word[i] <= '9' && word[i] >= '0') { prep_word[j++] = '.'; @@ -810,7 +969,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, prep_word[j] = '\0'; for (i = 0; i < j; i++) - hyphens[i] = '0'; + hyphens[i] = '0'; #ifdef VERBOSE printf ("prep_word = %s\n", prep_word); @@ -829,7 +988,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, /* KBH: FIXME shouldn't this be as follows? */ state = 0; goto try_next_letter; - } + } #ifdef VERBOSE char *state_str; @@ -892,7 +1051,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, } } } - + } /* KBH: we need this to make sure we keep looking in a word */ @@ -926,7 +1085,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, nHyphCount++; j = 0; for (i = 0; i < word_size; i++) { - if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { + if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { if (rep && pos && cut) { if (!*rep) *rep = (char **) calloc(word_size, sizeof(char *)); @@ -961,7 +1120,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, cut2 = (int*) hnj_malloc (word_size * sizeof(int)); hyphens2 = (char*) hnj_malloc (word_size + 3); for (i = 0; i < word_size; i++) rep2[i] = NULL; - for (i = 0; i < word_size; i++) if + for (i = 0; i < word_size; i++) if (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) { if (i - begin > 0) { int hyph = 0; @@ -1009,7 +1168,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, begin = i + 1; for (j = 0; j < word_size; j++) rep2[j] = NULL; } - + // non-compound if (begin == 0) { hnj_hyphen_hyph_(dict->nextlevel, word, word_size, @@ -1019,7 +1178,7 @@ int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, rep, pos, cut, crhmin); } - + free(rep2); free(cut2); free(pos2); @@ -1053,7 +1212,7 @@ int hnj_hyphen_norm(const char *word, int word_size, char * hyphens, } k = i - l + 1; l = k + (*cut)[i]; - (*cut)[j] = 0; + (*cut)[j] = 0; for (; k < l; k++) { if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++; } @@ -1073,7 +1232,7 @@ int hnj_hyphen_norm(const char *word, int word_size, char * hyphens, } /* get the word with all possible hyphenations (output: hyphword) */ -void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, +void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, char * hyphword, char *** rep, int ** pos, int ** cut) { int hyphenslen = l + 5; diff --git a/hyphen.h b/hyphen.h index 2b4e146..1243da2 100644 --- a/hyphen.h +++ b/hyphen.h @@ -10,7 +10,7 @@ * to use it in OpenOffice.org. * * Non-standard and compound word hyphenation support by László Németh. - * + * * License is the original LibHnj license: * * LibHnj is dual licensed under LGPL and MPL. Boilerplate for both @@ -31,8 +31,8 @@ * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307 USA. */ @@ -97,6 +97,7 @@ struct _HyphenTrans { HyphenDict *hnj_hyphen_load (const char *fn); HyphenDict *hnj_hyphen_load_file (FILE *f); +HyphenDict *hnj_hyphen_load_data (const char *fdata, size_t flen); void hnj_hyphen_free (HyphenDict *dict); /* obsolete, use hnj_hyphen_hyphenate2() or *hyphenate3() functions) */ @@ -110,11 +111,11 @@ int hnj_hyphen_hyphenate (HyphenDict *dict, (It supports Catalan, Dutch, German, Hungarian, Norwegian, Swedish etc. orthography, see documentation.) - + input data: word: input word word_size: byte length of the input word - + hyphens: allocated character buffer (size = word_size + 5) hyphenated_word: allocated character buffer (size ~ word_size * 2) or NULL rep, pos, cut: pointers (point to the allocated and _zeroed_ buffers @@ -147,7 +148,7 @@ int hnj_hyphen_hyphenate (HyphenDict *dict, int * cut = NULL; char hyphens[MAXWORDLEN]; hnj_hyphen_hyphenate2(dict, "example", 7, hyphens, NULL, &rep, &pos, &cut); - + See example in the source distribution. */