NOHYPHEN feature, see README.compound

This commit is contained in:
László Németh 2010-11-27 02:20:33 +00:00
parent 06bfd5b5b3
commit f86ce87baa
21 changed files with 1279 additions and 1575 deletions

View File

@ -1,3 +1,13 @@
2010-11-27 László Németh <nemeth at OOo>:
* hyphen.c: add NOHYPHEN feature
to handle special hyphenation at hyphens and apostrophes,
see README.compound
* tests/{rhmin, hyphen}: new test files
* ooopatch.sed. hyph_en_US.dic: replace and improve old hack
with NOHYPHEN
2010-07-18 Caolán McNamara <cmc at OOo>:
* remove csutil.* as more trouble that its
worth for just the testsuite

View File

@ -257,6 +257,7 @@ libdir = @libdir@
libexecdir = @libexecdir@
localedir = @localedir@
localstatedir = @localstatedir@
lt_ECHO = @lt_ECHO@
mandir = @mandir@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@

8
NEWS
View File

@ -1,3 +1,11 @@
2010-11-27 Hyphen 2.7 release:
- The new hyphenation problem of OpenOffice.org 3.2, related to its
modified word breaking of words with hyphen characters, can be fixed
with the new NOHYPHEN feature. Also it's possible to solve the similar old
problem with apostrophes. More information: README.compound.
- improved English dictionaries
2010-08-10 Hyphen 2.6 release:
- maintainance release, fix all warnings, tidy up
make check with VALGRIND=memcheck, etc.

View File

@ -1,3 +1,23 @@
New option of Libhyphen 2.7: NOHYPHEN
Hyphen, apostrophe and other characters may be word boundary characters,
but they don't need (extra) hyphenation. With NOHYPHEN option
it's possible to hyphenate the words parts correctly.
Example:
ISO8859-1
NOHYPHEN -,'
1-1
1'1
NEXTLEVEL
Description:
1-1 and 1'1 declare hyphen and apostrophe as word boundary characters
and NOHYPHEN with the comma separated character (or character sequence)
list forbid the (extra) hyphens at the hyphen and apostrophe characters.
Compound word hyphenation
Hyphen library supports better compound word hyphenation and special

1059
aclocal.m4 vendored

File diff suppressed because it is too large Load Diff

1414
configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
AC_INIT([hyphen],[2.6],[nemeth@openoffice.org])
AM_INIT_AUTOMAKE(hyphen,2.6)
AC_INIT([hyphen],[2.7],[nemeth@openoffice.org])
AM_INIT_AUTOMAKE(hyphen,2.7)
AC_PROG_CC
AC_PROG_INSTALL
AC_PROG_LIBTOOL

View File

@ -3,119 +3,11 @@ LEFTHYPHENMIN 2
RIGHTHYPHENMIN 3
COMPOUNDLEFTHYPHENMIN 2
COMPOUNDRIGHTHYPHENMIN 3
1'.
1's./'=s,1,2
1't./'=t,1,2
1.
1s./=s,1,2
1t./=t,1,2
NOHYPHEN -,',
1-1
1'1
11
NEXTLEVEL
4'4
4a4'4
4b4'4
4c4'4
4d4'4
4e4'4
4f4'4
4g4'4
4h4'4
4i4'4
4j4'4
4k4'4
4l4'4
4m4'4
4n4'4
4o4'4
4p4'4
4q4'4
4r4'4
4s4'4
4t4'4
4u4'4
4v4'4
4w4'4
4x4'4
4y4'4
4z4'4
'a4
'b4
'c4
'd4
'e4
'f4
'g4
'h4
'i4
'j4
'k4
'l4
'm4
'n4
'o4
'p4
'q4
'r4
's4
't4
'u4
'v4
'w4
'x4
'y4
'z4
44
4a44
4b44
4c44
4d44
4e44
4f44
4g44
4h44
4i44
4j44
4k44
4l44
4m44
4n44
4o44
4p44
4q44
4r44
4s44
4t44
4u44
4v44
4w44
4x44
4y44
4z44
a4
b4
c4
d4
e4
f4
g4
h4
i4
j4
k4
l4
m4
n4
o4
p4
q4
r4
s4
t4
u4
v4
w4
x4
y4
z4
.a2ch4
.ad4der
.a2d

View File

@ -285,6 +285,8 @@ for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
dict[k]->rhmin = 0;
dict[k]->clhmin = 0;
dict[k]->crhmin = 0;
dict[k]->nohyphen = NULL;
dict[k]->nohyphenl = 0;
/* read in character set info */
if (k == 0) {
@ -321,6 +323,21 @@ for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
} else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
dict[k]->crhmin = atoi(buf + 22);
continue;
} else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
char * space = buf + 8;
while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
if (*buf != '\0') dict[k]->nohyphen = hnj_strdup(space);
if (dict[k]->nohyphen) {
char * nhe = dict[k]->nohyphen + strlen(dict[k]->nohyphen) - 1;
*nhe = 0;
for (nhe = nhe - 1; nhe > dict[k]->nohyphen; nhe--) {
if (*nhe == ',') {
dict[k]->nohyphenl++;
*nhe = 0;
}
}
}
continue;
}
j = 0;
pattern[j] = '0';
@ -483,6 +500,8 @@ void hnj_hyphen_free (HyphenDict *dict)
}
if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
if (dict->nohyphen) hnj_free(dict->nohyphen);
hnj_free (dict->states);
hnj_free (dict);
@ -1050,6 +1069,22 @@ int hnj_hyphen_hyphenate2 (HyphenDict *dict,
hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
hnj_hyphen_rhmin(dict->utf8, word, word_size,
hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
/* nohyphen */
if (dict->nohyphen) {
char * nh = dict->nohyphen;
int nhi;
for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
char * nhy = (char *) strstr(word, nh);
while (nhy) {
hyphens[nhy - word + strlen(nh) - 1] = 0;
hyphens[nhy - word - 1] = 0;
nhy = (char *) strstr(nhy + 1, nh);
}
nh = nh + strlen(nh) + 1;
}
}
if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
return 0;
@ -1070,6 +1105,22 @@ int hnj_hyphen_hyphenate3 (HyphenDict *dict,
hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
rep, pos, cut, (rhmin > 0 ? rhmin : 2));
if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
/* nohyphen */
if (dict->nohyphen) {
char * nh = dict->nohyphen;
int nhi;
for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
char * nhy = (char *) strstr(word, nh);
while (nhy) {
hyphens[nhy - word + strlen(nh) - 1] = 0;
hyphens[nhy - word - 1] = 0;
nhy = (char *) strstr(nhy + 1, nh);
}
nh = nh + strlen(nh) + 1;
}
}
if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
return 0;
}

View File

@ -67,6 +67,9 @@ struct _HyphenDict {
char rhmin; /* righthyphenmin: min. hyph. distance from the right side */
char clhmin; /* min. hyph. distance from the left compound boundary */
char crhmin; /* min. hyph. distance from the right compound boundary */
char * nohyphen; /* comma separated list of characters or character
sequences with forbidden hyphenation */
int nohyphenl; /* count of elements in nohyphen */
/* system variables */
int num_states;
char cset[MAX_NAME];

View File

@ -2,116 +2,8 @@
s/\(RIGHTHYPHENMIN.*\)/\1\
COMPOUNDLEFTHYPHENMIN 2\
COMPOUNDRIGHTHYPHENMIN 3\
1'.\
1's.\/'=s,1,2\
1't.\/'=t,1,2\
1.\
1s.\/=s,1,2\
1t.\/=t,1,2\
NEXTLEVEL\
4'4\
4a4'4\
4b4'4\
4c4'4\
4d4'4\
4e4'4\
4f4'4\
4g4'4\
4h4'4\
4i4'4\
4j4'4\
4k4'4\
4l4'4\
4m4'4\
4n4'4\
4o4'4\
4p4'4\
4q4'4\
4r4'4\
4s4'4\
4t4'4\
4u4'4\
4v4'4\
4w4'4\
4x4'4\
4y4'4\
4z4'4\
'a4\
'b4\
'c4\
'd4\
'e4\
'f4\
'g4\
'h4\
'i4\
'j4\
'k4\
'l4\
'm4\
'n4\
'o4\
'p4\
'q4\
'r4\
's4\
't4\
'u4\
'v4\
'w4\
'x4\
'y4\
'z4\
44\
4a44\
4b44\
4c44\
4d44\
4e44\
4f44\
4g44\
4h44\
4i44\
4j44\
4k44\
4l44\
4m44\
4n44\
4o44\
4p44\
4q44\
4r44\
4s44\
4t44\
4u44\
4v44\
4w44\
4x44\
4y44\
4z44\
a4\
b4\
c4\
d4\
e4\
f4\
g4\
h4\
i4\
j4\
k4\
l4\
m4\
n4\
o4\
p4\
q4\
r4\
s4\
t4\
u4\
v4\
w4\
x4\
y4\
z4/
NOHYPHEN -,',\
1-1\
1'1\
11\
NEXTLEVEL/

View File

@ -19,7 +19,9 @@ settings2.test \
settings3.test \
settings4.test \
lhmin.test \
lig.test
rhmin.test \
lig.test \
hyphen.test
distclean-local:
-rm -rf testSubDir
@ -107,7 +109,15 @@ lhmin.hyph \
lhmin.pat \
lhmin.word \
lhmin.test \
rhmin.hyph \
rhmin.pat \
rhmin.word \
rhmin.test \
lig.hyph \
lig.pat \
lig.test \
lig.word
lig.word \
hyphen.hyph \
hyphen.pat \
hyphen.test \
hyphen.word

View File

@ -178,7 +178,9 @@ settings2.test \
settings3.test \
settings4.test \
lhmin.test \
lig.test
rhmin.test \
lig.test \
hyphen.test
EXTRA_DIST = \
test.sh \
@ -263,10 +265,18 @@ lhmin.hyph \
lhmin.pat \
lhmin.word \
lhmin.test \
rhmin.hyph \
rhmin.pat \
rhmin.word \
rhmin.test \
lig.hyph \
lig.pat \
lig.test \
lig.word
lig.word \
hyphen.hyph \
hyphen.pat \
hyphen.test \
hyphen.word
all: all-am

1
tests/hyphen.hyph Normal file
View File

@ -0,0 +1 @@
foobar'foobar-foobarfoobar

6
tests/hyphen.pat Normal file
View File

@ -0,0 +1,6 @@
UTF-8
NOHYPHEN -,',
1-1
1'1
11
NEXTLEVEL

4
tests/hyphen.test Executable file
View File

@ -0,0 +1,4 @@
#!/bin/sh
DIR="`dirname $0`"
NAME="`basename $0 .test`"
$DIR/test.sh $NAME.pat $NAME.word $NAME.hyph

1
tests/hyphen.word Normal file
View File

@ -0,0 +1 @@
foobar'foobar-foobarfoobar

1
tests/rhmin.hyph Normal file
View File

@ -0,0 +1 @@
övéit

4
tests/rhmin.pat Normal file
View File

@ -0,0 +1,4 @@
UTF-8
RIGHTHYPHENMIN 3
% test patterns for righthyphenmin fix for UTF-8 patterns
övé1it

4
tests/rhmin.test Executable file
View File

@ -0,0 +1,4 @@
#!/bin/sh
DIR="`dirname $0`"
NAME="`basename $0 .test`"
$DIR/test.sh $NAME.pat $NAME.word $NAME.hyph

1
tests/rhmin.word Normal file
View File

@ -0,0 +1 @@
övéit