[hb-old] Shovel out the line-breaking / word-segmentation stuff

This commit is contained in:
Behdad Esfahbod 2012-07-24 19:49:48 -04:00
parent 0bcbe88cf3
commit 4a31166b28
12 changed files with 33 additions and 750 deletions

View File

@ -18,10 +18,7 @@ MAINSOURCES = \
harfbuzz-hebrew.c \ harfbuzz-hebrew.c \
harfbuzz-arabic.c \ harfbuzz-arabic.c \
harfbuzz-hangul.c \ harfbuzz-hangul.c \
harfbuzz-myanmar.c \ harfbuzz-myanmar.c
harfbuzz-thai.c
EXTRA_SOURCES = harfbuzz.c
PUBLICHEADERS = \ PUBLICHEADERS = \
harfbuzz.h \ harfbuzz.h \
@ -50,7 +47,4 @@ libhb_old_la_SOURCES = \
$(PUBLICHEADERS) \ $(PUBLICHEADERS) \
$(PRIVATEHEADERS) $(PRIVATEHEADERS)
EXTRA_DIST = \ EXTRA_DIST = README COPYING
README \
COPYING \
$(EXTRA_SOURCES)

View File

@ -52,7 +52,7 @@ am__objects_1 = harfbuzz-buffer.lo harfbuzz-stream.lo harfbuzz-gdef.lo \
harfbuzz-open.lo harfbuzz-shaper.lo harfbuzz-greek.lo \ harfbuzz-open.lo harfbuzz-shaper.lo harfbuzz-greek.lo \
harfbuzz-tibetan.lo harfbuzz-khmer.lo harfbuzz-indic.lo \ harfbuzz-tibetan.lo harfbuzz-khmer.lo harfbuzz-indic.lo \
harfbuzz-hebrew.lo harfbuzz-arabic.lo harfbuzz-hangul.lo \ harfbuzz-hebrew.lo harfbuzz-arabic.lo harfbuzz-hangul.lo \
harfbuzz-myanmar.lo harfbuzz-thai.lo harfbuzz-myanmar.lo
am__objects_2 = am__objects_2 =
am_libhb_old_la_OBJECTS = $(am__objects_1) $(am__objects_2) \ am_libhb_old_la_OBJECTS = $(am__objects_1) $(am__objects_2) \
$(am__objects_2) $(am__objects_2)
@ -268,10 +268,8 @@ MAINSOURCES = \
harfbuzz-hebrew.c \ harfbuzz-hebrew.c \
harfbuzz-arabic.c \ harfbuzz-arabic.c \
harfbuzz-hangul.c \ harfbuzz-hangul.c \
harfbuzz-myanmar.c \ harfbuzz-myanmar.c
harfbuzz-thai.c
EXTRA_SOURCES = harfbuzz.c
PUBLICHEADERS = \ PUBLICHEADERS = \
harfbuzz.h \ harfbuzz.h \
harfbuzz-buffer.h \ harfbuzz-buffer.h \
@ -299,11 +297,7 @@ libhb_old_la_SOURCES = \
$(PUBLICHEADERS) \ $(PUBLICHEADERS) \
$(PRIVATEHEADERS) $(PRIVATEHEADERS)
EXTRA_DIST = \ EXTRA_DIST = README COPYING
README \
COPYING \
$(EXTRA_SOURCES)
all: all-am all: all-am
.SUFFIXES: .SUFFIXES:
@ -371,7 +365,6 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/harfbuzz-open.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/harfbuzz-open.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/harfbuzz-shaper.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/harfbuzz-shaper.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/harfbuzz-stream.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/harfbuzz-stream.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/harfbuzz-thai.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/harfbuzz-tibetan.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/harfbuzz-tibetan.Plo@am__quote@
.c.o: .c.o:

View File

@ -35,21 +35,6 @@ HB_BEGIN_HEADER
*/ */
/*
see http://www.unicode.org/reports/tr14/tr14-19.html
we don't use the XX, AI and CB properties and map them to AL instead.
as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.
*/
typedef enum {
HB_LineBreak_OP, HB_LineBreak_CL, HB_LineBreak_QU, HB_LineBreak_GL, HB_LineBreak_NS,
HB_LineBreak_EX, HB_LineBreak_SY, HB_LineBreak_IS, HB_LineBreak_PR, HB_LineBreak_PO,
HB_LineBreak_NU, HB_LineBreak_AL, HB_LineBreak_ID, HB_LineBreak_IN, HB_LineBreak_HY,
HB_LineBreak_BA, HB_LineBreak_BB, HB_LineBreak_B2, HB_LineBreak_ZW, HB_LineBreak_CM,
HB_LineBreak_WJ, HB_LineBreak_H2, HB_LineBreak_H3, HB_LineBreak_JL, HB_LineBreak_JV,
HB_LineBreak_JT, HB_LineBreak_SA, HB_LineBreak_SG,
HB_LineBreak_SP, HB_LineBreak_CR, HB_LineBreak_LF, HB_LineBreak_BK
} HB_LineBreakClass;
typedef enum typedef enum
{ {
HB_Mark_NonSpacing, /* Mn */ HB_Mark_NonSpacing, /* Mn */
@ -90,62 +75,11 @@ typedef enum
HB_Symbol_Other /* So */ HB_Symbol_Other /* So */
} HB_CharCategory; } HB_CharCategory;
typedef enum
{
HB_Grapheme_Other,
HB_Grapheme_CR,
HB_Grapheme_LF,
HB_Grapheme_Control,
HB_Grapheme_Extend,
HB_Grapheme_L,
HB_Grapheme_V,
HB_Grapheme_T,
HB_Grapheme_LV,
HB_Grapheme_LVT
} HB_GraphemeClass;
typedef enum
{
HB_Word_Other,
HB_Word_Format,
HB_Word_Katakana,
HB_Word_ALetter,
HB_Word_MidLetter,
HB_Word_MidNum,
HB_Word_Numeric,
HB_Word_ExtendNumLet
} HB_WordClass;
typedef enum
{
HB_Sentence_Other,
HB_Sentence_Sep,
HB_Sentence_Format,
HB_Sentence_Sp,
HB_Sentence_Lower,
HB_Sentence_Upper,
HB_Sentence_OLetter,
HB_Sentence_Numeric,
HB_Sentence_ATerm,
HB_Sentence_STerm,
HB_Sentence_Close
} HB_SentenceClass;
HB_GraphemeClass HB_GetGraphemeClass(HB_UChar32 ch);
HB_WordClass HB_GetWordClass(HB_UChar32 ch);
HB_SentenceClass HB_GetSentenceClass(HB_UChar32 ch);
HB_LineBreakClass HB_GetLineBreakClass(HB_UChar32 ch);
void HB_GetGraphemeAndLineBreakClass(HB_UChar32 ch, HB_GraphemeClass *grapheme, HB_LineBreakClass *lineBreak);
void HB_GetUnicodeCharProperties(HB_UChar32 ch, HB_CharCategory *category, int *combiningClass); void HB_GetUnicodeCharProperties(HB_UChar32 ch, HB_CharCategory *category, int *combiningClass);
HB_CharCategory HB_GetUnicodeCharCategory(HB_UChar32 ch); HB_CharCategory HB_GetUnicodeCharCategory(HB_UChar32 ch);
int HB_GetUnicodeCharCombiningClass(HB_UChar32 ch); int HB_GetUnicodeCharCombiningClass(HB_UChar32 ch);
HB_UChar16 HB_GetMirroredChar(HB_UChar16 ch); HB_UChar16 HB_GetMirroredChar(HB_UChar16 ch);
void *HB_Library_Resolve(const char *library, int version, const char *symbol);
HB_END_HEADER HB_END_HEADER
#endif #endif

View File

@ -1866,29 +1866,3 @@ HB_Bool HB_IndicShape(HB_ShaperItem *item)
item->num_glyphs = first_glyph; item->num_glyphs = first_glyph;
return true; return true;
} }
void HB_IndicAttributes(HB_Script script, const HB_UChar16 *text, hb_uint32 from, hb_uint32 len, HB_CharAttributes *attributes)
{
int end = from + len;
const HB_UChar16 *uc = text + from;
attributes += from;
hb_uint32 i = 0;
while (i < len) {
bool invalid;
hb_uint32 boundary = indic_nextSyllableBoundary(script, text, from+i, end, &invalid) - from;
attributes[i].charStop = true;
if (boundary > len-1) boundary = len;
i++;
while (i < boundary) {
attributes[i].charStop = false;
++uc;
++i;
}
assert(i == boundary);
}
}

View File

@ -640,28 +640,3 @@ HB_Bool HB_KhmerShape(HB_ShaperItem *item)
item->num_glyphs = first_glyph; item->num_glyphs = first_glyph;
return TRUE; return TRUE;
} }
void HB_KhmerAttributes(HB_Script script, const HB_UChar16 *text, hb_uint32 from, hb_uint32 len, HB_CharAttributes *attributes)
{
int end = from + len;
const HB_UChar16 *uc = text + from;
hb_uint32 i = 0;
HB_UNUSED(script);
attributes += from;
while ( i < len ) {
HB_Bool invalid;
hb_uint32 boundary = khmer_nextSyllableBoundary( text, from+i, end, &invalid ) - from;
attributes[i].charStop = TRUE;
if ( boundary > len-1 ) boundary = len;
i++;
while ( i < boundary ) {
attributes[i].charStop = FALSE;
++uc;
++i;
}
assert( i == boundary );
}
}

View File

@ -509,31 +509,3 @@ HB_Bool HB_MyanmarShape(HB_ShaperItem *item)
item->num_glyphs = first_glyph; item->num_glyphs = first_glyph;
return TRUE; return TRUE;
} }
void HB_MyanmarAttributes(HB_Script script, const HB_UChar16 *text, hb_uint32 from, hb_uint32 len, HB_CharAttributes *attributes)
{
int end = from + len;
const HB_UChar16 *uc = text + from;
hb_uint32 i = 0;
HB_UNUSED(script);
attributes += from;
while (i < len) {
HB_Bool invalid;
hb_uint32 boundary = myanmar_nextSyllableBoundary(text, from+i, end, &invalid) - from;
attributes[i].charStop = TRUE;
if (i)
attributes[i-1].lineBreakType = HB_Break;
if (boundary > len-1)
boundary = len;
i++;
while (i < boundary) {
attributes[i].charStop = FALSE;
++uc;
++i;
}
assert(i == boundary);
}
}

View File

@ -93,11 +93,9 @@ typedef enum {
/* return true if ok. */ /* return true if ok. */
typedef HB_Bool (*HB_ShapeFunction)(HB_ShaperItem *shaper_item); typedef HB_Bool (*HB_ShapeFunction)(HB_ShaperItem *shaper_item);
typedef void (*HB_AttributeFunction)(HB_Script script, const HB_UChar16 *string, hb_uint32 from, hb_uint32 len, HB_CharAttributes *attributes);
typedef struct { typedef struct {
HB_ShapeFunction shape; HB_ShapeFunction shape;
HB_AttributeFunction charAttributes;
} HB_ScriptEngine; } HB_ScriptEngine;
extern const HB_ScriptEngine hb_scriptEngines[]; extern const HB_ScriptEngine hb_scriptEngines[];
@ -112,16 +110,6 @@ extern HB_Bool HB_MyanmarShape(HB_ShaperItem *shaper_item);
extern HB_Bool HB_KhmerShape(HB_ShaperItem *shaper_item); extern HB_Bool HB_KhmerShape(HB_ShaperItem *shaper_item);
extern HB_Bool HB_IndicShape(HB_ShaperItem *shaper_item); extern HB_Bool HB_IndicShape(HB_ShaperItem *shaper_item);
extern void HB_TibetanAttributes(HB_Script script, const HB_UChar16 *string, hb_uint32 from, hb_uint32 len, HB_CharAttributes *attributes);
extern void HB_MyanmarAttributes(HB_Script script, const HB_UChar16 *string, hb_uint32 from, hb_uint32 len, HB_CharAttributes *attributes);
extern void HB_KhmerAttributes(HB_Script script, const HB_UChar16 *string, hb_uint32 from, hb_uint32 len, HB_CharAttributes *attributes);
extern void HB_IndicAttributes(HB_Script script, const HB_UChar16 *string, hb_uint32 from, hb_uint32 len, HB_CharAttributes *attributes);
extern void HB_ThaiAttributes(HB_Script script, const HB_UChar16 *string, hb_uint32 from, hb_uint32 len, HB_CharAttributes *attributes);
typedef struct { typedef struct {
hb_uint32 tag; hb_uint32 tag;
hb_uint32 property; hb_uint32 property;

View File

@ -32,205 +32,6 @@
#define HB_MIN(a, b) ((a) < (b) ? (a) : (b)) #define HB_MIN(a, b) ((a) < (b) ? (a) : (b))
#define HB_MAX(a, b) ((a) > (b) ? (a) : (b)) #define HB_MAX(a, b) ((a) > (b) ? (a) : (b))
// -----------------------------------------------------------------------------------------------------
//
// The line break algorithm. See http://www.unicode.org/reports/tr14/tr14-13.html
//
// -----------------------------------------------------------------------------------------------------
/* The Unicode algorithm does in our opinion allow line breaks at some
places they shouldn't be allowed. The following changes were thus
made in comparison to the Unicode reference:
EX->AL from DB to IB
SY->AL from DB to IB
SY->PO from DB to IB
SY->PR from DB to IB
SY->OP from DB to IB
AL->PR from DB to IB
AL->PO from DB to IB
PR->PR from DB to IB
PO->PO from DB to IB
PR->PO from DB to IB
PO->PR from DB to IB
HY->PO from DB to IB
HY->PR from DB to IB
HY->OP from DB to IB
NU->EX from PB to IB
EX->PO from DB to IB
*/
// The following line break classes are not treated by the table:
// AI, BK, CB, CR, LF, NL, SA, SG, SP, XX
enum break_class {
// the first 4 values have to agree with the enum in QCharAttributes
ProhibitedBreak, // PB in table
DirectBreak, // DB in table
IndirectBreak, // IB in table
CombiningIndirectBreak, // CI in table
CombiningProhibitedBreak // CP in table
};
#define DB DirectBreak
#define IB IndirectBreak
#define CI CombiningIndirectBreak
#define CP CombiningProhibitedBreak
#define PB ProhibitedBreak
static const hb_uint8 breakTable[HB_LineBreak_JT+1][HB_LineBreak_JT+1] =
{
/* OP CL QU GL NS EX SY IS PR PO NU AL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT */
/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB },
/* CL */ { DB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* QU */ { PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* GL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* NS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* EX */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* SY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* IS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* PR */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB },
/* PO */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* NU */ { IB, PB, IB, IB, IB, IB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* AL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* ID */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* IN */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* HY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* BA */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* BB */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* B2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB },
/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB },
/* CM */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* WJ */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* H2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
/* H3 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB },
/* JL */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB },
/* JV */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
/* JT */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }
};
#undef DB
#undef IB
#undef CI
#undef CP
#undef PB
static const hb_uint8 graphemeTable[HB_Grapheme_LVT + 1][HB_Grapheme_LVT + 1] =
{
// Other, CR, LF, Control,Extend,L, V, T, LV, LVT
{ true , true , true , true , true , true , true , true , true , true }, // Other,
{ true , true , true , true , true , true , true , true , true , true }, // CR,
{ true , false, true , true , true , true , true , true , true , true }, // LF,
{ true , true , true , true , true , true , true , true , true , true }, // Control,
{ false, true , true , true , false, false, false, false, false, false }, // Extend,
{ true , true , true , true , true , false, true , true , true , true }, // L,
{ true , true , true , true , true , false, false, true , false, true }, // V,
{ true , true , true , true , true , true , false, false, false, false }, // T,
{ true , true , true , true , true , false, true , true , true , true }, // LV,
{ true , true , true , true , true , false, true , true , true , true }, // LVT
};
static void calcLineBreaks(const HB_UChar16 *uc, hb_uint32 len, HB_CharAttributes *charAttributes)
{
if (!len)
return;
// ##### can this fail if the first char is a surrogate?
HB_LineBreakClass cls;
HB_GraphemeClass grapheme;
HB_GetGraphemeAndLineBreakClass(*uc, &grapheme, &cls);
// handle case where input starts with an LF
if (cls == HB_LineBreak_LF)
cls = HB_LineBreak_BK;
charAttributes[0].whiteSpace = (cls == HB_LineBreak_SP || cls == HB_LineBreak_BK);
charAttributes[0].charStop = true;
int lcls = cls;
for (hb_uint32 i = 1; i < len; ++i) {
charAttributes[i].whiteSpace = false;
charAttributes[i].charStop = true;
HB_UChar32 code = uc[i];
HB_GraphemeClass ngrapheme;
HB_LineBreakClass ncls;
HB_GetGraphemeAndLineBreakClass(code, &ngrapheme, &ncls);
charAttributes[i].charStop = graphemeTable[ngrapheme][grapheme];
// handle surrogates
if (ncls == HB_LineBreak_SG) {
if (HB_IsHighSurrogate(uc[i]) && i < len - 1 && HB_IsLowSurrogate(uc[i+1])) {
continue;
} else if (HB_IsLowSurrogate(uc[i]) && HB_IsHighSurrogate(uc[i-1])) {
code = HB_SurrogateToUcs4(uc[i-1], uc[i]);
HB_GetGraphemeAndLineBreakClass(code, &ngrapheme, &ncls);
charAttributes[i].charStop = false;
} else {
ncls = HB_LineBreak_AL;
}
}
// set white space and char stop flag
if (ncls >= HB_LineBreak_SP)
charAttributes[i].whiteSpace = true;
HB_LineBreakType lineBreakType = HB_NoBreak;
if (cls >= HB_LineBreak_LF) {
lineBreakType = HB_ForcedBreak;
} else if(cls == HB_LineBreak_CR) {
lineBreakType = (ncls == HB_LineBreak_LF) ? HB_NoBreak : HB_ForcedBreak;
}
if (ncls == HB_LineBreak_SP)
goto next_no_cls_update;
if (ncls >= HB_LineBreak_CR)
goto next;
{
int tcls = ncls;
// for south east asian chars that require a complex (dictionary analysis), the unicode
// standard recommends to treat them as AL. thai_attributes and other attribute methods that
// do dictionary analysis can override
if (tcls >= HB_LineBreak_SA)
tcls = HB_LineBreak_AL;
if (cls >= HB_LineBreak_SA)
cls = HB_LineBreak_AL;
int brk = breakTable[cls][tcls];
switch (brk) {
case DirectBreak:
lineBreakType = HB_Break;
if (uc[i-1] == 0xad) // soft hyphen
lineBreakType = HB_SoftHyphen;
break;
case IndirectBreak:
lineBreakType = (lcls == HB_LineBreak_SP) ? HB_Break : HB_NoBreak;
break;
case CombiningIndirectBreak:
lineBreakType = HB_NoBreak;
if (lcls == HB_LineBreak_SP){
if (i > 1)
charAttributes[i-2].lineBreakType = HB_Break;
} else {
goto next_no_cls_update;
}
break;
case CombiningProhibitedBreak:
lineBreakType = HB_NoBreak;
if (lcls != HB_LineBreak_SP)
goto next_no_cls_update;
case ProhibitedBreak:
default:
break;
}
}
next:
cls = ncls;
next_no_cls_update:
lcls = ncls;
grapheme = ngrapheme;
charAttributes[i-1].lineBreakType = lineBreakType;
}
charAttributes[len-1].lineBreakType = HB_ForcedBreak;
}
// -------------------------------------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------------------------------------
// //
// Basic processing // Basic processing
@ -582,210 +383,63 @@ HB_Bool HB_BasicShape(HB_ShaperItem *shaper_item)
const HB_ScriptEngine HB_ScriptEngines[] = { const HB_ScriptEngine HB_ScriptEngines[] = {
// Common // Common
{ HB_BasicShape, 0}, { HB_BasicShape},
// Greek // Greek
{ HB_GreekShape, 0}, { HB_GreekShape},
// Cyrillic // Cyrillic
{ HB_BasicShape, 0}, { HB_BasicShape},
// Armenian // Armenian
{ HB_BasicShape, 0}, { HB_BasicShape},
// Hebrew // Hebrew
{ HB_HebrewShape, 0 }, { HB_HebrewShape},
// Arabic // Arabic
{ HB_ArabicShape, 0}, { HB_ArabicShape},
// Syriac // Syriac
{ HB_ArabicShape, 0}, { HB_ArabicShape},
// Thaana // Thaana
{ HB_BasicShape, 0 }, { HB_BasicShape},
// Devanagari // Devanagari
{ HB_IndicShape, HB_IndicAttributes }, { HB_IndicShape},
// Bengali // Bengali
{ HB_IndicShape, HB_IndicAttributes }, { HB_IndicShape},
// Gurmukhi // Gurmukhi
{ HB_IndicShape, HB_IndicAttributes }, { HB_IndicShape},
// Gujarati // Gujarati
{ HB_IndicShape, HB_IndicAttributes }, { HB_IndicShape},
// Oriya // Oriya
{ HB_IndicShape, HB_IndicAttributes }, { HB_IndicShape},
// Tamil // Tamil
{ HB_IndicShape, HB_IndicAttributes }, { HB_IndicShape},
// Telugu // Telugu
{ HB_IndicShape, HB_IndicAttributes }, { HB_IndicShape},
// Kannada // Kannada
{ HB_IndicShape, HB_IndicAttributes }, { HB_IndicShape},
// Malayalam // Malayalam
{ HB_IndicShape, HB_IndicAttributes }, { HB_IndicShape},
// Sinhala // Sinhala
{ HB_IndicShape, HB_IndicAttributes }, { HB_IndicShape},
// Thai // Thai
{ HB_BasicShape, HB_ThaiAttributes }, { HB_BasicShape},
// Lao // Lao
{ HB_BasicShape, 0 }, { HB_BasicShape},
// Tibetan // Tibetan
{ HB_TibetanShape, HB_TibetanAttributes }, { HB_TibetanShape},
// Myanmar // Myanmar
{ HB_MyanmarShape, HB_MyanmarAttributes }, { HB_MyanmarShape},
// Georgian // Georgian
{ HB_BasicShape, 0 }, { HB_BasicShape},
// Hangul // Hangul
{ HB_HangulShape, 0 }, { HB_HangulShape},
// Ogham // Ogham
{ HB_BasicShape, 0 }, { HB_BasicShape},
// Runic // Runic
{ HB_BasicShape, 0 }, { HB_BasicShape},
// Khmer // Khmer
{ HB_KhmerShape, HB_KhmerAttributes }, { HB_KhmerShape},
// N'Ko // N'Ko
{ HB_ArabicShape, 0} { HB_ArabicShape}
}; };
void HB_GetCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength,
const HB_ScriptItem *items, hb_uint32 numItems,
HB_CharAttributes *attributes)
{
calcLineBreaks(string, stringLength, attributes);
for (hb_uint32 i = 0; i < numItems; ++i) {
HB_Script script = items[i].script;
if (script == HB_Script_Inherited)
script = HB_Script_Common;
HB_AttributeFunction attributeFunction = HB_ScriptEngines[script].charAttributes;
if (!attributeFunction)
continue;
attributeFunction(script, string, items[i].pos, items[i].length, attributes);
}
}
enum BreakRule { NoBreak = 0, Break = 1, Middle = 2 };
static const hb_uint8 wordbreakTable[HB_Word_ExtendNumLet + 1][HB_Word_ExtendNumLet + 1] = {
// Other Format Katakana ALetter MidLetter MidNum Numeric ExtendNumLet
{ Break, Break, Break, Break, Break, Break, Break, Break }, // Other
{ Break, Break, Break, Break, Break, Break, Break, Break }, // Format
{ Break, Break, NoBreak, Break, Break, Break, Break, NoBreak }, // Katakana
{ Break, Break, Break, NoBreak, Middle, Break, NoBreak, NoBreak }, // ALetter
{ Break, Break, Break, Break, Break, Break, Break, Break }, // MidLetter
{ Break, Break, Break, Break, Break, Break, Break, Break }, // MidNum
{ Break, Break, Break, NoBreak, Break, Middle, NoBreak, NoBreak }, // Numeric
{ Break, Break, NoBreak, NoBreak, Break, Break, NoBreak, NoBreak }, // ExtendNumLet
};
void HB_GetWordBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
const HB_ScriptItem * /*items*/, hb_uint32 /*numItems*/,
HB_CharAttributes *attributes)
{
if (stringLength == 0)
return;
unsigned int brk = HB_GetWordClass(string[0]);
attributes[0].wordBoundary = true;
for (hb_uint32 i = 1; i < stringLength; ++i) {
if (!attributes[i].charStop) {
attributes[i].wordBoundary = false;
continue;
}
hb_uint32 nbrk = HB_GetWordClass(string[i]);
if (nbrk == HB_Word_Format) {
attributes[i].wordBoundary = (HB_GetSentenceClass(string[i-1]) == HB_Sentence_Sep);
continue;
}
BreakRule rule = (BreakRule)wordbreakTable[brk][nbrk];
if (rule == Middle) {
rule = Break;
hb_uint32 lookahead = i + 1;
while (lookahead < stringLength) {
hb_uint32 testbrk = HB_GetWordClass(string[lookahead]);
if (testbrk == HB_Word_Format && HB_GetSentenceClass(string[lookahead]) != HB_Sentence_Sep) {
++lookahead;
continue;
}
if (testbrk == brk) {
rule = NoBreak;
while (i < lookahead)
attributes[i++].wordBoundary = false;
nbrk = testbrk;
}
break;
}
}
attributes[i].wordBoundary = (rule == Break);
brk = nbrk;
}
}
enum SentenceBreakStates {
SB_Initial,
SB_Upper,
SB_UpATerm,
SB_ATerm,
SB_ATermC,
SB_ACS,
SB_STerm,
SB_STermC,
SB_SCS,
SB_BAfter,
SB_Break,
SB_Look
};
static const hb_uint8 sentenceBreakTable[HB_Sentence_Close + 1][HB_Sentence_Close + 1] = {
// Other Sep Format Sp Lower Upper OLetter Numeric ATerm STerm Close
{ SB_Initial, SB_BAfter , SB_Initial, SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_ATerm , SB_STerm , SB_Initial }, // SB_Initial,
{ SB_Initial, SB_BAfter , SB_Upper , SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_UpATerm, SB_STerm , SB_Initial }, // SB_Upper
{ SB_Look , SB_BAfter , SB_UpATerm, SB_ACS , SB_Initial, SB_Upper , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_UpATerm
{ SB_Look , SB_BAfter , SB_ATerm , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATerm
{ SB_Look , SB_BAfter , SB_ATermC , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Look , SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATermC,
{ SB_Look , SB_BAfter , SB_ACS , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Look , SB_ATerm , SB_STerm , SB_Look }, // SB_ACS,
{ SB_Break , SB_BAfter , SB_STerm , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STerm,
{ SB_Break , SB_BAfter , SB_STermC , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STermC,
{ SB_Break , SB_BAfter , SB_SCS , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_Break }, // SB_SCS,
{ SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break }, // SB_BAfter,
};
void HB_GetSentenceBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
const HB_ScriptItem * /*items*/, hb_uint32 /*numItems*/,
HB_CharAttributes *attributes)
{
if (stringLength == 0)
return;
hb_uint32 brk = sentenceBreakTable[SB_Initial][HB_GetSentenceClass(string[0])];
attributes[0].sentenceBoundary = true;
for (hb_uint32 i = 1; i < stringLength; ++i) {
if (!attributes[i].charStop) {
attributes[i].sentenceBoundary = false;
continue;
}
brk = sentenceBreakTable[brk][HB_GetSentenceClass(string[i])];
if (brk == SB_Look) {
brk = SB_Break;
hb_uint32 lookahead = i + 1;
while (lookahead < stringLength) {
hb_uint32 sbrk = HB_GetSentenceClass(string[lookahead]);
if (sbrk != HB_Sentence_Other && sbrk != HB_Sentence_Numeric && sbrk != HB_Sentence_Close) {
break;
} else if (sbrk == HB_Sentence_Lower) {
brk = SB_Initial;
break;
}
++lookahead;
}
if (brk == SB_Initial) {
while (i < lookahead)
attributes[i++].sentenceBoundary = false;
}
}
if (brk == SB_Break) {
attributes[i].sentenceBoundary = true;
brk = sentenceBreakTable[SB_Initial][HB_GetSentenceClass(string[i])];
} else {
attributes[i].sentenceBoundary = false;
}
}
}
static inline char *tag_to_string(HB_UInt tag) static inline char *tag_to_string(HB_UInt tag)
{ {
@ -1335,4 +989,3 @@ HB_Bool HB_ShapeItem(HB_ShaperItem *shaper_item)
shaper_item->glyphIndicesPresent = false; shaper_item->glyphIndicesPresent = false;
return result; return result;
} }

View File

@ -130,37 +130,6 @@ typedef struct
hb_uint8 bidiLevel; hb_uint8 bidiLevel;
} HB_ScriptItem; } HB_ScriptItem;
typedef enum {
HB_NoBreak,
HB_SoftHyphen,
HB_Break,
HB_ForcedBreak
} HB_LineBreakType;
typedef struct {
/*HB_LineBreakType*/ hb_bitfield lineBreakType :2;
/*HB_Bool*/ hb_bitfield whiteSpace :1; /* A unicode whitespace character, except NBSP, ZWNBSP */
/*HB_Bool*/ hb_bitfield charStop :1; /* Valid cursor position (for left/right arrow) */
/*HB_Bool*/ hb_bitfield wordBoundary :1;
/*HB_Bool*/ hb_bitfield sentenceBoundary :1;
hb_bitfield unused :2;
} HB_CharAttributes;
void HB_GetCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength,
const HB_ScriptItem *items, hb_uint32 numItems,
HB_CharAttributes *attributes);
/* requires HB_GetCharAttributes to be called before */
void HB_GetWordBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
const HB_ScriptItem *items, hb_uint32 numItems,
HB_CharAttributes *attributes);
/* requires HB_GetCharAttributes to be called before */
void HB_GetSentenceBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
const HB_ScriptItem *items, hb_uint32 numItems,
HB_CharAttributes *attributes);
typedef enum { typedef enum {
HB_LeftToRight = 0, HB_LeftToRight = 0,

View File

@ -1,111 +0,0 @@
/*
* Copyright (C) 2008 Nokia Corporation and/or its subsidiary(-ies)
*
* This is part of HarfBuzz, an OpenType Layout engine library.
*
* Permission is hereby granted, without written agreement and without
* license or royalty fees, to use, copy, modify, and distribute this
* software and its documentation for any purpose, provided that the
* above copyright notice and the following two paragraphs appear in
* all copies of this software.
*
* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
*/
#include "harfbuzz-shaper.h"
#include "harfbuzz-shaper-private.h"
#include "harfbuzz-external.h"
#include <assert.h>
#include <stdio.h>
typedef int (*th_brk_def)(const char*, int[], int);
static th_brk_def th_brk = 0;
static int libthai_resolved = 0;
static void resolve_libthai()
{
if (!th_brk)
th_brk = (th_brk_def)HB_Library_Resolve("thai", 0, "th_brk");
libthai_resolved = 1;
}
static void to_tis620(const HB_UChar16 *string, hb_uint32 len, const char *cstr)
{
hb_uint32 i;
unsigned char *result = (unsigned char *)cstr;
for (i = 0; i < len; ++i) {
if (string[i] <= 0xa0)
result[i] = (unsigned char)string[i];
if (string[i] >= 0xe01 && string[i] <= 0xe5b)
result[i] = (unsigned char)(string[i] - 0xe00 + 0xa0);
else
result[i] = '?';
}
result[len] = 0;
}
static void thaiWordBreaks(const HB_UChar16 *string, hb_uint32 len, HB_CharAttributes *attributes)
{
char s[128];
char *cstr = s;
int brp[128];
int *break_positions = brp;
hb_uint32 numbreaks;
hb_uint32 i;
if (!libthai_resolved)
resolve_libthai();
if (!th_brk)
return;
if (len >= 128)
cstr = (char *)malloc(len*sizeof(char) + 1);
to_tis620(string, len, cstr);
numbreaks = th_brk(cstr, break_positions, 128);
if (numbreaks > 128) {
break_positions = (int *)malloc(numbreaks * sizeof(int));
numbreaks = th_brk(cstr, break_positions, numbreaks);
}
for (i = 0; i < len; ++i) {
attributes[i].lineBreakType = HB_NoBreak;
attributes[i].wordBoundary = FALSE;
}
for (i = 0; i < numbreaks; ++i) {
if (break_positions[i] > 0) {
attributes[break_positions[i]-1].lineBreakType = HB_Break;
attributes[break_positions[i]-1].wordBoundary = TRUE;
}
}
if (break_positions != brp)
free(break_positions);
if (len >= 128)
free(cstr);
}
void HB_ThaiAttributes(HB_Script script, const HB_UChar16 *text, hb_uint32 from, hb_uint32 len, HB_CharAttributes *attributes)
{
assert(script == HB_Script_Thai);
attributes += from;
thaiWordBreaks(text + from, len, attributes);
}

View File

@ -246,29 +246,3 @@ HB_Bool HB_TibetanShape(HB_ShaperItem *item)
item->num_glyphs = first_glyph; item->num_glyphs = first_glyph;
return TRUE; return TRUE;
} }
void HB_TibetanAttributes(HB_Script script, const HB_UChar16 *text, hb_uint32 from, hb_uint32 len, HB_CharAttributes *attributes)
{
int end = from + len;
const HB_UChar16 *uc = text + from;
hb_uint32 i = 0;
HB_UNUSED(script);
attributes += from;
while (i < len) {
HB_Bool invalid;
hb_uint32 boundary = tibetan_nextSyllableBoundary(text, from+i, end, &invalid) - from;
attributes[i].charStop = TRUE;
if (boundary > len-1) boundary = len;
i++;
while (i < boundary) {
attributes[i].charStop = FALSE;
++uc;
++i;
}
assert(i == boundary);
}
}

View File

@ -1,32 +0,0 @@
/*
* Copyright (C) 2006 Behdad Esfahbod
*
* This is part of HarfBuzz, an OpenType Layout engine library.
*
* Permission is hereby granted, without written agreement and without
* license or royalty fees, to use, copy, modify, and distribute this
* software and its documentation for any purpose, provided that the
* above copyright notice and the following two paragraphs appear in
* all copies of this software.
*
* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
*/
#define HB_INTERNAL static
#include "harfbuzz-buffer.c"
#include "harfbuzz-gdef.c"
#include "harfbuzz-gsub.c"
#include "harfbuzz-gpos.c"
#include "harfbuzz-impl.c"
#include "harfbuzz-open.c"
#include "harfbuzz-stream.c"