New API: hb_buffer_[sg]et_replacement_codepoint()
With this change, we now by default replace broken UTF-8/16/32 bits
with U+FFFD. This can be changed by calling new API on the buffer.
Previously the replacement value used to be (hb_codepoint_t)-1.
Note that hb_buffer_clear_contents() does NOT reset the replacement
character.
See discussion here:
6f13b6d62d
New API:
hb_buffer_set_replacement_codepoint()
hb_buffer_get_replacement_codepoint()
This commit is contained in:
parent
bcba8b4502
commit
976c8f4552
|
@ -52,6 +52,7 @@ struct hb_buffer_t {
|
||||||
hb_unicode_funcs_t *unicode; /* Unicode functions */
|
hb_unicode_funcs_t *unicode; /* Unicode functions */
|
||||||
hb_segment_properties_t props; /* Script, language, direction */
|
hb_segment_properties_t props; /* Script, language, direction */
|
||||||
hb_buffer_flags_t flags; /* BOT / EOT / etc. */
|
hb_buffer_flags_t flags; /* BOT / EOT / etc. */
|
||||||
|
hb_codepoint_t replacement; /* U+FFFD or something else. */
|
||||||
|
|
||||||
/* Buffer contents */
|
/* Buffer contents */
|
||||||
|
|
||||||
|
|
|
@ -178,6 +178,7 @@ hb_buffer_t::reset (void)
|
||||||
|
|
||||||
hb_unicode_funcs_destroy (unicode);
|
hb_unicode_funcs_destroy (unicode);
|
||||||
unicode = hb_unicode_funcs_get_default ();
|
unicode = hb_unicode_funcs_get_default ();
|
||||||
|
replacement = HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT;
|
||||||
|
|
||||||
clear ();
|
clear ();
|
||||||
}
|
}
|
||||||
|
@ -703,6 +704,7 @@ hb_buffer_get_empty (void)
|
||||||
const_cast<hb_unicode_funcs_t *> (&_hb_unicode_funcs_nil),
|
const_cast<hb_unicode_funcs_t *> (&_hb_unicode_funcs_nil),
|
||||||
HB_SEGMENT_PROPERTIES_DEFAULT,
|
HB_SEGMENT_PROPERTIES_DEFAULT,
|
||||||
HB_BUFFER_FLAG_DEFAULT,
|
HB_BUFFER_FLAG_DEFAULT,
|
||||||
|
HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT,
|
||||||
|
|
||||||
HB_BUFFER_CONTENT_TYPE_INVALID,
|
HB_BUFFER_CONTENT_TYPE_INVALID,
|
||||||
true, /* in_error */
|
true, /* in_error */
|
||||||
|
@ -1047,6 +1049,42 @@ hb_buffer_get_flags (hb_buffer_t *buffer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* hb_buffer_set_replacement_codepoint:
|
||||||
|
* @buffer: a buffer.
|
||||||
|
* @replacement:
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Since: 1.0
|
||||||
|
**/
|
||||||
|
void
|
||||||
|
hb_buffer_set_replacement_codepoint (hb_buffer_t *buffer,
|
||||||
|
hb_codepoint_t replacement)
|
||||||
|
{
|
||||||
|
if (unlikely (hb_object_is_inert (buffer)))
|
||||||
|
return;
|
||||||
|
|
||||||
|
buffer->replacement = replacement;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* hb_buffer_get_replacement_codepoint:
|
||||||
|
* @buffer: a buffer.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Return value:
|
||||||
|
*
|
||||||
|
* Since: 1.0
|
||||||
|
**/
|
||||||
|
hb_codepoint_t
|
||||||
|
hb_buffer_get_replacement_codepoint (hb_buffer_t *buffer)
|
||||||
|
{
|
||||||
|
return buffer->replacement;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* hb_buffer_reset:
|
* hb_buffer_reset:
|
||||||
* @buffer: a buffer.
|
* @buffer: a buffer.
|
||||||
|
@ -1299,6 +1337,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer,
|
||||||
int item_length)
|
int item_length)
|
||||||
{
|
{
|
||||||
typedef hb_utf_t<T, true> utf_t;
|
typedef hb_utf_t<T, true> utf_t;
|
||||||
|
const hb_codepoint_t replacement = buffer->replacement;
|
||||||
|
|
||||||
assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
|
assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
|
||||||
(!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
|
(!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
|
||||||
|
@ -1330,7 +1369,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer,
|
||||||
while (start < prev && buffer->context_len[0] < buffer->CONTEXT_LENGTH)
|
while (start < prev && buffer->context_len[0] < buffer->CONTEXT_LENGTH)
|
||||||
{
|
{
|
||||||
hb_codepoint_t u;
|
hb_codepoint_t u;
|
||||||
prev = utf_t::prev (prev, start, &u);
|
prev = utf_t::prev (prev, start, &u, replacement);
|
||||||
buffer->context[0][buffer->context_len[0]++] = u;
|
buffer->context[0][buffer->context_len[0]++] = u;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1341,7 +1380,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer,
|
||||||
{
|
{
|
||||||
hb_codepoint_t u;
|
hb_codepoint_t u;
|
||||||
const T *old_next = next;
|
const T *old_next = next;
|
||||||
next = utf_t::next (next, end, &u);
|
next = utf_t::next (next, end, &u, replacement);
|
||||||
buffer->add (u, old_next - (const T *) text);
|
buffer->add (u, old_next - (const T *) text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1351,7 +1390,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer,
|
||||||
while (next < end && buffer->context_len[1] < buffer->CONTEXT_LENGTH)
|
while (next < end && buffer->context_len[1] < buffer->CONTEXT_LENGTH)
|
||||||
{
|
{
|
||||||
hb_codepoint_t u;
|
hb_codepoint_t u;
|
||||||
next = utf_t::next (next, end, &u);
|
next = utf_t::next (next, end, &u, replacement);
|
||||||
buffer->context[1][buffer->context_len[1]++] = u;
|
buffer->context[1][buffer->context_len[1]++] = u;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -186,12 +186,25 @@ hb_buffer_flags_t
|
||||||
hb_buffer_get_flags (hb_buffer_t *buffer);
|
hb_buffer_get_flags (hb_buffer_t *buffer);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT 0xFFFDu
|
||||||
|
|
||||||
|
/* Sets codepoint used to replace invalid UTF-8/16/32 entries.
|
||||||
|
* Default is 0xFFFDu. */
|
||||||
|
void
|
||||||
|
hb_buffer_set_replacement_codepoint (hb_buffer_t *buffer,
|
||||||
|
hb_codepoint_t replacement);
|
||||||
|
|
||||||
|
hb_codepoint_t
|
||||||
|
hb_buffer_get_replacement_codepoint (hb_buffer_t *buffer);
|
||||||
|
|
||||||
|
|
||||||
/* Resets the buffer. Afterwards it's as if it was just created,
|
/* Resets the buffer. Afterwards it's as if it was just created,
|
||||||
* except that it has a larger buffer allocated perhaps... */
|
* except that it has a larger buffer allocated perhaps... */
|
||||||
void
|
void
|
||||||
hb_buffer_reset (hb_buffer_t *buffer);
|
hb_buffer_reset (hb_buffer_t *buffer);
|
||||||
|
|
||||||
/* Like reset, but does NOT clear unicode_funcs. */
|
/* Like reset, but does NOT clear unicode_funcs and replacement_codepoint. */
|
||||||
void
|
void
|
||||||
hb_buffer_clear_contents (hb_buffer_t *buffer);
|
hb_buffer_clear_contents (hb_buffer_t *buffer);
|
||||||
|
|
||||||
|
|
|
@ -40,7 +40,8 @@ struct hb_utf_t<uint8_t, true>
|
||||||
static inline const uint8_t *
|
static inline const uint8_t *
|
||||||
next (const uint8_t *text,
|
next (const uint8_t *text,
|
||||||
const uint8_t *end,
|
const uint8_t *end,
|
||||||
hb_codepoint_t *unicode)
|
hb_codepoint_t *unicode,
|
||||||
|
hb_codepoint_t replacement)
|
||||||
{
|
{
|
||||||
/* Written to only accept well-formed sequences.
|
/* Written to only accept well-formed sequences.
|
||||||
* Based on ideas from ICU's U8_NEXT.
|
* Based on ideas from ICU's U8_NEXT.
|
||||||
|
@ -101,23 +102,24 @@ struct hb_utf_t<uint8_t, true>
|
||||||
return text;
|
return text;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
*unicode = -1;
|
*unicode = replacement;
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline const uint8_t *
|
static inline const uint8_t *
|
||||||
prev (const uint8_t *text,
|
prev (const uint8_t *text,
|
||||||
const uint8_t *start,
|
const uint8_t *start,
|
||||||
hb_codepoint_t *unicode)
|
hb_codepoint_t *unicode,
|
||||||
|
hb_codepoint_t replacement)
|
||||||
{
|
{
|
||||||
const uint8_t *end = text--;
|
const uint8_t *end = text--;
|
||||||
while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
|
while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
|
||||||
text--;
|
text--;
|
||||||
|
|
||||||
if (likely (next (text, end, unicode) == end))
|
if (likely (next (text, end, unicode, replacement) == end))
|
||||||
return text;
|
return text;
|
||||||
|
|
||||||
*unicode = -1;
|
*unicode = replacement;
|
||||||
return end - 1;
|
return end - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -137,7 +139,8 @@ struct hb_utf_t<uint16_t, true>
|
||||||
static inline const uint16_t *
|
static inline const uint16_t *
|
||||||
next (const uint16_t *text,
|
next (const uint16_t *text,
|
||||||
const uint16_t *end,
|
const uint16_t *end,
|
||||||
hb_codepoint_t *unicode)
|
hb_codepoint_t *unicode,
|
||||||
|
hb_codepoint_t replacement)
|
||||||
{
|
{
|
||||||
hb_codepoint_t c = *text++;
|
hb_codepoint_t c = *text++;
|
||||||
|
|
||||||
|
@ -161,14 +164,15 @@ struct hb_utf_t<uint16_t, true>
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Lonely / out-of-order surrogate. */
|
/* Lonely / out-of-order surrogate. */
|
||||||
*unicode = -1;
|
*unicode = replacement;
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline const uint16_t *
|
static inline const uint16_t *
|
||||||
prev (const uint16_t *text,
|
prev (const uint16_t *text,
|
||||||
const uint16_t *start,
|
const uint16_t *start,
|
||||||
hb_codepoint_t *unicode)
|
hb_codepoint_t *unicode,
|
||||||
|
hb_codepoint_t replacement)
|
||||||
{
|
{
|
||||||
const uint16_t *end = text--;
|
const uint16_t *end = text--;
|
||||||
hb_codepoint_t c = *text;
|
hb_codepoint_t c = *text;
|
||||||
|
@ -182,10 +186,10 @@ struct hb_utf_t<uint16_t, true>
|
||||||
if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu)))
|
if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu)))
|
||||||
text--;
|
text--;
|
||||||
|
|
||||||
if (likely (next (text, end, unicode) == end))
|
if (likely (next (text, end, unicode, replacement) == end))
|
||||||
return text;
|
return text;
|
||||||
|
|
||||||
*unicode = -1;
|
*unicode = replacement;
|
||||||
return end - 1;
|
return end - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -208,7 +212,8 @@ struct hb_utf_t<uint32_t, validate>
|
||||||
static inline const uint32_t *
|
static inline const uint32_t *
|
||||||
next (const uint32_t *text,
|
next (const uint32_t *text,
|
||||||
const uint32_t *end HB_UNUSED,
|
const uint32_t *end HB_UNUSED,
|
||||||
hb_codepoint_t *unicode)
|
hb_codepoint_t *unicode,
|
||||||
|
hb_codepoint_t replacement)
|
||||||
{
|
{
|
||||||
hb_codepoint_t c = *text++;
|
hb_codepoint_t c = *text++;
|
||||||
if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu)))
|
if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu)))
|
||||||
|
@ -217,16 +222,17 @@ struct hb_utf_t<uint32_t, validate>
|
||||||
return text;
|
return text;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
*unicode = -1;
|
*unicode = replacement;
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline const uint32_t *
|
static inline const uint32_t *
|
||||||
prev (const uint32_t *text,
|
prev (const uint32_t *text,
|
||||||
const uint32_t *start HB_UNUSED,
|
const uint32_t *start HB_UNUSED,
|
||||||
hb_codepoint_t *unicode)
|
hb_codepoint_t *unicode,
|
||||||
|
hb_codepoint_t replacement)
|
||||||
{
|
{
|
||||||
next (text - 1, text, unicode);
|
next (text - 1, text, unicode, replacement);
|
||||||
return text - 1;
|
return text - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -374,6 +374,7 @@ test_buffer_utf8_conversion (void)
|
||||||
unsigned int bytes, chars, i, j, len;
|
unsigned int bytes, chars, i, j, len;
|
||||||
|
|
||||||
b = hb_buffer_create ();
|
b = hb_buffer_create ();
|
||||||
|
hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1);
|
||||||
|
|
||||||
for (i = 0; i < G_N_ELEMENTS (utf8_conversion_tests); i++)
|
for (i = 0; i < G_N_ELEMENTS (utf8_conversion_tests); i++)
|
||||||
{
|
{
|
||||||
|
@ -388,7 +389,7 @@ test_buffer_utf8_conversion (void)
|
||||||
for (chars = 0; test->codepoints[chars]; chars++)
|
for (chars = 0; test->codepoints[chars]; chars++)
|
||||||
;
|
;
|
||||||
|
|
||||||
hb_buffer_reset (b);
|
hb_buffer_clear_contents (b);
|
||||||
hb_buffer_add_utf8 (b, test->utf8, bytes, 1, bytes - 2);
|
hb_buffer_add_utf8 (b, test->utf8, bytes, 1, bytes - 2);
|
||||||
|
|
||||||
glyphs = hb_buffer_get_glyph_infos (b, &len);
|
glyphs = hb_buffer_get_glyph_infos (b, &len);
|
||||||
|
@ -660,6 +661,7 @@ test_buffer_utf8_validity (void)
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
|
|
||||||
b = hb_buffer_create ();
|
b = hb_buffer_create ();
|
||||||
|
hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1);
|
||||||
|
|
||||||
for (i = 0; i < G_N_ELEMENTS (utf8_validity_tests); i++)
|
for (i = 0; i < G_N_ELEMENTS (utf8_validity_tests); i++)
|
||||||
{
|
{
|
||||||
|
@ -678,7 +680,7 @@ test_buffer_utf8_validity (void)
|
||||||
else
|
else
|
||||||
segment_bytes = test->max_len;
|
segment_bytes = test->max_len;
|
||||||
|
|
||||||
hb_buffer_reset (b);
|
hb_buffer_clear_contents (b);
|
||||||
hb_buffer_add_utf8 (b, test->utf8, text_bytes, 0, segment_bytes);
|
hb_buffer_add_utf8 (b, test->utf8, text_bytes, 0, segment_bytes);
|
||||||
|
|
||||||
glyphs = hb_buffer_get_glyph_infos (b, &len);
|
glyphs = hb_buffer_get_glyph_infos (b, &len);
|
||||||
|
@ -718,6 +720,7 @@ test_buffer_utf16_conversion (void)
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
|
|
||||||
b = hb_buffer_create ();
|
b = hb_buffer_create ();
|
||||||
|
hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1);
|
||||||
|
|
||||||
for (i = 0; i < G_N_ELEMENTS (utf16_conversion_tests); i++)
|
for (i = 0; i < G_N_ELEMENTS (utf16_conversion_tests); i++)
|
||||||
{
|
{
|
||||||
|
@ -732,7 +735,7 @@ test_buffer_utf16_conversion (void)
|
||||||
for (chars = 0; test->codepoints[chars]; chars++)
|
for (chars = 0; test->codepoints[chars]; chars++)
|
||||||
;
|
;
|
||||||
|
|
||||||
hb_buffer_reset (b);
|
hb_buffer_clear_contents (b);
|
||||||
hb_buffer_add_utf16 (b, test->utf16, u_len, 1, u_len - 2);
|
hb_buffer_add_utf16 (b, test->utf16, u_len, 1, u_len - 2);
|
||||||
|
|
||||||
glyphs = hb_buffer_get_glyph_infos (b, &len);
|
glyphs = hb_buffer_get_glyph_infos (b, &len);
|
||||||
|
@ -752,15 +755,15 @@ typedef struct {
|
||||||
|
|
||||||
/* note: we skip the first and last item from utf32 when adding to buffer */
|
/* note: we skip the first and last item from utf32 when adding to buffer */
|
||||||
static const utf32_conversion_test_t utf32_conversion_tests[] = {
|
static const utf32_conversion_test_t utf32_conversion_tests[] = {
|
||||||
{{0x41, 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x61} , {0x004D, 0x0430, 0x4E8C, -1, -1}},
|
{{0x41, 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x61} , {0x004D, 0x0430, 0x4E8C, -3, -3}},
|
||||||
{{0x41, 0x004D, 0x0430, 0x4E8C, 0x10302, 0x61} , {0x004D, 0x0430, 0x4E8C, 0x10302}},
|
{{0x41, 0x004D, 0x0430, 0x4E8C, 0x10302, 0x61} , {0x004D, 0x0430, 0x4E8C, 0x10302}},
|
||||||
{{0x41, 0xD800, 0xDF02, 0x61}, {-1, -1}},
|
{{0x41, 0xD800, 0xDF02, 0x61}, {-3, -3}},
|
||||||
{{0x41, 0xD800, 0xDF02}, {-1}},
|
{{0x41, 0xD800, 0xDF02}, {-3}},
|
||||||
{{0x41, 0x61, 0xD800, 0xDF02}, {0x61, -1}},
|
{{0x41, 0x61, 0xD800, 0xDF02}, {0x61, -3}},
|
||||||
{{0x41, 0xD800, 0x61, 0xDF02}, {-1, 0x61}},
|
{{0x41, 0xD800, 0x61, 0xDF02}, {-3, 0x61}},
|
||||||
{{0x41, 0xDF00, 0x61}, {-1}},
|
{{0x41, 0xDF00, 0x61}, {-3}},
|
||||||
{{0x41, 0x10FFFF, 0x61}, {0x10FFFF}},
|
{{0x41, 0x10FFFF, 0x61}, {0x10FFFF}},
|
||||||
{{0x41, 0x110000, 0x61}, {-1}},
|
{{0x41, 0x110000, 0x61}, {-3}},
|
||||||
{{0x41, 0x61}, {0}}
|
{{0x41, 0x61}, {0}}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -771,6 +774,7 @@ test_buffer_utf32_conversion (void)
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
|
|
||||||
b = hb_buffer_create ();
|
b = hb_buffer_create ();
|
||||||
|
hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -3);
|
||||||
|
|
||||||
for (i = 0; i < G_N_ELEMENTS (utf32_conversion_tests); i++)
|
for (i = 0; i < G_N_ELEMENTS (utf32_conversion_tests); i++)
|
||||||
{
|
{
|
||||||
|
@ -785,7 +789,7 @@ test_buffer_utf32_conversion (void)
|
||||||
for (chars = 0; test->codepoints[chars]; chars++)
|
for (chars = 0; test->codepoints[chars]; chars++)
|
||||||
;
|
;
|
||||||
|
|
||||||
hb_buffer_reset (b);
|
hb_buffer_clear_contents (b);
|
||||||
hb_buffer_add_utf32 (b, test->utf32, u_len, 1, u_len - 2);
|
hb_buffer_add_utf32 (b, test->utf32, u_len, 1, u_len - 2);
|
||||||
|
|
||||||
glyphs = hb_buffer_get_glyph_infos (b, &len);
|
glyphs = hb_buffer_get_glyph_infos (b, &len);
|
||||||
|
|
Loading…
Reference in New Issue