From 4043d092950a74ddfe3e1c8328803ea9e8eec3dc Mon Sep 17 00:00:00 2001 From: "Ryan C. Gordon" Date: Wed, 1 Jun 2011 03:13:09 -0400 Subject: [PATCH] Backport from dev branch: utf8codepoint() should always advance pointer. --- physfs_unicode.c | 63 ++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/physfs_unicode.c b/physfs_unicode.c index d9cc73f..15b6c03 100644 --- a/physfs_unicode.c +++ b/physfs_unicode.c @@ -63,12 +63,13 @@ static PHYSFS_uint32 utf8codepoint(const char **_str) else if (octet < 224) /* two octets */ { + (*_str)++; /* advance at least one byte in case of an error */ octet -= (128+64); octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ return UNICODE_BOGUS_CHAR_VALUE; - *_str += 2; /* skip to next possible start of codepoint. */ + *_str += 1; /* skip to next possible start of codepoint. */ retval = ((octet << 6) | (octet2 - 128)); if ((retval >= 0x80) && (retval <= 0x7FF)) return retval; @@ -76,6 +77,7 @@ static PHYSFS_uint32 utf8codepoint(const char **_str) else if (octet < 240) /* three octets */ { + (*_str)++; // advance at least one byte in case of an error octet -= (128+64+32); octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ @@ -85,7 +87,7 @@ static PHYSFS_uint32 utf8codepoint(const char **_str) if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ return UNICODE_BOGUS_CHAR_VALUE; - *_str += 3; /* skip to next possible start of codepoint. */ + *_str += 2; /* skip to next possible start of codepoint. */ retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) ); /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */ @@ -108,6 +110,7 @@ static PHYSFS_uint32 utf8codepoint(const char **_str) else if (octet < 248) /* four octets */ { + (*_str)++; // advance at least one byte in case of an error octet -= (128+64+32+16); octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ @@ -121,7 +124,7 @@ static PHYSFS_uint32 utf8codepoint(const char **_str) if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ return UNICODE_BOGUS_CHAR_VALUE; - *_str += 4; /* skip to next possible start of codepoint. */ + *_str += 3; /* skip to next possible start of codepoint. */ retval = ( ((octet << 18)) | ((octet2 - 128) << 12) | ((octet3 - 128) << 6) | ((octet4 - 128)) ); if ((retval >= 0x10000) && (retval <= 0x10FFFF)) @@ -136,6 +139,34 @@ static PHYSFS_uint32 utf8codepoint(const char **_str) else if (octet < 252) /* five octets */ { + (*_str)++; // advance at least one byte in case of an error + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + + *_str += 4; /* skip to next possible start of codepoint. */ + return UNICODE_BOGUS_CHAR_VALUE; + } /* else if */ + + else /* six octets */ + { + (*_str)++; // advance at least one byte in case of an error + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); + if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ + return UNICODE_BOGUS_CHAR_VALUE; + octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ return UNICODE_BOGUS_CHAR_VALUE; @@ -156,32 +187,6 @@ static PHYSFS_uint32 utf8codepoint(const char **_str) return UNICODE_BOGUS_CHAR_VALUE; } /* else if */ - else /* six octets */ - { - octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); - if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ - return UNICODE_BOGUS_CHAR_VALUE; - - octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); - if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ - return UNICODE_BOGUS_CHAR_VALUE; - - octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); - if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ - return UNICODE_BOGUS_CHAR_VALUE; - - octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); - if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ - return UNICODE_BOGUS_CHAR_VALUE; - - octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); - if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ - return UNICODE_BOGUS_CHAR_VALUE; - - *_str += 6; /* skip to next possible start of codepoint. */ - return UNICODE_BOGUS_CHAR_VALUE; - } /* else if */ - return UNICODE_BOGUS_CHAR_VALUE; } /* utf8codepoint */