Backport from dev branch: utf8codepoint() should always advance pointer.

2011-06-01 03:13:09 -04:00 · 2011-06-01 03:13:09 -04:00 · 4043d09295
commit 4043d09295
parent 76ffb5dde1
1 changed files with 34 additions and 29 deletions
--- a/physfs_unicode.c
+++ b/physfs_unicode.c
@ -63,12 +63,13 @@ static PHYSFS_uint32 utf8codepoint(const char **_str)

    else if (octet < 224)  /* two octets */
    {
+        (*_str)++;  /* advance at least one byte in case of an error */
        octet -= (128+64);
        octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
        if ((octet2 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
            return UNICODE_BOGUS_CHAR_VALUE;

-        *_str += 2;  /* skip to next possible start of codepoint. */
+        *_str += 1;  /* skip to next possible start of codepoint. */
        retval = ((octet << 6) | (octet2 - 128));
        if ((retval >= 0x80) && (retval <= 0x7FF))
            return retval;
@ -76,6 +77,7 @@ static PHYSFS_uint32 utf8codepoint(const char **_str)

    else if (octet < 240)  /* three octets */
    {
+        (*_str)++;  // advance at least one byte in case of an error
        octet -= (128+64+32);
        octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
        if ((octet2 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
@ -85,7 +87,7 @@ static PHYSFS_uint32 utf8codepoint(const char **_str)
        if ((octet3 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
            return UNICODE_BOGUS_CHAR_VALUE;

-        *_str += 3;  /* skip to next possible start of codepoint. */
+        *_str += 2;  /* skip to next possible start of codepoint. */
        retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );

        /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
@ -108,6 +110,7 @@ static PHYSFS_uint32 utf8codepoint(const char **_str)

    else if (octet < 248)  /* four octets */
    {
+        (*_str)++;  // advance at least one byte in case of an error
        octet -= (128+64+32+16);
        octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
        if ((octet2 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
@ -121,7 +124,7 @@ static PHYSFS_uint32 utf8codepoint(const char **_str)
        if ((octet4 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
            return UNICODE_BOGUS_CHAR_VALUE;

-        *_str += 4;  /* skip to next possible start of codepoint. */
+        *_str += 3;  /* skip to next possible start of codepoint. */
        retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
                   ((octet3 - 128) << 6) | ((octet4 - 128)) );
        if ((retval >= 0x10000) && (retval <= 0x10FFFF))
@ -136,6 +139,34 @@ static PHYSFS_uint32 utf8codepoint(const char **_str)

    else if (octet < 252)  /* five octets */
    {
+        (*_str)++;  // advance at least one byte in case of an error
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 4;  /* skip to next possible start of codepoint. */
+        return UNICODE_BOGUS_CHAR_VALUE;
+    } /* else if */
+
+    else  /* six octets */
+    {
+        (*_str)++;  // advance at least one byte in case of an error
+        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
+        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
+            return UNICODE_BOGUS_CHAR_VALUE;
+
        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
            return UNICODE_BOGUS_CHAR_VALUE;
@ -156,32 +187,6 @@ static PHYSFS_uint32 utf8codepoint(const char **_str)
        return UNICODE_BOGUS_CHAR_VALUE;
    } /* else if */

-    else  /* six octets */
-    {
-        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
-        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
-        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
-        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
-        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
-        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        *_str += 6;  /* skip to next possible start of codepoint. */
-        return UNICODE_BOGUS_CHAR_VALUE;
-    } /* else if */
-
    return UNICODE_BOGUS_CHAR_VALUE;
 } /* utf8codepoint */