unicode: Added UTF-16 and UCS-4 stricmp functions.
This commit is contained in:
parent
eb75883226
commit
9f8ecb91cb
48
src/physfs.h
48
src/physfs.h
|
@ -2542,6 +2542,54 @@ PHYSFS_DECL void PHYSFS_utf8FromLatin1(const char *src, char *dst,
|
|||
*/
|
||||
PHYSFS_DECL int PHYSFS_utf8stricmp(const char *str1, const char *str2);
|
||||
|
||||
/**
|
||||
* \fn int PHYSFS_utf16stricmp(const PHYSFS_uint16 *str1, const PHYSFS_uint16 *str2)
|
||||
* \brief Case-insensitive compare of two UTF-16 strings.
|
||||
*
|
||||
* This is a strcasecmp/stricmp replacement that expects both strings
|
||||
* to be in UTF-16 encoding. It will do "case folding" to decide if the
|
||||
* Unicode codepoints in the strings match.
|
||||
*
|
||||
* It will report which string is "greater than" the other, but be aware that
|
||||
* this doesn't necessarily mean anything: 'a' may be "less than" 'b', but
|
||||
* a Japanese kuten has no meaningful alphabetically relationship to
|
||||
* a Greek lambda, but being able to assign a reliable "value" makes sorting
|
||||
* algorithms possible, if not entirely sane. Most cases should treat the
|
||||
* return value as "equal" or "not equal".
|
||||
*
|
||||
* Like stricmp, this expects both strings to be NULL-terminated.
|
||||
*
|
||||
* \param str1 First string to compare.
|
||||
* \param str2 Second string to compare.
|
||||
* \return -1 if str1 is "less than" str2, 1 if "greater than", 0 if equal.
|
||||
*/
|
||||
PHYSFS_DECL int PHYSFS_utf16stricmp(const PHYSFS_uint16 *str1,
|
||||
const PHYSFS_uint16 *str2);
|
||||
|
||||
/**
|
||||
* \fn int PHYSFS_ucs4stricmp(const PHYSFS_uint32 *str1, const PHYSFS_uint32 *str2)
|
||||
* \brief Case-insensitive compare of two UCS-4 strings.
|
||||
*
|
||||
* This is a strcasecmp/stricmp replacement that expects both strings
|
||||
* to be in UCS-4 (aka UTF-32) encoding. It will do "case folding" to decide
|
||||
* if the Unicode codepoints in the strings match.
|
||||
*
|
||||
* It will report which string is "greater than" the other, but be aware that
|
||||
* this doesn't necessarily mean anything: 'a' may be "less than" 'b', but
|
||||
* a Japanese kuten has no meaningful alphabetically relationship to
|
||||
* a Greek lambda, but being able to assign a reliable "value" makes sorting
|
||||
* algorithms possible, if not entirely sane. Most cases should treat the
|
||||
* return value as "equal" or "not equal".
|
||||
*
|
||||
* Like stricmp, this expects both strings to be NULL-terminated.
|
||||
*
|
||||
* \param str1 First string to compare.
|
||||
* \param str2 Second string to compare.
|
||||
* \return -1 if str1 is "less than" str2, 1 if "greater than", 0 if equal.
|
||||
*/
|
||||
PHYSFS_DECL int PHYSFS_ucs4stricmp(const PHYSFS_uint32 *str1,
|
||||
const PHYSFS_uint32 *str2);
|
||||
|
||||
|
||||
/**
|
||||
* \typedef PHYSFS_EnumerateCallback
|
||||
|
|
|
@ -190,6 +190,48 @@ static PHYSFS_uint32 utf8codepoint(const char **_str)
|
|||
return UNICODE_BOGUS_CHAR_VALUE;
|
||||
} /* utf8codepoint */
|
||||
|
||||
static PHYSFS_uint32 utf16codepoint(const PHYSFS_uint16 **_str)
|
||||
{
|
||||
const PHYSFS_uint16 *src = *_str;
|
||||
PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++);
|
||||
|
||||
if (cp == 0) /* null terminator, end of string. */
|
||||
return 0;
|
||||
/* Orphaned second half of surrogate pair? */
|
||||
else if ((cp >= 0xDC00) && (cp <= 0xDFFF))
|
||||
cp = UNICODE_BOGUS_CHAR_CODEPOINT;
|
||||
else if ((cp >= 0xD800) && (cp <= 0xDBFF)) /* start surrogate pair! */
|
||||
{
|
||||
const PHYSFS_uint32 pair = (PHYSFS_uint32) *src;
|
||||
if (pair == 0)
|
||||
cp = UNICODE_BOGUS_CHAR_CODEPOINT;
|
||||
else if ((pair < 0xDC00) || (pair > 0xDFFF))
|
||||
cp = UNICODE_BOGUS_CHAR_CODEPOINT;
|
||||
else
|
||||
{
|
||||
src++; /* eat the other surrogate. */
|
||||
cp = (((cp - 0xD800) << 10) | (pair - 0xDC00));
|
||||
} /* else */
|
||||
} /* else if */
|
||||
|
||||
*_str = src;
|
||||
return cp;
|
||||
} /* utf16codepoint */
|
||||
|
||||
static PHYSFS_uint32 utf32codepoint(const PHYSFS_uint32 **_str)
|
||||
{
|
||||
const PHYSFS_uint32 *src = *_str;
|
||||
PHYSFS_uint32 cp = *(src++);
|
||||
|
||||
if (cp == 0) /* null terminator, end of string. */
|
||||
return 0;
|
||||
else if (cp > 0x10FFF)
|
||||
cp = UNICODE_BOGUS_CHAR_CODEPOINT;
|
||||
|
||||
*_str = src;
|
||||
return cp;
|
||||
} /* utf32codepoint */
|
||||
|
||||
|
||||
void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
|
||||
{
|
||||
|
@ -378,25 +420,9 @@ void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len
|
|||
len--;
|
||||
while (len)
|
||||
{
|
||||
PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++);
|
||||
if (cp == 0)
|
||||
const PHYSFS_uint32 cp = utf16codepoint(&src);
|
||||
if (!cp)
|
||||
break;
|
||||
|
||||
/* Orphaned second half of surrogate pair? */
|
||||
if ((cp >= 0xDC00) && (cp <= 0xDFFF))
|
||||
cp = UNICODE_BOGUS_CHAR_CODEPOINT;
|
||||
else if ((cp >= 0xD800) && (cp <= 0xDBFF)) /* start surrogate pair! */
|
||||
{
|
||||
const PHYSFS_uint32 pair = (PHYSFS_uint32) *src;
|
||||
if ((pair < 0xDC00) || (pair > 0xDFFF))
|
||||
cp = UNICODE_BOGUS_CHAR_CODEPOINT;
|
||||
else
|
||||
{
|
||||
src++; /* eat the other surrogate. */
|
||||
cp = (((cp - 0xD800) << 10) | (pair - 0xDC00));
|
||||
} /* else */
|
||||
} /* else if */
|
||||
|
||||
utf8fromcodepoint(cp, &dst, &len);
|
||||
} /* while */
|
||||
|
||||
|
@ -492,46 +518,51 @@ static int locate_casefold_mapping(const PHYSFS_uint32 from, PHYSFS_uint32 *to)
|
|||
} /* locate_casefold_mapping */
|
||||
|
||||
|
||||
#define UTFSTRICMP(bits) \
|
||||
PHYSFS_uint32 folded1[3], folded2[3]; \
|
||||
int head1 = 0, tail1 = 0, head2 = 0, tail2 = 0; \
|
||||
while (1) { \
|
||||
PHYSFS_uint32 cp1, cp2; \
|
||||
if (head1 != tail1) { \
|
||||
cp1 = folded1[tail1++]; \
|
||||
} else { \
|
||||
head1 = locate_casefold_mapping(utf##bits##codepoint(&str1), folded1); \
|
||||
cp1 = folded1[0]; \
|
||||
tail1 = 1; \
|
||||
} \
|
||||
if (head2 != tail2) { \
|
||||
cp2 = folded2[tail2++]; \
|
||||
} else { \
|
||||
head2 = locate_casefold_mapping(utf##bits##codepoint(&str2), folded2); \
|
||||
cp2 = folded2[0]; \
|
||||
tail2 = 1; \
|
||||
} \
|
||||
if (cp1 < cp2) { \
|
||||
return -1; \
|
||||
} else if (cp1 > cp2) { \
|
||||
return 1; \
|
||||
} else if (cp1 == 0) { \
|
||||
break; /* complete match. */ \
|
||||
} \
|
||||
} \
|
||||
return 0
|
||||
|
||||
int PHYSFS_utf8stricmp(const char *str1, const char *str2)
|
||||
{
|
||||
PHYSFS_uint32 folded1[3], folded2[3];
|
||||
int head1 = 0;
|
||||
int tail1 = 0;
|
||||
int head2 = 0;
|
||||
int tail2 = 0;
|
||||
|
||||
while (1)
|
||||
{
|
||||
PHYSFS_uint32 cp1, cp2;
|
||||
|
||||
if (head1 != tail1)
|
||||
cp1 = folded1[tail1++];
|
||||
else
|
||||
{
|
||||
head1 = locate_casefold_mapping(utf8codepoint(&str1), folded1);
|
||||
cp1 = folded1[0];
|
||||
tail1 = 1;
|
||||
} /* else */
|
||||
|
||||
if (head2 != tail2)
|
||||
cp2 = folded2[tail2++];
|
||||
else
|
||||
{
|
||||
head2 = locate_casefold_mapping(utf8codepoint(&str2), folded2);
|
||||
cp2 = folded2[0];
|
||||
tail2 = 1;
|
||||
} /* else */
|
||||
|
||||
if (cp1 < cp2)
|
||||
return -1;
|
||||
else if (cp1 > cp2)
|
||||
return 1;
|
||||
else if (cp1 == 0)
|
||||
break; /* complete match. */
|
||||
} /* while */
|
||||
|
||||
return 0;
|
||||
UTFSTRICMP(8);
|
||||
} /* PHYSFS_utf8stricmp */
|
||||
|
||||
int PHYSFS_utf16stricmp(const PHYSFS_uint16 *str1, const PHYSFS_uint16 *str2)
|
||||
{
|
||||
UTFSTRICMP(16);
|
||||
} /* PHYSFS_utf16stricmp */
|
||||
|
||||
int PHYSFS_ucs4stricmp(const PHYSFS_uint32 *str1, const PHYSFS_uint32 *str2)
|
||||
{
|
||||
UTFSTRICMP(32);
|
||||
} /* PHYSFS_ucs4stricmp */
|
||||
|
||||
#undef UTFSTRICMP
|
||||
|
||||
/* end of physfs_unicode.c ... */
|
||||
|
||||
|
|
Loading…
Reference in New Issue