utf8: big improvements to case-insensitive UTF-8 string compare.

- Dramatically reduce RAM usage: uses between 8 and 11 kilobytes less static
memory for its internal case-folding tables.
- Actually works now. It would fail unconditionally if a codepoint folded
into multiple codepoints, even if the compared string contained those exact
codepoints.
- Now a public API!
- Removed __PHYSFS_utf8strnicmp(): nothing was using it, it was incorrect
anyhow, and what does 'n' represent when either string might case-fold to
something larger in-flight, anyhow?
This commit is contained in:
Ryan C. Gordon 2017-08-11 01:39:22 -04:00
parent 587ec88a0d
commit d1f2637ca8
7 changed files with 2773 additions and 2336 deletions

View File

@ -3,6 +3,11 @@
use warnings;
use strict;
my $HASHBUCKETS1_16 = 256;
my $HASHBUCKETS1_32 = 16;
my $HASHBUCKETS2_16 = 16;
my $HASHBUCKETS3_16 = 4;
print <<__EOF__;
/*
* This file is part of PhysicsFS (https://icculus.org/physfs/)
@ -13,17 +18,97 @@ print <<__EOF__;
* Please see the file LICENSE.txt in the source's root directory.
*/
#ifndef _INCLUDE_PHYSFS_CASEFOLDING_H_
#define _INCLUDE_PHYSFS_CASEFOLDING_H_
#ifndef __PHYSICSFS_INTERNAL__
#error Do not include this header from your applications.
#endif
/* We build three simple hashmaps here: one that maps Unicode codepoints to
a one, two, or three lowercase codepoints. To retrieve this info: look at
case_fold_hashX, where X is 1, 2, or 3. Most foldable codepoints fold to one,
a few dozen fold to two, and a handful fold to three. If the codepoint isn't
in any of these hashes, it doesn't fold (no separate upper and lowercase).
Almost all these codepoints fit into 16 bits, so we hash them as such to save
memory. If a codepoint is > 0xFFFF, we have separate hashes for them,
since there are (currently) only about 120 of them and (currently) all of them
map to a single lowercase codepoint. */
typedef struct CaseFoldMapping1_32
{
PHYSFS_uint32 from;
PHYSFS_uint32 to0;
} CaseFoldMapping1_32;
typedef struct CaseFoldMapping1_16
{
PHYSFS_uint16 from;
PHYSFS_uint16 to0;
} CaseFoldMapping1_16;
typedef struct CaseFoldMapping2_16
{
PHYSFS_uint16 from;
PHYSFS_uint16 to0;
PHYSFS_uint16 to1;
} CaseFoldMapping2_16;
typedef struct CaseFoldMapping3_16
{
PHYSFS_uint16 from;
PHYSFS_uint16 to0;
PHYSFS_uint16 to1;
PHYSFS_uint16 to2;
} CaseFoldMapping3_16;
typedef struct CaseFoldHashBucket1_16
{
const CaseFoldMapping1_16 *list;
const PHYSFS_uint8 count;
} CaseFoldHashBucket1_16;
typedef struct CaseFoldHashBucket1_32
{
const CaseFoldMapping1_32 *list;
const PHYSFS_uint8 count;
} CaseFoldHashBucket1_32;
typedef struct CaseFoldHashBucket2_16
{
const CaseFoldMapping2_16 *list;
const PHYSFS_uint8 count;
} CaseFoldHashBucket2_16;
typedef struct CaseFoldHashBucket3_16
{
const CaseFoldMapping3_16 *list;
const PHYSFS_uint8 count;
} CaseFoldHashBucket3_16;
__EOF__
my @foldPairs;
my @foldPairs1_16;
my @foldPairs2_16;
my @foldPairs3_16;
my @foldPairs1_32;
for (my $i = 0; $i < 256; $i++) {
$foldPairs[$i] = '';
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
$foldPairs1_16[$i] = '';
}
for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
$foldPairs1_32[$i] = '';
}
for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
$foldPairs2_16[$i] = '';
}
for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
$foldPairs3_16[$i] = '';
}
open(FH,'<','casefolding.txt') or die("failed to open casefolding.txt: $!\n");
@ -38,47 +123,153 @@ while (<FH>) {
next if not /\A([a-fA-F0-9]+)\;\s*(.)\;\s*(.+)\;/;
my ($code, $status, $mapping) = ($1, $2, $3);
my $hexxed = hex($code);
my $hashed = (($hexxed ^ ($hexxed >> 8)) & 0xFF);
#print("// code '$code' status '$status' mapping '$mapping'\n");
#print("// hexxed '$hexxed' hashed '$hashed'\n");
if (($status eq 'C') or ($status eq 'F')) {
my ($map1, $map2, $map3) = ('0000', '0000', '0000');
my ($map1, $map2, $map3) = (undef, undef, undef);
$map1 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
$map2 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
$map3 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
die("mapping space too small for '$code'\n") if ($mapping ne '');
$foldPairs[$hashed] .= " { 0x$code, 0x$map1, 0x$map2, 0x$map3 },\n";
die("problem parsing mapping for '$code'\n") if (not defined($map1));
if ($hexxed < 128) {
# Just ignore these, we'll handle the low-ASCII ones ourselves.
} elsif ($hexxed > 0xFFFF) {
# We just need to add the 32-bit 2 and/or 3 codepoint maps if this die()'s here.
die("Uhoh, a codepoint > 0xFFFF that folds to multiple codepoints! Fixme.") if defined($map2);
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS1_32-1));
#print("// hexxed '$hexxed' hashed1 '$hashed'\n");
$foldPairs1_32[$hashed] .= " { 0x$code, 0x$map1 },\n";
} elsif (not defined($map2)) {
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS1_16-1));
#print("// hexxed '$hexxed' hashed1 '$hashed'\n");
$foldPairs1_16[$hashed] .= " { 0x$code, 0x$map1 },\n";
} elsif (not defined($map3)) {
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS2_16-1));
#print("// hexxed '$hexxed' hashed2 '$hashed'\n");
$foldPairs2_16[$hashed] .= " { 0x$code, 0x$map1, 0x$map2 },\n";
} else {
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS3_16-1));
#print("// hexxed '$hexxed' hashed3 '$hashed'\n");
$foldPairs3_16[$hashed] .= " { 0x$code, 0x$map1, 0x$map2, 0x$map3 },\n";
}
}
}
close(FH);
for (my $i = 0; $i < 256; $i++) {
$foldPairs[$i] =~ s/,\n\Z//;
my $str = $foldPairs[$i];
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
$foldPairs1_16[$i] =~ s/,\n\Z//;
my $str = $foldPairs1_16[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold_${num}";
print("static const CaseFoldMapping ${sym}[] = {\n$str\n};\n\n");
my $sym = "case_fold1_16_${num}";
print("static const CaseFoldMapping1_16 ${sym}[] = {\n$str\n};\n\n");
}
print("\nstatic const CaseFoldHashBucket case_fold_hash[256] = {\n");
for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
$foldPairs1_32[$i] =~ s/,\n\Z//;
my $str = $foldPairs1_32[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold1_32_${num}";
print("static const CaseFoldMapping1_32 ${sym}[] = {\n$str\n};\n\n");
}
for (my $i = 0; $i < 256; $i++) {
my $str = $foldPairs[$i];
for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
$foldPairs2_16[$i] =~ s/,\n\Z//;
my $str = $foldPairs2_16[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold2_16_${num}";
print("static const CaseFoldMapping2_16 ${sym}[] = {\n$str\n};\n\n");
}
for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
$foldPairs3_16[$i] =~ s/,\n\Z//;
my $str = $foldPairs3_16[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold3_16_${num}";
print("static const CaseFoldMapping3_16 ${sym}[] = {\n$str\n};\n\n");
}
print("static const CaseFoldHashBucket1_16 case_fold_hash1_16[] = {\n");
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
my $str = $foldPairs1_16[$i];
if ($str eq '') {
print(" { 0, NULL },\n");
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold_${num}";
print(" { __PHYSFS_ARRAYLEN($sym), $sym },\n");
my $sym = "case_fold1_16_${num}";
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
}
}
print("};\n\n");
print("static const CaseFoldHashBucket1_32 case_fold_hash1_32[] = {\n");
for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
my $str = $foldPairs1_32[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold1_32_${num}";
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
}
}
print("};\n\n");
print("static const CaseFoldHashBucket2_16 case_fold_hash2_16[] = {\n");
for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
my $str = $foldPairs2_16[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold2_16_${num}";
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
}
}
print("};\n\n");
print("static const CaseFoldHashBucket3_16 case_fold_hash3_16[] = {\n");
for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
my $str = $foldPairs3_16[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold3_16_${num}";
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
}
}
print("};\n\n");
print <<__EOF__;
#endif /* _INCLUDE_PHYSFS_CASEFOLDING_H_ */
/* end of physfs_casefolding.h ... */
__EOF__
exit 0;
# end of makecashfoldhashtable.pl ...

View File

@ -891,14 +891,14 @@ static DirHandle *openDirectory(PHYSFS_Io *io, const char *d, int forWriting)
/* Look for archivers with matching file extensions first... */
for (i = archivers; (*i != NULL) && (retval == NULL); i++)
{
if (__PHYSFS_utf8stricmp(ext, (*i)->info.extension) == 0)
if (PHYSFS_utf8stricmp(ext, (*i)->info.extension) == 0)
retval = tryOpenDir(io, *i, d, forWriting);
} /* for */
/* failing an exact file extension match, try all the others... */
for (i = archivers; (*i != NULL) && (retval == NULL); i++)
{
if (__PHYSFS_utf8stricmp(ext, (*i)->info.extension) != 0)
if (PHYSFS_utf8stricmp(ext, (*i)->info.extension) != 0)
retval = tryOpenDir(io, *i, d, forWriting);
} /* for */
} /* if */
@ -1442,7 +1442,7 @@ static int doRegisterArchiver(const PHYSFS_Archiver *_archiver)
ext = _archiver->info.extension;
for (i = 0; i < numArchivers; i++)
{
if (__PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
if (PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
BAIL(PHYSFS_ERR_DUPLICATE, 0);
} /* for */
@ -1518,7 +1518,7 @@ int PHYSFS_deregisterArchiver(const char *ext)
__PHYSFS_platformGrabMutex(stateLock);
for (i = 0; i < numArchivers; i++)
{
if (__PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
if (PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
{
const int retval = doDeregisterArchiver(i);
__PHYSFS_platformReleaseMutex(stateLock);
@ -1921,7 +1921,7 @@ int PHYSFS_setSaneConfig(const char *organization, const char *appName,
if ((l > extlen) && ((*i)[l - extlen - 1] == '.'))
{
ext = (*i) + (l - extlen);
if (__PHYSFS_utf8stricmp(ext, archiveExt) == 0)
if (PHYSFS_utf8stricmp(ext, archiveExt) == 0)
setSaneCfgAddPath(*i, l, dirsep, archivesFirst);
} /* if */
} /* for */

View File

@ -2521,6 +2521,30 @@ PHYSFS_DECL void PHYSFS_utf8FromLatin1(const char *src, char *dst,
/* Everything above this line is part of the PhysicsFS 2.0 API. */
/**
* \fn int PHYSFS_utf8stricmp(const char *str1, const char *str2)
* \brief Case-insensitive compare of two UTF-8 strings.
*
* This is a strcasecmp/stricmp replacement that expects both strings
* to be in UTF-8 encoding. It will do "case folding" to decide if the
* Unicode codepoints in the strings match.
*
* It will report which string is "greater than" the other, but be aware that
* this doesn't necessarily mean anything: 'a' may be "less than" 'b', but
* a Japanese kuten has no meaningful alphabetically relationship to
* a Greek lambda, but being able to assign a reliable "value" makes sorting
* algorithms possible, if not entirely sane. Most cases should treat the
* return value as "equal" or "not equal".
*
* Like stricmp, this expects both strings to be NULL-terminated.
*
* \param str1 First string to compare.
* \param str2 Second string to compare.
* \return -1 if str1 is "less than" str2, 1 if "greater than", 0 if equal.
*/
PHYSFS_DECL int PHYSFS_utf8stricmp(const char *str1, const char *str2);
/**
* \fn int PHYSFS_unmount(const char *oldDir)
* \brief Remove a directory or archive from the search path.

File diff suppressed because it is too large Load Diff

View File

@ -290,27 +290,6 @@ void __PHYSFS_sort(void *entries, size_t max,
((s) < (__PHYSFS_UI64(0xFFFFFFFFFFFFFFFF) >> (64-(sizeof(size_t)*8)))) \
)
/*
* This is a strcasecmp() or stricmp() replacement that expects both strings
* to be in UTF-8 encoding. It will do "case folding" to decide if the
* Unicode codepoints in the strings match.
*
* It will report which string is "greater than" the other, but be aware that
* this doesn't necessarily mean anything: 'a' may be "less than" 'b', but
* a random Kanji codepoint has no meaningful alphabetically relationship to
* a Greek Lambda, but being able to assign a reliable "value" makes sorting
* algorithms possible, if not entirely sane. Most cases should treat the
* return value as "equal" or "not equal".
*/
int __PHYSFS_utf8stricmp(const char *s1, const char *s2);
/*
* This works like __PHYSFS_utf8stricmp(), but takes a character (NOT BYTE
* COUNT) argument, like strcasencmp().
*/
int __PHYSFS_utf8strnicmp(const char *s1, const char *s2, PHYSFS_uint32 l);
/*
* stricmp() that guarantees to only work with low ASCII. The C runtime
* stricmp() might try to apply a locale/codepage/etc, which we don't want.

View File

@ -206,7 +206,7 @@ static char *cvtPathToCorrectCase(char *buf)
cmp = __PHYSFS_stricmpASCII(utf8, fname);
else
{
cmp = __PHYSFS_utf8stricmp(utf8, fname);
cmp = PHYSFS_utf8stricmp(utf8, fname);
allocator.Free(utf8);
} /* else */

View File

@ -1,6 +1,8 @@
#define __PHYSICSFS_INTERNAL__
#include "physfs_internal.h"
#include "physfs_casefolding.h"
/*
* From rfc3629, the UTF-8 spec:
@ -402,112 +404,134 @@ void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len
} /* PHYSFS_utf8FromUtf16 */
typedef struct CaseFoldMapping
/* (to) should point to at least 3 PHYSFS_uint32 slots. */
static int locate_casefold_mapping(const PHYSFS_uint32 from, PHYSFS_uint32 *to)
{
PHYSFS_uint32 from;
PHYSFS_uint32 to0;
PHYSFS_uint32 to1;
PHYSFS_uint32 to2;
} CaseFoldMapping;
int i;
typedef struct CaseFoldHashBucket
{
const PHYSFS_uint8 count;
const CaseFoldMapping *list;
} CaseFoldHashBucket;
#include "physfs_casefolding.h"
static void locate_case_fold_mapping(const PHYSFS_uint32 from,
PHYSFS_uint32 *to)
{
PHYSFS_uint32 i;
const PHYSFS_uint8 hashed = ((from ^ (from >> 8)) & 0xFF);
const CaseFoldHashBucket *bucket = &case_fold_hash[hashed];
const CaseFoldMapping *mapping = bucket->list;
for (i = 0; i < bucket->count; i++, mapping++)
if (from < 128) /* low-ASCII, easy! */
{
if (mapping->from == from)
if ((from >= 'A') && (from <= 'Z'))
*to = from - ('A' - 'a');
else
*to = from;
return 1;
} /* if */
else if (from <= 0xFFFF)
{
const PHYSFS_uint8 hash = ((from ^ (from >> 8)) & 0xFF);
const PHYSFS_uint16 from16 = (PHYSFS_uint16) from;
{
to[0] = mapping->to0;
to[1] = mapping->to1;
to[2] = mapping->to2;
return;
} /* if */
} /* for */
const CaseFoldHashBucket1_16 *bucket = &case_fold_hash1_16[hash];
const int count = (int) bucket->count;
for (i = 0; i < count; i++)
{
const CaseFoldMapping1_16 *mapping = &bucket->list[i];
if (mapping->from == from16)
{
*to = mapping->to0;
return 1;
} /* if */
} /* for */
}
{
const CaseFoldHashBucket2_16 *bucket = &case_fold_hash2_16[hash & 15];
const int count = (int) bucket->count;
for (i = 0; i < count; i++)
{
const CaseFoldMapping2_16 *mapping = &bucket->list[i];
if (mapping->from == from16)
{
to[0] = mapping->to0;
to[1] = mapping->to1;
return 2;
} /* if */
} /* for */
}
{
const CaseFoldHashBucket3_16 *bucket = &case_fold_hash3_16[hash & 3];
const int count = (int) bucket->count;
for (i = 0; i < count; i++)
{
const CaseFoldMapping3_16 *mapping = &bucket->list[i];
if (mapping->from == from16)
{
to[0] = mapping->to0;
to[1] = mapping->to1;
to[2] = mapping->to2;
return 3;
} /* if */
} /* for */
}
} /* else if */
else /* codepoint that doesn't fit in 16 bits. */
{
const PHYSFS_uint8 hash = ((from ^ (from >> 8)) & 0xFF);
const CaseFoldHashBucket1_32 *bucket = &case_fold_hash1_32[hash & 15];
const int count = (int) bucket->count;
for (i = 0; i < count; i++)
{
const CaseFoldMapping1_32 *mapping = &bucket->list[i];
if (mapping->from == from)
{
*to = mapping->to0;
return 1;
} /* if */
} /* for */
} /* else */
/* Not found...there's no remapping for this codepoint. */
to[0] = from;
to[1] = 0;
to[2] = 0;
} /* locate_case_fold_mapping */
*to = from;
return 1;
} /* locate_casefold_mapping */
/* !!! FIXME-3.0: this doesn't actually work (for example, it folds the German Eszett
into 's' 's', but if you have two 'S' chars in a row, it'll fail on the first one,
since it'll fold into a single 's'. This needs to be able to lurch along with a
variable number of codepoints at a time. */
static int utf8codepointcmp(const PHYSFS_uint32 cp1, const PHYSFS_uint32 cp2)
int PHYSFS_utf8stricmp(const char *str1, const char *str2)
{
PHYSFS_uint32 folded1[3], folded2[3];
int head1 = 0;
int tail1 = 0;
int head2 = 0;
int tail2 = 0;
if (cp1 == cp2)
return 0; /* obviously matches. */
locate_case_fold_mapping(cp1, folded1);
locate_case_fold_mapping(cp2, folded2);
if (folded1[0] < folded2[0])
return -1;
else if (folded1[0] > folded2[0])
return 1;
else if (folded1[1] < folded2[1])
return -1;
else if (folded1[1] > folded2[1])
return 1;
else if (folded1[2] < folded2[2])
return -1;
else if (folded1[2] > folded2[2])
return 1;
return 0; /* complete match. */
} /* utf8codepointcmp */
int __PHYSFS_utf8stricmp(const char *str1, const char *str2)
{
while (1)
{
const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
const int rc = utf8codepointcmp(cp1, cp2);
if (rc != 0)
return rc;
PHYSFS_uint32 cp1, cp2;
if (head1 != tail1)
cp1 = folded1[tail1++];
else
{
head1 = locate_casefold_mapping(utf8codepoint(&str1), folded1);
cp1 = folded1[0];
tail1 = 1;
} /* else */
if (head2 != tail2)
cp2 = folded2[tail2++];
else
{
head2 = locate_casefold_mapping(utf8codepoint(&str2), folded2);
cp2 = folded2[0];
tail2 = 1;
} /* else */
if (cp1 < cp2)
return -1;
else if (cp1 > cp2)
return 1;
else if (cp1 == 0)
break; /* complete match. */
break; /* complete match. */
} /* while */
return 0;
} /* __PHYSFS_utf8stricmp */
int __PHYSFS_utf8strnicmp(const char *str1, const char *str2, PHYSFS_uint32 n)
{
while (n > 0)
{
const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
const int rc = utf8codepointcmp(cp1, cp2);
if (rc != 0)
return rc;
else if (cp1 == 0)
return 0;
n--;
} /* while */
return 0; /* matched to n chars. */
} /* __PHYSFS_utf8strnicmp */
} /* PHYSFS_utf8stricmp */
int __PHYSFS_stricmpASCII(const char *str1, const char *str2)