utf8: big improvements to case-insensitive UTF-8 string compare.
- Dramatically reduce RAM usage: uses between 8 and 11 kilobytes less static memory for its internal case-folding tables. - Actually works now. It would fail unconditionally if a codepoint folded into multiple codepoints, even if the compared string contained those exact codepoints. - Now a public API! - Removed __PHYSFS_utf8strnicmp(): nothing was using it, it was incorrect anyhow, and what does 'n' represent when either string might case-fold to something larger in-flight, anyhow?
This commit is contained in:
parent
587ec88a0d
commit
d1f2637ca8
|
@ -3,6 +3,11 @@
|
||||||
use warnings;
|
use warnings;
|
||||||
use strict;
|
use strict;
|
||||||
|
|
||||||
|
my $HASHBUCKETS1_16 = 256;
|
||||||
|
my $HASHBUCKETS1_32 = 16;
|
||||||
|
my $HASHBUCKETS2_16 = 16;
|
||||||
|
my $HASHBUCKETS3_16 = 4;
|
||||||
|
|
||||||
print <<__EOF__;
|
print <<__EOF__;
|
||||||
/*
|
/*
|
||||||
* This file is part of PhysicsFS (https://icculus.org/physfs/)
|
* This file is part of PhysicsFS (https://icculus.org/physfs/)
|
||||||
|
@ -13,17 +18,97 @@ print <<__EOF__;
|
||||||
* Please see the file LICENSE.txt in the source's root directory.
|
* Please see the file LICENSE.txt in the source's root directory.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#ifndef _INCLUDE_PHYSFS_CASEFOLDING_H_
|
||||||
|
#define _INCLUDE_PHYSFS_CASEFOLDING_H_
|
||||||
|
|
||||||
#ifndef __PHYSICSFS_INTERNAL__
|
#ifndef __PHYSICSFS_INTERNAL__
|
||||||
#error Do not include this header from your applications.
|
#error Do not include this header from your applications.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* We build three simple hashmaps here: one that maps Unicode codepoints to
|
||||||
|
a one, two, or three lowercase codepoints. To retrieve this info: look at
|
||||||
|
case_fold_hashX, where X is 1, 2, or 3. Most foldable codepoints fold to one,
|
||||||
|
a few dozen fold to two, and a handful fold to three. If the codepoint isn't
|
||||||
|
in any of these hashes, it doesn't fold (no separate upper and lowercase).
|
||||||
|
|
||||||
|
Almost all these codepoints fit into 16 bits, so we hash them as such to save
|
||||||
|
memory. If a codepoint is > 0xFFFF, we have separate hashes for them,
|
||||||
|
since there are (currently) only about 120 of them and (currently) all of them
|
||||||
|
map to a single lowercase codepoint. */
|
||||||
|
|
||||||
|
typedef struct CaseFoldMapping1_32
|
||||||
|
{
|
||||||
|
PHYSFS_uint32 from;
|
||||||
|
PHYSFS_uint32 to0;
|
||||||
|
} CaseFoldMapping1_32;
|
||||||
|
|
||||||
|
typedef struct CaseFoldMapping1_16
|
||||||
|
{
|
||||||
|
PHYSFS_uint16 from;
|
||||||
|
PHYSFS_uint16 to0;
|
||||||
|
} CaseFoldMapping1_16;
|
||||||
|
|
||||||
|
typedef struct CaseFoldMapping2_16
|
||||||
|
{
|
||||||
|
PHYSFS_uint16 from;
|
||||||
|
PHYSFS_uint16 to0;
|
||||||
|
PHYSFS_uint16 to1;
|
||||||
|
} CaseFoldMapping2_16;
|
||||||
|
|
||||||
|
typedef struct CaseFoldMapping3_16
|
||||||
|
{
|
||||||
|
PHYSFS_uint16 from;
|
||||||
|
PHYSFS_uint16 to0;
|
||||||
|
PHYSFS_uint16 to1;
|
||||||
|
PHYSFS_uint16 to2;
|
||||||
|
} CaseFoldMapping3_16;
|
||||||
|
|
||||||
|
typedef struct CaseFoldHashBucket1_16
|
||||||
|
{
|
||||||
|
const CaseFoldMapping1_16 *list;
|
||||||
|
const PHYSFS_uint8 count;
|
||||||
|
} CaseFoldHashBucket1_16;
|
||||||
|
|
||||||
|
typedef struct CaseFoldHashBucket1_32
|
||||||
|
{
|
||||||
|
const CaseFoldMapping1_32 *list;
|
||||||
|
const PHYSFS_uint8 count;
|
||||||
|
} CaseFoldHashBucket1_32;
|
||||||
|
|
||||||
|
typedef struct CaseFoldHashBucket2_16
|
||||||
|
{
|
||||||
|
const CaseFoldMapping2_16 *list;
|
||||||
|
const PHYSFS_uint8 count;
|
||||||
|
} CaseFoldHashBucket2_16;
|
||||||
|
|
||||||
|
typedef struct CaseFoldHashBucket3_16
|
||||||
|
{
|
||||||
|
const CaseFoldMapping3_16 *list;
|
||||||
|
const PHYSFS_uint8 count;
|
||||||
|
} CaseFoldHashBucket3_16;
|
||||||
|
|
||||||
__EOF__
|
__EOF__
|
||||||
|
|
||||||
|
|
||||||
my @foldPairs;
|
my @foldPairs1_16;
|
||||||
|
my @foldPairs2_16;
|
||||||
|
my @foldPairs3_16;
|
||||||
|
my @foldPairs1_32;
|
||||||
|
|
||||||
for (my $i = 0; $i < 256; $i++) {
|
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
|
||||||
$foldPairs[$i] = '';
|
$foldPairs1_16[$i] = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
|
||||||
|
$foldPairs1_32[$i] = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
|
||||||
|
$foldPairs2_16[$i] = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
|
||||||
|
$foldPairs3_16[$i] = '';
|
||||||
}
|
}
|
||||||
|
|
||||||
open(FH,'<','casefolding.txt') or die("failed to open casefolding.txt: $!\n");
|
open(FH,'<','casefolding.txt') or die("failed to open casefolding.txt: $!\n");
|
||||||
|
@ -38,47 +123,153 @@ while (<FH>) {
|
||||||
|
|
||||||
next if not /\A([a-fA-F0-9]+)\;\s*(.)\;\s*(.+)\;/;
|
next if not /\A([a-fA-F0-9]+)\;\s*(.)\;\s*(.+)\;/;
|
||||||
my ($code, $status, $mapping) = ($1, $2, $3);
|
my ($code, $status, $mapping) = ($1, $2, $3);
|
||||||
|
|
||||||
my $hexxed = hex($code);
|
my $hexxed = hex($code);
|
||||||
my $hashed = (($hexxed ^ ($hexxed >> 8)) & 0xFF);
|
|
||||||
#print("// code '$code' status '$status' mapping '$mapping'\n");
|
#print("// code '$code' status '$status' mapping '$mapping'\n");
|
||||||
#print("// hexxed '$hexxed' hashed '$hashed'\n");
|
|
||||||
|
|
||||||
if (($status eq 'C') or ($status eq 'F')) {
|
if (($status eq 'C') or ($status eq 'F')) {
|
||||||
my ($map1, $map2, $map3) = ('0000', '0000', '0000');
|
my ($map1, $map2, $map3) = (undef, undef, undef);
|
||||||
$map1 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
|
$map1 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
|
||||||
$map2 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
|
$map2 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
|
||||||
$map3 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
|
$map3 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
|
||||||
die("mapping space too small for '$code'\n") if ($mapping ne '');
|
die("mapping space too small for '$code'\n") if ($mapping ne '');
|
||||||
$foldPairs[$hashed] .= " { 0x$code, 0x$map1, 0x$map2, 0x$map3 },\n";
|
die("problem parsing mapping for '$code'\n") if (not defined($map1));
|
||||||
|
|
||||||
|
if ($hexxed < 128) {
|
||||||
|
# Just ignore these, we'll handle the low-ASCII ones ourselves.
|
||||||
|
} elsif ($hexxed > 0xFFFF) {
|
||||||
|
# We just need to add the 32-bit 2 and/or 3 codepoint maps if this die()'s here.
|
||||||
|
die("Uhoh, a codepoint > 0xFFFF that folds to multiple codepoints! Fixme.") if defined($map2);
|
||||||
|
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS1_32-1));
|
||||||
|
#print("// hexxed '$hexxed' hashed1 '$hashed'\n");
|
||||||
|
$foldPairs1_32[$hashed] .= " { 0x$code, 0x$map1 },\n";
|
||||||
|
} elsif (not defined($map2)) {
|
||||||
|
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS1_16-1));
|
||||||
|
#print("// hexxed '$hexxed' hashed1 '$hashed'\n");
|
||||||
|
$foldPairs1_16[$hashed] .= " { 0x$code, 0x$map1 },\n";
|
||||||
|
} elsif (not defined($map3)) {
|
||||||
|
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS2_16-1));
|
||||||
|
#print("// hexxed '$hexxed' hashed2 '$hashed'\n");
|
||||||
|
$foldPairs2_16[$hashed] .= " { 0x$code, 0x$map1, 0x$map2 },\n";
|
||||||
|
} else {
|
||||||
|
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS3_16-1));
|
||||||
|
#print("// hexxed '$hexxed' hashed3 '$hashed'\n");
|
||||||
|
$foldPairs3_16[$hashed] .= " { 0x$code, 0x$map1, 0x$map2, 0x$map3 },\n";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
close(FH);
|
close(FH);
|
||||||
|
|
||||||
for (my $i = 0; $i < 256; $i++) {
|
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
|
||||||
$foldPairs[$i] =~ s/,\n\Z//;
|
$foldPairs1_16[$i] =~ s/,\n\Z//;
|
||||||
my $str = $foldPairs[$i];
|
my $str = $foldPairs1_16[$i];
|
||||||
next if $str eq '';
|
next if $str eq '';
|
||||||
my $num = '000' . $i;
|
my $num = '000' . $i;
|
||||||
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
|
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
|
||||||
my $sym = "case_fold_${num}";
|
my $sym = "case_fold1_16_${num}";
|
||||||
print("static const CaseFoldMapping ${sym}[] = {\n$str\n};\n\n");
|
print("static const CaseFoldMapping1_16 ${sym}[] = {\n$str\n};\n\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
print("\nstatic const CaseFoldHashBucket case_fold_hash[256] = {\n");
|
for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
|
||||||
|
$foldPairs1_32[$i] =~ s/,\n\Z//;
|
||||||
|
my $str = $foldPairs1_32[$i];
|
||||||
|
next if $str eq '';
|
||||||
|
my $num = '000' . $i;
|
||||||
|
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
|
||||||
|
my $sym = "case_fold1_32_${num}";
|
||||||
|
print("static const CaseFoldMapping1_32 ${sym}[] = {\n$str\n};\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
for (my $i = 0; $i < 256; $i++) {
|
for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
|
||||||
my $str = $foldPairs[$i];
|
$foldPairs2_16[$i] =~ s/,\n\Z//;
|
||||||
|
my $str = $foldPairs2_16[$i];
|
||||||
|
next if $str eq '';
|
||||||
|
my $num = '000' . $i;
|
||||||
|
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
|
||||||
|
my $sym = "case_fold2_16_${num}";
|
||||||
|
print("static const CaseFoldMapping2_16 ${sym}[] = {\n$str\n};\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
|
||||||
|
$foldPairs3_16[$i] =~ s/,\n\Z//;
|
||||||
|
my $str = $foldPairs3_16[$i];
|
||||||
|
next if $str eq '';
|
||||||
|
my $num = '000' . $i;
|
||||||
|
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
|
||||||
|
my $sym = "case_fold3_16_${num}";
|
||||||
|
print("static const CaseFoldMapping3_16 ${sym}[] = {\n$str\n};\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
print("static const CaseFoldHashBucket1_16 case_fold_hash1_16[] = {\n");
|
||||||
|
|
||||||
|
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
|
||||||
|
my $str = $foldPairs1_16[$i];
|
||||||
if ($str eq '') {
|
if ($str eq '') {
|
||||||
print(" { 0, NULL },\n");
|
print(" { NULL, 0 },\n");
|
||||||
} else {
|
} else {
|
||||||
my $num = '000' . $i;
|
my $num = '000' . $i;
|
||||||
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
|
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
|
||||||
my $sym = "case_fold_${num}";
|
my $sym = "case_fold1_16_${num}";
|
||||||
print(" { __PHYSFS_ARRAYLEN($sym), $sym },\n");
|
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
print("};\n\n");
|
print("};\n\n");
|
||||||
|
|
||||||
|
|
||||||
|
print("static const CaseFoldHashBucket1_32 case_fold_hash1_32[] = {\n");
|
||||||
|
|
||||||
|
for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
|
||||||
|
my $str = $foldPairs1_32[$i];
|
||||||
|
if ($str eq '') {
|
||||||
|
print(" { NULL, 0 },\n");
|
||||||
|
} else {
|
||||||
|
my $num = '000' . $i;
|
||||||
|
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
|
||||||
|
my $sym = "case_fold1_32_${num}";
|
||||||
|
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print("};\n\n");
|
||||||
|
|
||||||
|
|
||||||
|
print("static const CaseFoldHashBucket2_16 case_fold_hash2_16[] = {\n");
|
||||||
|
|
||||||
|
for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
|
||||||
|
my $str = $foldPairs2_16[$i];
|
||||||
|
if ($str eq '') {
|
||||||
|
print(" { NULL, 0 },\n");
|
||||||
|
} else {
|
||||||
|
my $num = '000' . $i;
|
||||||
|
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
|
||||||
|
my $sym = "case_fold2_16_${num}";
|
||||||
|
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print("};\n\n");
|
||||||
|
|
||||||
|
print("static const CaseFoldHashBucket3_16 case_fold_hash3_16[] = {\n");
|
||||||
|
|
||||||
|
for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
|
||||||
|
my $str = $foldPairs3_16[$i];
|
||||||
|
if ($str eq '') {
|
||||||
|
print(" { NULL, 0 },\n");
|
||||||
|
} else {
|
||||||
|
my $num = '000' . $i;
|
||||||
|
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
|
||||||
|
my $sym = "case_fold3_16_${num}";
|
||||||
|
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print("};\n\n");
|
||||||
|
|
||||||
|
print <<__EOF__;
|
||||||
|
|
||||||
|
#endif /* _INCLUDE_PHYSFS_CASEFOLDING_H_ */
|
||||||
|
|
||||||
|
/* end of physfs_casefolding.h ... */
|
||||||
|
|
||||||
|
__EOF__
|
||||||
|
|
||||||
exit 0;
|
exit 0;
|
||||||
|
|
||||||
# end of makecashfoldhashtable.pl ...
|
# end of makecashfoldhashtable.pl ...
|
||||||
|
|
10
src/physfs.c
10
src/physfs.c
|
@ -891,14 +891,14 @@ static DirHandle *openDirectory(PHYSFS_Io *io, const char *d, int forWriting)
|
||||||
/* Look for archivers with matching file extensions first... */
|
/* Look for archivers with matching file extensions first... */
|
||||||
for (i = archivers; (*i != NULL) && (retval == NULL); i++)
|
for (i = archivers; (*i != NULL) && (retval == NULL); i++)
|
||||||
{
|
{
|
||||||
if (__PHYSFS_utf8stricmp(ext, (*i)->info.extension) == 0)
|
if (PHYSFS_utf8stricmp(ext, (*i)->info.extension) == 0)
|
||||||
retval = tryOpenDir(io, *i, d, forWriting);
|
retval = tryOpenDir(io, *i, d, forWriting);
|
||||||
} /* for */
|
} /* for */
|
||||||
|
|
||||||
/* failing an exact file extension match, try all the others... */
|
/* failing an exact file extension match, try all the others... */
|
||||||
for (i = archivers; (*i != NULL) && (retval == NULL); i++)
|
for (i = archivers; (*i != NULL) && (retval == NULL); i++)
|
||||||
{
|
{
|
||||||
if (__PHYSFS_utf8stricmp(ext, (*i)->info.extension) != 0)
|
if (PHYSFS_utf8stricmp(ext, (*i)->info.extension) != 0)
|
||||||
retval = tryOpenDir(io, *i, d, forWriting);
|
retval = tryOpenDir(io, *i, d, forWriting);
|
||||||
} /* for */
|
} /* for */
|
||||||
} /* if */
|
} /* if */
|
||||||
|
@ -1442,7 +1442,7 @@ static int doRegisterArchiver(const PHYSFS_Archiver *_archiver)
|
||||||
ext = _archiver->info.extension;
|
ext = _archiver->info.extension;
|
||||||
for (i = 0; i < numArchivers; i++)
|
for (i = 0; i < numArchivers; i++)
|
||||||
{
|
{
|
||||||
if (__PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
|
if (PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
|
||||||
BAIL(PHYSFS_ERR_DUPLICATE, 0);
|
BAIL(PHYSFS_ERR_DUPLICATE, 0);
|
||||||
} /* for */
|
} /* for */
|
||||||
|
|
||||||
|
@ -1518,7 +1518,7 @@ int PHYSFS_deregisterArchiver(const char *ext)
|
||||||
__PHYSFS_platformGrabMutex(stateLock);
|
__PHYSFS_platformGrabMutex(stateLock);
|
||||||
for (i = 0; i < numArchivers; i++)
|
for (i = 0; i < numArchivers; i++)
|
||||||
{
|
{
|
||||||
if (__PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
|
if (PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
|
||||||
{
|
{
|
||||||
const int retval = doDeregisterArchiver(i);
|
const int retval = doDeregisterArchiver(i);
|
||||||
__PHYSFS_platformReleaseMutex(stateLock);
|
__PHYSFS_platformReleaseMutex(stateLock);
|
||||||
|
@ -1921,7 +1921,7 @@ int PHYSFS_setSaneConfig(const char *organization, const char *appName,
|
||||||
if ((l > extlen) && ((*i)[l - extlen - 1] == '.'))
|
if ((l > extlen) && ((*i)[l - extlen - 1] == '.'))
|
||||||
{
|
{
|
||||||
ext = (*i) + (l - extlen);
|
ext = (*i) + (l - extlen);
|
||||||
if (__PHYSFS_utf8stricmp(ext, archiveExt) == 0)
|
if (PHYSFS_utf8stricmp(ext, archiveExt) == 0)
|
||||||
setSaneCfgAddPath(*i, l, dirsep, archivesFirst);
|
setSaneCfgAddPath(*i, l, dirsep, archivesFirst);
|
||||||
} /* if */
|
} /* if */
|
||||||
} /* for */
|
} /* for */
|
||||||
|
|
24
src/physfs.h
24
src/physfs.h
|
@ -2521,6 +2521,30 @@ PHYSFS_DECL void PHYSFS_utf8FromLatin1(const char *src, char *dst,
|
||||||
|
|
||||||
/* Everything above this line is part of the PhysicsFS 2.0 API. */
|
/* Everything above this line is part of the PhysicsFS 2.0 API. */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \fn int PHYSFS_utf8stricmp(const char *str1, const char *str2)
|
||||||
|
* \brief Case-insensitive compare of two UTF-8 strings.
|
||||||
|
*
|
||||||
|
* This is a strcasecmp/stricmp replacement that expects both strings
|
||||||
|
* to be in UTF-8 encoding. It will do "case folding" to decide if the
|
||||||
|
* Unicode codepoints in the strings match.
|
||||||
|
*
|
||||||
|
* It will report which string is "greater than" the other, but be aware that
|
||||||
|
* this doesn't necessarily mean anything: 'a' may be "less than" 'b', but
|
||||||
|
* a Japanese kuten has no meaningful alphabetically relationship to
|
||||||
|
* a Greek lambda, but being able to assign a reliable "value" makes sorting
|
||||||
|
* algorithms possible, if not entirely sane. Most cases should treat the
|
||||||
|
* return value as "equal" or "not equal".
|
||||||
|
*
|
||||||
|
* Like stricmp, this expects both strings to be NULL-terminated.
|
||||||
|
*
|
||||||
|
* \param str1 First string to compare.
|
||||||
|
* \param str2 Second string to compare.
|
||||||
|
* \return -1 if str1 is "less than" str2, 1 if "greater than", 0 if equal.
|
||||||
|
*/
|
||||||
|
PHYSFS_DECL int PHYSFS_utf8stricmp(const char *str1, const char *str2);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \fn int PHYSFS_unmount(const char *oldDir)
|
* \fn int PHYSFS_unmount(const char *oldDir)
|
||||||
* \brief Remove a directory or archive from the search path.
|
* \brief Remove a directory or archive from the search path.
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -290,27 +290,6 @@ void __PHYSFS_sort(void *entries, size_t max,
|
||||||
((s) < (__PHYSFS_UI64(0xFFFFFFFFFFFFFFFF) >> (64-(sizeof(size_t)*8)))) \
|
((s) < (__PHYSFS_UI64(0xFFFFFFFFFFFFFFFF) >> (64-(sizeof(size_t)*8)))) \
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This is a strcasecmp() or stricmp() replacement that expects both strings
|
|
||||||
* to be in UTF-8 encoding. It will do "case folding" to decide if the
|
|
||||||
* Unicode codepoints in the strings match.
|
|
||||||
*
|
|
||||||
* It will report which string is "greater than" the other, but be aware that
|
|
||||||
* this doesn't necessarily mean anything: 'a' may be "less than" 'b', but
|
|
||||||
* a random Kanji codepoint has no meaningful alphabetically relationship to
|
|
||||||
* a Greek Lambda, but being able to assign a reliable "value" makes sorting
|
|
||||||
* algorithms possible, if not entirely sane. Most cases should treat the
|
|
||||||
* return value as "equal" or "not equal".
|
|
||||||
*/
|
|
||||||
int __PHYSFS_utf8stricmp(const char *s1, const char *s2);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This works like __PHYSFS_utf8stricmp(), but takes a character (NOT BYTE
|
|
||||||
* COUNT) argument, like strcasencmp().
|
|
||||||
*/
|
|
||||||
int __PHYSFS_utf8strnicmp(const char *s1, const char *s2, PHYSFS_uint32 l);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* stricmp() that guarantees to only work with low ASCII. The C runtime
|
* stricmp() that guarantees to only work with low ASCII. The C runtime
|
||||||
* stricmp() might try to apply a locale/codepage/etc, which we don't want.
|
* stricmp() might try to apply a locale/codepage/etc, which we don't want.
|
||||||
|
|
|
@ -206,7 +206,7 @@ static char *cvtPathToCorrectCase(char *buf)
|
||||||
cmp = __PHYSFS_stricmpASCII(utf8, fname);
|
cmp = __PHYSFS_stricmpASCII(utf8, fname);
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
cmp = __PHYSFS_utf8stricmp(utf8, fname);
|
cmp = PHYSFS_utf8stricmp(utf8, fname);
|
||||||
allocator.Free(utf8);
|
allocator.Free(utf8);
|
||||||
} /* else */
|
} /* else */
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
#define __PHYSICSFS_INTERNAL__
|
#define __PHYSICSFS_INTERNAL__
|
||||||
#include "physfs_internal.h"
|
#include "physfs_internal.h"
|
||||||
|
|
||||||
|
#include "physfs_casefolding.h"
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* From rfc3629, the UTF-8 spec:
|
* From rfc3629, the UTF-8 spec:
|
||||||
|
@ -402,112 +404,134 @@ void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len
|
||||||
} /* PHYSFS_utf8FromUtf16 */
|
} /* PHYSFS_utf8FromUtf16 */
|
||||||
|
|
||||||
|
|
||||||
typedef struct CaseFoldMapping
|
/* (to) should point to at least 3 PHYSFS_uint32 slots. */
|
||||||
|
static int locate_casefold_mapping(const PHYSFS_uint32 from, PHYSFS_uint32 *to)
|
||||||
{
|
{
|
||||||
PHYSFS_uint32 from;
|
int i;
|
||||||
PHYSFS_uint32 to0;
|
|
||||||
PHYSFS_uint32 to1;
|
|
||||||
PHYSFS_uint32 to2;
|
|
||||||
} CaseFoldMapping;
|
|
||||||
|
|
||||||
typedef struct CaseFoldHashBucket
|
if (from < 128) /* low-ASCII, easy! */
|
||||||
{
|
|
||||||
const PHYSFS_uint8 count;
|
|
||||||
const CaseFoldMapping *list;
|
|
||||||
} CaseFoldHashBucket;
|
|
||||||
|
|
||||||
#include "physfs_casefolding.h"
|
|
||||||
|
|
||||||
static void locate_case_fold_mapping(const PHYSFS_uint32 from,
|
|
||||||
PHYSFS_uint32 *to)
|
|
||||||
{
|
|
||||||
PHYSFS_uint32 i;
|
|
||||||
const PHYSFS_uint8 hashed = ((from ^ (from >> 8)) & 0xFF);
|
|
||||||
const CaseFoldHashBucket *bucket = &case_fold_hash[hashed];
|
|
||||||
const CaseFoldMapping *mapping = bucket->list;
|
|
||||||
|
|
||||||
for (i = 0; i < bucket->count; i++, mapping++)
|
|
||||||
{
|
{
|
||||||
if (mapping->from == from)
|
if ((from >= 'A') && (from <= 'Z'))
|
||||||
|
*to = from - ('A' - 'a');
|
||||||
|
else
|
||||||
|
*to = from;
|
||||||
|
return 1;
|
||||||
|
} /* if */
|
||||||
|
|
||||||
|
else if (from <= 0xFFFF)
|
||||||
|
{
|
||||||
|
const PHYSFS_uint8 hash = ((from ^ (from >> 8)) & 0xFF);
|
||||||
|
const PHYSFS_uint16 from16 = (PHYSFS_uint16) from;
|
||||||
|
|
||||||
{
|
{
|
||||||
to[0] = mapping->to0;
|
const CaseFoldHashBucket1_16 *bucket = &case_fold_hash1_16[hash];
|
||||||
to[1] = mapping->to1;
|
const int count = (int) bucket->count;
|
||||||
to[2] = mapping->to2;
|
for (i = 0; i < count; i++)
|
||||||
return;
|
{
|
||||||
} /* if */
|
const CaseFoldMapping1_16 *mapping = &bucket->list[i];
|
||||||
} /* for */
|
if (mapping->from == from16)
|
||||||
|
{
|
||||||
|
*to = mapping->to0;
|
||||||
|
return 1;
|
||||||
|
} /* if */
|
||||||
|
} /* for */
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const CaseFoldHashBucket2_16 *bucket = &case_fold_hash2_16[hash & 15];
|
||||||
|
const int count = (int) bucket->count;
|
||||||
|
for (i = 0; i < count; i++)
|
||||||
|
{
|
||||||
|
const CaseFoldMapping2_16 *mapping = &bucket->list[i];
|
||||||
|
if (mapping->from == from16)
|
||||||
|
{
|
||||||
|
to[0] = mapping->to0;
|
||||||
|
to[1] = mapping->to1;
|
||||||
|
return 2;
|
||||||
|
} /* if */
|
||||||
|
} /* for */
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const CaseFoldHashBucket3_16 *bucket = &case_fold_hash3_16[hash & 3];
|
||||||
|
const int count = (int) bucket->count;
|
||||||
|
for (i = 0; i < count; i++)
|
||||||
|
{
|
||||||
|
const CaseFoldMapping3_16 *mapping = &bucket->list[i];
|
||||||
|
if (mapping->from == from16)
|
||||||
|
{
|
||||||
|
to[0] = mapping->to0;
|
||||||
|
to[1] = mapping->to1;
|
||||||
|
to[2] = mapping->to2;
|
||||||
|
return 3;
|
||||||
|
} /* if */
|
||||||
|
} /* for */
|
||||||
|
}
|
||||||
|
} /* else if */
|
||||||
|
|
||||||
|
else /* codepoint that doesn't fit in 16 bits. */
|
||||||
|
{
|
||||||
|
const PHYSFS_uint8 hash = ((from ^ (from >> 8)) & 0xFF);
|
||||||
|
const CaseFoldHashBucket1_32 *bucket = &case_fold_hash1_32[hash & 15];
|
||||||
|
const int count = (int) bucket->count;
|
||||||
|
for (i = 0; i < count; i++)
|
||||||
|
{
|
||||||
|
const CaseFoldMapping1_32 *mapping = &bucket->list[i];
|
||||||
|
if (mapping->from == from)
|
||||||
|
{
|
||||||
|
*to = mapping->to0;
|
||||||
|
return 1;
|
||||||
|
} /* if */
|
||||||
|
} /* for */
|
||||||
|
} /* else */
|
||||||
|
|
||||||
|
|
||||||
/* Not found...there's no remapping for this codepoint. */
|
/* Not found...there's no remapping for this codepoint. */
|
||||||
to[0] = from;
|
*to = from;
|
||||||
to[1] = 0;
|
return 1;
|
||||||
to[2] = 0;
|
} /* locate_casefold_mapping */
|
||||||
} /* locate_case_fold_mapping */
|
|
||||||
|
|
||||||
|
|
||||||
/* !!! FIXME-3.0: this doesn't actually work (for example, it folds the German Eszett
|
int PHYSFS_utf8stricmp(const char *str1, const char *str2)
|
||||||
into 's' 's', but if you have two 'S' chars in a row, it'll fail on the first one,
|
|
||||||
since it'll fold into a single 's'. This needs to be able to lurch along with a
|
|
||||||
variable number of codepoints at a time. */
|
|
||||||
static int utf8codepointcmp(const PHYSFS_uint32 cp1, const PHYSFS_uint32 cp2)
|
|
||||||
{
|
{
|
||||||
PHYSFS_uint32 folded1[3], folded2[3];
|
PHYSFS_uint32 folded1[3], folded2[3];
|
||||||
|
int head1 = 0;
|
||||||
|
int tail1 = 0;
|
||||||
|
int head2 = 0;
|
||||||
|
int tail2 = 0;
|
||||||
|
|
||||||
if (cp1 == cp2)
|
|
||||||
return 0; /* obviously matches. */
|
|
||||||
|
|
||||||
locate_case_fold_mapping(cp1, folded1);
|
|
||||||
locate_case_fold_mapping(cp2, folded2);
|
|
||||||
|
|
||||||
if (folded1[0] < folded2[0])
|
|
||||||
return -1;
|
|
||||||
else if (folded1[0] > folded2[0])
|
|
||||||
return 1;
|
|
||||||
else if (folded1[1] < folded2[1])
|
|
||||||
return -1;
|
|
||||||
else if (folded1[1] > folded2[1])
|
|
||||||
return 1;
|
|
||||||
else if (folded1[2] < folded2[2])
|
|
||||||
return -1;
|
|
||||||
else if (folded1[2] > folded2[2])
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
return 0; /* complete match. */
|
|
||||||
} /* utf8codepointcmp */
|
|
||||||
|
|
||||||
|
|
||||||
int __PHYSFS_utf8stricmp(const char *str1, const char *str2)
|
|
||||||
{
|
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
|
PHYSFS_uint32 cp1, cp2;
|
||||||
const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
|
|
||||||
const int rc = utf8codepointcmp(cp1, cp2);
|
if (head1 != tail1)
|
||||||
if (rc != 0)
|
cp1 = folded1[tail1++];
|
||||||
return rc;
|
else
|
||||||
|
{
|
||||||
|
head1 = locate_casefold_mapping(utf8codepoint(&str1), folded1);
|
||||||
|
cp1 = folded1[0];
|
||||||
|
tail1 = 1;
|
||||||
|
} /* else */
|
||||||
|
|
||||||
|
if (head2 != tail2)
|
||||||
|
cp2 = folded2[tail2++];
|
||||||
|
else
|
||||||
|
{
|
||||||
|
head2 = locate_casefold_mapping(utf8codepoint(&str2), folded2);
|
||||||
|
cp2 = folded2[0];
|
||||||
|
tail2 = 1;
|
||||||
|
} /* else */
|
||||||
|
|
||||||
|
if (cp1 < cp2)
|
||||||
|
return -1;
|
||||||
|
else if (cp1 > cp2)
|
||||||
|
return 1;
|
||||||
else if (cp1 == 0)
|
else if (cp1 == 0)
|
||||||
break; /* complete match. */
|
break; /* complete match. */
|
||||||
} /* while */
|
} /* while */
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
} /* __PHYSFS_utf8stricmp */
|
} /* PHYSFS_utf8stricmp */
|
||||||
|
|
||||||
|
|
||||||
int __PHYSFS_utf8strnicmp(const char *str1, const char *str2, PHYSFS_uint32 n)
|
|
||||||
{
|
|
||||||
while (n > 0)
|
|
||||||
{
|
|
||||||
const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
|
|
||||||
const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
|
|
||||||
const int rc = utf8codepointcmp(cp1, cp2);
|
|
||||||
if (rc != 0)
|
|
||||||
return rc;
|
|
||||||
else if (cp1 == 0)
|
|
||||||
return 0;
|
|
||||||
n--;
|
|
||||||
} /* while */
|
|
||||||
|
|
||||||
return 0; /* matched to n chars. */
|
|
||||||
} /* __PHYSFS_utf8strnicmp */
|
|
||||||
|
|
||||||
|
|
||||||
int __PHYSFS_stricmpASCII(const char *str1, const char *str2)
|
int __PHYSFS_stricmpASCII(const char *str1, const char *str2)
|
||||||
|
|
Loading…
Reference in New Issue