Add psl2c --binary to create DAFSA binary file from PSL
This commit is contained in:
parent
e63ff8abfc
commit
82e9445493
|
@ -430,12 +430,22 @@ def to_cxx(data):
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def words_to_cxx(words):
|
def words_to_whatever(words, converter):
|
||||||
"""Generates C++ code from a word list"""
|
"""Generates C++ code from a word list"""
|
||||||
dafsa = to_dafsa(words)
|
dafsa = to_dafsa(words)
|
||||||
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
|
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
|
||||||
dafsa = fun(dafsa)
|
dafsa = fun(dafsa)
|
||||||
return to_cxx(encode(dafsa))
|
return converter(encode(dafsa))
|
||||||
|
|
||||||
|
|
||||||
|
def words_to_cxx(words):
|
||||||
|
"""Generates C++ code from a word list"""
|
||||||
|
return words_to_whatever(words, to_cxx)
|
||||||
|
|
||||||
|
|
||||||
|
def words_to_binary(words):
|
||||||
|
"""Generates C++ code from a word list"""
|
||||||
|
return words_to_whatever(words, bytearray)
|
||||||
|
|
||||||
|
|
||||||
def parse_gperf(infile):
|
def parse_gperf(infile):
|
||||||
|
@ -457,15 +467,24 @@ def parse_gperf(infile):
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if len(sys.argv) != 3:
|
if len(sys.argv) < 3:
|
||||||
print('usage: %s infile outfile' % sys.argv[0])
|
print('usage: %s [--binary] infile outfile' % sys.argv[0])
|
||||||
return 1
|
return 1
|
||||||
if sys.argv[1] == '-':
|
|
||||||
with open(sys.argv[2], 'w') as outfile:
|
argpos = 1
|
||||||
outfile.write(words_to_cxx(parse_gperf(sys.stdin)))
|
converter = words_to_cxx
|
||||||
|
|
||||||
|
if sys.argv[argpos] == '--binary':
|
||||||
|
converter = words_to_binary
|
||||||
|
argpos += 1
|
||||||
|
|
||||||
|
if sys.argv[argpos] == '-':
|
||||||
|
with open(sys.argv[argpos + 1], 'w') as outfile:
|
||||||
|
outfile.write(converter(parse_gperf(sys.stdin)))
|
||||||
else:
|
else:
|
||||||
with open(sys.argv[1], 'r') as infile, open(sys.argv[2], 'w') as outfile:
|
with open(sys.argv[argpos], 'r') as infile, open(sys.argv[argpos + 1], 'w') as outfile:
|
||||||
outfile.write(words_to_cxx(parse_gperf(infile)))
|
outfile.write(converter(parse_gperf(infile)))
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|
105
src/psl2c.c
105
src/psl2c.c
|
@ -180,52 +180,50 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
|
||||||
unlink("in.tmp");
|
unlink("in.tmp");
|
||||||
unlink("out.tmp");
|
unlink("out.tmp");
|
||||||
}
|
}
|
||||||
|
#endif /* _GENERATE_BUILTIN_DATA */
|
||||||
|
|
||||||
#if 0
|
static int _print_psl_entries_dafsa_binary(const char *fname, const _psl_vector_t *v)
|
||||||
#if !defined(WITH_LIBICU) && !defined(WITH_IDN2)
|
|
||||||
static int _str_needs_encoding(const char *s)
|
|
||||||
{
|
{
|
||||||
while (*s && *((unsigned char *)s) < 128) s++;
|
FILE *fp;
|
||||||
|
int ret = 0, it, rc;
|
||||||
|
char cmd[256];
|
||||||
|
|
||||||
return !!*s;
|
if ((fp = fopen("in.tmp", "w"))) {
|
||||||
}
|
for (it = 0; it < v->cur; it++) {
|
||||||
|
_psl_entry_t *e = _vector_get(v, it);
|
||||||
|
unsigned char *s = (unsigned char *)e->label_buf;
|
||||||
|
|
||||||
static void _add_punycode_if_needed(_psl_vector_t *v)
|
/* search for non-ASCII label and skip it */
|
||||||
{
|
while (*s && *s < 128) s++;
|
||||||
int it, n;
|
if (*s) continue;
|
||||||
|
|
||||||
/* do not use 'it < v->cur' since v->cur is changed by _vector_add() ! */
|
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
|
||||||
for (it = 0, n = v->cur; it < n; it++) {
|
|
||||||
_psl_entry_t *e = _vector_get(v, it);
|
|
||||||
|
|
||||||
if (_str_needs_encoding(e->label_buf)) {
|
|
||||||
_psl_entry_t suffix, *suffixp;
|
|
||||||
char lookupname[64] = "";
|
|
||||||
|
|
||||||
/* this is much slower than the libidn2 API but should have no license issues */
|
|
||||||
FILE *pp;
|
|
||||||
char cmd[16 + sizeof(e->label_buf)];
|
|
||||||
snprintf(cmd, sizeof(cmd), "idn2 '%s'", e->label_buf);
|
|
||||||
if ((pp = popen(cmd, "r"))) {
|
|
||||||
if (fscanf(pp, "%63s", lookupname) >= 1 && strcmp(e->label_buf, lookupname)) {
|
|
||||||
/* fprintf(stderr, "idn2 '%s' -> '%s'\n", e->label_buf, lookupname); */
|
|
||||||
_suffix_init(&suffix, lookupname, strlen(lookupname));
|
|
||||||
suffix.wildcard = e->wildcard;
|
|
||||||
suffixp = _vector_get(v, _vector_add(v, &suffix));
|
|
||||||
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
|
||||||
}
|
|
||||||
pclose(pp);
|
|
||||||
} else
|
|
||||||
fprintf(stderr, "Failed to call popen(%s, \"r\")\n", cmd);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "Failed to write open 'in.tmp'\n");
|
||||||
|
return 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
_vector_sort(v);
|
snprintf(cmd, sizeof(cmd), MAKE_DAFSA " --binary in.tmp %s", fname);
|
||||||
}
|
if ((rc = system(cmd))) {
|
||||||
#endif /* !defined(WITH_LIBICU) && !defined(WITH_IDN2) */
|
fprintf(stderr, "Failed to execute '%s' (%d)\n", cmd, rc);
|
||||||
#endif
|
ret = 2;
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* _GENERATE_BUILTIN_DATA */
|
unlink("in.tmp");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void usage(void)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Usage: psl2c [--binary] <infile> <outfile>\n");
|
||||||
|
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
|
||||||
|
fprintf(stderr, " <outfile> is the the filename to be generated from <infile>\n");
|
||||||
|
fprintf(stderr, " --binary Generate binary DAFSA output (default: C code for psl.c)\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, const char **argv)
|
int main(int argc, const char **argv)
|
||||||
{
|
{
|
||||||
|
@ -233,20 +231,34 @@ int main(int argc, const char **argv)
|
||||||
#ifdef _GENERATE_BUILTIN_DATA
|
#ifdef _GENERATE_BUILTIN_DATA
|
||||||
psl_ctx_t *psl;
|
psl_ctx_t *psl;
|
||||||
#endif
|
#endif
|
||||||
int ret = 0, argpos = 1;
|
int ret = 0, argpos = 1, binary = 0;
|
||||||
|
|
||||||
if (argc - argpos != 2) {
|
if (argc < 3)
|
||||||
fprintf(stderr, "Usage: psl2c <infile> <outfile>\n");
|
usage();
|
||||||
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
|
|
||||||
fprintf(stderr, " <outfile> is the the C filename to be generated from <infile>\n");
|
if (strcmp(argv[argpos], "--binary") == 0) {
|
||||||
return 1;
|
argpos++;
|
||||||
|
binary = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc - argpos != 2)
|
||||||
|
usage();
|
||||||
|
|
||||||
|
if (binary) {
|
||||||
|
if (!(psl = psl_load_file(argv[argpos])))
|
||||||
|
return 2;
|
||||||
|
|
||||||
|
ret = _print_psl_entries_dafsa_binary(argv[argpos + 1], psl->suffixes);
|
||||||
|
|
||||||
|
psl_free(psl);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _GENERATE_BUILTIN_DATA
|
#ifdef _GENERATE_BUILTIN_DATA
|
||||||
if (!(psl = psl_load_file(argv[argpos])))
|
if (!(psl = psl_load_file(argv[argpos])))
|
||||||
return 2;
|
return 2;
|
||||||
|
|
||||||
/* look for ambigious or double entries */
|
/* look for ambiguous or double entries */
|
||||||
/* if (_check_psl(psl)) {
|
/* if (_check_psl(psl)) {
|
||||||
psl_free(psl);
|
psl_free(psl);
|
||||||
return 5;
|
return 5;
|
||||||
|
@ -260,11 +272,6 @@ int main(int argc, const char **argv)
|
||||||
char *abs_srcfile;
|
char *abs_srcfile;
|
||||||
const char *source_date_epoch = NULL;
|
const char *source_date_epoch = NULL;
|
||||||
|
|
||||||
#if 0
|
|
||||||
/* include library code did not generate punycode, so let's do it for the builtin data */
|
|
||||||
_add_punycode_if_needed(psl->suffixes);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
_print_psl_entries_dafsa(fpout, psl->suffixes);
|
_print_psl_entries_dafsa(fpout, psl->suffixes);
|
||||||
|
|
||||||
snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]);
|
snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]);
|
||||||
|
|
Loading…
Reference in New Issue