Add psl2c --binary to create DAFSA binary file from PSL

This commit is contained in:
Tim Rühsen 2015-12-29 16:53:47 +01:00
parent e63ff8abfc
commit 82e9445493
2 changed files with 84 additions and 58 deletions

View File

@ -430,12 +430,22 @@ def to_cxx(data):
return text return text
def words_to_cxx(words): def words_to_whatever(words, converter):
"""Generates C++ code from a word list""" """Generates C++ code from a word list"""
dafsa = to_dafsa(words) dafsa = to_dafsa(words)
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels): for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
dafsa = fun(dafsa) dafsa = fun(dafsa)
return to_cxx(encode(dafsa)) return converter(encode(dafsa))
def words_to_cxx(words):
"""Generates C++ code from a word list"""
return words_to_whatever(words, to_cxx)
def words_to_binary(words):
"""Generates C++ code from a word list"""
return words_to_whatever(words, bytearray)
def parse_gperf(infile): def parse_gperf(infile):
@ -457,15 +467,24 @@ def parse_gperf(infile):
def main(): def main():
if len(sys.argv) != 3: if len(sys.argv) < 3:
print('usage: %s infile outfile' % sys.argv[0]) print('usage: %s [--binary] infile outfile' % sys.argv[0])
return 1 return 1
if sys.argv[1] == '-':
with open(sys.argv[2], 'w') as outfile: argpos = 1
outfile.write(words_to_cxx(parse_gperf(sys.stdin))) converter = words_to_cxx
if sys.argv[argpos] == '--binary':
converter = words_to_binary
argpos += 1
if sys.argv[argpos] == '-':
with open(sys.argv[argpos + 1], 'w') as outfile:
outfile.write(converter(parse_gperf(sys.stdin)))
else: else:
with open(sys.argv[1], 'r') as infile, open(sys.argv[2], 'w') as outfile: with open(sys.argv[argpos], 'r') as infile, open(sys.argv[argpos + 1], 'w') as outfile:
outfile.write(words_to_cxx(parse_gperf(infile))) outfile.write(converter(parse_gperf(infile)))
return 0 return 0

View File

@ -180,73 +180,85 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
unlink("in.tmp"); unlink("in.tmp");
unlink("out.tmp"); unlink("out.tmp");
} }
#if 0
#if !defined(WITH_LIBICU) && !defined(WITH_IDN2)
static int _str_needs_encoding(const char *s)
{
while (*s && *((unsigned char *)s) < 128) s++;
return !!*s;
}
static void _add_punycode_if_needed(_psl_vector_t *v)
{
int it, n;
/* do not use 'it < v->cur' since v->cur is changed by _vector_add() ! */
for (it = 0, n = v->cur; it < n; it++) {
_psl_entry_t *e = _vector_get(v, it);
if (_str_needs_encoding(e->label_buf)) {
_psl_entry_t suffix, *suffixp;
char lookupname[64] = "";
/* this is much slower than the libidn2 API but should have no license issues */
FILE *pp;
char cmd[16 + sizeof(e->label_buf)];
snprintf(cmd, sizeof(cmd), "idn2 '%s'", e->label_buf);
if ((pp = popen(cmd, "r"))) {
if (fscanf(pp, "%63s", lookupname) >= 1 && strcmp(e->label_buf, lookupname)) {
/* fprintf(stderr, "idn2 '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.wildcard = e->wildcard;
suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
}
pclose(pp);
} else
fprintf(stderr, "Failed to call popen(%s, \"r\")\n", cmd);
}
}
_vector_sort(v);
}
#endif /* !defined(WITH_LIBICU) && !defined(WITH_IDN2) */
#endif
#endif /* _GENERATE_BUILTIN_DATA */ #endif /* _GENERATE_BUILTIN_DATA */
static int _print_psl_entries_dafsa_binary(const char *fname, const _psl_vector_t *v)
{
FILE *fp;
int ret = 0, it, rc;
char cmd[256];
if ((fp = fopen("in.tmp", "w"))) {
for (it = 0; it < v->cur; it++) {
_psl_entry_t *e = _vector_get(v, it);
unsigned char *s = (unsigned char *)e->label_buf;
/* search for non-ASCII label and skip it */
while (*s && *s < 128) s++;
if (*s) continue;
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
}
fclose(fp);
} else {
fprintf(stderr, "Failed to write open 'in.tmp'\n");
return 3;
}
snprintf(cmd, sizeof(cmd), MAKE_DAFSA " --binary in.tmp %s", fname);
if ((rc = system(cmd))) {
fprintf(stderr, "Failed to execute '%s' (%d)\n", cmd, rc);
ret = 2;
}
unlink("in.tmp");
return ret;
}
static void usage(void)
{
fprintf(stderr, "Usage: psl2c [--binary] <infile> <outfile>\n");
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
fprintf(stderr, " <outfile> is the the filename to be generated from <infile>\n");
fprintf(stderr, " --binary Generate binary DAFSA output (default: C code for psl.c)\n");
exit(1);
}
int main(int argc, const char **argv) int main(int argc, const char **argv)
{ {
FILE *fpout; FILE *fpout;
#ifdef _GENERATE_BUILTIN_DATA #ifdef _GENERATE_BUILTIN_DATA
psl_ctx_t *psl; psl_ctx_t *psl;
#endif #endif
int ret = 0, argpos = 1; int ret = 0, argpos = 1, binary = 0;
if (argc - argpos != 2) { if (argc < 3)
fprintf(stderr, "Usage: psl2c <infile> <outfile>\n"); usage();
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
fprintf(stderr, " <outfile> is the the C filename to be generated from <infile>\n"); if (strcmp(argv[argpos], "--binary") == 0) {
return 1; argpos++;
binary = 1;
}
if (argc - argpos != 2)
usage();
if (binary) {
if (!(psl = psl_load_file(argv[argpos])))
return 2;
ret = _print_psl_entries_dafsa_binary(argv[argpos + 1], psl->suffixes);
psl_free(psl);
return ret;
} }
#ifdef _GENERATE_BUILTIN_DATA #ifdef _GENERATE_BUILTIN_DATA
if (!(psl = psl_load_file(argv[argpos]))) if (!(psl = psl_load_file(argv[argpos])))
return 2; return 2;
/* look for ambigious or double entries */ /* look for ambiguous or double entries */
/* if (_check_psl(psl)) { /* if (_check_psl(psl)) {
psl_free(psl); psl_free(psl);
return 5; return 5;
@ -260,11 +272,6 @@ int main(int argc, const char **argv)
char *abs_srcfile; char *abs_srcfile;
const char *source_date_epoch = NULL; const char *source_date_epoch = NULL;
#if 0
/* include library code did not generate punycode, so let's do it for the builtin data */
_add_punycode_if_needed(psl->suffixes);
#endif
_print_psl_entries_dafsa(fpout, psl->suffixes); _print_psl_entries_dafsa(fpout, psl->suffixes);
snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]); snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]);