Compare commits

...

2 Commits

Author SHA1 Message Date
takase1121 1de4d0a4a3 refactor regex API again
This time making it easier to use from Lua side.
- regex is now a proper userdata instead of a table wrapped in userdata
- add regex:get_metadata()
- remove regex:nametable() option in favor of regex:get_metadata()
2021-10-09 12:32:36 +08:00
takase1121 09d6087df8 regex code refactor
- declare API_TYPE_REGEX
- move error report code to a function
- move regex table check to a function
- add regex.source and regex.flags
- add regex:nametable()
- ensure stack size in regex:cmatch()
- fix wrong match indices
2021-08-02 09:49:52 +08:00
2 changed files with 159 additions and 65 deletions

View File

@ -8,6 +8,7 @@
#define API_TYPE_FONT "Font" #define API_TYPE_FONT "Font"
#define API_TYPE_REPLACE "Replace" #define API_TYPE_REPLACE "Replace"
#define API_TYPE_PROCESS "Process" #define API_TYPE_PROCESS "Process"
#define API_TYPE_REGEX "Regex"
void api_load_libs(lua_State *L); void api_load_libs(lua_State *L);

View File

@ -2,116 +2,209 @@
#define PCRE2_CODE_UNIT_WIDTH 8 #define PCRE2_CODE_UNIT_WIDTH 8
#include <stdarg.h>
#include <string.h> #include <string.h>
#include <pcre2.h> #include <pcre2.h>
typedef struct regex_t {
pcre2_code *re;
int metadata;
} regex_t;
static int pcre2_error(lua_State *L, int rc, const char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
lua_pushvfstring(L, fmt, ap);
va_end(ap);
PCRE2_UCHAR err[256];
pcre2_get_error_message(rc, err, sizeof(err));
lua_pushstring(L, (const char *) err);
lua_concat(L, 2);
lua_error(L);
return 0;
}
static int f_pcre_gc(lua_State* L) { static int f_pcre_gc(lua_State* L) {
lua_rawgeti(L, -1, 1); regex_t *re = luaL_checkudata(L, 1, API_TYPE_REGEX);
pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1); luaL_unref(L, LUA_REGISTRYINDEX, re->metadata);
if (re) if (re->re)
pcre2_code_free(re); pcre2_code_free(re->re);
return 0; return 0;
} }
static int f_pcre_compile(lua_State *L) { static int f_pcre_compile(lua_State *L) {
size_t len; size_t len;
PCRE2_SIZE errorOffset; int options = PCRE2_UTF;
int errorNumber; const char* pattern = luaL_checklstring(L, 1, &len);
int pattern = PCRE2_UTF; const char* optstr = luaL_optstring(L, 2, "");
const char* str = luaL_checklstring(L, 1, &len); if (strstr(optstr,"i"))
if (lua_gettop(L) > 1) { options |= PCRE2_CASELESS;
const char* options = luaL_checkstring(L, 2); if (strstr(optstr,"m"))
if (strstr(options,"i")) options |= PCRE2_MULTILINE;
pattern |= PCRE2_CASELESS; if (strstr(optstr,"s"))
if (strstr(options,"m")) options |= PCRE2_DOTALL;
pattern |= PCRE2_MULTILINE;
if (strstr(options,"s")) int error;
pattern |= PCRE2_DOTALL; PCRE2_SIZE error_offset;
}
pcre2_code* re = pcre2_compile( pcre2_code* re = pcre2_compile(
(PCRE2_SPTR)str, (PCRE2_SPTR)pattern,
len, len,
pattern, options,
&errorNumber, &error,
&errorOffset, &error_offset,
NULL NULL
); );
if (re) {
lua_newtable(L); if (re == NULL)
lua_pushlightuserdata(L, re); return pcre2_error(L, error, "regex compilation failed at %d: ", error_offset);
lua_rawseti(L, -2, 1);
luaL_setmetatable(L, "regex"); regex_t *regex = (regex_t*) lua_newuserdata(L, sizeof(regex_t));
return 1; luaL_setmetatable(L, API_TYPE_REGEX);
regex->re = re;
lua_newtable(L);
lua_pushvalue(L, 1);
lua_setfield(L, -2, "source");
lua_pushstring(L, optstr);
lua_setfield(L, -2, "flags");
regex->metadata = luaL_ref(L, LUA_REGISTRYINDEX);
return 1;
}
static int get_nametable(lua_State *L, pcre2_code *re) {
int ret;
uint32_t namecount, entrysize;
PCRE2_SPTR nametable;
ret = pcre2_pattern_info(re, PCRE2_INFO_NAMETABLE, &nametable);
if (ret)
return pcre2_error(L, ret, "cannot get PCRE2_INFO_NAMETABLE: ");
pcre2_pattern_info(re, PCRE2_INFO_NAMECOUNT, &namecount);
pcre2_pattern_info(re, PCRE2_INFO_NAMEENTRYSIZE, &entrysize);
lua_createtable(L, namecount, 0);
for (uint32_t i = 1; i <= namecount; i++) {
uint16_t index = (nametable[1] << 0) | (nametable[0] << 8);
nametable += 2;
lua_createtable(L, 0, 2);
lua_pushnumber(L, index);
lua_setfield(L, -2, "index");
lua_pushstring(L, (const char *) nametable);
lua_setfield(L, -2, "name");
lua_rawseti(L, -2, i);
nametable += (entrysize - 2);
} }
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(errorNumber, buffer, sizeof(buffer)); return 1;
lua_pushnil(L); }
char message[1024];
len = snprintf(message, sizeof(message), "regex compilation failed at offset %d: %s", (int)errorOffset, buffer); static int f_pcre_get_metadata(lua_State *L) {
lua_pushlstring(L, message, len); regex_t *re = luaL_checkudata(L, 1, API_TYPE_REGEX);
return 2;
lua_rawgeti(L, LUA_REGISTRYINDEX, re->metadata);
lua_getfield(L, -1, "nametable");
if (lua_type(L, -1) == LUA_TTABLE) {
lua_pop(L, 1);
} else {
lua_pop(L, 1);
get_nametable(L, re->re);
lua_setfield(L, -2, "nametable");
}
return 1;
}
static size_t check_bitfield(lua_State *L, int t) {
size_t size, bit = 0;
luaL_checktype(L, t, LUA_TTABLE);
size = lua_rawlen(L, t);
for (size_t i = 1; i <= size; i++) {
lua_rawgeti(L, t, i);
bit |= luaL_checkinteger(L, -1);
lua_pop(L, 1);
}
return bit;
} }
// Takes string, compiled regex, returns list of indices of matched groups // Takes string, compiled regex, returns list of indices of matched groups
// (including the whole match), if a match was found. // (including the whole match), if a match was found.
static int f_pcre_match(lua_State *L) { static int f_pcre_match(lua_State *L) {
size_t len, offset = 1, opts = 0; size_t len, offset, options;
luaL_checktype(L, 1, LUA_TTABLE); regex_t *re = luaL_checkudata(L, 1, API_TYPE_REGEX);
const char* str = luaL_checklstring(L, 2, &len); const char* str = luaL_checklstring(L, 2, &len);
if (lua_gettop(L) > 2)
offset = luaL_checknumber(L, 3); offset = luaL_optnumber(L, 3, 1);
if (lua_gettop(L) > 3) options = lua_gettop(L) == 4 ? check_bitfield(L, 4) : 0;
opts = luaL_checknumber(L, 4);
lua_rawgeti(L, 1, 1); pcre2_match_data* md = pcre2_match_data_create_from_pattern(re->re, NULL);
pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1); int rc = pcre2_match(re->re, (PCRE2_SPTR)str, len, offset - 1, options, md, NULL);
pcre2_match_data* md = pcre2_match_data_create_from_pattern(re, NULL);
int rc = pcre2_match(re, (PCRE2_SPTR)str, len, offset - 1, opts, md, NULL);
if (rc < 0) { if (rc < 0) {
pcre2_match_data_free(md); pcre2_match_data_free(md);
if (rc != PCRE2_ERROR_NOMATCH) if (rc != PCRE2_ERROR_NOMATCH)
luaL_error(L, "regex matching error %d", rc); return pcre2_error(L, rc, "regex matching error: ");
return 0; else
return 0;
} }
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(md); PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(md);
if (ovector[0] > ovector[1]) { if (ovector[0] > ovector[1]) {
/* We must guard against patterns such as /(?=.\K)/ that use \K in an pcre2_match_data_free(md);
/* We must guard against patterns such as /(?=.\K)/ that use \K in an
assertion to set the start of a match later than its end. In the editor, assertion to set the start of a match later than its end. In the editor,
we just detect this case and give up. */ we just detect this case and give up. */
luaL_error(L, "regex matching error: \\K was used in an assertion to " return luaL_error(L, "regex matching error: \\K was used in an assertion to "
" set the match start after its end"); " set the match start after its end");
pcre2_match_data_free(md);
return 0;
} }
for (int i = 0; i < rc*2; i++)
lua_pushnumber(L, ovector[i]+1); rc *= 2;
luaL_checkstack(L, rc, NULL);
for (int i = 0; i < rc; i++)
// for every even vector (the end of a pair), we do not increment by 1
lua_pushnumber(L, ovector[i] + (i + 1) % 2);
pcre2_match_data_free(md); pcre2_match_data_free(md);
return rc*2; return rc;
} }
static const luaL_Reg lib[] = { static const luaL_Reg lib[] = {
{ "compile", f_pcre_compile }, { "get_metadata", f_pcre_get_metadata },
{ "cmatch", f_pcre_match }, { "compile", f_pcre_compile },
{ "__gc", f_pcre_gc }, { "cmatch", f_pcre_match },
{ NULL, NULL } { "__gc", f_pcre_gc },
{ NULL, NULL }
}; };
int luaopen_regex(lua_State *L) { int luaopen_regex(lua_State *L) {
luaL_newlib(L, lib); luaL_newmetatable(L, API_TYPE_REGEX);
lua_pushliteral(L, "regex"); luaL_setfuncs(L, lib, 0);
lua_setfield(L, -2, "__name");
lua_pushvalue(L, -1); lua_pushvalue(L, -1);
lua_setfield(L, LUA_REGISTRYINDEX, "regex"); lua_setfield(L, -2, "__index");
lua_pushnumber(L, PCRE2_ANCHORED); lua_pushnumber(L, PCRE2_ANCHORED);
lua_setfield(L, -2, "ANCHORED"); lua_setfield(L, -2, "ANCHORED");
lua_pushnumber(L, PCRE2_ANCHORED) ;
lua_setfield(L, -2, "ENDANCHORED"); lua_pushnumber(L, PCRE2_ANCHORED) ;
lua_setfield(L, -2, "ENDANCHORED");
lua_pushnumber(L, PCRE2_NOTBOL); lua_pushnumber(L, PCRE2_NOTBOL);
lua_setfield(L, -2, "NOTBOL"); lua_setfield(L, -2, "NOTBOL");
lua_pushnumber(L, PCRE2_NOTEOL); lua_pushnumber(L, PCRE2_NOTEOL);
lua_setfield(L, -2, "NOTEOL"); lua_setfield(L, -2, "NOTEOL");
lua_pushnumber(L, PCRE2_NOTEMPTY); lua_pushnumber(L, PCRE2_NOTEMPTY);
lua_setfield(L, -2, "NOTEMPTY"); lua_setfield(L, -2, "NOTEMPTY");
lua_pushnumber(L, PCRE2_NOTEMPTY_ATSTART); lua_pushnumber(L, PCRE2_NOTEMPTY_ATSTART);
lua_setfield(L, -2, "NOTEMPTY_ATSTART"); lua_setfield(L, -2, "NOTEMPTY_ATSTART");
return 1; return 1;
} }