/* * Integration of https://github.com/starwing/luautf8 * * Copyright (c) 2018 Xavier Wang * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include "../unidata.h" /* UTF-8 string operations */ #define UTF8_BUFFSZ 8 #define UTF8_MAX 0x7FFFFFFFu #define UTF8_MAXCP 0x10FFFFu #define iscont(p) ((*(p) & 0xC0) == 0x80) #define CAST(tp,expr) ((tp)(expr)) #ifndef LUA_QL # define LUA_QL(x) "'" x "'" #endif static int utf8_invalid (utfint ch) { return (ch > UTF8_MAXCP || (0xD800u <= ch && ch <= 0xDFFFu)); } static size_t utf8_encode (char *buff, utfint x) { int n = 1; /* number of bytes put in buffer (backwards) */ lua_assert(x <= UTF8_MAX); if (x < 0x80) /* ascii? */ buff[UTF8_BUFFSZ - 1] = x & 0x7F; else { /* need continuation bytes */ utfint mfb = 0x3f; /* maximum that fits in first byte */ do { /* add continuation bytes */ buff[UTF8_BUFFSZ - (n++)] = 0x80 | (x & 0x3f); x >>= 6; /* remove added bits */ mfb >>= 1; /* now there is one less bit available in first byte */ } while (x > mfb); /* still needs continuation byte? */ buff[UTF8_BUFFSZ - n] = ((~mfb << 1) | x) & 0xFF; /* add first byte */ } return n; } static const char *utf8_decode (const char *s, utfint *val, int strict) { static const utfint limits[] = {~0u, 0x80u, 0x800u, 0x10000u, 0x200000u, 0x4000000u}; unsigned int c = (unsigned char)s[0]; utfint res = 0; /* final result */ if (c < 0x80) /* ascii? */ res = c; else { int count = 0; /* to count number of continuation bytes */ for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */ unsigned int cc = (unsigned char)s[++count]; /* read next byte */ if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ return NULL; /* invalid byte sequence */ res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ } res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */ if (count > 5 || res > UTF8_MAX || res < limits[count]) return NULL; /* invalid byte sequence */ s += count; /* skip continuation bytes read */ } if (strict) { /* check for invalid code points; too large or surrogates */ if (res > UTF8_MAXCP || (0xD800u <= res && res <= 0xDFFFu)) return NULL; } if (val) *val = res; return s + 1; /* +1 to include first byte */ } static const char *utf8_prev (const char *s, const char *e) { while (s < e && iscont(e - 1)) --e; return s < e ? e - 1 : s; } static const char *utf8_next (const char *s, const char *e) { while (s < e && iscont(s + 1)) ++s; return s < e ? s + 1 : e; } static size_t utf8_length (const char *s, const char *e) { size_t i; for (i = 0; s < e; ++i) s = utf8_next(s, e); return i; } static const char *utf8_offset (const char *s, const char *e, lua_Integer offset, lua_Integer idx) { const char *p = s + offset - 1; if (idx >= 0) { while (p < e && idx > 0) p = utf8_next(p, e), --idx; return idx == 0 ? p : NULL; } else { while (s < p && idx < 0) p = utf8_prev(s, p), ++idx; return idx == 0 ? p : NULL; } } static const char *utf8_relat (const char *s, const char *e, int idx) { return idx >= 0 ? utf8_offset(s, e, 1, idx - 1) : utf8_offset(s, e, e-s+1, idx); } static int utf8_range(const char *s, const char *e, lua_Integer *i, lua_Integer *j) { const char *ps = utf8_relat(s, e, CAST(int, *i)); const char *pe = utf8_relat(s, e, CAST(int, *j)); *i = (ps ? ps : (*i > 0 ? e : s)) - s; *j = (pe ? utf8_next(pe, e) : (*j > 0 ? e : s)) - s; return *i < *j; } /* Unicode character categories */ #define table_size(t) (sizeof(t)/sizeof((t)[0])) #define utf8_categories(X) \ X('a', alpha) \ X('c', cntrl) \ X('d', digit) \ X('l', lower) \ X('p', punct) \ X('s', space) \ X('t', compose) \ X('u', upper) \ X('x', xdigit) #define utf8_converters(X) \ X(lower) \ X(upper) \ X(title) \ X(fold) static int find_in_range (range_table *t, size_t size, utfint ch) { size_t begin, end; begin = 0; end = size; while (begin < end) { size_t mid = (begin + end) / 2; if (t[mid].last < ch) begin = mid + 1; else if (t[mid].first > ch) end = mid; else return (ch - t[mid].first) % t[mid].step == 0; } return 0; } static int convert_char (conv_table *t, size_t size, utfint ch) { size_t begin, end; begin = 0; end = size; while (begin < end) { size_t mid = (begin + end) / 2; if (t[mid].last < ch) begin = mid + 1; else if (t[mid].first > ch) end = mid; else if ((ch - t[mid].first) % t[mid].step == 0) return ch + t[mid].offset; else return ch; } return ch; } #define define_category(cls, name) static int utf8_is##name (utfint ch)\ { return find_in_range(name##_table, table_size(name##_table), ch); } #define define_converter(name) static utfint utf8_to##name (utfint ch) \ { return convert_char(to##name##_table, table_size(to##name##_table), ch); } utf8_categories(define_category) utf8_converters(define_converter) #undef define_category #undef define_converter static int utf8_isgraph (utfint ch) { if (find_in_range(space_table, table_size(space_table), ch)) return 0; if (find_in_range(graph_table, table_size(graph_table), ch)) return 1; if (find_in_range(compose_table, table_size(compose_table), ch)) return 1; return 0; } static int utf8_isalnum (utfint ch) { if (find_in_range(alpha_table, table_size(alpha_table), ch)) return 1; if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch)) return 1; return 0; } static int utf8_width (utfint ch, int ambi_is_single) { if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch)) return 2; if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch)) return ambi_is_single ? 1 : 2; if (find_in_range(compose_table, table_size(compose_table), ch)) return 0; if (find_in_range(unprintable_table, table_size(unprintable_table), ch)) return 0; return 1; } /* string module compatible interface */ static int typeerror (lua_State *L, int idx, const char *tname) { return luaL_error(L, "%s expected, got %s", tname, luaL_typename(L, idx)); } static const char *check_utf8 (lua_State *L, int idx, const char **end) { size_t len; const char *s = luaL_checklstring(L, idx, &len); if (end) *end = s+len; return s; } static const char *to_utf8 (lua_State *L, int idx, const char **end) { size_t len; const char *s = lua_tolstring(L, idx, &len); if (end) *end = s+len; return s; } static const char *utf8_safe_decode (lua_State *L, const char *p, utfint *pval) { p = utf8_decode(p, pval, 0); if (p == NULL) luaL_error(L, "invalid UTF-8 code"); return p; } static void add_utf8char (luaL_Buffer *b, utfint ch) { char buff[UTF8_BUFFSZ]; size_t n = utf8_encode(buff, ch); luaL_addlstring(b, buff+UTF8_BUFFSZ-n, n); } static lua_Integer byte_relat (lua_Integer pos, size_t len) { if (pos >= 0) return pos; else if (0u - (size_t)pos > len) return 0; else return (lua_Integer)len + pos + 1; } static int Lutf8_len (lua_State *L) { size_t len, n; const char *s = luaL_checklstring(L, 1, &len), *p, *e; lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len); lua_Integer pose = byte_relat(luaL_optinteger(L, 3, -1), len); int lax = lua_toboolean(L, 4); luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, "initial position out of string"); luaL_argcheck(L, --pose < (lua_Integer)len, 3, "final position out of string"); for (n = 0, p=s+posi, e=s+pose+1; p < e; ++n) { if (lax) p = utf8_next(p, e); else { utfint ch; const char *np = utf8_decode(p, &ch, !lax); if (np == NULL || utf8_invalid(ch)) { lua_pushnil(L); lua_pushinteger(L, p - s + 1); return 2; } p = np; } } lua_pushinteger(L, n); return 1; } static int Lutf8_sub (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); lua_Integer posi = luaL_checkinteger(L, 2); lua_Integer pose = luaL_optinteger(L, 3, -1); if (utf8_range(s, e, &posi, &pose)) lua_pushlstring(L, s+posi, pose-posi); else lua_pushliteral(L, ""); return 1; } static int Lutf8_reverse (lua_State *L) { luaL_Buffer b; const char *prev, *pprev, *ends, *e, *s = check_utf8(L, 1, &e); (void) ends; int lax = lua_toboolean(L, 2); luaL_buffinit(L, &b); if (lax) { for (prev = e; s < prev; e = prev) { prev = utf8_prev(s, prev); luaL_addlstring(&b, prev, e-prev); } } else { for (prev = e; s < prev; prev = pprev) { utfint code = 0; ends = utf8_safe_decode(L, pprev = utf8_prev(s, prev), &code); assert(ends == prev); if (utf8_invalid(code)) return luaL_error(L, "invalid UTF-8 code"); if (!utf8_iscompose(code)) { luaL_addlstring(&b, pprev, e-pprev); e = pprev; } } } luaL_pushresult(&b); return 1; } static int Lutf8_byte (lua_State *L) { size_t n = 0; const char *e, *s = check_utf8(L, 1, &e); lua_Integer posi = luaL_optinteger(L, 2, 1); lua_Integer pose = luaL_optinteger(L, 3, posi); if (utf8_range(s, e, &posi, &pose)) { for (e = s + pose, s = s + posi; s < e; ++n) { utfint ch = 0; s = utf8_safe_decode(L, s, &ch); lua_pushinteger(L, ch); } } return CAST(int, n); } static int Lutf8_codepoint (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); size_t len = e-s; lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len); lua_Integer pose = byte_relat(luaL_optinteger(L, 3, posi), len); int lax = lua_toboolean(L, 4); int n; const char *se; luaL_argcheck(L, posi >= 1, 2, "out of range"); luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range"); if (posi > pose) return 0; /* empty interval; return no values */ if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */ return luaL_error(L, "string slice too long"); n = (int)(pose - posi + 1); luaL_checkstack(L, n, "string slice too long"); n = 0; /* count the number of returns */ se = s + pose; /* string end */ for (n = 0, s += posi - 1; s < se;) { utfint code = 0; s = utf8_safe_decode(L, s, &code); if (!lax && utf8_invalid(code)) return luaL_error(L, "invalid UTF-8 code"); lua_pushinteger(L, code); n++; } return n; } static int Lutf8_char (lua_State *L) { int i, n = lua_gettop(L); /* number of arguments */ luaL_Buffer b; luaL_buffinit(L, &b); for (i = 1; i <= n; ++i) { lua_Integer code = luaL_checkinteger(L, i); luaL_argcheck(L, code <= UTF8_MAXCP, i, "value out of range"); add_utf8char(&b, CAST(utfint, code)); } luaL_pushresult(&b); return 1; } #define bind_converter(name) \ static int Lutf8_##name (lua_State *L) { \ int t = lua_type(L, 1); \ if (t == LUA_TNUMBER) \ lua_pushinteger(L, utf8_to##name(CAST(utfint, lua_tointeger(L, 1)))); \ else if (t == LUA_TSTRING) { \ luaL_Buffer b; \ const char *e, *s = to_utf8(L, 1, &e); \ luaL_buffinit(L, &b); \ while (s < e) { \ utfint ch = 0; \ s = utf8_safe_decode(L, s, &ch); \ add_utf8char(&b, utf8_to##name(ch)); \ } \ luaL_pushresult(&b); \ } \ else return typeerror(L, 1, "number/string"); \ return 1; \ } utf8_converters(bind_converter) #undef bind_converter /* unicode extra interface */ static const char *parse_escape (lua_State *L, const char *s, const char *e, int hex, utfint *pch) { utfint code = 0; int in_bracket = 0; if (*s == '{') ++s, in_bracket = 1; for (; s < e; ++s) { utfint ch = (unsigned char)*s; if (ch >= '0' && ch <= '9') ch = ch - '0'; else if (hex && ch >= 'A' && ch <= 'F') ch = 10 + (ch - 'A'); else if (hex && ch >= 'a' && ch <= 'f') ch = 10 + (ch - 'a'); else if (!in_bracket) break; else if (ch == '}') { ++s; break; } else luaL_error(L, "invalid escape '%c'", ch); code *= hex ? 16 : 10; code += ch; } *pch = code; return s; } static int Lutf8_escape (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); luaL_Buffer b; luaL_buffinit(L, &b); while (s < e) { utfint ch = 0; s = utf8_safe_decode(L, s, &ch); if (ch == '%') { int hex = 0; switch (*s) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '{': break; case 'x': case 'X': hex = 1; /* fall through */ case 'u': case 'U': if (s+1 < e) { ++s; break; } /* fall through */ default: s = utf8_safe_decode(L, s, &ch); goto next; } s = parse_escape(L, s, e, hex, &ch); } next: add_utf8char(&b, ch); } luaL_pushresult(&b); return 1; } static int Lutf8_insert (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); size_t sublen; const char *subs; luaL_Buffer b; int nargs = 2; const char *first = e; if (lua_type(L, 2) == LUA_TNUMBER) { int idx = (int)lua_tointeger(L, 2); if (idx != 0) first = utf8_relat(s, e, idx); luaL_argcheck(L, first, 2, "invalid index"); ++nargs; } subs = luaL_checklstring(L, nargs, &sublen); luaL_buffinit(L, &b); luaL_addlstring(&b, s, first-s); luaL_addlstring(&b, subs, sublen); luaL_addlstring(&b, first, e-first); luaL_pushresult(&b); return 1; } static int Lutf8_remove (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); lua_Integer posi = luaL_optinteger(L, 2, -1); lua_Integer pose = luaL_optinteger(L, 3, -1); if (!utf8_range(s, e, &posi, &pose)) lua_settop(L, 1); else { luaL_Buffer b; luaL_buffinit(L, &b); luaL_addlstring(&b, s, posi); luaL_addlstring(&b, s+pose, e-s-pose); luaL_pushresult(&b); } return 1; } static int push_offset (lua_State *L, const char *s, const char *e, lua_Integer offset, lua_Integer idx) { utfint ch = 0; const char *p; if (idx != 0) p = utf8_offset(s, e, offset, idx); else if (p = s+offset-1, iscont(p)) p = utf8_prev(s, p); if (p == NULL || p == e) return 0; utf8_decode(p, &ch, 0); lua_pushinteger(L, p-s+1); lua_pushinteger(L, ch); return 2; } static int Lutf8_charpos (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); lua_Integer offset = 1; if (lua_isnoneornil(L, 3)) { lua_Integer idx = luaL_optinteger(L, 2, 0); if (idx > 0) --idx; else if (idx < 0) offset = e-s+1; return push_offset(L, s, e, offset, idx); } offset = byte_relat(luaL_optinteger(L, 2, 1), e-s); if (offset < 1) offset = 1; return push_offset(L, s, e, offset, luaL_checkinteger(L, 3)); } static int Lutf8_offset (lua_State *L) { size_t len; const char *s = luaL_checklstring(L, 1, &len); lua_Integer n = luaL_checkinteger(L, 2); lua_Integer posi = (n >= 0) ? 1 : len + 1; posi = byte_relat(luaL_optinteger(L, 3, posi), len); luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, "position out of range"); if (n == 0) { /* find beginning of current byte sequence */ while (posi > 0 && iscont(s + posi)) posi--; } else { if (iscont(s + posi)) return luaL_error(L, "initial position is a continuation byte"); if (n < 0) { while (n < 0 && posi > 0) { /* move back */ do { /* find beginning of previous character */ posi--; } while (posi > 0 && iscont(s + posi)); n++; } } else { n--; /* do not move for 1st character */ while (n > 0 && posi < (lua_Integer)len) { do { /* find beginning of next character */ posi++; } while (iscont(s + posi)); /* (cannot pass final '\0') */ n--; } } } if (n == 0) /* did it find given character? */ lua_pushinteger(L, posi + 1); else /* no such character */ lua_pushnil(L); return 1; } static int Lutf8_next (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); lua_Integer offset = byte_relat(luaL_optinteger(L, 2, 1), e-s); lua_Integer idx = luaL_optinteger(L, 3, !lua_isnoneornil(L, 2)); return push_offset(L, s, e, offset, idx); } static int iter_aux (lua_State *L, int strict) { const char *e, *s = check_utf8(L, 1, &e); int n = CAST(int, lua_tointeger(L, 2)); const char *p = n <= 0 ? s : utf8_next(s+n-1, e); if (p < e) { utfint code = 0; utf8_safe_decode(L, p, &code); if (strict && utf8_invalid(code)) return luaL_error(L, "invalid UTF-8 code"); lua_pushinteger(L, p-s+1); lua_pushinteger(L, code); return 2; } return 0; /* no more codepoints */ } static int iter_auxstrict (lua_State *L) { return iter_aux(L, 1); } static int iter_auxlax (lua_State *L) { return iter_aux(L, 0); } static int Lutf8_codes (lua_State *L) { int lax = lua_toboolean(L, 2); luaL_checkstring(L, 1); lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict); lua_pushvalue(L, 1); lua_pushinteger(L, 0); return 3; } static int Lutf8_width (lua_State *L) { int t = lua_type(L, 1); int ambi_is_single = !lua_toboolean(L, 2); int default_width = CAST(int, luaL_optinteger(L, 3, 0)); if (t == LUA_TNUMBER) { size_t chwidth = utf8_width(CAST(utfint, lua_tointeger(L, 1)), ambi_is_single); if (chwidth == 0) chwidth = default_width; lua_pushinteger(L, (lua_Integer)chwidth); } else if (t != LUA_TSTRING) return typeerror(L, 1, "number/string"); else { const char *e, *s = to_utf8(L, 1, &e); int width = 0; while (s < e) { utfint ch = 0; int chwidth; s = utf8_safe_decode(L, s, &ch); chwidth = utf8_width(ch, ambi_is_single); width += chwidth == 0 ? default_width : chwidth; } lua_pushinteger(L, (lua_Integer)width); } return 1; } static int Lutf8_widthindex (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); int width = CAST(int, luaL_checkinteger(L, 2)); int ambi_is_single = !lua_toboolean(L, 3); int default_width = CAST(int, luaL_optinteger(L, 4, 0)); size_t idx = 1; while (s < e) { utfint ch = 0; size_t chwidth; s = utf8_safe_decode(L, s, &ch); chwidth = utf8_width(ch, ambi_is_single); if (chwidth == 0) chwidth = default_width; width -= CAST(int, chwidth); if (width <= 0) { lua_pushinteger(L, idx); lua_pushinteger(L, width + chwidth); lua_pushinteger(L, chwidth); return 3; } ++idx; } lua_pushinteger(L, (lua_Integer)idx); return 1; } static int Lutf8_ncasecmp (lua_State *L) { const char *e1, *s1 = check_utf8(L, 1, &e1); const char *e2, *s2 = check_utf8(L, 2, &e2); while (s1 < e1 || s2 < e2) { utfint ch1 = 0, ch2 = 0; if (s1 == e1) ch2 = 1; else if (s2 == e2) ch1 = 1; else { s1 = utf8_safe_decode(L, s1, &ch1); s2 = utf8_safe_decode(L, s2, &ch2); ch1 = utf8_tofold(ch1); ch2 = utf8_tofold(ch2); } if (ch1 != ch2) { lua_pushinteger(L, ch1 > ch2 ? 1 : -1); return 1; } } lua_pushinteger(L, 0); return 1; } /* utf8 pattern matching implement */ #ifndef LUA_MAXCAPTURES # define LUA_MAXCAPTURES 32 #endif /* LUA_MAXCAPTURES */ #define CAP_UNFINISHED (-1) #define CAP_POSITION (-2) typedef struct MatchState { int matchdepth; /* control for recursive depth (to avoid C stack overflow) */ const char *src_init; /* init of source string */ const char *src_end; /* end ('\0') of source string */ const char *p_end; /* end ('\0') of pattern */ lua_State *L; int level; /* total number of captures (finished or unfinished) */ struct { const char *init; ptrdiff_t len; } capture[LUA_MAXCAPTURES]; } MatchState; /* recursive function */ static const char *match (MatchState *ms, const char *s, const char *p); /* maximum recursion depth for 'match' */ #if !defined(MAXCCALLS) #define MAXCCALLS 200 #endif #define L_ESC '%' #define SPECIALS "^$*+?.([%-" static int check_capture (MatchState *ms, int l) { l -= '1'; if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED) return luaL_error(ms->L, "invalid capture index %%%d", l + 1); return l; } static int capture_to_close (MatchState *ms) { int level = ms->level; while (--level >= 0) if (ms->capture[level].len == CAP_UNFINISHED) return level; return luaL_error(ms->L, "invalid pattern capture"); } static const char *classend (MatchState *ms, const char *p) { utfint ch = 0; p = utf8_safe_decode(ms->L, p, &ch); switch (ch) { case L_ESC: { if (p == ms->p_end) luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")"); return utf8_next(p, ms->p_end); } case '[': { if (*p == '^') p++; do { /* look for a `]' */ if (p == ms->p_end) luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")"); if (*(p++) == L_ESC && p < ms->p_end) p++; /* skip escapes (e.g. `%]') */ } while (*p != ']'); return p+1; } default: { return p; } } } static int match_class (utfint c, utfint cl) { int res; switch (utf8_tolower(cl)) { #define X(cls, name) case cls: res = utf8_is##name(c); break; utf8_categories(X) #undef X case 'g' : res = utf8_isgraph(c); break; case 'w' : res = utf8_isalnum(c); break; case 'z' : res = (c == 0); break; /* deprecated option */ default: return (cl == c); } return (utf8_islower(cl) ? res : !res); } static int matchbracketclass (MatchState *ms, utfint c, const char *p, const char *ec) { int sig = 1; assert(*p == '['); if (*++p == '^') { sig = 0; p++; /* skip the `^' */ } while (p < ec) { utfint ch = 0; p = utf8_safe_decode(ms->L, p, &ch); if (ch == L_ESC) { p = utf8_safe_decode(ms->L, p, &ch); if (match_class(c, ch)) return sig; } else { utfint next = 0; const char *np = utf8_safe_decode(ms->L, p, &next); if (next == '-' && np < ec) { p = utf8_safe_decode(ms->L, np, &next); if (ch <= c && c <= next) return sig; } else if (ch == c) return sig; } } return !sig; } static int singlematch (MatchState *ms, const char *s, const char *p, const char *ep) { if (s >= ms->src_end) return 0; else { utfint ch=0, pch=0; utf8_safe_decode(ms->L, s, &ch); p = utf8_safe_decode(ms->L, p, &pch); switch (pch) { case '.': return 1; /* matches any char */ case L_ESC: utf8_safe_decode(ms->L, p, &pch); return match_class(ch, pch); case '[': return matchbracketclass(ms, ch, p-1, ep-1); default: return pch == ch; } } } static const char *matchbalance (MatchState *ms, const char *s, const char **p) { utfint ch=0, begin=0, end=0; *p = utf8_safe_decode(ms->L, *p, &begin); if (*p >= ms->p_end) luaL_error(ms->L, "malformed pattern " "(missing arguments to " LUA_QL("%%b") ")"); *p = utf8_safe_decode(ms->L, *p, &end); s = utf8_safe_decode(ms->L, s, &ch); if (ch != begin) return NULL; else { int cont = 1; while (s < ms->src_end) { s = utf8_safe_decode(ms->L, s, &ch); if (ch == end) { if (--cont == 0) return s; } else if (ch == begin) cont++; } } return NULL; /* string ends out of balance */ } static const char *max_expand (MatchState *ms, const char *s, const char *p, const char *ep) { const char *m = s; /* matched end of single match p */ while (singlematch(ms, m, p, ep)) m = utf8_next(m, ms->src_end); /* keeps trying to match with the maximum repetitions */ while (s <= m) { const char *res = match(ms, m, ep+1); if (res) return res; /* else didn't match; reduce 1 repetition to try again */ if (s == m) break; m = utf8_prev(s, m); } return NULL; } static const char *min_expand (MatchState *ms, const char *s, const char *p, const char *ep) { for (;;) { const char *res = match(ms, s, ep+1); if (res != NULL) return res; else if (singlematch(ms, s, p, ep)) s = utf8_next(s, ms->src_end); /* try with one more repetition */ else return NULL; } } static const char *start_capture (MatchState *ms, const char *s, const char *p, int what) { const char *res; int level = ms->level; if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures"); ms->capture[level].init = s; ms->capture[level].len = what; ms->level = level+1; if ((res=match(ms, s, p)) == NULL) /* match failed? */ ms->level--; /* undo capture */ return res; } static const char *end_capture (MatchState *ms, const char *s, const char *p) { int l = capture_to_close(ms); const char *res; ms->capture[l].len = s - ms->capture[l].init; /* close capture */ if ((res = match(ms, s, p)) == NULL) /* match failed? */ ms->capture[l].len = CAP_UNFINISHED; /* undo capture */ return res; } static const char *match_capture (MatchState *ms, const char *s, int l) { size_t len; l = check_capture(ms, l); len = ms->capture[l].len; if ((size_t)(ms->src_end-s) >= len && memcmp(ms->capture[l].init, s, len) == 0) return s+len; else return NULL; } static const char *match (MatchState *ms, const char *s, const char *p) { if (ms->matchdepth-- == 0) luaL_error(ms->L, "pattern too complex"); init: /* using goto's to optimize tail recursion */ if (p != ms->p_end) { /* end of pattern? */ utfint ch = 0; utf8_safe_decode(ms->L, p, &ch); switch (ch) { case '(': { /* start capture */ if (*(p + 1) == ')') /* position capture? */ s = start_capture(ms, s, p + 2, CAP_POSITION); else s = start_capture(ms, s, p + 1, CAP_UNFINISHED); break; } case ')': { /* end capture */ s = end_capture(ms, s, p + 1); break; } case '$': { if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */ goto dflt; /* no; go to default */ s = (s == ms->src_end) ? s : NULL; /* check end of string */ break; } case L_ESC: { /* escaped sequence not in the format class[*+?-]? */ const char *prev_p = p; p = utf8_safe_decode(ms->L, p+1, &ch); switch (ch) { case 'b': { /* balanced string? */ s = matchbalance(ms, s, &p); if (s != NULL) goto init; /* return match(ms, s, p + 4); */ /* else fail (s == NULL) */ break; } case 'f': { /* frontier? */ const char *ep; utfint previous = 0, current = 0; if (*p != '[') luaL_error(ms->L, "missing " LUA_QL("[") " after " LUA_QL("%%f") " in pattern"); ep = classend(ms, p); /* points to what is next */ if (s != ms->src_init) utf8_decode(utf8_prev(ms->src_init, s), &previous, 0); if (s != ms->src_end) utf8_decode(s, ¤t, 0); if (!matchbracketclass(ms, previous, p, ep - 1) && matchbracketclass(ms, current, p, ep - 1)) { p = ep; goto init; /* return match(ms, s, ep); */ } s = NULL; /* match failed */ break; } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { /* capture results (%0-%9)? */ s = match_capture(ms, s, ch); if (s != NULL) goto init; /* return match(ms, s, p + 2) */ break; } default: p = prev_p; goto dflt; } break; } default: dflt: { /* pattern class plus optional suffix */ const char *ep = classend(ms, p); /* points to optional suffix */ /* does not match at least once? */ if (!singlematch(ms, s, p, ep)) { if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */ p = ep + 1; goto init; /* return match(ms, s, ep + 1); */ } else /* '+' or no suffix */ s = NULL; /* fail */ } else { /* matched once */ const char *next_s = utf8_next(s, ms->src_end); switch (*ep) { /* handle optional suffix */ case '?': { /* optional */ const char *res; const char *next_ep = utf8_next(ep, ms->p_end); if ((res = match(ms, next_s, next_ep)) != NULL) s = res; else { p = next_ep; goto init; /* else return match(ms, s, ep + 1); */ } break; } case '+': /* 1 or more repetitions */ s = next_s; /* 1 match already done */ /* fall through */ case '*': /* 0 or more repetitions */ s = max_expand(ms, s, p, ep); break; case '-': /* 0 or more repetitions (minimum) */ s = min_expand(ms, s, p, ep); break; default: /* no suffix */ s = next_s; p = ep; goto init; /* return match(ms, s + 1, ep); */ } } break; } } } ms->matchdepth++; return s; } static const char *lmemfind (const char *s1, size_t l1, const char *s2, size_t l2) { if (l2 == 0) return s1; /* empty strings are everywhere */ else if (l2 > l1) return NULL; /* avoids a negative `l1' */ else { const char *init; /* to search for a `*s2' inside `s1' */ l2--; /* 1st char will be checked by `memchr' */ l1 = l1-l2; /* `s2' cannot be found after that */ while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) { init++; /* 1st char is already checked */ if (memcmp(init, s2+1, l2) == 0) return init-1; else { /* correct `l1' and `s1' to try again */ l1 -= init-s1; s1 = init; } } return NULL; /* not found */ } } static int get_index (const char *p, const char *s, const char *e) { int idx; for (idx = 0; s < e && s < p; ++idx) s = utf8_next(s, e); return s == p ? idx : idx - 1; } static void push_onecapture (MatchState *ms, int i, const char *s, const char *e) { if (i >= ms->level) { if (i == 0) /* ms->level == 0, too */ lua_pushlstring(ms->L, s, e - s); /* add whole match */ else luaL_error(ms->L, "invalid capture index"); } else { ptrdiff_t l = ms->capture[i].len; if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture"); if (l == CAP_POSITION) { int idx = get_index(ms->capture[i].init, ms->src_init, ms->src_end); lua_pushinteger(ms->L, idx+1); } else lua_pushlstring(ms->L, ms->capture[i].init, l); } } static int push_captures (MatchState *ms, const char *s, const char *e) { int i; int nlevels = (ms->level == 0 && s) ? 1 : ms->level; luaL_checkstack(ms->L, nlevels, "too many captures"); for (i = 0; i < nlevels; i++) push_onecapture(ms, i, s, e); return nlevels; /* number of strings pushed */ } /* check whether pattern has no special characters */ static int nospecials (const char *p, const char * ep) { while (p < ep) { if (strpbrk(p, SPECIALS)) return 0; /* pattern has a special character */ p += strlen(p) + 1; /* may have more after \0 */ } return 1; /* no special chars found */ } /* utf8 pattern matching interface */ static int find_aux (lua_State *L, int find) { const char *es, *s = check_utf8(L, 1, &es); const char *ep, *p = check_utf8(L, 2, &ep); lua_Integer idx = luaL_optinteger(L, 3, 1); const char *init; if (!idx) idx = 1; init = utf8_relat(s, es, CAST(int, idx)); if (init == NULL) { if (idx > 0) { lua_pushnil(L); /* cannot find anything */ return 1; } init = s; } /* explicit request or no special characters? */ if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) { /* do a plain search */ const char *s2 = lmemfind(init, es-init, p, ep-p); if (s2) { const char *e2 = s2 + (ep - p); if (iscont(e2)) e2 = utf8_next(e2, es); lua_pushinteger(L, idx = get_index(s2, s, es) + 1); lua_pushinteger(L, idx + get_index(e2, s2, es) - 1); return 2; } } else { MatchState ms; int anchor = (*p == '^'); if (anchor) p++; /* skip anchor character */ if (idx < 0) idx += utf8_length(s, es)+1; /* TODO not very good */ ms.L = L; ms.matchdepth = MAXCCALLS; ms.src_init = s; ms.src_end = es; ms.p_end = ep; do { const char *res; ms.level = 0; assert(ms.matchdepth == MAXCCALLS); if ((res=match(&ms, init, p)) != NULL) { if (find) { lua_pushinteger(L, idx); /* start */ lua_pushinteger(L, idx + utf8_length(init, res) - 1); /* end */ return push_captures(&ms, NULL, 0) + 2; } else return push_captures(&ms, init, res); } if (init == es) break; idx += 1; init = utf8_next(init, es); } while (init <= es && !anchor); } lua_pushnil(L); /* not found */ return 1; } static int Lutf8_find (lua_State *L) { return find_aux(L, 1); } static int Lutf8_match (lua_State *L) { return find_aux(L, 0); } static int gmatch_aux (lua_State *L) { MatchState ms; const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es); const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep); const char *src; ms.L = L; ms.matchdepth = MAXCCALLS; ms.src_init = s; ms.src_end = es; ms.p_end = ep; for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3)); src <= ms.src_end; src = utf8_next(src, ms.src_end)) { const char *e; ms.level = 0; assert(ms.matchdepth == MAXCCALLS); if ((e = match(&ms, src, p)) != NULL) { lua_Integer newstart = e-s; if (e == src) newstart++; /* empty match? go at least one position */ lua_pushinteger(L, newstart); lua_replace(L, lua_upvalueindex(3)); return push_captures(&ms, src, e); } if (src == ms.src_end) break; } return 0; /* not found */ } static int Lutf8_gmatch (lua_State *L) { luaL_checkstring(L, 1); luaL_checkstring(L, 2); lua_settop(L, 2); lua_pushinteger(L, 0); lua_pushcclosure(L, gmatch_aux, 3); return 1; } static void add_s (MatchState *ms, luaL_Buffer *b, const char *s, const char *e) { const char *new_end, *news = to_utf8(ms->L, 3, &new_end); while (news < new_end) { utfint ch = 0; news = utf8_safe_decode(ms->L, news, &ch); if (ch != L_ESC) add_utf8char(b, ch); else { news = utf8_safe_decode(ms->L, news, &ch); /* skip ESC */ if (!utf8_isdigit(ch)) { if (ch != L_ESC) luaL_error(ms->L, "invalid use of " LUA_QL("%c") " in replacement string", L_ESC); add_utf8char(b, ch); } else if (ch == '0') luaL_addlstring(b, s, e-s); else { push_onecapture(ms, ch-'1', s, e); luaL_addvalue(b); /* add capture to accumulated result */ } } } } static void add_value (MatchState *ms, luaL_Buffer *b, const char *s, const char *e, int tr) { lua_State *L = ms->L; switch (tr) { case LUA_TFUNCTION: { int n; lua_pushvalue(L, 3); n = push_captures(ms, s, e); lua_call(L, n, 1); break; } case LUA_TTABLE: { push_onecapture(ms, 0, s, e); lua_gettable(L, 3); break; } default: { /* LUA_TNUMBER or LUA_TSTRING */ add_s(ms, b, s, e); return; } } if (!lua_toboolean(L, -1)) { /* nil or false? */ lua_pop(L, 1); lua_pushlstring(L, s, e - s); /* keep original text */ } else if (!lua_isstring(L, -1)) luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1)); luaL_addvalue(b); /* add result to accumulator */ } static int Lutf8_gsub (lua_State *L) { const char *es, *s = check_utf8(L, 1, &es); const char *ep, *p = check_utf8(L, 2, &ep); int tr = lua_type(L, 3); lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1); int anchor = (*p == '^'); lua_Integer n = 0; MatchState ms; luaL_Buffer b; luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING || tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3, "string/function/table expected"); luaL_buffinit(L, &b); if (anchor) p++; /* skip anchor character */ ms.L = L; ms.matchdepth = MAXCCALLS; ms.src_init = s; ms.src_end = es; ms.p_end = ep; while (n < max_s) { const char *e; ms.level = 0; assert(ms.matchdepth == MAXCCALLS); e = match(&ms, s, p); if (e) { n++; add_value(&ms, &b, s, e, tr); } if (e && e > s) /* non empty match? */ s = e; /* skip it */ else if (s < es) { utfint ch = 0; s = utf8_safe_decode(L, s, &ch); add_utf8char(&b, ch); } else break; if (anchor) break; } luaL_addlstring(&b, s, es-s); luaL_pushresult(&b); lua_pushinteger(L, n); /* number of substitutions */ return 2; } /* lua module import interface */ #if LUA_VERSION_NUM >= 502 static const char UTF8PATT[] = "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"; #else static const char UTF8PATT[] = "[%z\1-\x7F\xC2-\xF4][\x80-\xBF]*"; #endif int luaopen_utf8extra (lua_State *L) { luaL_Reg libs[] = { #define ENTRY(name) { #name, Lutf8_##name } ENTRY(offset), ENTRY(codes), ENTRY(codepoint), ENTRY(len), ENTRY(sub), ENTRY(reverse), ENTRY(lower), ENTRY(upper), ENTRY(title), ENTRY(fold), ENTRY(byte), ENTRY(char), ENTRY(escape), ENTRY(insert), ENTRY(remove), ENTRY(charpos), ENTRY(next), ENTRY(width), ENTRY(widthindex), ENTRY(ncasecmp), ENTRY(find), ENTRY(gmatch), ENTRY(gsub), ENTRY(match), #undef ENTRY { NULL, NULL } }; luaL_newlib(L, libs); lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)-1); lua_setfield(L, -2, "charpattern"); return 1; }