Added codesets support for encoding switch

This commit is contained in:
George Sokianos 2023-12-18 17:19:13 +00:00
parent e0b5f56faa
commit 1b00045146
9 changed files with 468 additions and 22 deletions

View File

@ -9,7 +9,7 @@ LiteXL_OBJ := \
src/api/api.o src/api/dirmonitor.o \
src/api/regex.o src/api/renderer.o src/api/system.o \
src/api/utf8.o src/platform/amigaos4.o \
src/api/dirmonitor/os4.o
src/api/dirmonitor/os4.o src/platform/codesets.o
outfile := lite-xl
compiler := gcc-11
@ -22,7 +22,7 @@ DFLAGS += -D__USE_INLINE__ -DLITE_XL_DATA_USE_EXEDIR
CFLAGS += -Werror -Wwrite-strings -O3 -std=gnu11 -fno-strict-aliasing
LFLAGS += -mcrt=newlib -lauto \
LFLAGS += -mcrt=newlib \
-lpcre2 -lSDL2 -llua54 -lfreetype -lz -lm -lpthread -athread=native
ifeq ($(DEBUG),1)
@ -56,7 +56,7 @@ LiteXL: $(LiteXL_OBJ)
src/main.o: src/main.c src/api/api.h src/rencache.h \
src/renderer.h src/platform/amigaos4.h
src/renderer.h src/platform/amigaos4.h src/platform/codesets.h
src/rencache.o: src/rencache.c
@ -70,13 +70,15 @@ src/api/regex.o: src/api/regex.c
src/api/renderer.o: src/api/renderer.c
src/api/system.o: src/api/system.c
src/api/system.o: src/api/system.c src/platform/amigaos4.h
src/platform/amigaos4.o: src/platform/amigaos4.c
src/platform/codesets.o: src/platform/codesets.c
src/api/dirmonitor.o: src/api/dirmonitor.c src/api/dirmonitor/os4.c
src/api/utf8.o: src/api/utf8.c
src/api/utf8.o: src/api/utf8.c src/platform/amigaos4.h
src/api/dirmonitor/os4.o: src/api/dirmonitor/os4.c

View File

@ -446,7 +446,7 @@ function DocView:draw_line_text(line, x, y)
local last_token = nil
local tokens = self.doc.highlighter:get_line(line).tokens
local tokens_count = #tokens
if string.sub(tokens[tokens_count], -1) == "\n" then
if tokens[tokens_count] ~= nil and string.sub(tokens[tokens_count], -1) == "\n" then
last_token = tokens_count - 1
end
for tidx, type, text in self.doc.highlighter:each_token(line) do

View File

@ -1,5 +1,5 @@
-- this file is used by lite-xl to setup the Lua environment when starting
VERSION = "@PROJECT_VERSION@"
VERSION = "2.1.1r3"
MOD_VERSION_MAJOR = 3
MOD_VERSION_MINOR = 0
MOD_VERSION_PATCH = 0

View File

@ -263,7 +263,8 @@ function tokenizer.tokenize(incoming_syntax, text, state, resume)
local text_len = text:ulen()
local start_time = system.get_time()
local starting_i = i
while i <= text_len do
while text_len ~= nil and i <= text_len do
-- Every 200 chars, check if we're out of time
if i - starting_i > 200 then
starting_i = i
@ -301,11 +302,9 @@ function tokenizer.tokenize(incoming_syntax, text, state, resume)
cont = false
end
end
-- General end of syntax check. Applies in the case where
-- we're ending early in the middle of a delimiter, or
-- just normally, upon finding a token.
if subsyntax_info then
local s, e = find_text(text, subsyntax_info, i, true, true)
-- If we don't have any concerns about syntax delimiters,
-- continue on as normal.
if cont then
if s then
push_token(res, token_type, text:usub(i, e))
set_subsyntax_pattern_idx(0)
@ -332,12 +331,47 @@ function tokenizer.tokenize(incoming_syntax, text, state, resume)
end
end
-- consume character if we didn't match
if not matched then
push_token(res, "normal", text:usub(i, i))
i = i + 1
-- find matching pattern
local matched = false
for n, p in ipairs(current_syntax.patterns) do
local find_results = { find_text(text, p, i, true, false) }
if find_results[1] then
local type_is_table = type(p.type) == "table"
local n_types = type_is_table and #p.type or 1
if #find_results == 2 and type_is_table then
report_bad_pattern(core.warn, current_syntax, n,
"Token type is a table, but a string was expected.")
p.type = p.type[1]
elseif #find_results - 1 > n_types then
report_bad_pattern(core.error, current_syntax, n,
"Not enough token types: got %d needed %d.", n_types, #find_results - 1)
elseif #find_results - 1 < n_types then
report_bad_pattern(core.warn, current_syntax, n,
"Too many token types: got %d needed %d.", n_types, #find_results - 1)
end
-- matched pattern; make and add tokens
push_tokens(res, current_syntax, p, text, find_results)
-- update state if this was a start|end pattern pair
if type(p.pattern or p.regex) == "table" then
-- If we have a subsyntax, push that onto the subsyntax stack.
if p.syntax then
push_subsyntax(p, n)
else
set_subsyntax_pattern_idx(n)
end
end
-- move cursor past this token
i = find_results[2] + 1
matched = true
break
end
end
-- consume character if we didn't match
if not matched then
push_token(res, "normal", text:usub(i, i))
i = i + 1
end
end
return res, state

View File

@ -7,6 +7,10 @@ int luaopen_regex(lua_State *L);
int luaopen_dirmonitor(lua_State* L);
int luaopen_utf8extra(lua_State* L);
#if defined(__amigaos4__)
int luaopen_codesets(lua_State* L);
#endif
static const luaL_Reg libs[] = {
{ "system", luaopen_system },
{ "renderer", luaopen_renderer },
@ -14,6 +18,9 @@ static const luaL_Reg libs[] = {
// { "process", luaopen_process },
{ "dirmonitor", luaopen_dirmonitor },
{ "utf8extra", luaopen_utf8extra },
#if defined(__amigaos4__)
{ "codesetsextra", luaopen_codesets },
#endif
{ NULL, NULL }
};
@ -22,4 +29,3 @@ void api_load_libs(lua_State *L) {
for (int i = 0; libs[i].name; i++)
luaL_requiref(L, libs[i].name, libs[i].func, 1);
}

View File

@ -8,7 +8,7 @@
#include <signal.h>
#if defined(__amigaos4__) || defined(__morphos__)
#define VSTRING "Lite XL 2.1.1r1 (29.01.2023)"
#define VSTRING "Lite XL 2.1.1r3 (07.08.2023)"
#define VERSTAG "\0$VER: " VSTRING
#endif
@ -20,6 +20,7 @@
#include <mach-o/dyld.h>
#elif defined(__amigaos4__)
#include <locale.h>
#include "platform/codesets.h"
#include "platform/amigaos4.h"
static CONST_STRPTR stack USED = "$STACK:102400";
static CONST_STRPTR version USED = VERSTAG;
@ -154,10 +155,19 @@ void set_macos_bundle_resources(lua_State *L);
#endif
int main(int argc, char **argv) {
#if defined(__amigaos4__) || defined(__morphos__)
setlocale(LC_ALL, "C");
#endif
#if defined(__amigaos4__)
OpenLibs();
#endif
#ifndef _WIN32
signal(SIGPIPE, SIG_IGN);
#endif
if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_EVENTS) != 0) {
fprintf(stderr, "Error initializing sdl: %s", SDL_GetError());
exit(1);
@ -262,6 +272,9 @@ init_lua:
" HOME = os.getenv('" LITE_OS_HOME "')\n"
" local exedir = match(EXEFILE, '^(.*)" LITE_PATHSEP_PATTERN LITE_NONPATHSEP_PATTERN "$')\n"
" local prefix = os.getenv('LITE_PREFIX') or match(exedir, '^(.*)" LITE_PATHSEP_PATTERN "bin$')\n"
" if not HOME then\n"
" HOME = exedir\n"
" end\n"
" dofile((MACOS_RESOURCES or (prefix and prefix .. '/share/lite-xl' or exedir .. '/data')) .. '/core/start.lua')\n"
" core = require(os.getenv('LITE_XL_RUNTIME') or 'core')\n"
" core.init()\n"
@ -304,5 +317,8 @@ init_lua:
ren_free_window_resources(&window_renderer);
lua_close(L);
#if defined(__amigaos4__)
CleanExit("JustExit");
#endif
return EXIT_SUCCESS;
}

377
src/platform/codesets.c Normal file
View File

@ -0,0 +1,377 @@
/* This code is responsible for the encoding change
* using codesets library. It requires the codesets
* plugin to work.
*
* Heavily inspired from the encoding plugin
* https://github.com/jgmdev/lite-xl-encoding
*/
#include <SDL2/SDL_stdinc.h>
#include <stdbool.h>
#include <lua.h>
#include <lauxlib.h>
#include <lualib.h>
#include "codesets.h"
struct Library *CodesetsBase = NULL;
struct CodesetsIFace *ICodesets = NULL;
typedef struct {
const char* charset;
unsigned char bom[4];
int len;
} bom_t;
/*
* List of encodings that can have byte order marks.
* Note: UTF-32 should be tested before UTF-16, the order matters.
*/
static bom_t bom_list[] = {
{ "UTF-8", {0xef, 0xbb, 0xbf}, 3 },
{ "UTF-32LE", {0xff, 0xfe, 0x00, 0x00}, 4 },
{ "UTF-32BE", {0x00, 0x00, 0xfe, 0xff}, 4 },
{ "UTF-16LE", {0xff, 0xfe}, 2 },
{ "UTF-16BE", {0xfe, 0xff}, 2 },
{ "GB18030", {0x84, 0x31, 0x95, 0x33}, 4 },
{ "UTF-7", {0x2b, 0x2f, 0x76, 0x38}, 4 },
{ "UTF-7", {0x2b, 0x2f, 0x76, 0x39}, 4 },
{ "UTF-7", {0x2b, 0x2f, 0x76, 0x2b}, 4 },
{ "UTF-7", {0x2b, 0x2f, 0x76, 0x2f}, 4 },
{ NULL }
};
/* Get the applicable byte order marks for the given charset */
static const unsigned char* encoding_bom_from_charset(const char* charset, size_t* len) {
for (size_t i=0; bom_list[i].charset != NULL; i++){
if (strcmp(bom_list[i].charset, charset) == 0) {
if (len) *len = bom_list[i].len;
return bom_list[i].bom;
}
}
if (len) *len = 0;
return NULL;
}
/* Detect the encoding of the given string if a valid bom sequence is found */
static const char* encoding_charset_from_bom(
const char* string, size_t len, size_t* bom_len
) {
const unsigned char* bytes = (unsigned char*) string;
for (size_t i=0; bom_list[i].charset != NULL; i++) {
if (len >= bom_list[i].len) {
bool all_match = true;
for (size_t b = 0; b<bom_list[i].len; b++) {
if (bytes[b] != bom_list[i].bom[b]) {
all_match = false;
break;
}
}
if (all_match) {
if (bom_len) *bom_len = bom_list[i].len;
return bom_list[i].charset;
}
}
}
if (bom_len)
*bom_len = 0;
return NULL;
}
/////////////////////////////////////////////////////////////////////
// Lua methods for codesets
int Lcodesets_detect(lua_State *L) {
const char* filename = luaL_checkstring(L, 1);
BPTR fileHandle = FOpen(filename, MODE_OLDFILE, 0);
if (!fileHandle)
{
lua_pushnil(L);
lua_pushfstring(L, "unable to open file '%s', code=%d", filename, IoErr());
return 2;
}
ChangeFilePosition(fileHandle, 0, OFFSET_END);
int64 fileSize = GetFileSize( fileHandle );
STRPTR fileText = malloc(fileSize);
if (!fileText) {
lua_pushnil(L);
lua_pushfstring(L, "out of ram while detecting charset of '%s'", filename);
FClose(fileHandle);
return 2;
}
ChangeFilePosition(fileHandle, 0, OFFSET_BEGINNING);
Read(fileHandle, fileText, fileSize);
struct codeset *cs;
ULONG errNum = 0;
if((cs = CodesetsFindBest(CSA_Source, fileText,
CSA_ErrPtr, &errNum,
TAG_DONE)))
{
FClose(fileHandle);
free(fileText);
lua_pushstring(L, cs->name);
// printf("Identified file as %s with %d of %d errors\n", cs->name, (int)errNum, (int)strlen(fileText));
}
else
{
FClose(fileHandle);
free(fileText);
lua_pushnil(L);
lua_pushstring(L, "could not detect the file encoding");
return 2;
}
return 1;
}
int Lcodesets_systemCodeset(lua_State *L) {
struct codeset *systemCodeset;
systemCodeset = CodesetsFindA(NULL, NULL);
lua_pushstring(L, systemCodeset->name);
return 1;
}
int Lcodesets_convert(lua_State *L) {
const char* to = luaL_checkstring(L, 1);
const char* from = luaL_checkstring(L, 2);
size_t text_len = 0;
const char* text = luaL_checklstring(L, 3, &text_len);
/* conversion options */
bool strict = false;
bool handle_to_bom = false;
bool handle_from_bom = false;
const unsigned char* bom;
size_t bom_len = 0;
if (lua_gettop(L) > 3 && lua_istable(L, 4)) {
lua_getfield(L, 4, "handle_to_bom");
if (lua_isboolean(L, -1)) {
handle_to_bom = lua_toboolean(L, -1);
}
lua_getfield(L, 4, "handle_from_bom");
if (lua_isboolean(L, -1)) {
handle_from_bom = lua_toboolean(L, -1);
}
lua_getfield(L, 4, "strict");
if (lua_isboolean(L, -1)) {
strict = lua_toboolean(L, -1);
}
}
/* to strip the bom from the input text if any */
if (handle_from_bom) {
encoding_charset_from_bom(text, text_len, &bom_len);
}
char *output;
ULONG output_len;
struct codeset *srcCodeset;
struct codeset *destCodeset;
ULONG errNum = 0;
srcCodeset = CodesetsFind(from, CSA_FallbackToDefault, FALSE, TAG_DONE);
// srcCodeset = CodesetsFindBest(CSA_Source, text,
// CSA_ErrPtr, &errNum,
// TAG_DONE);
if (!srcCodeset)
{
lua_pushnil(L);
lua_pushfstring(L, "failed creating source codeset for '%s'", from);
return 2;
}
destCodeset = CodesetsFind(to, CSA_FallbackToDefault, FALSE, TAG_DONE);
if (!destCodeset)
{
lua_pushnil(L);
lua_pushfstring(L, "failed creating destination codeset for '%s'", to);
return 2;
}
output = CodesetsConvertStr(CSA_SourceCodeset, srcCodeset,
CSA_DestCodeset, destCodeset,
CSA_Source, text,
CSA_DestLenPtr, &output_len,
TAG_DONE);
// if (!output)
// {
// lua_pushnil(L);
// lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to);
// CodesetsFreeA(output, NULL);
// return 2;
// }
/* strip bom sometimes added when converting to utf-8, we don't need it */
if (output && strcmp(to, "UTF-8") == 0) {
encoding_charset_from_bom(output, output_len, &bom_len);
if (bom_len > 0) {
memmove(output, output+bom_len, output_len-bom_len);
output = realloc(output, output_len-bom_len);
output_len -= bom_len;
}
}
// if (!CodesetsIsValidUTF8(output))
// {
// lua_pushnil(L);
// lua_pushfstring(L, "not valid conversion from '%s' to '%s'", from, to);
// CodesetsFreeA(output, NULL);
// return 2;
// }
if (output != NULL && handle_to_bom) {
if (handle_to_bom) {
bom = encoding_bom_from_charset(to, &bom_len);
if (bom != NULL) {
output = realloc(output, output_len + bom_len);
memmove(output+bom_len, output, output_len);
memcpy(output, bom, bom_len);
output_len += bom_len;
}
}
} else if (!output) {
lua_pushnil(L);
lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to);
CodesetsFreeA(output, NULL);
return 2;
}
lua_pushlstring(L, output, output_len);
CodesetsFreeA(output, NULL);
return 1;
}
/*
* encoding.get_charset_bom(charset)
*
* Retrieve the byte order marks sequence for the given charset if applicable.
*
* Arguments:
* charset, a string representing a valid iconv charset
*
* Returns:
* The bom sequence string or empty string if not applicable.
*/
int Lcodesets_get_charset_bom(lua_State *L) {
const char* charset = luaL_checkstring(L, 1);
size_t bom_len = 0;
const unsigned char* bom = encoding_bom_from_charset(charset, &bom_len);
if (bom)
lua_pushlstring(L, (char*)bom, bom_len);
else
lua_pushstring(L, "");
return 1;
}
/*
* encoding.strip_bom(text, charset)
*
* Remove the byte order marks from the given string.
*
* Arguments:
* text, a string that may contain a byte order marks to be removed.
* charset, optional charset to scan for, if empty scan all charsets with bom.
*
* Returns:
* The input text string with the byte order marks removed if found.
*/
int Lcodesets_strip_bom(lua_State* L) {
size_t text_len = 0;
const char* text = luaL_checklstring(L, 1, &text_len);
const char* charset = luaL_optstring(L, 2, NULL);
size_t bom_len = 0;
if (text_len <= 0) {
lua_pushstring(L, "");
} else {
if (charset) {
for (size_t i=0; bom_list[i].charset != NULL; i++) {
if (
strcmp(bom_list[i].charset, charset) == 0
&&
text_len >= bom_list[i].len
) {
bool bom_found = true;
for (size_t b=0; b<bom_list[i].len; b++) {
if (bom_list[i].bom[b] != (unsigned char)text[b]) {
bom_found = false;
break;
}
}
if (bom_found) {
bom_len = bom_list[i].len;
break;
}
}
}
} else {
encoding_charset_from_bom(text, text_len, &bom_len);
}
}
if (bom_len > 0 && text_len-bom_len > 0) {
lua_pushlstring(L, text+bom_len, text_len-bom_len);
} else {
lua_pushlstring(L, text, text_len);
}
return 1;
}
int luaopen_codesets (lua_State *L) {
luaL_Reg libs[] = {
#define ENTRY(name) { #name, Lcodesets_##name }
ENTRY(detect),
ENTRY(systemCodeset),
ENTRY(convert),
ENTRY(get_charset_bom),
ENTRY(strip_bom),
#undef ENTRY
{ NULL, NULL }
};
luaL_newlib(L, libs);
return 1;
}
int OpenLibs(void)
{
if ((CodesetsBase = OpenLibrary( "codesets.library", 6 )))
{
ICodesets = (struct CodesetsIFace *)GetInterface( CodesetsBase, "main", 1L, NULL );
if(!ICodesets) return CleanExit("Can't open codesets.library Interface");
}
else return CleanExit("Can't open codesets.library version 6");
return RETURN_OK;
}
int CleanExit(const char *str)
{
if(ICodesets) DropInterface((struct Interface *) ICodesets);
if(CodesetsBase) CloseLibrary(CodesetsBase);
if(strcmp(str, "JustExit"))
{
printf("Error::%s\n", str);
return RETURN_ERROR;
}
return RETURN_OK;
}

12
src/platform/codesets.h Normal file
View File

@ -0,0 +1,12 @@
#ifndef _CODESETS_H
#define _CODESETS_H
#include <proto/dos.h>
#include <proto/exec.h>
#include <proto/codesets.h>
int OpenLibs(void);
int CleanExit(const char *);
#endif

View File

@ -236,7 +236,7 @@ static void font_file_close(FT_Stream stream) {
RenFont* ren_font_load(RenWindow *window_renderer, const char* path, float size, ERenFontAntialiasing antialiasing, ERenFontHinting hinting, unsigned char style) {
RenFont *font = NULL;
FT_Face face = NULL;
SDL_RWops *file = SDL_RWFromFile(path, "rb");
if (!file)
goto rwops_failure;
@ -478,8 +478,7 @@ void ren_draw_rect(RenSurface *rs, RenRect rect, RenColor color) {
if (color.a == 0xff) {
uint32_t translated = SDL_MapRGB(surface->format, color.r, color.g, color.b);
SDL_Rect rect = { x1, y1, x2 - x1, y2 - y1 };
SDL_FillRect(surface, &rect, translated);
SDL_FillRect(surface, &dest_rect, translated);
} else {
// Seems like SDL doesn't handle clipping as we expect when using
// scaled blitting, so we "clip" manually.