Added codesets support for encoding switch

2023-12-18 17:19:13 +00:00 · 2023-12-18 17:19:13 +00:00 · 1b00045146
parent e0b5f56faa
commit 1b00045146
9 changed files with 468 additions and 22 deletions
--- a/Makefile.os4
+++ b/Makefile.os4
@ -9,7 +9,7 @@ LiteXL_OBJ := \
 	 src/api/api.o src/api/dirmonitor.o \
 	 src/api/regex.o src/api/renderer.o src/api/system.o \
 	 src/api/utf8.o src/platform/amigaos4.o \
-	 src/api/dirmonitor/os4.o
+	 src/api/dirmonitor/os4.o src/platform/codesets.o

 outfile := lite-xl
 compiler := gcc-11
@ -22,7 +22,7 @@ DFLAGS += -D__USE_INLINE__ -DLITE_XL_DATA_USE_EXEDIR

 CFLAGS += -Werror -Wwrite-strings -O3 -std=gnu11 -fno-strict-aliasing

-LFLAGS += -mcrt=newlib -lauto \
+LFLAGS += -mcrt=newlib \
 	-lpcre2 -lSDL2 -llua54 -lfreetype -lz -lm -lpthread -athread=native

 ifeq ($(DEBUG),1)
@ -56,7 +56,7 @@ LiteXL: $(LiteXL_OBJ)


 src/main.o: src/main.c src/api/api.h src/rencache.h \
-	src/renderer.h src/platform/amigaos4.h
+	src/renderer.h src/platform/amigaos4.h src/platform/codesets.h

 src/rencache.o: src/rencache.c

@ -70,13 +70,15 @@ src/api/regex.o: src/api/regex.c

 src/api/renderer.o: src/api/renderer.c

-src/api/system.o: src/api/system.c
+src/api/system.o: src/api/system.c src/platform/amigaos4.h

 src/platform/amigaos4.o: src/platform/amigaos4.c

+src/platform/codesets.o: src/platform/codesets.c
+
 src/api/dirmonitor.o: src/api/dirmonitor.c src/api/dirmonitor/os4.c

-src/api/utf8.o: src/api/utf8.c
+src/api/utf8.o: src/api/utf8.c src/platform/amigaos4.h

 src/api/dirmonitor/os4.o: src/api/dirmonitor/os4.c

--- a/data/core/docview.lua
+++ b/data/core/docview.lua
@ -446,7 +446,7 @@ function DocView:draw_line_text(line, x, y)
  local last_token = nil
  local tokens = self.doc.highlighter:get_line(line).tokens
  local tokens_count = #tokens
-  if string.sub(tokens[tokens_count], -1) == "\n" then
+  if tokens[tokens_count] ~= nil and string.sub(tokens[tokens_count], -1) == "\n" then
    last_token = tokens_count - 1
  end
  for tidx, type, text in self.doc.highlighter:each_token(line) do
--- a/data/core/start.lua
+++ b/data/core/start.lua
@ -1,5 +1,5 @@
 -- this file is used by lite-xl to setup the Lua environment when starting
-VERSION = "@PROJECT_VERSION@"
+VERSION = "2.1.1r3"
 MOD_VERSION_MAJOR = 3
 MOD_VERSION_MINOR = 0
 MOD_VERSION_PATCH = 0
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@ -263,7 +263,8 @@ function tokenizer.tokenize(incoming_syntax, text, state, resume)
  local text_len = text:ulen()
  local start_time = system.get_time()
  local starting_i = i
-  while i <= text_len do
+
+  while text_len ~= nil and i <= text_len do
    -- Every 200 chars, check if we're out of time
    if i - starting_i > 200 then
      starting_i = i
@ -301,11 +302,9 @@ function tokenizer.tokenize(incoming_syntax, text, state, resume)
          cont = false
        end
      end
-      -- General end of syntax check. Applies in the case where
-      -- we're ending early in the middle of a delimiter, or
-      -- just normally, upon finding a token.
-      if subsyntax_info then
-        local s, e = find_text(text, subsyntax_info, i, true, true)
+      -- If we don't have any concerns about syntax delimiters,
+      -- continue on as normal.
+      if cont then
        if s then
          push_token(res, token_type, text:usub(i, e))
          set_subsyntax_pattern_idx(0)
@ -332,12 +331,47 @@ function tokenizer.tokenize(incoming_syntax, text, state, resume)
      end
    end

-      -- consume character if we didn't match
-      if not matched then
-        push_token(res, "normal", text:usub(i, i))
-        i = i + 1
+    -- find matching pattern
+    local matched = false
+    for n, p in ipairs(current_syntax.patterns) do
+      local find_results = { find_text(text, p, i, true, false) }
+      if find_results[1] then
+        local type_is_table = type(p.type) == "table"
+        local n_types = type_is_table and #p.type or 1
+        if #find_results == 2 and type_is_table then
+          report_bad_pattern(core.warn, current_syntax, n,
+            "Token type is a table, but a string was expected.")
+          p.type = p.type[1]
+        elseif #find_results - 1 > n_types then
+          report_bad_pattern(core.error, current_syntax, n,
+            "Not enough token types: got %d needed %d.", n_types, #find_results - 1)
+        elseif #find_results - 1 < n_types then
+          report_bad_pattern(core.warn, current_syntax, n,
+            "Too many token types: got %d needed %d.", n_types, #find_results - 1)
+        end
+        -- matched pattern; make and add tokens
+        push_tokens(res, current_syntax, p, text, find_results)
+        -- update state if this was a start|end pattern pair
+        if type(p.pattern or p.regex) == "table" then
+          -- If we have a subsyntax, push that onto the subsyntax stack.
+          if p.syntax then
+            push_subsyntax(p, n)
+          else
+            set_subsyntax_pattern_idx(n)
+          end
+        end
+        -- move cursor past this token
+        i = find_results[2] + 1
+        matched = true
+        break
      end
    end
+
+    -- consume character if we didn't match
+    if not matched then
+      push_token(res, "normal", text:usub(i, i))
+      i = i + 1
+    end
  end

  return res, state
--- a/src/api/api.c
+++ b/src/api/api.c
@ -7,6 +7,10 @@ int luaopen_regex(lua_State *L);
 int luaopen_dirmonitor(lua_State* L);
 int luaopen_utf8extra(lua_State* L);

+#if defined(__amigaos4__)
+int luaopen_codesets(lua_State* L);
+#endif
+
 static const luaL_Reg libs[] = {
  { "system",     luaopen_system     },
  { "renderer",   luaopen_renderer   },
@ -14,6 +18,9 @@ static const luaL_Reg libs[] = {
  // { "process",    luaopen_process    },
  { "dirmonitor", luaopen_dirmonitor },
  { "utf8extra",  luaopen_utf8extra  },
+#if defined(__amigaos4__)
+  { "codesetsextra",   luaopen_codesets  },
+#endif
  { NULL, NULL }
 };

@ -22,4 +29,3 @@ void api_load_libs(lua_State *L) {
  for (int i = 0; libs[i].name; i++)
    luaL_requiref(L, libs[i].name, libs[i].func, 1);
 }
-
--- a/src/main.c
+++ b/src/main.c
@ -8,7 +8,7 @@
 #include <signal.h>

 #if defined(__amigaos4__) || defined(__morphos__)
-#define VSTRING     "Lite XL 2.1.1r1 (29.01.2023)"
+#define VSTRING     "Lite XL 2.1.1r3 (07.08.2023)"
 #define VERSTAG     "\0$VER: " VSTRING
 #endif

@ -20,6 +20,7 @@
  #include <mach-o/dyld.h>
 #elif defined(__amigaos4__)
  #include <locale.h>
+  #include "platform/codesets.h"
  #include "platform/amigaos4.h"
  static CONST_STRPTR stack USED = "$STACK:102400";
  static CONST_STRPTR version USED = VERSTAG;
@ -154,10 +155,19 @@ void set_macos_bundle_resources(lua_State *L);
 #endif

 int main(int argc, char **argv) {
+
+#if defined(__amigaos4__) || defined(__morphos__)
+  setlocale(LC_ALL, "C");
+#endif
+#if defined(__amigaos4__)
+  OpenLibs();
+#endif
+
 #ifndef _WIN32
  signal(SIGPIPE, SIG_IGN);
 #endif

+
  if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_EVENTS) != 0) {
    fprintf(stderr, "Error initializing sdl: %s", SDL_GetError());
    exit(1);
@ -262,6 +272,9 @@ init_lua:
    "  HOME = os.getenv('" LITE_OS_HOME "')\n"
    "  local exedir = match(EXEFILE, '^(.*)" LITE_PATHSEP_PATTERN LITE_NONPATHSEP_PATTERN "$')\n"
    "  local prefix = os.getenv('LITE_PREFIX') or match(exedir, '^(.*)" LITE_PATHSEP_PATTERN "bin$')\n"
+    "  if not HOME then\n"
+    "    HOME = exedir\n"
+    "  end\n"
    "  dofile((MACOS_RESOURCES or (prefix and prefix .. '/share/lite-xl' or exedir .. '/data')) .. '/core/start.lua')\n"
    "  core = require(os.getenv('LITE_XL_RUNTIME') or 'core')\n"
    "  core.init()\n"
@ -304,5 +317,8 @@ init_lua:
  ren_free_window_resources(&window_renderer);
  lua_close(L);

+#if defined(__amigaos4__)
+  CleanExit("JustExit");
+#endif
  return EXIT_SUCCESS;
 }
--- a/src/platform/codesets.c
+++ b/src/platform/codesets.c
@ -0,0 +1,377 @@
+/* This code is responsible for the encoding change
+ * using codesets library. It requires the codesets
+ * plugin to work.
+ *
+ * Heavily inspired from the encoding plugin
+ * https://github.com/jgmdev/lite-xl-encoding
+*/
+#include <SDL2/SDL_stdinc.h>
+#include <stdbool.h>
+
+#include <lua.h>
+#include <lauxlib.h>
+#include <lualib.h>
+
+#include "codesets.h"
+
+struct Library *CodesetsBase = NULL;
+struct CodesetsIFace *ICodesets = NULL;
+
+typedef struct {
+  const char* charset;
+  unsigned char bom[4];
+  int len;
+} bom_t;
+
+/*
+ * List of encodings that can have byte order marks.
+ * Note: UTF-32 should be tested before UTF-16, the order matters.
+*/
+static bom_t bom_list[] = {
+  { "UTF-8",    {0xef, 0xbb, 0xbf},       3 },
+  { "UTF-32LE", {0xff, 0xfe, 0x00, 0x00}, 4 },
+  { "UTF-32BE", {0x00, 0x00, 0xfe, 0xff}, 4 },
+  { "UTF-16LE", {0xff, 0xfe},             2 },
+  { "UTF-16BE", {0xfe, 0xff},             2 },
+  { "GB18030",  {0x84, 0x31, 0x95, 0x33}, 4 },
+  { "UTF-7",    {0x2b, 0x2f, 0x76, 0x38}, 4 },
+  { "UTF-7",    {0x2b, 0x2f, 0x76, 0x39}, 4 },
+  { "UTF-7",    {0x2b, 0x2f, 0x76, 0x2b}, 4 },
+  { "UTF-7",    {0x2b, 0x2f, 0x76, 0x2f}, 4 },
+  { NULL }
+};
+
+/* Get the applicable byte order marks for the given charset */
+static const unsigned char* encoding_bom_from_charset(const char* charset, size_t* len) {
+  for (size_t i=0; bom_list[i].charset != NULL; i++){
+    if (strcmp(bom_list[i].charset, charset) == 0) {
+      if (len) *len = bom_list[i].len;
+      return bom_list[i].bom;
+    }
+  }
+
+  if (len) *len = 0;
+
+  return NULL;
+}
+
+/* Detect the encoding of the given string if a valid bom sequence is found */
+static const char* encoding_charset_from_bom(
+  const char* string, size_t len, size_t* bom_len
+) {
+  const unsigned char* bytes = (unsigned char*) string;
+
+  for (size_t i=0; bom_list[i].charset != NULL; i++) {
+    if (len >= bom_list[i].len) {
+      bool all_match = true;
+      for (size_t b = 0; b<bom_list[i].len; b++) {
+        if (bytes[b] != bom_list[i].bom[b]) {
+          all_match = false;
+          break;
+        }
+      }
+      if (all_match) {
+        if (bom_len) *bom_len = bom_list[i].len;
+        return bom_list[i].charset;
+      }
+    }
+  }
+
+  if (bom_len)
+      *bom_len = 0;
+
+  return NULL;
+}
+
+/////////////////////////////////////////////////////////////////////
+// Lua methods for codesets
+
+int Lcodesets_detect(lua_State *L) {
+  const char* filename = luaL_checkstring(L, 1);
+
+  BPTR fileHandle = FOpen(filename, MODE_OLDFILE, 0);
+  if (!fileHandle)
+  {
+    lua_pushnil(L);
+    lua_pushfstring(L, "unable to open file '%s', code=%d", filename, IoErr());
+    return 2;
+  }
+
+  ChangeFilePosition(fileHandle, 0, OFFSET_END);
+
+  int64 fileSize = GetFileSize( fileHandle );
+  STRPTR fileText = malloc(fileSize);
+
+  if (!fileText) {
+    lua_pushnil(L);
+		lua_pushfstring(L, "out of ram while detecting charset of '%s'", filename);
+		FClose(fileHandle);
+		return 2;
+  }
+
+  ChangeFilePosition(fileHandle, 0, OFFSET_BEGINNING);
+  Read(fileHandle, fileText, fileSize);
+
+  struct codeset *cs;
+  ULONG errNum = 0;
+
+  if((cs = CodesetsFindBest(CSA_Source, fileText,
+                            CSA_ErrPtr, &errNum,
+                            TAG_DONE)))
+  {
+    FClose(fileHandle);
+    free(fileText);
+    lua_pushstring(L, cs->name);
+    // printf("Identified file as %s with %d of %d errors\n", cs->name, (int)errNum, (int)strlen(fileText));
+  }
+  else
+  {
+    FClose(fileHandle);
+    free(fileText);
+    lua_pushnil(L);
+		lua_pushstring(L, "could not detect the file encoding");
+		return 2;
+  }
+
+  return 1;
+}
+
+int Lcodesets_systemCodeset(lua_State *L) {
+  struct codeset *systemCodeset;
+  systemCodeset = CodesetsFindA(NULL, NULL);
+  lua_pushstring(L, systemCodeset->name);
+
+  return 1;
+}
+
+int Lcodesets_convert(lua_State *L) {
+  const char* to = luaL_checkstring(L, 1);
+  const char* from = luaL_checkstring(L, 2);
+  size_t text_len = 0;
+  const char* text = luaL_checklstring(L, 3, &text_len);
+  /* conversion options */
+  bool strict = false;
+  bool handle_to_bom = false;
+  bool handle_from_bom = false;
+  const unsigned char* bom;
+  size_t bom_len = 0;
+
+  if (lua_gettop(L) > 3 && lua_istable(L, 4)) {
+    lua_getfield(L, 4, "handle_to_bom");
+    if (lua_isboolean(L, -1)) {
+      handle_to_bom = lua_toboolean(L, -1);
+    }
+    lua_getfield(L, 4, "handle_from_bom");
+    if (lua_isboolean(L, -1)) {
+      handle_from_bom = lua_toboolean(L, -1);
+    }
+    lua_getfield(L, 4, "strict");
+    if (lua_isboolean(L, -1)) {
+      strict = lua_toboolean(L, -1);
+    }
+  }
+
+  /* to strip the bom from the input text if any */
+  if (handle_from_bom) {
+    encoding_charset_from_bom(text, text_len, &bom_len);
+  }
+
+  char *output;
+  ULONG output_len;
+  struct codeset *srcCodeset;
+  struct codeset *destCodeset;
+  ULONG errNum = 0;
+
+  srcCodeset = CodesetsFind(from, CSA_FallbackToDefault, FALSE, TAG_DONE);
+  // srcCodeset = CodesetsFindBest(CSA_Source, text,
+  //                               CSA_ErrPtr, &errNum,
+  //                               TAG_DONE);
+  if (!srcCodeset)
+  {
+    lua_pushnil(L);
+    lua_pushfstring(L, "failed creating source codeset for '%s'", from);
+    return 2;
+  }
+
+  destCodeset = CodesetsFind(to, CSA_FallbackToDefault, FALSE, TAG_DONE);
+  if (!destCodeset)
+  {
+    lua_pushnil(L);
+    lua_pushfstring(L, "failed creating destination codeset for '%s'", to);
+    return 2;
+  }
+
+  output = CodesetsConvertStr(CSA_SourceCodeset, srcCodeset,
+                                CSA_DestCodeset, destCodeset,
+                                CSA_Source, text,
+                                CSA_DestLenPtr, &output_len,
+                                TAG_DONE);
+  // if (!output)
+  // {
+  //   lua_pushnil(L);
+  //   lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to);
+  //   CodesetsFreeA(output, NULL);
+  //   return 2;
+  // }
+
+  /* strip bom sometimes added when converting to utf-8, we don't need it */
+  if (output && strcmp(to, "UTF-8") == 0) {
+    encoding_charset_from_bom(output, output_len, &bom_len);
+    if (bom_len > 0) {
+      memmove(output, output+bom_len, output_len-bom_len);
+      output = realloc(output, output_len-bom_len);
+      output_len -= bom_len;
+    }
+  }
+
+  // if (!CodesetsIsValidUTF8(output))
+  // {
+  //   lua_pushnil(L);
+  //   lua_pushfstring(L, "not valid conversion from '%s' to '%s'", from, to);
+  //   CodesetsFreeA(output, NULL);
+  //   return 2;
+  // }
+
+  if (output != NULL && handle_to_bom) {
+    if (handle_to_bom) {
+      bom = encoding_bom_from_charset(to, &bom_len);
+      if (bom != NULL) {
+        output = realloc(output, output_len + bom_len);
+        memmove(output+bom_len, output, output_len);
+        memcpy(output, bom, bom_len);
+        output_len += bom_len;
+      }
+    }
+  } else if (!output) {
+    lua_pushnil(L);
+    lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to);
+    CodesetsFreeA(output, NULL);
+    return 2;
+  }
+
+  lua_pushlstring(L, output, output_len);
+  CodesetsFreeA(output, NULL);
+  return 1;
+}
+
+/*
+ * encoding.get_charset_bom(charset)
+ *
+ * Retrieve the byte order marks sequence for the given charset if applicable.
+ *
+ * Arguments:
+ *  charset, a string representing a valid iconv charset
+ *
+ * Returns:
+ *  The bom sequence string or empty string if not applicable.
+ */
+int Lcodesets_get_charset_bom(lua_State *L) {
+  const char* charset = luaL_checkstring(L, 1);
+
+  size_t bom_len = 0;
+  const unsigned char* bom = encoding_bom_from_charset(charset, &bom_len);
+
+  if (bom)
+    lua_pushlstring(L, (char*)bom, bom_len);
+  else
+    lua_pushstring(L, "");
+
+  return 1;
+}
+
+/*
+ * encoding.strip_bom(text, charset)
+ *
+ * Remove the byte order marks from the given string.
+ *
+ * Arguments:
+ *  text, a string that may contain a byte order marks to be removed.
+ *  charset, optional charset to scan for, if empty scan all charsets with bom.
+ *
+ * Returns:
+ *  The input text string with the byte order marks removed if found.
+ */
+int Lcodesets_strip_bom(lua_State* L) {
+  size_t text_len = 0;
+  const char* text = luaL_checklstring(L, 1, &text_len);
+  const char* charset = luaL_optstring(L, 2, NULL);
+  size_t bom_len = 0;
+
+  if (text_len <= 0) {
+    lua_pushstring(L, "");
+  } else {
+    if (charset) {
+      for (size_t i=0; bom_list[i].charset != NULL; i++) {
+        if (
+          strcmp(bom_list[i].charset, charset) == 0
+          &&
+          text_len >= bom_list[i].len
+        ) {
+          bool bom_found = true;
+          for (size_t b=0; b<bom_list[i].len; b++) {
+            if (bom_list[i].bom[b] != (unsigned char)text[b]) {
+              bom_found = false;
+              break;
+            }
+          }
+          if (bom_found) {
+            bom_len = bom_list[i].len;
+            break;
+          }
+        }
+      }
+    } else {
+      encoding_charset_from_bom(text, text_len, &bom_len);
+    }
+  }
+
+  if (bom_len > 0 && text_len-bom_len > 0) {
+    lua_pushlstring(L, text+bom_len, text_len-bom_len);
+  } else {
+    lua_pushlstring(L, text, text_len);
+  }
+
+  return 1;
+}
+
+int luaopen_codesets (lua_State *L) {
+  luaL_Reg libs[] = {
+#define ENTRY(name) { #name, Lcodesets_##name }
+    ENTRY(detect),
+    ENTRY(systemCodeset),
+    ENTRY(convert),
+    ENTRY(get_charset_bom),
+    ENTRY(strip_bom),
+#undef  ENTRY
+    { NULL, NULL }
+  };
+
+  luaL_newlib(L, libs);
+
+  return 1;
+}
+
+
+int OpenLibs(void)
+{
+  if ((CodesetsBase = OpenLibrary( "codesets.library", 6 )))
+  {
+    ICodesets = (struct CodesetsIFace *)GetInterface( CodesetsBase, "main", 1L, NULL );
+    if(!ICodesets) return CleanExit("Can't open codesets.library Interface");
+  }
+  else return CleanExit("Can't open codesets.library version 6");
+  return RETURN_OK;
+}
+
+int CleanExit(const char *str)
+{
+  if(ICodesets)         DropInterface((struct Interface *) ICodesets);
+  if(CodesetsBase)      CloseLibrary(CodesetsBase);
+
+  if(strcmp(str, "JustExit"))
+  {
+    printf("Error::%s\n", str);
+    return RETURN_ERROR;
+  }
+  return RETURN_OK;
+}
--- a/src/platform/codesets.h
+++ b/src/platform/codesets.h
@ -0,0 +1,12 @@
+#ifndef _CODESETS_H
+#define _CODESETS_H
+
+#include <proto/dos.h>
+#include <proto/exec.h>
+#include <proto/codesets.h>
+
+int OpenLibs(void);
+int CleanExit(const char *);
+
+#endif
+
--- a/src/renderer.c
+++ b/src/renderer.c
@ -236,7 +236,7 @@ static void font_file_close(FT_Stream stream) {
 RenFont* ren_font_load(RenWindow *window_renderer, const char* path, float size, ERenFontAntialiasing antialiasing, ERenFontHinting hinting, unsigned char style) {
  RenFont *font = NULL;
  FT_Face face = NULL;
-  
+
  SDL_RWops *file = SDL_RWFromFile(path, "rb");
  if (!file)
    goto rwops_failure;
@ -478,8 +478,7 @@ void ren_draw_rect(RenSurface *rs, RenRect rect, RenColor color) {

  if (color.a == 0xff) {
    uint32_t translated = SDL_MapRGB(surface->format, color.r, color.g, color.b);
-    SDL_Rect rect = { x1, y1, x2 - x1, y2 - y1 };
-    SDL_FillRect(surface, &rect, translated);
+    SDL_FillRect(surface, &dest_rect, translated);
  } else {
    // Seems like SDL doesn't handle clipping as we expect when using
    // scaled blitting, so we "clip" manually.