lite-xl/src/platform/codesets.c

/* This code is responsible for the encoding change
 * using codesets library. It requires the codesets
 * plugin to work.
 *
 * Heavily inspired from the encoding plugin
 * https://github.com/jgmdev/lite-xl-encoding
*/
#include <SDL2/SDL_stdinc.h>
#include <stdbool.h>

#include <lua.h>
#include <lauxlib.h>
#include <lualib.h>

#include "codesets.h"

struct Library *CodesetsBase = NULL;
struct CodesetsIFace *ICodesets = NULL;

typedef struct {
  const char* charset;
  unsigned char bom[4];
  int len;
} bom_t;

/*
 * List of encodings that can have byte order marks.
 * Note: UTF-32 should be tested before UTF-16, the order matters.
*/
static bom_t bom_list[] = {
  { "UTF-8",    {0xef, 0xbb, 0xbf},       3 },
  { "UTF-32LE", {0xff, 0xfe, 0x00, 0x00}, 4 },
  { "UTF-32BE", {0x00, 0x00, 0xfe, 0xff}, 4 },
  { "UTF-16LE", {0xff, 0xfe},             2 },
  { "UTF-16BE", {0xfe, 0xff},             2 },
  { "GB18030",  {0x84, 0x31, 0x95, 0x33}, 4 },
  { "UTF-7",    {0x2b, 0x2f, 0x76, 0x38}, 4 },
  { "UTF-7",    {0x2b, 0x2f, 0x76, 0x39}, 4 },
  { "UTF-7",    {0x2b, 0x2f, 0x76, 0x2b}, 4 },
  { "UTF-7",    {0x2b, 0x2f, 0x76, 0x2f}, 4 },
  { NULL }
};

/* Get the applicable byte order marks for the given charset */
static const unsigned char* encoding_bom_from_charset(const char* charset, size_t* len) {
  for (size_t i=0; bom_list[i].charset != NULL; i++){
    if (strcmp(bom_list[i].charset, charset) == 0) {
      if (len) *len = bom_list[i].len;
      return bom_list[i].bom;
    }
  }

  if (len) *len = 0;

  return NULL;
}

/* Detect the encoding of the given string if a valid bom sequence is found */
static const char* encoding_charset_from_bom(
  const char* string, size_t len, size_t* bom_len
) {
  const unsigned char* bytes = (unsigned char*) string;

  for (size_t i=0; bom_list[i].charset != NULL; i++) {
    if (len >= bom_list[i].len) {
      bool all_match = true;
      for (size_t b = 0; b<bom_list[i].len; b++) {
        if (bytes[b] != bom_list[i].bom[b]) {
          all_match = false;
          break;
        }
      }
      if (all_match) {
        if (bom_len) *bom_len = bom_list[i].len;
        return bom_list[i].charset;
      }
    }
  }

  if (bom_len)
      *bom_len = 0;

  return NULL;
}

/////////////////////////////////////////////////////////////////////
// Lua methods for codesets

int Lcodesets_detect(lua_State *L) {
  const char* filename = luaL_checkstring(L, 1);

  BPTR fileHandle = FOpen(filename, MODE_OLDFILE, 0);
  if (!fileHandle)
  {
    lua_pushnil(L);
    lua_pushfstring(L, "unable to open file '%s', code=%d", filename, IoErr());
    return 2;
  }

  ChangeFilePosition(fileHandle, 0, OFFSET_END);

  int64 fileSize = GetFileSize( fileHandle );
  STRPTR fileText = malloc(fileSize);

  if (!fileText) {
    lua_pushnil(L);
		lua_pushfstring(L, "out of ram while detecting charset of '%s'", filename);
		FClose(fileHandle);
		return 2;
  }

  ChangeFilePosition(fileHandle, 0, OFFSET_BEGINNING);
  Read(fileHandle, fileText, fileSize);

  struct codeset *cs;
  ULONG errNum = 0;

  if((cs = CodesetsFindBest(CSA_Source, fileText,
                            CSA_ErrPtr, &errNum,
                            TAG_DONE)))
  {
    FClose(fileHandle);
    free(fileText);
    lua_pushstring(L, cs->name);
    // printf("Identified file as %s with %d of %d errors\n", cs->name, (int)errNum, (int)strlen(fileText));
  }
  else
  {
    FClose(fileHandle);
    free(fileText);
    lua_pushnil(L);
		lua_pushstring(L, "could not detect the file encoding");
		return 2;
  }

  return 1;
}

int Lcodesets_systemCodeset(lua_State *L) {
  struct codeset *systemCodeset;
  systemCodeset = CodesetsFindA(NULL, NULL);
  lua_pushstring(L, systemCodeset->name);

  return 1;
}

int Lcodesets_convert(lua_State *L) {
  const char* to = luaL_checkstring(L, 1);
  const char* from = luaL_checkstring(L, 2);
  size_t text_len = 0;
  const char* text = luaL_checklstring(L, 3, &text_len);
  /* conversion options */
  bool strict = false;
  bool handle_to_bom = false;
  bool handle_from_bom = false;
  const unsigned char* bom;
  size_t bom_len = 0;

  if (lua_gettop(L) > 3 && lua_istable(L, 4)) {
    lua_getfield(L, 4, "handle_to_bom");
    if (lua_isboolean(L, -1)) {
      handle_to_bom = lua_toboolean(L, -1);
    }
    lua_getfield(L, 4, "handle_from_bom");
    if (lua_isboolean(L, -1)) {
      handle_from_bom = lua_toboolean(L, -1);
    }
    lua_getfield(L, 4, "strict");
    if (lua_isboolean(L, -1)) {
      strict = lua_toboolean(L, -1);
    }
  }

  /* to strip the bom from the input text if any */
  if (handle_from_bom) {
    encoding_charset_from_bom(text, text_len, &bom_len);
  }

  char *output;
  ULONG output_len;
  struct codeset *srcCodeset;
  struct codeset *destCodeset;
  ULONG errNum = 0;

  srcCodeset = CodesetsFind(from, CSA_FallbackToDefault, FALSE, TAG_DONE);
  // srcCodeset = CodesetsFindBest(CSA_Source, text,
  //                               CSA_ErrPtr, &errNum,
  //                               TAG_DONE);
  if (!srcCodeset)
  {
    lua_pushnil(L);
    lua_pushfstring(L, "failed creating source codeset for '%s'", from);
    return 2;
  }

  destCodeset = CodesetsFind(to, CSA_FallbackToDefault, FALSE, TAG_DONE);
  if (!destCodeset)
  {
    lua_pushnil(L);
    lua_pushfstring(L, "failed creating destination codeset for '%s'", to);
    return 2;
  }

  output = CodesetsConvertStr(CSA_SourceCodeset, srcCodeset,
                                CSA_DestCodeset, destCodeset,
                                CSA_Source, text,
                                CSA_DestLenPtr, &output_len,
                                TAG_DONE);
  // if (!output)
  // {
  //   lua_pushnil(L);
  //   lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to);
  //   CodesetsFreeA(output, NULL);
  //   return 2;
  // }

  /* strip bom sometimes added when converting to utf-8, we don't need it */
  if (output && strcmp(to, "UTF-8") == 0) {
    encoding_charset_from_bom(output, output_len, &bom_len);
    if (bom_len > 0) {
      memmove(output, output+bom_len, output_len-bom_len);
      output = realloc(output, output_len-bom_len);
      output_len -= bom_len;
    }
  }

  // if (!CodesetsIsValidUTF8(output))
  // {
  //   lua_pushnil(L);
  //   lua_pushfstring(L, "not valid conversion from '%s' to '%s'", from, to);
  //   CodesetsFreeA(output, NULL);
  //   return 2;
  // }

  if (output != NULL && handle_to_bom) {
    if (handle_to_bom) {
      bom = encoding_bom_from_charset(to, &bom_len);
      if (bom != NULL) {
        output = realloc(output, output_len + bom_len);
        memmove(output+bom_len, output, output_len);
        memcpy(output, bom, bom_len);
        output_len += bom_len;
      }
    }
  } else if (!output) {
    lua_pushnil(L);
    lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to);
    CodesetsFreeA(output, NULL);
    return 2;
  }

  lua_pushlstring(L, output, output_len);
  CodesetsFreeA(output, NULL);
  return 1;
}

/*
 * encoding.get_charset_bom(charset)
 *
 * Retrieve the byte order marks sequence for the given charset if applicable.
 *
 * Arguments:
 *  charset, a string representing a valid iconv charset
 *
 * Returns:
 *  The bom sequence string or empty string if not applicable.
 */
int Lcodesets_get_charset_bom(lua_State *L) {
  const char* charset = luaL_checkstring(L, 1);

  size_t bom_len = 0;
  const unsigned char* bom = encoding_bom_from_charset(charset, &bom_len);

  if (bom)
    lua_pushlstring(L, (char*)bom, bom_len);
  else
    lua_pushstring(L, "");

  return 1;
}

/*
 * encoding.strip_bom(text, charset)
 *
 * Remove the byte order marks from the given string.
 *
 * Arguments:
 *  text, a string that may contain a byte order marks to be removed.
 *  charset, optional charset to scan for, if empty scan all charsets with bom.
 *
 * Returns:
 *  The input text string with the byte order marks removed if found.
 */
int Lcodesets_strip_bom(lua_State* L) {
  size_t text_len = 0;
  const char* text = luaL_checklstring(L, 1, &text_len);
  const char* charset = luaL_optstring(L, 2, NULL);
  size_t bom_len = 0;

  if (text_len <= 0) {
    lua_pushstring(L, "");
  } else {
    if (charset) {
      for (size_t i=0; bom_list[i].charset != NULL; i++) {
        if (
          strcmp(bom_list[i].charset, charset) == 0
          &&
          text_len >= bom_list[i].len
        ) {
          bool bom_found = true;
          for (size_t b=0; b<bom_list[i].len; b++) {
            if (bom_list[i].bom[b] != (unsigned char)text[b]) {
              bom_found = false;
              break;
            }
          }
          if (bom_found) {
            bom_len = bom_list[i].len;
            break;
          }
        }
      }
    } else {
      encoding_charset_from_bom(text, text_len, &bom_len);
    }
  }

  if (bom_len > 0 && text_len-bom_len > 0) {
    lua_pushlstring(L, text+bom_len, text_len-bom_len);
  } else {
    lua_pushlstring(L, text, text_len);
  }

  return 1;
}

int luaopen_codesets (lua_State *L) {
  luaL_Reg libs[] = {
#define ENTRY(name) { #name, Lcodesets_##name }
    ENTRY(detect),
    ENTRY(systemCodeset),
    ENTRY(convert),
    ENTRY(get_charset_bom),
    ENTRY(strip_bom),
#undef  ENTRY
    { NULL, NULL }
  };

  luaL_newlib(L, libs);

  return 1;
}


int OpenLibs(void)
{
  if ((CodesetsBase = OpenLibrary( "codesets.library", 6 )))
  {
    ICodesets = (struct CodesetsIFace *)GetInterface( CodesetsBase, "main", 1L, NULL );
    if(!ICodesets) return CleanExit("Can't open codesets.library Interface");
  }
  else return CleanExit("Can't open codesets.library version 6");
  return RETURN_OK;
}

int CleanExit(const char *str)
{
  if(ICodesets)         DropInterface((struct Interface *) ICodesets);
  if(CodesetsBase)      CloseLibrary(CodesetsBase);

  if(strcmp(str, "JustExit"))
  {
    printf("Error::%s\n", str);
    return RETURN_ERROR;
  }
  return RETURN_OK;
}