lite-xl/src/platform/codesets.c

465 lines
12 KiB
C

/* This code is responsible for the encoding change
* using codesets library. It requires the codesets
* plugin to work.
*
* Heavily inspired from the encoding plugin
* https://github.com/jgmdev/lite-xl-encoding
*/
#include <SDL_stdinc.h>
#include <stdbool.h>
#include <lua.h>
#include <lauxlib.h>
#include <lualib.h>
#include "codesets.h"
#if defined(__amigaos4__)
struct Library *CodesetsBase = NULL;
struct CodesetsIFace *ICodesets = NULL;
#endif
#if defined(__morphos__)
struct Library *CharsetsBase = NULL;
#endif
typedef struct {
const char* charset;
unsigned char bom[4];
int len;
} bom_t;
/*
* List of encodings that can have byte order marks.
* Note: UTF-32 should be tested before UTF-16, the order matters.
*/
static bom_t bom_list[] = {
{ "UTF-8", {0xef, 0xbb, 0xbf}, 3 },
{ "UTF-32LE", {0xff, 0xfe, 0x00, 0x00}, 4 },
{ "UTF-32BE", {0x00, 0x00, 0xfe, 0xff}, 4 },
{ "UTF-16LE", {0xff, 0xfe}, 2 },
{ "UTF-16BE", {0xfe, 0xff}, 2 },
{ "GB18030", {0x84, 0x31, 0x95, 0x33}, 4 },
{ "UTF-7", {0x2b, 0x2f, 0x76, 0x38}, 4 },
{ "UTF-7", {0x2b, 0x2f, 0x76, 0x39}, 4 },
{ "UTF-7", {0x2b, 0x2f, 0x76, 0x2b}, 4 },
{ "UTF-7", {0x2b, 0x2f, 0x76, 0x2f}, 4 },
{ NULL }
};
/* Get the applicable byte order marks for the given charset */
static const unsigned char* encoding_bom_from_charset(const char* charset, size_t* len) {
for (size_t i=0; bom_list[i].charset != NULL; i++){
if (strcmp(bom_list[i].charset, charset) == 0) {
if (len) *len = bom_list[i].len;
return bom_list[i].bom;
}
}
if (len) *len = 0;
return NULL;
}
/* Detect the encoding of the given string if a valid bom sequence is found */
static const char* encoding_charset_from_bom(
const char* string, size_t len, size_t* bom_len
) {
const unsigned char* bytes = (unsigned char*) string;
for (size_t i=0; bom_list[i].charset != NULL; i++) {
if (len >= bom_list[i].len) {
bool all_match = true;
for (size_t b = 0; b<bom_list[i].len; b++) {
if (bytes[b] != bom_list[i].bom[b]) {
all_match = false;
break;
}
}
if (all_match) {
if (bom_len) *bom_len = bom_list[i].len;
return bom_list[i].charset;
}
}
}
if (bom_len)
*bom_len = 0;
return NULL;
}
/////////////////////////////////////////////////////////////////////
// Lua methods for codesets
int Lcodesets_detect(lua_State *L) {
#if defined(__amigaos4__)
const char* filename = luaL_checkstring(L, 1);
BPTR fileHandle = FOpen(filename, MODE_OLDFILE, 0);
if (!fileHandle)
{
lua_pushnil(L);
lua_pushfstring(L, "unable to open file '%s', code=%d", filename, IoErr());
return 2;
}
ChangeFilePosition(fileHandle, 0, OFFSET_END);
int64 fileSize = GetFileSize( fileHandle );
STRPTR fileText = malloc(fileSize);
if (!fileText) {
lua_pushnil(L);
lua_pushfstring(L, "out of ram while detecting charset of '%s'", filename);
FClose(fileHandle);
return 2;
}
ChangeFilePosition(fileHandle, 0, OFFSET_BEGINNING);
Read(fileHandle, fileText, fileSize);
struct codeset *cs;
ULONG errNum = 0;
if((cs = CodesetsFindBest(CSA_Source, fileText,
CSA_ErrPtr, &errNum,
TAG_DONE)))
{
FClose(fileHandle);
free(fileText);
lua_pushstring(L, cs->name);
// printf("Identified file as %s with %d of %d errors\n", cs->name, (int)errNum, (int)strlen(fileText));
}
else
{
FClose(fileHandle);
free(fileText);
lua_pushnil(L);
lua_pushstring(L, "could not detect the file encoding");
return 2;
}
return 1;
#endif
#if defined(__morphos__)
lua_pushstring(L, "could not detect the file encoding");
return 2;
#endif
}
int Lcodesets_systemCodeset(lua_State *L) {
#if defined(__amigaos4__)
struct codeset *systemCodeset;
systemCodeset = CodesetsFindA(NULL, NULL);
lua_pushstring(L, systemCodeset->name);
#endif
#if defined(__morphos__)
char buf[16];
GetSystemCharset(buf, 16);
lua_pushstring(L, buf);
#endif
return 1;
}
int Lcodesets_convert(lua_State *L) {
const char* to = luaL_checkstring(L, 1);
const char* from = luaL_checkstring(L, 2);
size_t text_len = 0;
#if defined(__amigaos4__)
const char* text = luaL_checklstring(L, 3, &text_len);
#endif
#if defined(__morphos__)
APTR text = luaL_checklstring(L, 3, &text_len);
#endif
/* conversion options */
bool strict = false;
bool handle_to_bom = false;
bool handle_from_bom = false;
const unsigned char* bom;
size_t bom_len = 0;
if (text_len == 0)
{
lua_pushlstring(L, text, text_len);
return 1;
}
if (lua_gettop(L) > 3 && lua_istable(L, 4)) {
lua_getfield(L, 4, "handle_to_bom");
if (lua_isboolean(L, -1)) {
handle_to_bom = lua_toboolean(L, -1);
}
lua_getfield(L, 4, "handle_from_bom");
if (lua_isboolean(L, -1)) {
handle_from_bom = lua_toboolean(L, -1);
}
lua_getfield(L, 4, "strict");
if (lua_isboolean(L, -1)) {
strict = lua_toboolean(L, -1);
}
}
/* to strip the bom from the input text if any */
if (handle_from_bom) {
encoding_charset_from_bom(text, text_len, &bom_len);
}
#if defined(__amigaos4__)
char *output;
ULONG output_len;
ULONG errNum = 0;
struct codeset *srcCodeset;
struct codeset *destCodeset;
srcCodeset = CodesetsFind(from, CSA_FallbackToDefault, FALSE, TAG_DONE);
// srcCodeset = CodesetsFindBest(CSA_Source, text,
// CSA_ErrPtr, &errNum,
// TAG_DONE);
if (!srcCodeset)
{
lua_pushnil(L);
lua_pushfstring(L, "failed creating source codeset for '%s'", from);
return 2;
}
destCodeset = CodesetsFind(to, CSA_FallbackToDefault, FALSE, TAG_DONE);
if (!destCodeset)
{
lua_pushnil(L);
lua_pushfstring(L, "failed creating destination codeset for '%s'", to);
return 2;
}
output = CodesetsConvertStr(CSA_SourceCodeset, srcCodeset,
CSA_DestCodeset, destCodeset,
CSA_Source, text,
CSA_DestLenPtr, &output_len,
TAG_DONE);
#endif
#if defined(__morphos__)
// LONG output_len = 0;
ULONG fromMib = GetCharsetNumber(from, CSF_IANA_MIMENAME);
if (fromMib == 0)
fromMib = GetCharsetNumber(from, CSF_IANA_NAME);
if (fromMib == 0)
fromMib = GetCharsetNumber(from, CSF_IANA_ALIAS);
ULONG toMib = GetCharsetNumber(to, CSF_IANA_MIMENAME);
if (toMib == 0)
toMib = GetCharsetNumber(to, CSF_IANA_NAME);
if (toMib == 0)
toMib = GetCharsetNumber(to, CSF_IANA_ALIAS);
LONG output_len = GetByteSize((APTR)text, text_len, fromMib, toMib);
char *output = calloc(output_len, sizeof(char) + 1);
LONG dstEnc = 0;
struct TagItem tags[] = { { CST_DoNotTerminate, FALSE }, { CST_GetDestEncoding, &dstEnc }, { TAG_DONE, 0 } };
LONG result = ConvertTagList((APTR)text, text_len, (APTR)output, output_len, fromMib, toMib, tags);
if (result <= 0)
{
lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to);
free(output);
return 2;
}
#endif
// if (!output)
// {
// lua_pushnil(L);
// lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to);
// CodesetsFreeA(output, NULL);
// return 2;
// }
/* strip bom sometimes added when converting to utf-8, we don't need it */
if (output && strcmp(to, "UTF-8") == 0) {
encoding_charset_from_bom(output, output_len, &bom_len);
if (bom_len > 0) {
memmove(output, output+bom_len, output_len-bom_len);
output = realloc(output, output_len-bom_len);
output_len -= bom_len;
}
}
// if (!CodesetsIsValidUTF8(output))
// {
// lua_pushnil(L);
// lua_pushfstring(L, "not valid conversion from '%s' to '%s'", from, to);
// CodesetsFreeA(output, NULL);
// return 2;
// }
if (output != NULL && handle_to_bom) {
if (handle_to_bom) {
bom = encoding_bom_from_charset(to, &bom_len);
if (bom != NULL) {
output = realloc(output, output_len + bom_len);
memmove(output+bom_len, output, output_len);
memcpy(output, bom, bom_len);
output_len += bom_len;
}
}
} else if (!output) {
lua_pushnil(L);
lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to);
#if defined(__amigaos4__)
CodesetsFreeA(output, NULL);
#endif
#if defined(__morphos__)
free(output);
#endif
return 2;
}
lua_pushlstring(L, output, output_len);
#if defined(__amigaos4__)
CodesetsFreeA(output, NULL);
#endif
#if defined(__morphos__)
free(output);
#endif
return 1;
}
/*
* encoding.get_charset_bom(charset)
*
* Retrieve the byte order marks sequence for the given charset if applicable.
*
* Arguments:
* charset, a string representing a valid iconv charset
*
* Returns:
* The bom sequence string or empty string if not applicable.
*/
int Lcodesets_get_charset_bom(lua_State *L) {
const char* charset = luaL_checkstring(L, 1);
size_t bom_len = 0;
const unsigned char* bom = encoding_bom_from_charset(charset, &bom_len);
if (bom)
lua_pushlstring(L, (char*)bom, bom_len);
else
lua_pushstring(L, "");
return 1;
}
/*
* encoding.strip_bom(text, charset)
*
* Remove the byte order marks from the given string.
*
* Arguments:
* text, a string that may contain a byte order marks to be removed.
* charset, optional charset to scan for, if empty scan all charsets with bom.
*
* Returns:
* The input text string with the byte order marks removed if found.
*/
int Lcodesets_strip_bom(lua_State* L) {
size_t text_len = 0;
const char* text = luaL_checklstring(L, 1, &text_len);
const char* charset = luaL_optstring(L, 2, NULL);
size_t bom_len = 0;
if (text_len <= 0) {
lua_pushstring(L, "");
} else {
if (charset) {
for (size_t i=0; bom_list[i].charset != NULL; i++) {
if (
strcmp(bom_list[i].charset, charset) == 0
&&
text_len >= bom_list[i].len
) {
bool bom_found = true;
for (size_t b=0; b<bom_list[i].len; b++) {
if (bom_list[i].bom[b] != (unsigned char)text[b]) {
bom_found = false;
break;
}
}
if (bom_found) {
bom_len = bom_list[i].len;
break;
}
}
}
} else {
encoding_charset_from_bom(text, text_len, &bom_len);
}
}
if (bom_len > 0 && text_len-bom_len > 0) {
lua_pushlstring(L, text+bom_len, text_len-bom_len);
} else {
lua_pushlstring(L, text, text_len);
}
return 1;
}
int luaopen_codesets (lua_State *L) {
luaL_Reg libs[] = {
#define ENTRY(name) { #name, Lcodesets_##name }
ENTRY(detect),
ENTRY(systemCodeset),
ENTRY(convert),
ENTRY(get_charset_bom),
ENTRY(strip_bom),
#undef ENTRY
{ NULL, NULL }
};
luaL_newlib(L, libs);
return 1;
}
int OpenLibs(void)
{
#if defined(__amigaos4__)
if ((CodesetsBase = OpenLibrary( "codesets.library", 6 )))
{
ICodesets = (struct CodesetsIFace *)GetInterface( CodesetsBase, "main", 1L, NULL );
if(!ICodesets) return CleanExit("Can't open codesets.library Interface");
}
else return CleanExit("Can't open codesets.library version 6");
#endif
#if defined(__morphos__)
if ((CharsetsBase = OpenLibrary( "charsets.library", 53 )) == NULL)
return CleanExit("Can't open charsets.library version 53");
#endif
return RETURN_OK;
}
int CleanExit(const char *str)
{
#if defined(__amigaos4__)
if(ICodesets) DropInterface((struct Interface *) ICodesets);
if(CodesetsBase) CloseLibrary(CodesetsBase);
#endif
#if defined(__morphos__)
if(CharsetsBase) CloseLibrary(CharsetsBase);
#endif
if(strcmp(str, "FineExit"))
{
printf("Error::%s\n", str);
return RETURN_ERROR;
}
return RETURN_OK;
}