378 lines
9.6 KiB
C
378 lines
9.6 KiB
C
/* This code is responsible for the encoding change
|
|
* using codesets library. It requires the codesets
|
|
* plugin to work.
|
|
*
|
|
* Heavily inspired from the encoding plugin
|
|
* https://github.com/jgmdev/lite-xl-encoding
|
|
*/
|
|
#include <SDL2/SDL_stdinc.h>
|
|
#include <stdbool.h>
|
|
|
|
#include <lua.h>
|
|
#include <lauxlib.h>
|
|
#include <lualib.h>
|
|
|
|
#include "codesets.h"
|
|
|
|
struct Library *CodesetsBase = NULL;
|
|
struct CodesetsIFace *ICodesets = NULL;
|
|
|
|
typedef struct {
|
|
const char* charset;
|
|
unsigned char bom[4];
|
|
int len;
|
|
} bom_t;
|
|
|
|
/*
|
|
* List of encodings that can have byte order marks.
|
|
* Note: UTF-32 should be tested before UTF-16, the order matters.
|
|
*/
|
|
static bom_t bom_list[] = {
|
|
{ "UTF-8", {0xef, 0xbb, 0xbf}, 3 },
|
|
{ "UTF-32LE", {0xff, 0xfe, 0x00, 0x00}, 4 },
|
|
{ "UTF-32BE", {0x00, 0x00, 0xfe, 0xff}, 4 },
|
|
{ "UTF-16LE", {0xff, 0xfe}, 2 },
|
|
{ "UTF-16BE", {0xfe, 0xff}, 2 },
|
|
{ "GB18030", {0x84, 0x31, 0x95, 0x33}, 4 },
|
|
{ "UTF-7", {0x2b, 0x2f, 0x76, 0x38}, 4 },
|
|
{ "UTF-7", {0x2b, 0x2f, 0x76, 0x39}, 4 },
|
|
{ "UTF-7", {0x2b, 0x2f, 0x76, 0x2b}, 4 },
|
|
{ "UTF-7", {0x2b, 0x2f, 0x76, 0x2f}, 4 },
|
|
{ NULL }
|
|
};
|
|
|
|
/* Get the applicable byte order marks for the given charset */
|
|
static const unsigned char* encoding_bom_from_charset(const char* charset, size_t* len) {
|
|
for (size_t i=0; bom_list[i].charset != NULL; i++){
|
|
if (strcmp(bom_list[i].charset, charset) == 0) {
|
|
if (len) *len = bom_list[i].len;
|
|
return bom_list[i].bom;
|
|
}
|
|
}
|
|
|
|
if (len) *len = 0;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* Detect the encoding of the given string if a valid bom sequence is found */
|
|
static const char* encoding_charset_from_bom(
|
|
const char* string, size_t len, size_t* bom_len
|
|
) {
|
|
const unsigned char* bytes = (unsigned char*) string;
|
|
|
|
for (size_t i=0; bom_list[i].charset != NULL; i++) {
|
|
if (len >= bom_list[i].len) {
|
|
bool all_match = true;
|
|
for (size_t b = 0; b<bom_list[i].len; b++) {
|
|
if (bytes[b] != bom_list[i].bom[b]) {
|
|
all_match = false;
|
|
break;
|
|
}
|
|
}
|
|
if (all_match) {
|
|
if (bom_len) *bom_len = bom_list[i].len;
|
|
return bom_list[i].charset;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (bom_len)
|
|
*bom_len = 0;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////
|
|
// Lua methods for codesets
|
|
|
|
int Lcodesets_detect(lua_State *L) {
|
|
const char* filename = luaL_checkstring(L, 1);
|
|
|
|
BPTR fileHandle = FOpen(filename, MODE_OLDFILE, 0);
|
|
if (!fileHandle)
|
|
{
|
|
lua_pushnil(L);
|
|
lua_pushfstring(L, "unable to open file '%s', code=%d", filename, IoErr());
|
|
return 2;
|
|
}
|
|
|
|
ChangeFilePosition(fileHandle, 0, OFFSET_END);
|
|
|
|
int64 fileSize = GetFileSize( fileHandle );
|
|
STRPTR fileText = malloc(fileSize);
|
|
|
|
if (!fileText) {
|
|
lua_pushnil(L);
|
|
lua_pushfstring(L, "out of ram while detecting charset of '%s'", filename);
|
|
FClose(fileHandle);
|
|
return 2;
|
|
}
|
|
|
|
ChangeFilePosition(fileHandle, 0, OFFSET_BEGINNING);
|
|
Read(fileHandle, fileText, fileSize);
|
|
|
|
struct codeset *cs;
|
|
ULONG errNum = 0;
|
|
|
|
if((cs = CodesetsFindBest(CSA_Source, fileText,
|
|
CSA_ErrPtr, &errNum,
|
|
TAG_DONE)))
|
|
{
|
|
FClose(fileHandle);
|
|
free(fileText);
|
|
lua_pushstring(L, cs->name);
|
|
// printf("Identified file as %s with %d of %d errors\n", cs->name, (int)errNum, (int)strlen(fileText));
|
|
}
|
|
else
|
|
{
|
|
FClose(fileHandle);
|
|
free(fileText);
|
|
lua_pushnil(L);
|
|
lua_pushstring(L, "could not detect the file encoding");
|
|
return 2;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
int Lcodesets_systemCodeset(lua_State *L) {
|
|
struct codeset *systemCodeset;
|
|
systemCodeset = CodesetsFindA(NULL, NULL);
|
|
lua_pushstring(L, systemCodeset->name);
|
|
|
|
return 1;
|
|
}
|
|
|
|
int Lcodesets_convert(lua_State *L) {
|
|
const char* to = luaL_checkstring(L, 1);
|
|
const char* from = luaL_checkstring(L, 2);
|
|
size_t text_len = 0;
|
|
const char* text = luaL_checklstring(L, 3, &text_len);
|
|
/* conversion options */
|
|
bool strict = false;
|
|
bool handle_to_bom = false;
|
|
bool handle_from_bom = false;
|
|
const unsigned char* bom;
|
|
size_t bom_len = 0;
|
|
|
|
if (lua_gettop(L) > 3 && lua_istable(L, 4)) {
|
|
lua_getfield(L, 4, "handle_to_bom");
|
|
if (lua_isboolean(L, -1)) {
|
|
handle_to_bom = lua_toboolean(L, -1);
|
|
}
|
|
lua_getfield(L, 4, "handle_from_bom");
|
|
if (lua_isboolean(L, -1)) {
|
|
handle_from_bom = lua_toboolean(L, -1);
|
|
}
|
|
lua_getfield(L, 4, "strict");
|
|
if (lua_isboolean(L, -1)) {
|
|
strict = lua_toboolean(L, -1);
|
|
}
|
|
}
|
|
|
|
/* to strip the bom from the input text if any */
|
|
if (handle_from_bom) {
|
|
encoding_charset_from_bom(text, text_len, &bom_len);
|
|
}
|
|
|
|
char *output;
|
|
ULONG output_len;
|
|
struct codeset *srcCodeset;
|
|
struct codeset *destCodeset;
|
|
ULONG errNum = 0;
|
|
|
|
srcCodeset = CodesetsFind(from, CSA_FallbackToDefault, FALSE, TAG_DONE);
|
|
// srcCodeset = CodesetsFindBest(CSA_Source, text,
|
|
// CSA_ErrPtr, &errNum,
|
|
// TAG_DONE);
|
|
if (!srcCodeset)
|
|
{
|
|
lua_pushnil(L);
|
|
lua_pushfstring(L, "failed creating source codeset for '%s'", from);
|
|
return 2;
|
|
}
|
|
|
|
destCodeset = CodesetsFind(to, CSA_FallbackToDefault, FALSE, TAG_DONE);
|
|
if (!destCodeset)
|
|
{
|
|
lua_pushnil(L);
|
|
lua_pushfstring(L, "failed creating destination codeset for '%s'", to);
|
|
return 2;
|
|
}
|
|
|
|
output = CodesetsConvertStr(CSA_SourceCodeset, srcCodeset,
|
|
CSA_DestCodeset, destCodeset,
|
|
CSA_Source, text,
|
|
CSA_DestLenPtr, &output_len,
|
|
TAG_DONE);
|
|
// if (!output)
|
|
// {
|
|
// lua_pushnil(L);
|
|
// lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to);
|
|
// CodesetsFreeA(output, NULL);
|
|
// return 2;
|
|
// }
|
|
|
|
/* strip bom sometimes added when converting to utf-8, we don't need it */
|
|
if (output && strcmp(to, "UTF-8") == 0) {
|
|
encoding_charset_from_bom(output, output_len, &bom_len);
|
|
if (bom_len > 0) {
|
|
memmove(output, output+bom_len, output_len-bom_len);
|
|
output = realloc(output, output_len-bom_len);
|
|
output_len -= bom_len;
|
|
}
|
|
}
|
|
|
|
// if (!CodesetsIsValidUTF8(output))
|
|
// {
|
|
// lua_pushnil(L);
|
|
// lua_pushfstring(L, "not valid conversion from '%s' to '%s'", from, to);
|
|
// CodesetsFreeA(output, NULL);
|
|
// return 2;
|
|
// }
|
|
|
|
if (output != NULL && handle_to_bom) {
|
|
if (handle_to_bom) {
|
|
bom = encoding_bom_from_charset(to, &bom_len);
|
|
if (bom != NULL) {
|
|
output = realloc(output, output_len + bom_len);
|
|
memmove(output+bom_len, output, output_len);
|
|
memcpy(output, bom, bom_len);
|
|
output_len += bom_len;
|
|
}
|
|
}
|
|
} else if (!output) {
|
|
lua_pushnil(L);
|
|
lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to);
|
|
CodesetsFreeA(output, NULL);
|
|
return 2;
|
|
}
|
|
|
|
lua_pushlstring(L, output, output_len);
|
|
CodesetsFreeA(output, NULL);
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* encoding.get_charset_bom(charset)
|
|
*
|
|
* Retrieve the byte order marks sequence for the given charset if applicable.
|
|
*
|
|
* Arguments:
|
|
* charset, a string representing a valid iconv charset
|
|
*
|
|
* Returns:
|
|
* The bom sequence string or empty string if not applicable.
|
|
*/
|
|
int Lcodesets_get_charset_bom(lua_State *L) {
|
|
const char* charset = luaL_checkstring(L, 1);
|
|
|
|
size_t bom_len = 0;
|
|
const unsigned char* bom = encoding_bom_from_charset(charset, &bom_len);
|
|
|
|
if (bom)
|
|
lua_pushlstring(L, (char*)bom, bom_len);
|
|
else
|
|
lua_pushstring(L, "");
|
|
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* encoding.strip_bom(text, charset)
|
|
*
|
|
* Remove the byte order marks from the given string.
|
|
*
|
|
* Arguments:
|
|
* text, a string that may contain a byte order marks to be removed.
|
|
* charset, optional charset to scan for, if empty scan all charsets with bom.
|
|
*
|
|
* Returns:
|
|
* The input text string with the byte order marks removed if found.
|
|
*/
|
|
int Lcodesets_strip_bom(lua_State* L) {
|
|
size_t text_len = 0;
|
|
const char* text = luaL_checklstring(L, 1, &text_len);
|
|
const char* charset = luaL_optstring(L, 2, NULL);
|
|
size_t bom_len = 0;
|
|
|
|
if (text_len <= 0) {
|
|
lua_pushstring(L, "");
|
|
} else {
|
|
if (charset) {
|
|
for (size_t i=0; bom_list[i].charset != NULL; i++) {
|
|
if (
|
|
strcmp(bom_list[i].charset, charset) == 0
|
|
&&
|
|
text_len >= bom_list[i].len
|
|
) {
|
|
bool bom_found = true;
|
|
for (size_t b=0; b<bom_list[i].len; b++) {
|
|
if (bom_list[i].bom[b] != (unsigned char)text[b]) {
|
|
bom_found = false;
|
|
break;
|
|
}
|
|
}
|
|
if (bom_found) {
|
|
bom_len = bom_list[i].len;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
encoding_charset_from_bom(text, text_len, &bom_len);
|
|
}
|
|
}
|
|
|
|
if (bom_len > 0 && text_len-bom_len > 0) {
|
|
lua_pushlstring(L, text+bom_len, text_len-bom_len);
|
|
} else {
|
|
lua_pushlstring(L, text, text_len);
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
int luaopen_codesets (lua_State *L) {
|
|
luaL_Reg libs[] = {
|
|
#define ENTRY(name) { #name, Lcodesets_##name }
|
|
ENTRY(detect),
|
|
ENTRY(systemCodeset),
|
|
ENTRY(convert),
|
|
ENTRY(get_charset_bom),
|
|
ENTRY(strip_bom),
|
|
#undef ENTRY
|
|
{ NULL, NULL }
|
|
};
|
|
|
|
luaL_newlib(L, libs);
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
int OpenLibs(void)
|
|
{
|
|
if ((CodesetsBase = OpenLibrary( "codesets.library", 6 )))
|
|
{
|
|
ICodesets = (struct CodesetsIFace *)GetInterface( CodesetsBase, "main", 1L, NULL );
|
|
if(!ICodesets) return CleanExit("Can't open codesets.library Interface");
|
|
}
|
|
else return CleanExit("Can't open codesets.library version 6");
|
|
return RETURN_OK;
|
|
}
|
|
|
|
int CleanExit(const char *str)
|
|
{
|
|
if(ICodesets) DropInterface((struct Interface *) ICodesets);
|
|
if(CodesetsBase) CloseLibrary(CodesetsBase);
|
|
|
|
if(strcmp(str, "JustExit"))
|
|
{
|
|
printf("Error::%s\n", str);
|
|
return RETURN_ERROR;
|
|
}
|
|
return RETURN_OK;
|
|
}
|