/* This code is responsible for the encoding change * using codesets library. It requires the codesets * plugin to work. * * Heavily inspired from the encoding plugin * https://github.com/jgmdev/lite-xl-encoding */ #include #include #include #include #include #include "codesets.h" #if defined(__amigaos4__) struct Library *CodesetsBase = NULL; struct CodesetsIFace *ICodesets = NULL; #endif #if defined(__morphos__) struct Library *CharsetsBase = NULL; #endif typedef struct { const char* charset; unsigned char bom[4]; int len; } bom_t; /* * List of encodings that can have byte order marks. * Note: UTF-32 should be tested before UTF-16, the order matters. */ static bom_t bom_list[] = { { "UTF-8", {0xef, 0xbb, 0xbf}, 3 }, { "UTF-32LE", {0xff, 0xfe, 0x00, 0x00}, 4 }, { "UTF-32BE", {0x00, 0x00, 0xfe, 0xff}, 4 }, { "UTF-16LE", {0xff, 0xfe}, 2 }, { "UTF-16BE", {0xfe, 0xff}, 2 }, { "GB18030", {0x84, 0x31, 0x95, 0x33}, 4 }, { "UTF-7", {0x2b, 0x2f, 0x76, 0x38}, 4 }, { "UTF-7", {0x2b, 0x2f, 0x76, 0x39}, 4 }, { "UTF-7", {0x2b, 0x2f, 0x76, 0x2b}, 4 }, { "UTF-7", {0x2b, 0x2f, 0x76, 0x2f}, 4 }, { NULL } }; /* Get the applicable byte order marks for the given charset */ static const unsigned char* encoding_bom_from_charset(const char* charset, size_t* len) { for (size_t i=0; bom_list[i].charset != NULL; i++){ if (strcmp(bom_list[i].charset, charset) == 0) { if (len) *len = bom_list[i].len; return bom_list[i].bom; } } if (len) *len = 0; return NULL; } /* Detect the encoding of the given string if a valid bom sequence is found */ static const char* encoding_charset_from_bom( const char* string, size_t len, size_t* bom_len ) { const unsigned char* bytes = (unsigned char*) string; for (size_t i=0; bom_list[i].charset != NULL; i++) { if (len >= bom_list[i].len) { bool all_match = true; for (size_t b = 0; bname); // printf("Identified file as %s with %d of %d errors\n", cs->name, (int)errNum, (int)strlen(fileText)); } else { FClose(fileHandle); free(fileText); lua_pushnil(L); lua_pushstring(L, "could not detect the file encoding"); return 2; } return 1; #endif #if defined(__morphos__) lua_pushstring(L, "could not detect the file encoding"); return 2; #endif } int Lcodesets_systemCodeset(lua_State *L) { #if defined(__amigaos4__) struct codeset *systemCodeset; systemCodeset = CodesetsFindA(NULL, NULL); lua_pushstring(L, systemCodeset->name); #endif #if defined(__morphos__) char buf[16]; GetSystemCharset(buf, 16); lua_pushstring(L, buf); #endif return 1; } int Lcodesets_convert(lua_State *L) { const char* to = luaL_checkstring(L, 1); const char* from = luaL_checkstring(L, 2); size_t text_len = 0; #if defined(__amigaos4__) const char* text = luaL_checklstring(L, 3, &text_len); #endif #if defined(__morphos__) APTR text = luaL_checklstring(L, 3, &text_len); #endif /* conversion options */ bool strict = false; bool handle_to_bom = false; bool handle_from_bom = false; const unsigned char* bom; size_t bom_len = 0; if (text_len == 0) { lua_pushlstring(L, text, text_len); return 1; } if (lua_gettop(L) > 3 && lua_istable(L, 4)) { lua_getfield(L, 4, "handle_to_bom"); if (lua_isboolean(L, -1)) { handle_to_bom = lua_toboolean(L, -1); } lua_getfield(L, 4, "handle_from_bom"); if (lua_isboolean(L, -1)) { handle_from_bom = lua_toboolean(L, -1); } lua_getfield(L, 4, "strict"); if (lua_isboolean(L, -1)) { strict = lua_toboolean(L, -1); } } /* to strip the bom from the input text if any */ if (handle_from_bom) { encoding_charset_from_bom(text, text_len, &bom_len); } #if defined(__amigaos4__) char *output; ULONG output_len; ULONG errNum = 0; struct codeset *srcCodeset; struct codeset *destCodeset; srcCodeset = CodesetsFind(from, CSA_FallbackToDefault, FALSE, TAG_DONE); // srcCodeset = CodesetsFindBest(CSA_Source, text, // CSA_ErrPtr, &errNum, // TAG_DONE); if (!srcCodeset) { lua_pushnil(L); lua_pushfstring(L, "failed creating source codeset for '%s'", from); return 2; } destCodeset = CodesetsFind(to, CSA_FallbackToDefault, FALSE, TAG_DONE); if (!destCodeset) { lua_pushnil(L); lua_pushfstring(L, "failed creating destination codeset for '%s'", to); return 2; } output = CodesetsConvertStr(CSA_SourceCodeset, srcCodeset, CSA_DestCodeset, destCodeset, CSA_Source, text, CSA_DestLenPtr, &output_len, TAG_DONE); #endif #if defined(__morphos__) // LONG output_len = 0; ULONG fromMib = GetCharsetNumber(from, CSF_IANA_MIMENAME); if (fromMib == 0) fromMib = GetCharsetNumber(from, CSF_IANA_NAME); if (fromMib == 0) fromMib = GetCharsetNumber(from, CSF_IANA_ALIAS); ULONG toMib = GetCharsetNumber(to, CSF_IANA_MIMENAME); if (toMib == 0) toMib = GetCharsetNumber(to, CSF_IANA_NAME); if (toMib == 0) toMib = GetCharsetNumber(to, CSF_IANA_ALIAS); LONG output_len = GetByteSize((APTR)text, text_len, fromMib, toMib); char *output = calloc(output_len, sizeof(char) + 1); LONG dstEnc = 0; struct TagItem tags[] = { { CST_DoNotTerminate, FALSE }, { CST_GetDestEncoding, &dstEnc }, { TAG_DONE, 0 } }; LONG result = ConvertTagList((APTR)text, text_len, (APTR)output, output_len, fromMib, toMib, tags); if (result <= 0) { lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to); free(output); return 2; } #endif // if (!output) // { // lua_pushnil(L); // lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to); // CodesetsFreeA(output, NULL); // return 2; // } /* strip bom sometimes added when converting to utf-8, we don't need it */ if (output && strcmp(to, "UTF-8") == 0) { encoding_charset_from_bom(output, output_len, &bom_len); if (bom_len > 0) { memmove(output, output+bom_len, output_len-bom_len); output = realloc(output, output_len-bom_len); output_len -= bom_len; } } // if (!CodesetsIsValidUTF8(output)) // { // lua_pushnil(L); // lua_pushfstring(L, "not valid conversion from '%s' to '%s'", from, to); // CodesetsFreeA(output, NULL); // return 2; // } if (output != NULL && handle_to_bom) { if (handle_to_bom) { bom = encoding_bom_from_charset(to, &bom_len); if (bom != NULL) { output = realloc(output, output_len + bom_len); memmove(output+bom_len, output, output_len); memcpy(output, bom, bom_len); output_len += bom_len; } } } else if (!output) { lua_pushnil(L); lua_pushfstring(L, "failed converting from '%s' to '%s'", from, to); #if defined(__amigaos4__) CodesetsFreeA(output, NULL); #endif #if defined(__morphos__) free(output); #endif return 2; } lua_pushlstring(L, output, output_len); #if defined(__amigaos4__) CodesetsFreeA(output, NULL); #endif #if defined(__morphos__) free(output); #endif return 1; } /* * encoding.get_charset_bom(charset) * * Retrieve the byte order marks sequence for the given charset if applicable. * * Arguments: * charset, a string representing a valid iconv charset * * Returns: * The bom sequence string or empty string if not applicable. */ int Lcodesets_get_charset_bom(lua_State *L) { const char* charset = luaL_checkstring(L, 1); size_t bom_len = 0; const unsigned char* bom = encoding_bom_from_charset(charset, &bom_len); if (bom) lua_pushlstring(L, (char*)bom, bom_len); else lua_pushstring(L, ""); return 1; } /* * encoding.strip_bom(text, charset) * * Remove the byte order marks from the given string. * * Arguments: * text, a string that may contain a byte order marks to be removed. * charset, optional charset to scan for, if empty scan all charsets with bom. * * Returns: * The input text string with the byte order marks removed if found. */ int Lcodesets_strip_bom(lua_State* L) { size_t text_len = 0; const char* text = luaL_checklstring(L, 1, &text_len); const char* charset = luaL_optstring(L, 2, NULL); size_t bom_len = 0; if (text_len <= 0) { lua_pushstring(L, ""); } else { if (charset) { for (size_t i=0; bom_list[i].charset != NULL; i++) { if ( strcmp(bom_list[i].charset, charset) == 0 && text_len >= bom_list[i].len ) { bool bom_found = true; for (size_t b=0; b 0 && text_len-bom_len > 0) { lua_pushlstring(L, text+bom_len, text_len-bom_len); } else { lua_pushlstring(L, text, text_len); } return 1; } int luaopen_codesets (lua_State *L) { luaL_Reg libs[] = { #define ENTRY(name) { #name, Lcodesets_##name } ENTRY(detect), ENTRY(systemCodeset), ENTRY(convert), ENTRY(get_charset_bom), ENTRY(strip_bom), #undef ENTRY { NULL, NULL } }; luaL_newlib(L, libs); return 1; } int OpenLibs(void) { #if defined(__amigaos4__) if ((CodesetsBase = OpenLibrary( "codesets.library", 6 ))) { ICodesets = (struct CodesetsIFace *)GetInterface( CodesetsBase, "main", 1L, NULL ); if(!ICodesets) return CleanExit("Can't open codesets.library Interface"); } else return CleanExit("Can't open codesets.library version 6"); #endif #if defined(__morphos__) if ((CharsetsBase = OpenLibrary( "charsets.library", 53 )) == NULL) return CleanExit("Can't open charsets.library version 53"); #endif return RETURN_OK; } int CleanExit(const char *str) { #if defined(__amigaos4__) if(ICodesets) DropInterface((struct Interface *) ICodesets); if(CodesetsBase) CloseLibrary(CodesetsBase); #endif #if defined(__morphos__) if(CharsetsBase) CloseLibrary(CharsetsBase); #endif if(strcmp(str, "FineExit")) { printf("Error::%s\n", str); return RETURN_ERROR; } return RETURN_OK; }