lite-xl/resources/amiga/addons/plugins/codesets.lua

--mod-version:3 --priority:5

--[[
  This code is responsible for the encoding change
  using codesets library. It requires LiteXL 2.1.1r3
  and above to work.

  Heavily inspired from the encoding plugin
  https://github.com/jgmdev/lite-xl-encoding

  Configuration:
    useSystemEncoding
      By default the system encoding is used to open
      a file. If you want to disable that you may add
      the following line in you config file
      config.plugins.codesets.useSystemEncoding = false
]]

local core = require "core"
local common = require "core.common"
local command = require "core.command"
local config = require "core.config"
local style = require "core.style"
local Doc = require "core.doc"
local DocView = require "core.docview"
local CommandView = require "core.commandview"
local StatusView = require "core.statusview"

---@type encoding
local encoding = require "codesetsextra"
config.plugins.codesets = common.merge({
  useSystemEncoding = true
}, config.plugins.codesets)

-- Reference to plugin config
local conf = config.plugins.codesets

local encodings = {}

---@class encodings.encoding
---@field charset string
---@field name string

---List of encoding regions.
---@type table<integer,string>
encodings.groups = {
  "West European",
  "East European",
  "East Asian",
  "SE & SW Asian",
  "Middle Eastern",
  "Unicode"
}

---Supported iconv encodings grouped by region.
---@type table<integer,encodings.encoding[]>
encodings.list = {
  -- West European
  {
    { charset = "ISO-8859-14",  name = "Celtic"         },
    { charset = "ISO-8859-7",   name = "Greek"          },
    { charset = "WINDOWS-1253", name = "Greek"          },
    { charset = "ISO-8859-10",  name = "Nordic"         },
    { charset = "ISO-8859-3",   name = "South European" },
    { charset = "IBM850",       name = "Western"        },
    { charset = "ISO-8859-1",   name = "Western"        },
    { charset = "ISO-8859-15",  name = "Western"        },
    { charset = "WINDOWS-1252", name = "Western"        }
  },
  -- East European
  {
    { charset = "ISO-8859-4",   name = "Baltic"             },
    { charset = "ISO-8859-13",  name = "Baltic"             },
    { charset = "WINDOWS-1257", name = "Baltic"             },
    { charset = "IBM852",       name = "Central European"   },
    { charset = "ISO-8859-2",   name = "Central European"   },
    { charset = "WINDOWS-1250", name = "Central European"   },
    { charset = "IBM855",       name = "Cyrillic"           },
    { charset = "ISO-8859-5",   name = "Cyrillic"           },
    { charset = "ISO-IR-111",   name = "Cyrillic"           },
    { charset = "KOI8-R",       name = "Cyrillic"           },
    { charset = "WINDOWS-1251", name = "Cyrillic"           },
    { charset = "CP866",        name = "Cyrillic/Russian"   },
    { charset = "KOI8-U",       name = "Cyrillic/Ukrainian" },
    { charset = "ISO-8859-16",  name = "Romanian"           }
  },
  -- East Asian
  {
    { charset = "GB18030",     name = "Chinese Simplified" },
    { charset = "GB2312",      name = "Chinese Simplified" },
    { charset = "GBK",         name = "Chinese Simplified" },
    { charset = "HZ",          name = "Chinese Simplified" },
    { charset = "BIG5",        name = "Chinese Traditional" },
    { charset = "BIG5-HKSCS",  name = "Chinese Traditional" },
    { charset = "EUC-TW",      name = "Chinese Traditional" },
    { charset = "EUC-JP",      name = "Japanese" },
    { charset = "ISO-2022-JP", name = "Japanese" },
    { charset = "SHIFT_JIS",   name = "Japanese" },
    { charset = "CP932",       name = "Japanese" },
    { charset = "EUC-KR",      name = "Korean" },
    { charset = "ISO-2022-KR", name = "Korean" },
    { charset = "JOHAB",       name = "Korean" },
    { charset = "UHC",         name = "Korean" }
  },
  -- SE & SW Asian
  {
    { charset = "ARMSCII-8",        name = "Armenian"   },
    { charset = "GEORGIAN-ACADEMY", name = "Georgian"   },
    { charset = "TIS-620",          name = "Thai"       },
    { charset = "IBM857",           name = "Turkish"    },
    { charset = "WINDOWS-1254",     name = "Turkish"    },
    { charset = "ISO-8859-9",       name = "Turkish"    },
    { charset = "TCVN",             name = "Vietnamese" },
    { charset = "VISCII",           name = "Vietnamese" },
    { charset = "WINDOWS-1258",     name = "Vietnamese" }
  },
  -- Middle Eastern
  {
    { charset = "IBM864",       name = "Arabic"        },
    { charset = "ISO-8859-6",   name = "Arabic"        },
    { charset = "WINDOWS-1256", name = "Arabic"        },
    { charset = "IBM862",       name = "Hebrew"        },
    { charset = "ISO-8859-8-I", name = "Hebrew"        },
    { charset = "WINDOWS-1255", name = "Hebrew"        },
    { charset = "ISO-8859-8",   name = "Hebrew Visual" }
  },
  -- Unicode
  {
    { charset = "UTF-7",    name = "Unicode" },
    { charset = "UTF-8",    name = "Unicode" },
    { charset = "UTF-16LE", name = "Unicode" },
    { charset = "UTF-16BE", name = "Unicode" },
    { charset = "UCS-2LE",  name = "Unicode" },
    { charset = "UCS-2BE",  name = "Unicode" },
    { charset = "UTF-32LE", name = "Unicode" },
    { charset = "UTF-32BE", name = "Unicode" }
  }
};

---Get the list of encodings associated to a region.
---@param label string
---@return encodings.encoding[] | nil
function encodings.get_group(label)
  for idx, name in ipairs(encodings.groups) do
    if name == label then
      return encodings.list[idx]
    end
  end
end

---Get the list of encodings associated to a region.
---@return encodings.encoding[] | nil
function encodings.get_all()
  local all = {}
  for idx, _ in ipairs(encodings.groups) do
      for _, item in ipairs(encodings.list[idx]) do
        table.insert(all, item)
      end
  end
  return all
end

---Open a commandview to select a charset and executes the given callback,
---@param title_label string Title displayed on the commandview
---@param callback fun(charset: string)
function encodings.select_encoding(title_label, callback)
  core.command_view:enter(title_label, {
    submit = function(_, item)
      callback(item.charset)
    end,
    suggest = function(text)
      local charsets = encodings.get_all()
      local list_labels = {}
      local list_charset = {}
      for _, element in ipairs(charsets) do
        local label = element.name .. " (" .. element.charset .. ")"
        table.insert(list_labels, label)
        list_charset[label] = element.charset
      end
      local res = common.fuzzy_match(list_labels, text)
      for i, name in ipairs(res) do
        res[i] = {
          text = name,
          charset = list_charset[name]
        }
      end
      return res
    end
  })
end

--------------------------------------------------------------------------------
-- Overwrite Doc methods to properly add encoding detection and conversion.
--------------------------------------------------------------------------------
function Doc:new(filename, abs_filename, new_file)
  self.new_file = new_file
  self.encoding = nil
  self.convert = false
  self:reset()
  if filename then
    self:set_filename(filename, abs_filename)
    if not new_file then
      self:load(filename)
    end
  end
end

function Doc:load(filename)
  if not self.encoding then
    local errmsg
    if conf.useSystemEncoding then
      self.encoding, errmsg = encoding.systemCodeset();
    else
      self.encoding, errmsg = encoding.detect(filename);
    end
    if not self.encoding then core.error("%s", errmsg) error(errmsg) end
  end
  self.convert = false
  if self.encoding ~= "UTF-8" and self.encoding ~= "ASCII"
    and self.encoding ~= "US-ASCII" and self.encoding ~= "ISO-8859-1"
  then
    self.convert = true
  end
  local fp = assert( io.open(filename, "rb") )
  self:reset()
  self.lines = {}
  local i = 1
  if self.convert then
    local content = fp:read("*a");
    content = assert(encoding.convert("UTF-8", self.encoding, content, {
      strict = false,
      handle_from_bom = true
    }))
    for line in content:gmatch("([^\n]*)\n?") do
      if line:byte(-1) == 13 then
        line = line:sub(1, -2)
        self.crlf = true
      end
      table.insert(self.lines, line .. "\n")
      self.highlighter.lines[i] = false
      i = i + 1
    end
    content = nil
  else
    for line in fp:lines() do
      if (i == 1) then line = encoding.strip_bom(line, "UTF-8") end
      if line:byte(-1) == 13 then
        line = line:sub(1, -2)
        self.crlf = true
      end
      table.insert(self.lines, line .. "\n")
      self.highlighter.lines[i] = false
      i = i + 1
    end
  end
  if #self.lines == 0 then
    table.insert(self.lines, "\n")
  end
  fp:close()
  self:reset_syntax()
end

function Doc:save(filename, abs_filename)
  if not filename then
    assert(self.filename, "no filename set to default to")
    filename = self.filename
    abs_filename = self.abs_filename
  else
    assert(self.filename or abs_filename, "calling save on unnamed doc without absolute path")
  end
  local fp
  local output = ""
  if not self.convert then
    fp = assert( io.open(filename, "wb") )
    for _, line in ipairs(self.lines) do
      if self.crlf then line = line:gsub("\n", "\r\n") end
      fp:write(line)
    end
  else
      output = table.concat(self.lines);
      if self.crlf then output = output:gsub("\n", "\r\n") end
  end
  local conversion_error = false
  if self.convert then
    local errmsg
    output, errmsg = encoding.convert(self.encoding, "UTF-8", output, {
      strict = true,
      handle_to_bom = true
    })
    if output then
      fp = assert( io.open(filename, "wb") )
      fp:write(encoding.get_charset_bom(self.encoding) .. output)
      fp:close()
    else
      conversion_error = true
      core.error("%s", errmsg)
    end
  else
    fp:close()
  end
  self:set_filename(filename, abs_filename)
  if not conversion_error then
    self.new_file = false
  else
    self.new_file = true
  end
  self:clean()
end

--------------------------------------------------------------------------------
-- Register command to change current document encoding.
--------------------------------------------------------------------------------
command.add("core.docview", {
  ["doc:change-encoding"] = function(dv)
    encodings.select_encoding("Select Output Encoding", function(charset)
      dv.doc.encoding = charset
      if charset ~= "UTF-8" and charset ~= "ASCII"
        and charset ~= "US-ASCII" and charset ~= "ISO-8859-1"
      then
        dv.doc.convert = true
      else
        dv.doc.convert = false
      end
      dv.doc:save()
    end)
  end,

  ["doc:reload-with-encoding"] = function(dv)
    encodings.select_encoding("Reload With Encoding", function(charset)
      dv.doc.encoding = charset
      if charset ~= "UTF-8" and charset ~= "ASCII"
        and charset ~= "US-ASCII" and charset ~= "ISO-8859-1"
      then
        dv.doc.convert = true
      else
        dv.doc.convert = false
      end
      dv.doc:reload()
    end)
  end
})

--------------------------------------------------------------------------------
-- Register a statusbar item to view change current doc encoding.
--------------------------------------------------------------------------------
core.status_view:add_item({
  predicate = function()
    return  core.active_view:is(DocView)
      and not core.active_view:is(CommandView)
  end,
  name = "doc:encoding",
  alignment = StatusView.Item.RIGHT,
  get_item = function()
    local dv = core.active_view
    return {
      style.text, dv.doc.encoding or "none"
    }
  end,
  command = function(button)
    if button == "left" then
      command.perform "doc:change-encoding"
    elseif button == "right" then
      command.perform "doc:reload-with-encoding"
    end
  end,
  tooltip = "encoding"
})


return encodings;