Add basic unicode character encoding detection

2025-12-07 13:11:28 +00:00 · 2017-07-05 22:43:57 +00:00
parent 2ca43ae9dc
commit baac7e705e
1 changed files with 85 additions and 0 deletions
--- a/nselib/unicode.lua
+++ b/nselib/unicode.lua
@@ -73,6 +73,86 @@ function transcode(buf, decoder, encoder, bigendian_dec, bigendian_enc)
  return table.concat(out)
 end

+--- Determine (poorly) the character encoding of a string
+--
+-- First, the string is checked for a Byte-order Mark (BOM). This can be
+-- examined to determine UTF-16 with endianness or UTF-8. If no BOM is found,
+-- the string is examined.
+--
+-- If null bytes are encountered, UTF-16 is assumed. Endianness is determined
+-- by byte position, assuming the null is the high-order byte. Otherwise, if
+-- byte values over 127 are found, UTF-8 decoding is attempted. If this fails,
+-- the result is 'other', otherwise it is 'utf-8'. If no high bytes are found,
+-- the result is 'ascii'.
+--
+--@param buf The string/buffer to be identified
+--@param len The number of bytes to inspect in order to identify the string.
+--           Default: 100
+--@return A string describing the encoding: 'ascii', 'utf-8', 'utf-16be',
+--        'utf-16le', or 'other' meaning some unidentified 8-bit encoding
+function chardet(buf, len)
+  local limit = len or 100
+  if limit > #buf then
+    limit = #buf
+  end
+  -- Check BOM
+  if limit >= 2 then
+    local bom1, bom2 = byte(buf, 1, 2)
+    if bom1 == 0xff and bom2 == 0xfe then
+      return 'utf-16le'
+    elseif bom1 == 0xfe and bom2 == 0xff then
+      return 'utf-16be'
+    elseif limit >= 3 then
+      local bom3 = byte(buf, 3)
+      if bom1 == 0xef and bom2 == 0xbb and bom3 == 0xbf then
+        return 'utf-8'
+      end
+    end
+  end
+  -- Try bytes
+  local pos = 1
+  local high = false
+  local utf8 = true
+  while pos < limit do
+    local c = byte(buf, pos)
+    if c == 0 then
+      if pos % 2 == 0 then
+        return 'utf-16le'
+      else
+        return 'utf-16be'
+      end
+      utf8 = false
+      pos = pos + 1
+    elseif c > 127 then
+      if not high then
+        high = true
+      end
+      if utf8 then
+        local p, cp = utf8_dec(buf, pos)
+        if not p then
+          utf8 = false
+        else
+          pos = p
+        end
+      end
+      if not utf8 then
+        pos = pos + 1
+      end
+    else
+      pos = pos + 1
+    end
+  end
+  if high then
+    if utf8 then
+      return 'utf-8'
+    else
+      return 'other'
+    end
+  else
+    return 'ascii'
+  end
+end
+
 ---Encode a Unicode code point to UTF-16. See RFC 2781.
 --
 -- Windows OS prior to Windows 2000 only supports UCS-2, so beware using this
@@ -424,5 +504,10 @@ test_suite:add_test(unittest.equal(utf16to8("\x08\xD8\x45\xDF=\0R\0a\0"), "\xF0\
 test_suite:add_test(unittest.equal(utf8to16("\xF0\x92\x8D\x85=Ra"), "\x08\xD8\x45\xDF=\0R\0a\0"),"utf8to16")
 test_suite:add_test(unittest.equal(encode({0x221e, 0x2248, 0x30}, cp437_enc), "\xec\xf70"), "encode cp437")
 test_suite:add_test(unittest.table_equal(decode("\x81ber", cp437_dec), {0xfc, 0x62, 0x65, 0x72}), "decode cp437")
+test_suite:add_test(unittest.equal(chardet("\x08\xD8\x45\xDF=\0R\0a\0"), 'utf-16le'), "detect utf-16le")
+test_suite:add_test(unittest.equal(chardet("\xD8\x08\xDF\x45\0=\0R\0a"), 'utf-16be'), "detect utf-16be")
+test_suite:add_test(unittest.equal(chardet("...\xF0\x92\x8D\x85=Ra"), 'utf-8'), "detect utf-8")
+test_suite:add_test(unittest.equal(chardet("This sentence is completely normal."), 'ascii'), "detect ascii")
+test_suite:add_test(unittest.equal(chardet('Comme ci, comme \xe7a'), 'other'), "detect other")

 return _ENV