1
0
mirror of https://github.com/nmap/nmap.git synced 2025-12-07 13:11:28 +00:00

Add basic unicode character encoding detection

This commit is contained in:
dmiller
2017-07-05 22:43:57 +00:00
parent 2ca43ae9dc
commit baac7e705e

View File

@@ -73,6 +73,86 @@ function transcode(buf, decoder, encoder, bigendian_dec, bigendian_enc)
return table.concat(out)
end
--- Determine (poorly) the character encoding of a string
--
-- First, the string is checked for a Byte-order Mark (BOM). This can be
-- examined to determine UTF-16 with endianness or UTF-8. If no BOM is found,
-- the string is examined.
--
-- If null bytes are encountered, UTF-16 is assumed. Endianness is determined
-- by byte position, assuming the null is the high-order byte. Otherwise, if
-- byte values over 127 are found, UTF-8 decoding is attempted. If this fails,
-- the result is 'other', otherwise it is 'utf-8'. If no high bytes are found,
-- the result is 'ascii'.
--
--@param buf The string/buffer to be identified
--@param len The number of bytes to inspect in order to identify the string.
-- Default: 100
--@return A string describing the encoding: 'ascii', 'utf-8', 'utf-16be',
-- 'utf-16le', or 'other' meaning some unidentified 8-bit encoding
function chardet(buf, len)
local limit = len or 100
if limit > #buf then
limit = #buf
end
-- Check BOM
if limit >= 2 then
local bom1, bom2 = byte(buf, 1, 2)
if bom1 == 0xff and bom2 == 0xfe then
return 'utf-16le'
elseif bom1 == 0xfe and bom2 == 0xff then
return 'utf-16be'
elseif limit >= 3 then
local bom3 = byte(buf, 3)
if bom1 == 0xef and bom2 == 0xbb and bom3 == 0xbf then
return 'utf-8'
end
end
end
-- Try bytes
local pos = 1
local high = false
local utf8 = true
while pos < limit do
local c = byte(buf, pos)
if c == 0 then
if pos % 2 == 0 then
return 'utf-16le'
else
return 'utf-16be'
end
utf8 = false
pos = pos + 1
elseif c > 127 then
if not high then
high = true
end
if utf8 then
local p, cp = utf8_dec(buf, pos)
if not p then
utf8 = false
else
pos = p
end
end
if not utf8 then
pos = pos + 1
end
else
pos = pos + 1
end
end
if high then
if utf8 then
return 'utf-8'
else
return 'other'
end
else
return 'ascii'
end
end
---Encode a Unicode code point to UTF-16. See RFC 2781.
--
-- Windows OS prior to Windows 2000 only supports UCS-2, so beware using this
@@ -424,5 +504,10 @@ test_suite:add_test(unittest.equal(utf16to8("\x08\xD8\x45\xDF=\0R\0a\0"), "\xF0\
test_suite:add_test(unittest.equal(utf8to16("\xF0\x92\x8D\x85=Ra"), "\x08\xD8\x45\xDF=\0R\0a\0"),"utf8to16")
test_suite:add_test(unittest.equal(encode({0x221e, 0x2248, 0x30}, cp437_enc), "\xec\xf70"), "encode cp437")
test_suite:add_test(unittest.table_equal(decode("\x81ber", cp437_dec), {0xfc, 0x62, 0x65, 0x72}), "decode cp437")
test_suite:add_test(unittest.equal(chardet("\x08\xD8\x45\xDF=\0R\0a\0"), 'utf-16le'), "detect utf-16le")
test_suite:add_test(unittest.equal(chardet("\xD8\x08\xDF\x45\0=\0R\0a"), 'utf-16be'), "detect utf-16be")
test_suite:add_test(unittest.equal(chardet("...\xF0\x92\x8D\x85=Ra"), 'utf-8'), "detect utf-8")
test_suite:add_test(unittest.equal(chardet("This sentence is completely normal."), 'ascii'), "detect ascii")
test_suite:add_test(unittest.equal(chardet('Comme ci, comme \xe7a'), 'other'), "detect other")
return _ENV