diff --git a/nselib/unicode.lua b/nselib/unicode.lua index a5200b147..f20894fe4 100644 --- a/nselib/unicode.lua +++ b/nselib/unicode.lua @@ -73,6 +73,86 @@ function transcode(buf, decoder, encoder, bigendian_dec, bigendian_enc) return table.concat(out) end +--- Determine (poorly) the character encoding of a string +-- +-- First, the string is checked for a Byte-order Mark (BOM). This can be +-- examined to determine UTF-16 with endianness or UTF-8. If no BOM is found, +-- the string is examined. +-- +-- If null bytes are encountered, UTF-16 is assumed. Endianness is determined +-- by byte position, assuming the null is the high-order byte. Otherwise, if +-- byte values over 127 are found, UTF-8 decoding is attempted. If this fails, +-- the result is 'other', otherwise it is 'utf-8'. If no high bytes are found, +-- the result is 'ascii'. +-- +--@param buf The string/buffer to be identified +--@param len The number of bytes to inspect in order to identify the string. +-- Default: 100 +--@return A string describing the encoding: 'ascii', 'utf-8', 'utf-16be', +-- 'utf-16le', or 'other' meaning some unidentified 8-bit encoding +function chardet(buf, len) + local limit = len or 100 + if limit > #buf then + limit = #buf + end + -- Check BOM + if limit >= 2 then + local bom1, bom2 = byte(buf, 1, 2) + if bom1 == 0xff and bom2 == 0xfe then + return 'utf-16le' + elseif bom1 == 0xfe and bom2 == 0xff then + return 'utf-16be' + elseif limit >= 3 then + local bom3 = byte(buf, 3) + if bom1 == 0xef and bom2 == 0xbb and bom3 == 0xbf then + return 'utf-8' + end + end + end + -- Try bytes + local pos = 1 + local high = false + local utf8 = true + while pos < limit do + local c = byte(buf, pos) + if c == 0 then + if pos % 2 == 0 then + return 'utf-16le' + else + return 'utf-16be' + end + utf8 = false + pos = pos + 1 + elseif c > 127 then + if not high then + high = true + end + if utf8 then + local p, cp = utf8_dec(buf, pos) + if not p then + utf8 = false + else + pos = p + end + end + if not utf8 then + pos = pos + 1 + end + else + pos = pos + 1 + end + end + if high then + if utf8 then + return 'utf-8' + else + return 'other' + end + else + return 'ascii' + end +end + ---Encode a Unicode code point to UTF-16. See RFC 2781. -- -- Windows OS prior to Windows 2000 only supports UCS-2, so beware using this @@ -424,5 +504,10 @@ test_suite:add_test(unittest.equal(utf16to8("\x08\xD8\x45\xDF=\0R\0a\0"), "\xF0\ test_suite:add_test(unittest.equal(utf8to16("\xF0\x92\x8D\x85=Ra"), "\x08\xD8\x45\xDF=\0R\0a\0"),"utf8to16") test_suite:add_test(unittest.equal(encode({0x221e, 0x2248, 0x30}, cp437_enc), "\xec\xf70"), "encode cp437") test_suite:add_test(unittest.table_equal(decode("\x81ber", cp437_dec), {0xfc, 0x62, 0x65, 0x72}), "decode cp437") +test_suite:add_test(unittest.equal(chardet("\x08\xD8\x45\xDF=\0R\0a\0"), 'utf-16le'), "detect utf-16le") +test_suite:add_test(unittest.equal(chardet("\xD8\x08\xDF\x45\0=\0R\0a"), 'utf-16be'), "detect utf-16be") +test_suite:add_test(unittest.equal(chardet("...\xF0\x92\x8D\x85=Ra"), 'utf-8'), "detect utf-8") +test_suite:add_test(unittest.equal(chardet("This sentence is completely normal."), 'ascii'), "detect ascii") +test_suite:add_test(unittest.equal(chardet('Comme ci, comme \xe7a'), 'other'), "detect other") return _ENV