mirror of
https://github.com/nmap/nmap.git
synced 2025-12-06 20:51:30 +00:00
New utility functions for transcoding to and from Windows Unicode
Windows uses UTF-16 little-endian. Since this is a common use case,
utility functions are provided such that this:
x = unicode.utf16to8(v)
is equivalent to this:
x = unicode.encode(unicode.decode(v, unicode.utf16_dec),
unicode.utf8_enc)
but faster (fewer intermediate tables)
This commit is contained in:
@@ -183,6 +183,38 @@ function utf8_dec(buf, pos)
|
||||
return pos + 1 + n, cp
|
||||
end
|
||||
|
||||
---Helper function for the common case of UTF-16 to UTF-8 transcoding, such as
|
||||
--from a Windows/SMB unicode string to a printable ASCII (subset of UTF-8)
|
||||
--string.
|
||||
--@param from A string in UTF-16, little-endian
|
||||
--@return The string in UTF-8
|
||||
function utf16to8(from)
|
||||
local buf = {}
|
||||
local cp
|
||||
local pos = 1
|
||||
while pos <= #from do
|
||||
pos, cp = utf16_dec(from, pos)
|
||||
buf[#buf+1] = utf8_enc(cp)
|
||||
end
|
||||
return table.concat(buf)
|
||||
end
|
||||
|
||||
---Helper function for the common case of UTF-8 to UTF-16 transcoding, such as
|
||||
--from a printable ASCII (subset of UTF-8) string to a Windows/SMB unicode
|
||||
--string.
|
||||
--@param from A string in UTF-8
|
||||
--@return The string in UTF-16, little-endian
|
||||
function utf8to16(from)
|
||||
local buf = {}
|
||||
local cp
|
||||
local pos = 1
|
||||
while pos <= #from do
|
||||
pos, cp = utf8_dec(from, pos)
|
||||
buf[#buf+1] = utf16_enc(cp)
|
||||
end
|
||||
return table.concat(buf)
|
||||
end
|
||||
|
||||
test_suite = unittest.TestSuite:new()
|
||||
test_suite:add_test(function()
|
||||
local pos, cp = utf8_dec("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E")
|
||||
@@ -195,5 +227,7 @@ test_suite:add_test(unittest.equal(encode({0x12345,61,82,97}, utf16_enc, true),
|
||||
test_suite:add_test(unittest.table_equal(decode("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E", utf8_dec), {0x65E5,0x672C,0x8A9E}),"decode utf-8")
|
||||
test_suite:add_test(unittest.table_equal(decode("\x08\xD8\x45\xDF=\0R\0a\0", utf16_dec), {0x12345,61,82,97}),"decode utf-16")
|
||||
test_suite:add_test(unittest.table_equal(decode("\xD8\x08\xDF\x45\0=\0R\0a", utf16_dec, true), {0x12345,61,82,97}),"decode utf-16, big-endian")
|
||||
test_suite:add_test(unittest.equal(utf16to8("\x08\xD8\x45\xDF=\0R\0a\0"), "\xF0\x92\x8D\x85=Ra"),"utf16to8")
|
||||
test_suite:add_test(unittest.equal(utf8to16("\xF0\x92\x8D\x85=Ra"), "\x08\xD8\x45\xDF=\0R\0a\0"),"utf8to16")
|
||||
|
||||
return _ENV
|
||||
|
||||
Reference in New Issue
Block a user