From f17cc56bf9c2818a9218a81aaf74c1259e52911a Mon Sep 17 00:00:00 2001 From: dmiller Date: Wed, 19 Feb 2014 18:32:23 +0000 Subject: [PATCH] New utility functions for transcoding to and from Windows Unicode Windows uses UTF-16 little-endian. Since this is a common use case, utility functions are provided such that this: x = unicode.utf16to8(v) is equivalent to this: x = unicode.encode(unicode.decode(v, unicode.utf16_dec), unicode.utf8_enc) but faster (fewer intermediate tables) --- nselib/unicode.lua | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/nselib/unicode.lua b/nselib/unicode.lua index 241033be0..27a4de2a8 100644 --- a/nselib/unicode.lua +++ b/nselib/unicode.lua @@ -183,6 +183,38 @@ function utf8_dec(buf, pos) return pos + 1 + n, cp end +---Helper function for the common case of UTF-16 to UTF-8 transcoding, such as +--from a Windows/SMB unicode string to a printable ASCII (subset of UTF-8) +--string. +--@param from A string in UTF-16, little-endian +--@return The string in UTF-8 +function utf16to8(from) + local buf = {} + local cp + local pos = 1 + while pos <= #from do + pos, cp = utf16_dec(from, pos) + buf[#buf+1] = utf8_enc(cp) + end + return table.concat(buf) +end + +---Helper function for the common case of UTF-8 to UTF-16 transcoding, such as +--from a printable ASCII (subset of UTF-8) string to a Windows/SMB unicode +--string. +--@param from A string in UTF-8 +--@return The string in UTF-16, little-endian +function utf8to16(from) + local buf = {} + local cp + local pos = 1 + while pos <= #from do + pos, cp = utf8_dec(from, pos) + buf[#buf+1] = utf16_enc(cp) + end + return table.concat(buf) +end + test_suite = unittest.TestSuite:new() test_suite:add_test(function() local pos, cp = utf8_dec("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E") @@ -195,5 +227,7 @@ test_suite:add_test(unittest.equal(encode({0x12345,61,82,97}, utf16_enc, true), test_suite:add_test(unittest.table_equal(decode("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E", utf8_dec), {0x65E5,0x672C,0x8A9E}),"decode utf-8") test_suite:add_test(unittest.table_equal(decode("\x08\xD8\x45\xDF=\0R\0a\0", utf16_dec), {0x12345,61,82,97}),"decode utf-16") test_suite:add_test(unittest.table_equal(decode("\xD8\x08\xDF\x45\0=\0R\0a", utf16_dec, true), {0x12345,61,82,97}),"decode utf-16, big-endian") +test_suite:add_test(unittest.equal(utf16to8("\x08\xD8\x45\xDF=\0R\0a\0"), "\xF0\x92\x8D\x85=Ra"),"utf16to8") +test_suite:add_test(unittest.equal(utf8to16("\xF0\x92\x8D\x85=Ra"), "\x08\xD8\x45\xDF=\0R\0a\0"),"utf8to16") return _ENV