diff --git a/nselib/unicode.lua b/nselib/unicode.lua index c237937d0..c8c90c1cd 100644 --- a/nselib/unicode.lua +++ b/nselib/unicode.lua @@ -55,6 +55,28 @@ function encode(list, encoder, bigendian) return table.concat(buf, "") end +---Transcode a string from one format to another +-- +--The string will be decoded and re-encoded in one pass. This saves some +--overhead vs simply passing the output of unicode.encode to +--unicode.decode. +--@param buf The string/buffer to be transcoded +--@param decoder A Unicode decoder function (such as utf16_dec) +--@param encoder A Unicode encoder function (such as utf8_enc) +--@param bigendian_dec Set this to true to force big-endian decoding. +--@param bigendian_enc Set this to true to force big-endian encoding. +--@return An encoded string +function transcode(buf, decoder, encoder, bigendian_dec, bigendian_enc) + local out = {} + local cp + local pos = 1 + while pos <= #buf do + pos, cp = decoder(buf, pos, bigendian_dec) + out[#out+1] = encoder(cp, bigendian_enc) + end + return table.concat(out) +end + ---Encode a Unicode code point to UTF-16. See RFC 2781. -- -- Windows OS prior to Windows 2000 only supports UCS-2, so beware using this @@ -374,14 +396,7 @@ end --@param from A string in UTF-16, little-endian --@return The string in UTF-8 function utf16to8(from) - local buf = {} - local cp - local pos = 1 - while pos <= #from do - pos, cp = utf16_dec(from, pos) - buf[#buf+1] = utf8_enc(cp) - end - return table.concat(buf) + return transcode(from, utf16_dec, utf8_enc, false, nil) end ---Helper function for the common case of UTF-8 to UTF-16 transcoding, such as @@ -390,14 +405,7 @@ end --@param from A string in UTF-8 --@return The string in UTF-16, little-endian function utf8to16(from) - local buf = {} - local cp - local pos = 1 - while pos <= #from do - pos, cp = utf8_dec(from, pos) - buf[#buf+1] = utf16_enc(cp) - end - return table.concat(buf) + return transcode(from, utf8_dec, utf16_enc, nil, false) end if not unittest.testing() then