1
0
mirror of https://github.com/nmap/nmap.git synced 2025-12-06 04:31:29 +00:00

Use Lua's built-in utf8 lib for unicode.lua ops

This commit is contained in:
dmiller
2022-09-23 01:35:09 +00:00
parent a3c725acd4
commit 5e39a04749

View File

@@ -10,6 +10,7 @@ local table = require "table"
local stdnse = require "stdnse" local stdnse = require "stdnse"
local unittest = require "unittest" local unittest = require "unittest"
local tableaux = require "tableaux" local tableaux = require "tableaux"
local utf8 = require "utf8"
_ENV = stdnse.module("unicode", stdnse.seeall) _ENV = stdnse.module("unicode", stdnse.seeall)
-- Localize a few functions for a tiny speed boost, since these will be looped -- Localize a few functions for a tiny speed boost, since these will be looped
@@ -19,6 +20,7 @@ local char = string.char
local pack = string.pack local pack = string.pack
local unpack = string.unpack local unpack = string.unpack
local concat = table.concat local concat = table.concat
local pcall = pcall
---Decode a buffer containing Unicode data. ---Decode a buffer containing Unicode data.
@@ -29,6 +31,9 @@ local concat = table.concat
-- false (little-endian) -- false (little-endian)
--@return A list-table containing the code points as numbers --@return A list-table containing the code points as numbers
function decode(buf, decoder, bigendian) function decode(buf, decoder, bigendian)
if decoder == utf8_dec then
return {utf8.codepoint(buf, 1, -1)}
end
local cp = {} local cp = {}
local pos = 1 local pos = 1
while pos <= #buf do while pos <= #buf do
@@ -45,6 +50,9 @@ end
-- false (little-endian) -- false (little-endian)
--@return An encoded string --@return An encoded string
function encode(list, encoder, bigendian) function encode(list, encoder, bigendian)
if encoder == utf8_enc then
return utf8.char(table.unpack(list))
end
local buf = {} local buf = {}
for i, cp in ipairs(list) do for i, cp in ipairs(list) do
buf[i] = encoder(cp, bigendian) buf[i] = encoder(cp, bigendian)
@@ -67,9 +75,21 @@ function transcode(buf, decoder, encoder, bigendian_dec, bigendian_enc)
local out = {} local out = {}
local cp local cp
local pos = 1 local pos = 1
while pos <= #buf do -- Take advantage of Lua's built-in utf8 functions
pos, cp = decoder(buf, pos, bigendian_dec) if decoder == utf8_dec then
out[#out+1] = encoder(cp, bigendian_enc) for _, cp in utf8.codes(buf) do
out[#out+1] = encoder(cp, bigendian_enc)
end
elseif encoder == utf8_enc then
while pos <= #buf do
pos, cp = decoder(buf, pos, bigendian_dec)
out[#out+1] = utf8.char(cp)
end
else
while pos <= #buf do
pos, cp = decoder(buf, pos, bigendian_dec)
out[#out+1] = encoder(cp, bigendian_enc)
end
end end
return table.concat(out) return table.concat(out)
end end
@@ -113,7 +133,7 @@ function chardet(buf, len)
-- Try bytes -- Try bytes
local pos = 1 local pos = 1
local high = false local high = false
local utf8 = true local is_utf8 = true
while pos < limit do while pos < limit do
local c = byte(buf, pos) local c = byte(buf, pos)
if c == 0 then if c == 0 then
@@ -122,21 +142,21 @@ function chardet(buf, len)
else else
return 'utf-16be' return 'utf-16be'
end end
utf8 = false is_utf8 = false
pos = pos + 1 pos = pos + 1
elseif c > 127 then elseif c > 127 then
if not high then if not high then
high = true high = true
end end
if utf8 then if is_utf8 then
local p, cp = utf8_dec(buf, pos) local p, cp = utf8_dec(buf, pos)
if not p then if not p then
utf8 = false is_utf8 = false
else else
pos = p pos = p
end end
end end
if not utf8 then if not is_utf8 then
pos = pos + 1 pos = pos + 1
end end
else else
@@ -144,7 +164,7 @@ function chardet(buf, len)
end end
end end
if high then if high then
if utf8 then if is_utf8 then
return 'utf-8' return 'utf-8'
else else
return 'other' return 'other'
@@ -212,40 +232,10 @@ end
-- --
-- Does not check that cp is a real character; that is, doesn't exclude the -- Does not check that cp is a real character; that is, doesn't exclude the
-- surrogate range U+D800 - U+DFFF and a handful of others. -- surrogate range U+D800 - U+DFFF and a handful of others.
-- @class function
--@param cp The Unicode code point as a number --@param cp The Unicode code point as a number
--@return A string containing the code point in UTF-8 encoding. --@return A string containing the code point in UTF-8 encoding.
function utf8_enc(cp) utf8_enc = utf8.char
local bytes = {}
local n, mask
if cp % 1.0 ~= 0.0 or cp < 0 then
-- Only defined for nonnegative integers.
return nil
elseif cp <= 0x7F then
-- Special case of one-byte encoding.
return char(cp)
elseif cp <= 0x7FF then
n = 2
mask = 0xC0
elseif cp <= 0xFFFF then
n = 3
mask = 0xE0
elseif cp <= 0x10FFFF then
n = 4
mask = 0xF0
else
return nil
end
while n > 1 do
bytes[n] = char(0x80 + (cp & 0x3F))
cp = cp >> 6
n = n - 1
end
bytes[1] = char(mask + cp)
return table.concat(bytes)
end
---Decodes a UTF-8 character. ---Decodes a UTF-8 character.
-- --
@@ -256,40 +246,12 @@ end
--@return cp The code point of the character as a number, or an error string --@return cp The code point of the character as a number, or an error string
function utf8_dec(buf, pos) function utf8_dec(buf, pos)
pos = pos or 1 pos = pos or 1
local n, mask local status, cp = pcall(utf8.codepoint, buf, pos)
local bv = byte(buf, pos) if status then
if bv <= 0x7F then return utf8.offset(buf, 2, pos), cp
return pos+1, bv
elseif bv <= 0xDF then
--110xxxxx 10xxxxxx
n = 1
mask = 0xC0
elseif bv <= 0xEF then
--1110xxxx 10xxxxxx 10xxxxxx
n = 2
mask = 0xE0
elseif bv <= 0xF7 then
--11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
n = 3
mask = 0xF0
else else
return nil, string.format("Invalid UTF-8 byte at %d", pos) return nil, cp
end end
local cp = bv - mask
if pos + n > #buf then
return nil, string.format("Incomplete UTF-8 sequence at %d", pos)
end
for i = 1, n do
bv = byte(buf, pos + i)
if bv < 0x80 or bv > 0xBF then
return nil, string.format("Invalid UTF-8 sequence at %d", pos + i)
end
cp = (cp << 6) + (bv & 0x3F)
end
return pos + 1 + n, cp
end end
-- Code Page 437, native US-English Windows OEM code page -- Code Page 437, native US-English Windows OEM code page