1
0
mirror of https://github.com/nmap/nmap.git synced 2025-12-06 04:31:29 +00:00

Move domain splitting and unicode decoding from punycode to idna.

This commit is contained in:
dmiller
2017-09-28 00:12:54 +00:00
parent 2c03a651f3
commit 9f77cef4b1
2 changed files with 73 additions and 129 deletions

View File

@@ -236,6 +236,33 @@ function validate(tableOfTables, checkHyphens)
end
--- Breaks the tables of codepoints using a delimiter.
--
-- @param A table is given as an input which contains codepoints.
-- @param ASCII value of delimiter is provided.
-- @return Returns table of tables after breaking the give table using delimiter.
local function breakInput(codepoints, delimiter)
local tbl = {}
local output = {}
local delimiter = delimiter or 0x002E
for _, v in ipairs(codepoints) do
if v == delimiter then
table.insert(output, tbl)
tbl = {}
else
table.insert(tbl, v)
end
end
table.insert(output, tbl)
return output
end
--- Converts the input codepoints into ASCII text based on IDNA rules.
--
-- @param codepoints Table of codepoints of decoded input.
@@ -249,13 +276,11 @@ end
-- Default: false.
-- @param useSTD3ASCIIRules Boolean value to represent ASCII rules. Default: true.
-- @param delimiter codepoint of the character to be used as delimiter.
-- @param encoder Encoder function to convert a Unicode codepoint into a
-- string of bytes.
-- @param An decoder function to decode the input string
-- into an array of code points.
-- @return Returns the IDNA ASCII format of the input.
-- @return Throws nil, if there is any error in conversion.
function toASCII(codepoints, transitionalProcessing, checkHyphens, checkBidi, checkJoiners, useSTD3ASCIIRules, delimiter, encoder, decoder)
function toASCII(codepoints, transitionalProcessing, checkHyphens, checkBidi, checkJoiners, useSTD3ASCIIRules, delimiter, decoder)
-- Assigns default values if not specified.
if transitionalProcessing == nil then
@@ -282,7 +307,6 @@ function toASCII(codepoints, transitionalProcessing, checkHyphens, checkBidi, ch
end
delimiter = delimiter or 0x002E
encoder = encoder or unicode.utf8_enc
decoder = decoder or unicode.utf8_dec
local decoded_tbl, disallowedCodePoints = map(codepoints, useSTD3ASCIIRules, transitionalProcessing)
@@ -297,7 +321,7 @@ function toASCII(codepoints, transitionalProcessing, checkHyphens, checkBidi, ch
end
-- Breaks the codepoints into multiple tables using delimiter.
decoded_tbl = punycode.breakInput(decoded_tbl, delimiter)
decoded_tbl = breakInput(decoded_tbl, delimiter)
if decoded_tbl == nil then
return nil
@@ -308,20 +332,16 @@ function toASCII(codepoints, transitionalProcessing, checkHyphens, checkBidi, ch
return nil
end
local stringLabels = {}
-- Convert the codepoints into Unicode strings before passing them to mapLabels function.
for _, label in ipairs(decoded_tbl) do
table.insert(stringLabels, unicode.encode(label, encoder))
for i, label in ipairs(decoded_tbl) do
decoded_tbl[i] = punycode.encode_label(label)
end
return punycode.mapLabels(stringLabels, punycode.encode_label, decoder, unicode.encode({0x002E}, encoder))
return table.concat(decoded_tbl, ".")
end
--- Converts the input into Unicode codepoitns based on IDNA rules.
--- Converts the input into Unicode codepoints based on IDNA rules.
--
-- @param codepoints Table of codepoints of decoded input.
-- @param name A domain name in string format
-- @param transitionalProcessing Boolean value. Default: true.
-- @param checkHyphens Boolean flag for checking hyphens presence in input.
-- Default: true.
@@ -333,11 +353,9 @@ end
-- @param delimiter, codepoint of the character to be used as delimiter.
-- @param encoder Encoder function to convert a Unicode codepoint into a
-- string of bytes.
-- @param An decoder function to decode the input string
-- into an array of code points.
-- @return Returns the Unicode format of the input based on IDNA rules.
-- @return Throws nil, if there is any error in conversion.
function toUnicode(decoded_tbl, transitionalProcessing, checkHyphens, checkBidi, checkJoiners, useSTD3ASCIIRules, delimiter, encoder, decoder)
function toUnicode(decoded_tbl, transitionalProcessing, checkHyphens, checkBidi, checkJoiners, useSTD3ASCIIRules, delimiter, encoder)
-- Assigns default values if not specified.
if transitionalProcessing == nil then
@@ -358,22 +376,25 @@ function toUnicode(decoded_tbl, transitionalProcessing, checkHyphens, checkBidi,
delimiter = delimiter or 0x002E
encoder = encoder or unicode.utf8_enc
decoder = decoder or unicode.utf8_dec
-- Breaks the codepoints into multiple tables using delimiter.
decoded_tbl = punycode.breakInput(decoded_tbl, delimiter)
decoded_tbl = stdnse.strsplit('%'.. string.char(delimiter), decoded_tbl)
if decoded_tbl == nil then
return nil
end
local stringLabels = {}
-- Format the codepoints into strings before passing to punycode.mapLabels
for _, label in ipairs(decoded_tbl) do
table.insert(stringLabels, unicode.encode(label, encoder))
local output = {}
for i, label in ipairs(decoded_tbl) do
local decoded = punycode.decode_label(label)
for j = 1, #decoded do
output[#output+1] = decoded[j]
end
if i < #decoded_tbl then
output[#output+1] = delimiter
end
end
return punycode.mapLabels(stringLabels, punycode.decode_label, encoder, unicode.encode({0x002E}, encoder))
return unicode.encode(output, encoder)
end
@@ -528,7 +549,7 @@ end
for _, v in ipairs(encodingAndDecodingTestCases) do
test_suite:add_test(unittest.equal(toASCII(unicode.decode(v[1], unicode.utf8_dec)), v[2]))
test_suite:add_test(unittest.equal(toUnicode(unicode.decode(v[2], unicode.utf8_dec)), v[1]))
test_suite:add_test(unittest.equal(toUnicode(v[2],nil,nil,nil,nil,nil,nil,unicode.utf8_enc), v[1]))
end
for _, v in ipairs(multipleProcessingTestCases) do

View File

@@ -61,25 +61,6 @@ local delimiter = char("0x2D")
-- Convenience shortcuts
local baseMinusTMin = base - tMin
-- This function finds and replaces matched values in a table.
--
-- @param tbl Table of values.
-- @param val Value to to be replaced in the table.
-- @param new_val Value to be replaced with.
-- @return Returns a new table with new values.
local function find_and_replace(tbl, val, new_val)
for index, data in pairs(tbl) do
if data == val then
tbl[index] = new_val
end
end
return tbl
end
-- Bias adaptation function as per section 3.4 of RFC 3492.
-- https://tools.ietf.org/html/rfc3492#section-3.4
-- The following function is adapted from punycode.js by Mathias Bynens
@@ -162,18 +143,15 @@ end
-- Creates a string based on an array of numeric code points.
--
-- @param input String of input to be encoded.
-- @param input list-table of Unicode code points
-- @param decoder Sets the decoding format to be used.
-- @return The new encoded string
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
function encode_input(input, decoder)
function encode_input(input)
local output = {}
-- Convert the input into an array of Unicode code points.
input = unicode.decode(input, decoder)
-- Cache the length.
local inputLength = #input
@@ -283,14 +261,13 @@ function encode_input(input, decoder)
end
-- Converts a Punycode string of ASCII-only symbols to a
-- string of Unicode symbols.
-- list-table of Unicode code points.
--
-- @param input The Punycode string of ASCII-only symbols.
-- @param encoder Defines the type of encoding format to be used.
-- @return The resulting string of Unicode symbols.
-- @return The resulting list-table of Unicode code points.
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
function decode_input(input, encoder)
function decode_input(input)
local output = {}
local inputLength = #input
@@ -397,23 +374,23 @@ function decode_input(input, encoder)
i = i + 1
end
return unicode.encode(output, encoder)
return output
end
-- Performs punycode encoding on a label
--
-- @param s String of input to be encoded.
-- @param decoder A decoder function to convert the domain into a
-- table of Unicode code points.
-- @return Returns encoded string.
function encode_label(s, decoder)
-- If the label is already ASCII, it is returned as a string. If any encoding
-- was required, the "xn--" prefix is added.
--
-- @param u A list-table of Unicode code points representing a domain label
-- @return A punycode-encoded ASCII string
function encode_label(u)
local flag = false
local decoded_tbl = unicode.decode(s, decoder)
-- Looks for non-ASCII character
for _, val in pairs(decoded_tbl) do
for _, val in pairs(u) do
if not (val >=0 and val <= 127) then
flag = true
@@ -424,7 +401,7 @@ function encode_label(s, decoder)
if flag then
local res, err = encode_input(s, decoder)
local res, err = encode_input(u)
if err then
return nil, err
end
@@ -432,22 +409,24 @@ function encode_label(s, decoder)
return 'xn--' .. res
else
return s
return unicode.encode(u, unicode.utf8_enc)
end
end
--- Decodes a punycode-encoded label to Unicode.
--
-- @param s String of input
-- @param encoder An encoder function to convert a Unicode code point
-- into a string of bytes. Default: unicode.utf8_enc
-- @return Returns decoded string.
function decode_label(s, encoder)
-- If the label starts with "xn--", it will be punycode-decoded. Otherwise, it
-- will be decoded as UTF-8 (ASCII). The return value is always a table of
-- Unicode code points.
--
-- @param s String of input.
-- @return A table of Unicode code points.
function decode_label(s)
if match(s, "^xn%-%-") then
local res, err = decode_input(sub(s, 5):lower(), encoder)
local res, err = decode_input(sub(s, 5))
if err then
return nil, err
end
@@ -455,67 +434,11 @@ function decode_label(s, encoder)
return res
else
return s
return unicode.decode(s, unicode.utf8_dec)
end
end
--- Splits the domain name and maps it with the corresponding data.
--
-- @param s The domain name to be processed.
-- @param fn The function to be called for every label.
-- @param formatter The type of encoder/decoder to be used.
-- @param delimiter delimiter character for concatinating output.
-- @return Returns encoded/decoded string based on the formatter.
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
function mapLabels(labels, fn, formatter, delimiter)
local encoded = {}
for index, v in ipairs(labels) do
local res, err = fn(labels[index], formatter)
if err then
stdnse.debug2(err)
return nil
end
encoded[index] = res
end
return table.concat(encoded, delimiter)
end
--- Breaks the tables of codepoints using a delimiter.
--
-- @param A table is given as an input which contains codepoints.
-- @param ASCII value of delimiter is provided.
-- @return Returns table of tables after breaking the give table using delimiter.
function breakInput(codepoints, delimiter)
local tbl = {}
local output = {}
local delimiter = delimiter or 0x002E
for _, v in ipairs(codepoints) do
if v == delimiter then
table.insert(output, tbl)
tbl = {}
else
table.insert(tbl, v)
end
end
table.insert(output, tbl)
return output
end
--Ignore the rest if we are not testing.
if not unittest.testing() then
return _ENV
@@ -549,8 +472,8 @@ test_suite = unittest.TestSuite:new()
-- Running test cases against Encoding function.
for i, v in ipairs(testCases) do
test_suite:add_test(unittest.equal(decode_label(v[1], unicode.utf8_enc), v[2]))
test_suite:add_test(unittest.equal(encode_label(v[2], unicode.utf8_dec), v[1]))
test_suite:add_test(unittest.equal(unicode.encode(decode_label(v[1]), unicode.utf8_enc), v[2]))
test_suite:add_test(unittest.equal(encode_label(unicode.decode(v[2], unicode.utf8_dec)), v[1]))
end
return _ENV