From 23a8bfc0bdcc908b4fed57f4a2aebfdbfc8f7577 Mon Sep 17 00:00:00 2001 From: dmiller Date: Wed, 27 Sep 2017 21:29:32 +0000 Subject: [PATCH] Fix line endings and add NSEdoc markers --- nselib/idna.lua | 1087 +++++++++++++++++++------------------ nselib/punycode.lua | 1261 +++++++++++++++++++++---------------------- 2 files changed, 1173 insertions(+), 1175 deletions(-) mode change 100755 => 100644 nselib/idna.lua mode change 100755 => 100644 nselib/punycode.lua diff --git a/nselib/idna.lua b/nselib/idna.lua old mode 100755 new mode 100644 index ad360bf42..a9ae69e8b --- a/nselib/idna.lua +++ b/nselib/idna.lua @@ -1,544 +1,543 @@ ---- --- Library methods for handling IDNA domains. --- --- Internationalized Domain Names (IDNs) follow a mechanism to process --- Internationalizing Domain Names in Applications (IDNA) for handling --- characters outside the ASCII repertoire in a standard fashion. IDNs use --- characters drawn from a large repertoire (Unicode), but IDNA allows the --- non-ASCII characters to be represented using only the ASCII characters --- already allowed in so-called host names today. This backward-compatible --- representation is required in existing protocols like DNS, so that IDNs can be --- introduced with no changes to the existing infrastructure. IDNA is --- only meant for processing domain names, not free text. --- --- Client software, such as browsers and emailers, faces a difficult transition --- from the version of international domain names approved in 2003 (IDNA2003), --- to the revision approved in 2010 (IDNA2008). The following functions allows --- the developer and end user to access domains that are valid under either --- system but the default conversion is set to IDNA2008. --- --- IDNA specification solves the problem of extending the repertoire --- of characters that can be used in domain names to include the Unicode --- repertoire (with some restrictions). --- --- Applications can use IDNA to support internationalized domain names --- anywhere that ASCII domain names are already supported, including DNS --- master files and resolver interfaces. The IDNA protocol is contained --- completely within applications. It is not a client-server or peer-to-peer --- protocol: everything is done inside the application itself. When used with --- a DNS resolver library, IDNA is inserted as a "shim" between the application --- and the resolver library. When used for writing names into a DNS zone, IDNA --- is used just before the name is committed to the zone. --- --- References: --- * http://ietf.org/rfc/rfc3490.txt --- * http://tools.ietf.org/html/rfc5890 --- * https://tools.ietf.org/html/rfc5891 --- * http://tools.ietf.org/html/rfc5892 --- * http://www.unicode.org/reports/tr46/ --- --- TODO: --- Add support for mapping right to left scripts for IDNA library. --- References: --- * http://tools.ietf.org/html/rfc5893 --- * http://www.unicode.org/reports/tr9/ --- * http://www.unicode.org/reports/tr46/#Right_to_Left_Scripts --- --- @author Rewanth Cool --- @copyright Same as Nmap--See https://nmap.org/book/man-legal.html - -local stdnse = require "stdnse" -local string = require "string" -local math = require "math" -local table = require "table" -local unicode = require "unicode" -local unittest = require "unittest" -local punycode = require "punycode" -local idnaMappings = require "data.idnaMappings".tbl - -_ENV = stdnse.module("idna", stdnse.seeall) - --- Localize few functions for a tiny speed boost, since these will be --- used frequently. -local floor = math.floor -local byte = string.byte -local char = string.char -local find = string.find -local match = string.match -local reverse = string.reverse -local sub = string.sub - --- This function concatenates the strings and tables (depth = 1) in --- a given table. --- --- @param tbl A table is given as an input which contains values as string --- or table (depth = 1). --- @return Returns table after concatinating all the values. -local function concat_table_in_tables(tbl) - - local t = {} - for _, v in ipairs(tbl) do - if type(v) == "table" then - for _, q in ipairs(v) do - table.insert(t, q) - end - else - table.insert(t, v) - end - end - - return t - -end - - --- This function maps the codepoints of the input to their respective --- codepoints based on the latest IDNA version mapping. --- --- @param decoded_tbl Table of Unicode decoded codepoints. --- @param useSTD3ASCIIRules Boolean value to set the mapping according to IDNA2003 rules. --- useSTD3ASCIIRules=true refers to IDNA2008. --- useSTD3ASCIIRules=false refers to IDNA2003. --- @param transitionalProcessing Processing option to handle deviation codepoints. --- transitionalProcessing=true maps deviation codepoints to the input. --- transitionalProcessing=false maintains original input. --- @param viewDisallowedCodePoints Boolean value to see the list of disallowed codepoints. --- @return Returns table with the list of mapped codepoints. -function map(decoded_tbl, useSTD3ASCIIRules, transitionalProcessing, viewDisallowedCodePoints) - - -- Assigns default values if not specified. - - -- According to IDNA2008, transitionalProcessing=true (default). - if transitionalProcessing == nil then - transitionalProcessing = true - end - - if useSTD3ASCIIRules == nil then - useSTD3ASCIIRules = true - end - if viewDisallowedCodePoints == nil then - viewDisallowedCodePoints = false - end - - local disallowedCodePoints = {} - - -- Mapping codepoints based on latest IDNA mapping list. - for index, cp in ipairs(decoded_tbl) do - local lookup = idnaMappings[cp] - if type(lookup) == "number" then - decoded_tbl[index] = lookup - -- Handles the IDNA deviated set of codepoints. - elseif transitionalProcessing and lookup.status == "deviation" then - decoded_tbl[index] = lookup[1] - -- Removes the IDNA ignored set of codepoints. - elseif lookup.status == "ignored" then - decoded_tbl[index] = {} - end - end - - decoded_tbl = concat_table_in_tables(decoded_tbl) - - --TODO: - -- Map bidi characters. - -- Right-to-left domain names. - -- References: - -- http://unicode.org/reports/tr9/ - -- http://www.unicode.org/reports/tr46/#Right_to_Left_Scripts - -- http://tools.ietf.org/html/rfc5893 - - -- Saves the list of disallowed codepoints. - if viewDisallowedCodePoints then - for index, cp in ipairs(decoded_tbl) do - local lookup = idnaMappings[cp] - if type(lookup) == "table" then - if lookup.status == "disallowed" then - table.insert(disallowedCodePoints, cp) - end - end - - -- If useSTD3ASCIIRules=true, both the disallowed_STD3_valid and - -- disallowed_STD3_mapped are considered as disallowed codepoints. - -- To use this part of code, add disallowed_STD3_mapped and disallowed_STD3_valid - -- codepoints to idnaMappings.lua. For now, we ignore these because idnaMappings.lua - -- is set to support only for the latest version of IDNA. - if useSTD3ASCIIRules then - if type(lookup) == "table" then - if lookup.status == "disallowed_STD3_valid" or lookup.status == "disallowed_STD3_mapped" then - table.insert(disallowedCodePoints, cp) - end - end - end - end - end - - decoded_tbl = concat_table_in_tables(decoded_tbl) - - -- If useSTD3ASCIIRules=false, then disallowed_STD3_mapped values are considered - -- as mapped codepoints and are mapped with the input. - -- To use this part of code, add disallowed_STD3_mapped and disallowed_STD3_valid - -- codepoints to idnaMappings.lua. For now, we ignore these because idnaMappings.lua - -- is set to support only for the latest version of IDNA. - if not useSTD3ASCIIRules then - for index, cp in ipairs(decoded_tbl) do - local lookup = idnaMappings[cp] - if type(lookup) == "table" then - if lookup.status == "disallowed_STD3_mapped" then - decoded_tbl[index] = lookup[1] - end - end - end - end - - decoded_tbl = concat_table_in_tables(decoded_tbl) - - return decoded_tbl, disallowedCodePoints -end - - --- Validate the input based on IDNA codepoints validation rules. --- --- @param tableOfTables Table of codepoints of the splitted input. --- @param checkHyphens Boolean flag checks for 0x002D in unusual places. -function validate(tableOfTables, checkHyphens) - - if checkHyphens == nil then - checkHyphens = true - end - - -- Validates the list of input codepoints. - for _, tbl in ipairs(tableOfTables) do - - if checkHyphens then - - -- Checks the 3rd and 4th position of input. - if (tbl[3] and tbl[3] == 0x002D) or (tbl[4] and tbl[4] == 0x002D) then - return false - end - - -- Checks for starting and ending of input. - if tbl[1] == 0x002D or tbl[#tbl] == 0x002D then - return false - end - - end - - for _, v in ipairs(tbl) do - if v == 0x002E then - return false - end - end - - -- TODO: - -- 1. Add validation for checkBidi, checkJoiners (if required). - -- 2. The label must not begin with a combining mark, that is: General_Category=Mark. - end - - return true - -end - --- This function converts the input codepoints into ASCII text based on IDNA rules. --- --- @param codepoints Table of codepoints of decoded input. --- @param tbl Table of optional params. --- @param transitionalProcessing Boolean value. Default: true. --- @param checkHyphens Boolean flag for checking hyphens presence in input. --- Default: true. --- @param checkBidi Boolean flag to represent if the input is of Bidi type. --- Default: false. --- @param checkJoiners Boolean flag to check for ContextJ rules in input. --- Default: false. --- @param useSTD3ASCIIRules Boolean value to represent ASCII rules. Default: true. --- @param delimiter codepoint of the character to be used as delimiter. --- @param encoder Encoder function to convert a Unicode codepoint into a --- string of bytes. --- @param An decoder function to decode the input string --- into an array of code points. --- @return Returns the IDNA ASCII format of the input. --- @return Throws nil, if there is any error in conversion. -function toASCII(codepoints, transitionalProcessing, checkHyphens, checkBidi, checkJoiners, useSTD3ASCIIRules, delimiter, encoder, decoder) - - -- Assigns default values if not specified. - if transitionalProcessing == nil then - transitionalProcessing = true - end - if checkHyphens == nil then - checkHyphens = true - end - - -- Bidi refers to right-to-left scripts. - -- Labels must satisfy all six of the numbered conditions in RFC 5893, Section 2. - -- to use checkBidi functionality. - if checkBidi == nil then - checkBidi = false - end - - -- Labels must satisify the ContextJ rules to use checkJoiners functionality. - if checkJoiners == nil then - checkJoiners = false - end - - if useSTD3ASCIIRules == nil then - useSTD3ASCIIRules = true - end - - delimiter = delimiter or 0x002E - encoder = encoder or unicode.utf8_enc - decoder = decoder or unicode.utf8_dec - - local inputString = unicode.encode(codepoints, encoder) - - local decoded_tbl, disallowedCodePoints = map(codepoints, useSTD3ASCIIRules, transitionalProcessing) - - if decoded_tbl == nil then - return nil - end - - -- Prints the list of disallowed values in the given input. - if #disallowedCodePoints > 0 then - stdnse.debug(table.concat(disallowedCodePoints, ", ")) - end - - -- Breaks the codepoints into multiple tables using delimiter. - decoded_tbl = punycode.breakInput(decoded_tbl, delimiter) - - if decoded_tbl == nil then - return nil - end - - -- Validates the codepoints and if any invalid codepoint found, returns nil. - if not validate(decoded_tbl, checkHyphens) then - return nil - end - - local stringLabels = {} - - -- Convert the codepoints into Unicode strings before passing them to mapLabels function. - for _, label in ipairs(decoded_tbl) do - table.insert(stringLabels, unicode.encode(label, encoder)) - end - - return punycode.mapLabels(stringLabels, punycode.encode_label, decoder, unicode.encode({0x002E}, encoder)) - -end - --- This function converts the input into Unicode codepoitns based on IDNA rules. --- --- @param codepoints Table of codepoints of decoded input. --- @param transitionalProcessing Boolean value. Default: true. --- @param checkHyphens Boolean flag for checking hyphens presence in input. --- Default: true. --- @param checkBidi Boolean flag to represent if the input is of Bidi type. --- Default: false. --- @param checkJoiners Boolean flag to check for ContextJ rules in input. --- Default: false. --- @param useSTD3ASCIIRules Boolean value to represent ASCII rules. Default: true. --- @param delimiter, codepoint of the character to be used as delimiter. --- @param encoder Encoder function to convert a Unicode codepoint into a --- string of bytes. --- @param An decoder function to decode the input string --- into an array of code points. --- @return Returns the Unicode format of the input based on IDNA rules. --- @return Throws nil, if there is any error in conversion. -function toUnicode(decoded_tbl, transitionalProcessing, checkHyphens, checkBidi, checkJoiners, useSTD3ASCIIRules, delimiter, encoder, decoder) - - -- Assigns default values if not specified. - if transitionalProcessing == nil then - transitionalProcessing = true - end - if checkHyphens == nil then - checkHyphens = true - end - if checkBidi == nil then - checkBidi = false - end - if checkJoiners == nil then - checkJoiners = false - end - if useSTD3ASCIIRules == nil then - useSTD3ASCIIRules = true - end - - delimiter = delimiter or 0x002E - encoder = encoder or unicode.utf8_enc - decoder = decoder or unicode.utf8_dec - - -- Breaks the codepoints into multiple tables using delimiter. - decoded_tbl = punycode.breakInput(decoded_tbl, delimiter) - if decoded_tbl == nil then - return nil - end - - local stringLabels = {} - - -- Format the codepoints into strings before passing to punycode.mapLabels - for _, label in ipairs(decoded_tbl) do - table.insert(stringLabels, unicode.encode(label, encoder)) - end - - return punycode.mapLabels(stringLabels, punycode.decode_label, encoder, unicode.encode({0x002E}, encoder)) - -end - -if not unittest.testing() then - return _ENV -end - --- These are the used for two way testing (both encoding and decoding). -local encodingAndDecodingTestCases = { - { - "\xce\xb1\xcf\x80\xcf\x80\xce\xbb\xce\xb5.\xce\xba\xce\xbf\xce\xbc", - "xn--mxairta.xn--vxaei" - }, - { - "a\xe0\xa5\x8db", - "xn--ab-fsf" - }, - { - "\xd9\x86\xd8\xa7\xd9\x85\xd9\x87\xd8\xa7\xdb\x8c.com", - "xn--mgba3gch31f.com" - }, - { - "\xe0\xb7\x81\xe0\xb7\x8a\xe0\xb6\xbb\xe0\xb7\x93.com", - "xn--10cl1a0b.com" - }, - { - "\xd0\xbf\xd1\x80\xd0\xb0\xd0\xb2\xd0\xb8\xd1\x82\xd0\xb5\xd0\xbb\xd1\x8c\xd1\x81\xd1\x82\xd0\xb2\xd0\xbe.\xd1\x80\xd1\x84", - "xn--80aealotwbjpid2k.xn--p1ai" - }, - { - "\xe0\xa4\x95\xe0\xa4\xbe\xe0\xa4\xb6\xe0\xa5\x80\xe0\xa4\xaa\xe0\xa5\x81\xe0\xa4\xb0.\xe0\xa4\xad\xe0\xa4\xbe\xe0\xa4\xb0\xe0\xa4\xa4", - "xn--11b6bsw3bni.xn--h2brj9c" - }, - { - "rewanthcool.com", - "rewanthcool.com" - }, - { - "\xe3\xaf\x99\xe3\xaf\x9c\xe3\xaf\x99\xe3\xaf\x9f.com", - "xn--domain.com" - } -} - --- These test cases are used for only converting them into ASCII text. -local toASCIITestCases = { - { - "ma\xc3\xb1ana.com", - "xn--maana-pta.com" - }, - { - "RewanthCool.com", - "rewanthcool.com" - }, - { - "\xc3\xb6bb.at", - "xn--bb-eka.at" - }, - { - "\xe3\x83\x89\xe3\x83\xa1\xe3\x82\xa4\xe3\x83\xb3.\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88", - "xn--eckwd4c7c.xn--zckzah" - }, - { - "\xd0\xb4\xd0\xbe\xd0\xbc\xd0\xb5\xd0\xbd\xd0\xb0.\xd0\xb8\xd1\x81\xd0\xbf\xd1\x8b\xd1\x82\xd0\xb0\xd0\xbd\xd0\xb8\xd0\xb5", - "xn--80ahd1agd.xn--80akhbyknj4f" - }, - { - "\xe6\xb5\x8b\xe8\xaf\x95", - "xn--0zwm56d" - }, - { - "k\xc3\xb6nigsg\xc3\xa4\xc3\x9fchen", - "xn--knigsgsschen-lcb0w" - }, - { - "fa\xc3\x9f.de", - "fass.de" - }, - { - "\xce\xb2\xcf\x8c\xce\xbb\xce\xbf\xcf\x82.com", - "xn--nxasmq6b.com" - }, - { - "mycharity\xe3\x80\x82org", - "mycharity.org" - }, - { - "K\xc3\xb6nigsg\xc3\xa4\xc3\x9fchen", - "xn--knigsgsschen-lcb0w" - }, - { - "B\xc3\xbccher.de", - "xn--bcher-kva.de" - }, - { - "xn--ma\xc3\xb1ana.com", - nil - } -} - --- These test cases are used for only converting them into ASCII text. --- The last two values in a table are outputs for different cases. --- --- Format: --- { --- input unicode string, --- transitional processed output, --transitional=true --- non-transitional processed output --transitional=false --- } -local multipleProcessingTestCases = { - { - "a\xe0\xa5\x8d\xe2\x80\x8cb", - "xn--ab-fsf", - "xn--ab-fsf604u" - }, - { - "A\xe0\xa5\x8d\xe2\x80\x8cb", - "xn--ab-fsf", - "xn--ab-fsf604u" - }, - { - "A\xe0\xa5\x8d\xe2\x80\x8Cb", - "xn--ab-fsf", - "xn--ab-fsf604u" - }, - { - "\xd9\x86\xd8\xa7\xd9\x85\xd9\x87\xe2\x80\x8c\xd8\xa7\xdb\x8c", - "xn--mgba3gch31f", - "xn--mgba3gch31f060k" - }, - { - "\xd9\x86\xd8\xa7\xd9\x85\xd9\x87\xe2\x80\x8c\xd8\xa7\xdb\x8c.com", - "xn--mgba3gch31f.com", - "xn--mgba3gch31f060k.com" - }, - { - "\xc3\x9f\xe0\xa7\x81\xe1\xb7\xad\xe3\x80\x82\xd8\xa085", - "xn--ss-e2f077r.xn--85-psd", - "xn--zca266bwrr.xn--85-psd" - }, - { - "\xc3\x9f\xe0\xa7\x81\xe1\xb7\xad\xe3\x80\x82\xd8\xa08\xe2\x82\x85", - "xn--ss-e2f077r.xn--85-psd", - "xn--zca266bwrr.xn--85-psd" - } -} - -test_suite = unittest.TestSuite:new() - -for _, v in ipairs(toASCIITestCases) do - test_suite:add_test(unittest.equal(toASCII(unicode.decode(v[1], unicode.utf8_dec)), v[2])) -end - -for _, v in ipairs(encodingAndDecodingTestCases) do - test_suite:add_test(unittest.equal(toASCII(unicode.decode(v[1], unicode.utf8_dec)), v[2])) - test_suite:add_test(unittest.equal(toUnicode(unicode.decode(v[2], unicode.utf8_dec)), v[1])) -end - -for _, v in ipairs(multipleProcessingTestCases) do - -- Performs transitional conversion. - test_suite:add_test(unittest.equal(toASCII(unicode.decode(v[1], unicode.utf8_dec)), v[2])) - -- Performs non-transitional conversion. - test_suite:add_test(unittest.equal(toASCII(unicode.decode(v[1], unicode.utf8_dec), false), v[3])) -end - -return _ENV +--- +-- Library methods for handling IDNA domains. +-- +-- Internationalized Domain Names (IDNs) follow a mechanism to process +-- Internationalizing Domain Names in Applications (IDNA) for handling +-- characters outside the ASCII repertoire in a standard fashion. IDNs use +-- characters drawn from a large repertoire (Unicode), but IDNA allows the +-- non-ASCII characters to be represented using only the ASCII characters +-- already allowed in so-called host names today. This backward-compatible +-- representation is required in existing protocols like DNS, so that IDNs can be +-- introduced with no changes to the existing infrastructure. IDNA is +-- only meant for processing domain names, not free text. +-- +-- Client software, such as browsers and emailers, faces a difficult transition +-- from the version of international domain names approved in 2003 (IDNA2003), +-- to the revision approved in 2010 (IDNA2008). The following functions allows +-- the developer and end user to access domains that are valid under either +-- system but the default conversion is set to IDNA2008. +-- +-- IDNA specification solves the problem of extending the repertoire +-- of characters that can be used in domain names to include the Unicode +-- repertoire (with some restrictions). +-- +-- Applications can use IDNA to support internationalized domain names +-- anywhere that ASCII domain names are already supported, including DNS +-- master files and resolver interfaces. The IDNA protocol is contained +-- completely within applications. It is not a client-server or peer-to-peer +-- protocol: everything is done inside the application itself. When used with +-- a DNS resolver library, IDNA is inserted as a "shim" between the application +-- and the resolver library. When used for writing names into a DNS zone, IDNA +-- is used just before the name is committed to the zone. +-- +-- References: +-- * http://ietf.org/rfc/rfc3490.txt +-- * http://tools.ietf.org/html/rfc5890 +-- * https://tools.ietf.org/html/rfc5891 +-- * http://tools.ietf.org/html/rfc5892 +-- * http://www.unicode.org/reports/tr46/ +-- +-- TODO: +-- Add support for mapping right to left scripts for IDNA library. +-- References: +-- * http://tools.ietf.org/html/rfc5893 +-- * http://www.unicode.org/reports/tr9/ +-- * http://www.unicode.org/reports/tr46/#Right_to_Left_Scripts +-- +-- @author Rewanth Cool +-- @copyright Same as Nmap--See https://nmap.org/book/man-legal.html + +local stdnse = require "stdnse" +local string = require "string" +local math = require "math" +local table = require "table" +local unicode = require "unicode" +local unittest = require "unittest" +local punycode = require "punycode" +local idnaMappings = require "data.idnaMappings".tbl + +_ENV = stdnse.module("idna", stdnse.seeall) + +-- Localize few functions for a tiny speed boost, since these will be +-- used frequently. +local floor = math.floor +local byte = string.byte +local char = string.char +local find = string.find +local match = string.match +local reverse = string.reverse +local sub = string.sub + +-- Concatenates the strings and tables (depth = 1) in a given table. +-- +-- @param tbl A table is given as an input which contains values as string +-- or table (depth = 1). +-- @return Returns table after concatinating all the values. +local function concat_table_in_tables(tbl) + + local t = {} + for _, v in ipairs(tbl) do + if type(v) == "table" then + for _, q in ipairs(v) do + table.insert(t, q) + end + else + table.insert(t, v) + end + end + + return t + +end + + +--- Maps the codepoints of the input to their respective +-- codepoints based on the latest IDNA version mapping. +-- +-- @param decoded_tbl Table of Unicode decoded codepoints. +-- @param useSTD3ASCIIRules Boolean value to set the mapping according to IDNA2003 rules. +-- useSTD3ASCIIRules=true refers to IDNA2008. +-- useSTD3ASCIIRules=false refers to IDNA2003. +-- @param transitionalProcessing Processing option to handle deviation codepoints. +-- transitionalProcessing=true maps deviation codepoints to the input. +-- transitionalProcessing=false maintains original input. +-- @param viewDisallowedCodePoints Boolean value to see the list of disallowed codepoints. +-- @return Returns table with the list of mapped codepoints. +function map(decoded_tbl, useSTD3ASCIIRules, transitionalProcessing, viewDisallowedCodePoints) + + -- Assigns default values if not specified. + + -- According to IDNA2008, transitionalProcessing=true (default). + if transitionalProcessing == nil then + transitionalProcessing = true + end + + if useSTD3ASCIIRules == nil then + useSTD3ASCIIRules = true + end + if viewDisallowedCodePoints == nil then + viewDisallowedCodePoints = false + end + + local disallowedCodePoints = {} + + -- Mapping codepoints based on latest IDNA mapping list. + for index, cp in ipairs(decoded_tbl) do + local lookup = idnaMappings[cp] + if type(lookup) == "number" then + decoded_tbl[index] = lookup + -- Handles the IDNA deviated set of codepoints. + elseif transitionalProcessing and lookup.status == "deviation" then + decoded_tbl[index] = lookup[1] + -- Removes the IDNA ignored set of codepoints. + elseif lookup.status == "ignored" then + decoded_tbl[index] = {} + end + end + + decoded_tbl = concat_table_in_tables(decoded_tbl) + + --TODO: + -- Map bidi characters. + -- Right-to-left domain names. + -- References: + -- http://unicode.org/reports/tr9/ + -- http://www.unicode.org/reports/tr46/#Right_to_Left_Scripts + -- http://tools.ietf.org/html/rfc5893 + + -- Saves the list of disallowed codepoints. + if viewDisallowedCodePoints then + for index, cp in ipairs(decoded_tbl) do + local lookup = idnaMappings[cp] + if type(lookup) == "table" then + if lookup.status == "disallowed" then + table.insert(disallowedCodePoints, cp) + end + end + + -- If useSTD3ASCIIRules=true, both the disallowed_STD3_valid and + -- disallowed_STD3_mapped are considered as disallowed codepoints. + -- To use this part of code, add disallowed_STD3_mapped and disallowed_STD3_valid + -- codepoints to idnaMappings.lua. For now, we ignore these because idnaMappings.lua + -- is set to support only for the latest version of IDNA. + if useSTD3ASCIIRules then + if type(lookup) == "table" then + if lookup.status == "disallowed_STD3_valid" or lookup.status == "disallowed_STD3_mapped" then + table.insert(disallowedCodePoints, cp) + end + end + end + end + end + + decoded_tbl = concat_table_in_tables(decoded_tbl) + + -- If useSTD3ASCIIRules=false, then disallowed_STD3_mapped values are considered + -- as mapped codepoints and are mapped with the input. + -- To use this part of code, add disallowed_STD3_mapped and disallowed_STD3_valid + -- codepoints to idnaMappings.lua. For now, we ignore these because idnaMappings.lua + -- is set to support only for the latest version of IDNA. + if not useSTD3ASCIIRules then + for index, cp in ipairs(decoded_tbl) do + local lookup = idnaMappings[cp] + if type(lookup) == "table" then + if lookup.status == "disallowed_STD3_mapped" then + decoded_tbl[index] = lookup[1] + end + end + end + end + + decoded_tbl = concat_table_in_tables(decoded_tbl) + + return decoded_tbl, disallowedCodePoints +end + + +--- Validate the input based on IDNA codepoints validation rules. +-- +-- @param tableOfTables Table of codepoints of the splitted input. +-- @param checkHyphens Boolean flag checks for 0x002D in unusual places. +function validate(tableOfTables, checkHyphens) + + if checkHyphens == nil then + checkHyphens = true + end + + -- Validates the list of input codepoints. + for _, tbl in ipairs(tableOfTables) do + + if checkHyphens then + + -- Checks the 3rd and 4th position of input. + if (tbl[3] and tbl[3] == 0x002D) or (tbl[4] and tbl[4] == 0x002D) then + return false + end + + -- Checks for starting and ending of input. + if tbl[1] == 0x002D or tbl[#tbl] == 0x002D then + return false + end + + end + + for _, v in ipairs(tbl) do + if v == 0x002E then + return false + end + end + + -- TODO: + -- 1. Add validation for checkBidi, checkJoiners (if required). + -- 2. The label must not begin with a combining mark, that is: General_Category=Mark. + end + + return true + +end + +--- Converts the input codepoints into ASCII text based on IDNA rules. +-- +-- @param codepoints Table of codepoints of decoded input. +-- @param tbl Table of optional params. +-- @param transitionalProcessing Boolean value. Default: true. +-- @param checkHyphens Boolean flag for checking hyphens presence in input. +-- Default: true. +-- @param checkBidi Boolean flag to represent if the input is of Bidi type. +-- Default: false. +-- @param checkJoiners Boolean flag to check for ContextJ rules in input. +-- Default: false. +-- @param useSTD3ASCIIRules Boolean value to represent ASCII rules. Default: true. +-- @param delimiter codepoint of the character to be used as delimiter. +-- @param encoder Encoder function to convert a Unicode codepoint into a +-- string of bytes. +-- @param An decoder function to decode the input string +-- into an array of code points. +-- @return Returns the IDNA ASCII format of the input. +-- @return Throws nil, if there is any error in conversion. +function toASCII(codepoints, transitionalProcessing, checkHyphens, checkBidi, checkJoiners, useSTD3ASCIIRules, delimiter, encoder, decoder) + + -- Assigns default values if not specified. + if transitionalProcessing == nil then + transitionalProcessing = true + end + if checkHyphens == nil then + checkHyphens = true + end + + -- Bidi refers to right-to-left scripts. + -- Labels must satisfy all six of the numbered conditions in RFC 5893, Section 2. + -- to use checkBidi functionality. + if checkBidi == nil then + checkBidi = false + end + + -- Labels must satisify the ContextJ rules to use checkJoiners functionality. + if checkJoiners == nil then + checkJoiners = false + end + + if useSTD3ASCIIRules == nil then + useSTD3ASCIIRules = true + end + + delimiter = delimiter or 0x002E + encoder = encoder or unicode.utf8_enc + decoder = decoder or unicode.utf8_dec + + local inputString = unicode.encode(codepoints, encoder) + + local decoded_tbl, disallowedCodePoints = map(codepoints, useSTD3ASCIIRules, transitionalProcessing) + + if decoded_tbl == nil then + return nil + end + + -- Prints the list of disallowed values in the given input. + if #disallowedCodePoints > 0 then + stdnse.debug(table.concat(disallowedCodePoints, ", ")) + end + + -- Breaks the codepoints into multiple tables using delimiter. + decoded_tbl = punycode.breakInput(decoded_tbl, delimiter) + + if decoded_tbl == nil then + return nil + end + + -- Validates the codepoints and if any invalid codepoint found, returns nil. + if not validate(decoded_tbl, checkHyphens) then + return nil + end + + local stringLabels = {} + + -- Convert the codepoints into Unicode strings before passing them to mapLabels function. + for _, label in ipairs(decoded_tbl) do + table.insert(stringLabels, unicode.encode(label, encoder)) + end + + return punycode.mapLabels(stringLabels, punycode.encode_label, decoder, unicode.encode({0x002E}, encoder)) + +end + +--- Converts the input into Unicode codepoitns based on IDNA rules. +-- +-- @param codepoints Table of codepoints of decoded input. +-- @param transitionalProcessing Boolean value. Default: true. +-- @param checkHyphens Boolean flag for checking hyphens presence in input. +-- Default: true. +-- @param checkBidi Boolean flag to represent if the input is of Bidi type. +-- Default: false. +-- @param checkJoiners Boolean flag to check for ContextJ rules in input. +-- Default: false. +-- @param useSTD3ASCIIRules Boolean value to represent ASCII rules. Default: true. +-- @param delimiter, codepoint of the character to be used as delimiter. +-- @param encoder Encoder function to convert a Unicode codepoint into a +-- string of bytes. +-- @param An decoder function to decode the input string +-- into an array of code points. +-- @return Returns the Unicode format of the input based on IDNA rules. +-- @return Throws nil, if there is any error in conversion. +function toUnicode(decoded_tbl, transitionalProcessing, checkHyphens, checkBidi, checkJoiners, useSTD3ASCIIRules, delimiter, encoder, decoder) + + -- Assigns default values if not specified. + if transitionalProcessing == nil then + transitionalProcessing = true + end + if checkHyphens == nil then + checkHyphens = true + end + if checkBidi == nil then + checkBidi = false + end + if checkJoiners == nil then + checkJoiners = false + end + if useSTD3ASCIIRules == nil then + useSTD3ASCIIRules = true + end + + delimiter = delimiter or 0x002E + encoder = encoder or unicode.utf8_enc + decoder = decoder or unicode.utf8_dec + + -- Breaks the codepoints into multiple tables using delimiter. + decoded_tbl = punycode.breakInput(decoded_tbl, delimiter) + if decoded_tbl == nil then + return nil + end + + local stringLabels = {} + + -- Format the codepoints into strings before passing to punycode.mapLabels + for _, label in ipairs(decoded_tbl) do + table.insert(stringLabels, unicode.encode(label, encoder)) + end + + return punycode.mapLabels(stringLabels, punycode.decode_label, encoder, unicode.encode({0x002E}, encoder)) + +end + +if not unittest.testing() then + return _ENV +end + +-- These are the used for two way testing (both encoding and decoding). +local encodingAndDecodingTestCases = { + { + "\xce\xb1\xcf\x80\xcf\x80\xce\xbb\xce\xb5.\xce\xba\xce\xbf\xce\xbc", + "xn--mxairta.xn--vxaei" + }, + { + "a\xe0\xa5\x8db", + "xn--ab-fsf" + }, + { + "\xd9\x86\xd8\xa7\xd9\x85\xd9\x87\xd8\xa7\xdb\x8c.com", + "xn--mgba3gch31f.com" + }, + { + "\xe0\xb7\x81\xe0\xb7\x8a\xe0\xb6\xbb\xe0\xb7\x93.com", + "xn--10cl1a0b.com" + }, + { + "\xd0\xbf\xd1\x80\xd0\xb0\xd0\xb2\xd0\xb8\xd1\x82\xd0\xb5\xd0\xbb\xd1\x8c\xd1\x81\xd1\x82\xd0\xb2\xd0\xbe.\xd1\x80\xd1\x84", + "xn--80aealotwbjpid2k.xn--p1ai" + }, + { + "\xe0\xa4\x95\xe0\xa4\xbe\xe0\xa4\xb6\xe0\xa5\x80\xe0\xa4\xaa\xe0\xa5\x81\xe0\xa4\xb0.\xe0\xa4\xad\xe0\xa4\xbe\xe0\xa4\xb0\xe0\xa4\xa4", + "xn--11b6bsw3bni.xn--h2brj9c" + }, + { + "rewanthcool.com", + "rewanthcool.com" + }, + { + "\xe3\xaf\x99\xe3\xaf\x9c\xe3\xaf\x99\xe3\xaf\x9f.com", + "xn--domain.com" + } +} + +-- These test cases are used for only converting them into ASCII text. +local toASCIITestCases = { + { + "ma\xc3\xb1ana.com", + "xn--maana-pta.com" + }, + { + "RewanthCool.com", + "rewanthcool.com" + }, + { + "\xc3\xb6bb.at", + "xn--bb-eka.at" + }, + { + "\xe3\x83\x89\xe3\x83\xa1\xe3\x82\xa4\xe3\x83\xb3.\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88", + "xn--eckwd4c7c.xn--zckzah" + }, + { + "\xd0\xb4\xd0\xbe\xd0\xbc\xd0\xb5\xd0\xbd\xd0\xb0.\xd0\xb8\xd1\x81\xd0\xbf\xd1\x8b\xd1\x82\xd0\xb0\xd0\xbd\xd0\xb8\xd0\xb5", + "xn--80ahd1agd.xn--80akhbyknj4f" + }, + { + "\xe6\xb5\x8b\xe8\xaf\x95", + "xn--0zwm56d" + }, + { + "k\xc3\xb6nigsg\xc3\xa4\xc3\x9fchen", + "xn--knigsgsschen-lcb0w" + }, + { + "fa\xc3\x9f.de", + "fass.de" + }, + { + "\xce\xb2\xcf\x8c\xce\xbb\xce\xbf\xcf\x82.com", + "xn--nxasmq6b.com" + }, + { + "mycharity\xe3\x80\x82org", + "mycharity.org" + }, + { + "K\xc3\xb6nigsg\xc3\xa4\xc3\x9fchen", + "xn--knigsgsschen-lcb0w" + }, + { + "B\xc3\xbccher.de", + "xn--bcher-kva.de" + }, + { + "xn--ma\xc3\xb1ana.com", + nil + } +} + +-- These test cases are used for only converting them into ASCII text. +-- The last two values in a table are outputs for different cases. +-- +-- Format: +-- { +-- input unicode string, +-- transitional processed output, --transitional=true +-- non-transitional processed output --transitional=false +-- } +local multipleProcessingTestCases = { + { + "a\xe0\xa5\x8d\xe2\x80\x8cb", + "xn--ab-fsf", + "xn--ab-fsf604u" + }, + { + "A\xe0\xa5\x8d\xe2\x80\x8cb", + "xn--ab-fsf", + "xn--ab-fsf604u" + }, + { + "A\xe0\xa5\x8d\xe2\x80\x8Cb", + "xn--ab-fsf", + "xn--ab-fsf604u" + }, + { + "\xd9\x86\xd8\xa7\xd9\x85\xd9\x87\xe2\x80\x8c\xd8\xa7\xdb\x8c", + "xn--mgba3gch31f", + "xn--mgba3gch31f060k" + }, + { + "\xd9\x86\xd8\xa7\xd9\x85\xd9\x87\xe2\x80\x8c\xd8\xa7\xdb\x8c.com", + "xn--mgba3gch31f.com", + "xn--mgba3gch31f060k.com" + }, + { + "\xc3\x9f\xe0\xa7\x81\xe1\xb7\xad\xe3\x80\x82\xd8\xa085", + "xn--ss-e2f077r.xn--85-psd", + "xn--zca266bwrr.xn--85-psd" + }, + { + "\xc3\x9f\xe0\xa7\x81\xe1\xb7\xad\xe3\x80\x82\xd8\xa08\xe2\x82\x85", + "xn--ss-e2f077r.xn--85-psd", + "xn--zca266bwrr.xn--85-psd" + } +} + +test_suite = unittest.TestSuite:new() + +for _, v in ipairs(toASCIITestCases) do + test_suite:add_test(unittest.equal(toASCII(unicode.decode(v[1], unicode.utf8_dec)), v[2])) +end + +for _, v in ipairs(encodingAndDecodingTestCases) do + test_suite:add_test(unittest.equal(toASCII(unicode.decode(v[1], unicode.utf8_dec)), v[2])) + test_suite:add_test(unittest.equal(toUnicode(unicode.decode(v[2], unicode.utf8_dec)), v[1])) +end + +for _, v in ipairs(multipleProcessingTestCases) do + -- Performs transitional conversion. + test_suite:add_test(unittest.equal(toASCII(unicode.decode(v[1], unicode.utf8_dec)), v[2])) + -- Performs non-transitional conversion. + test_suite:add_test(unittest.equal(toASCII(unicode.decode(v[1], unicode.utf8_dec), false), v[3])) +end + +return _ENV diff --git a/nselib/punycode.lua b/nselib/punycode.lua old mode 100755 new mode 100644 index 612e74125..841767346 --- a/nselib/punycode.lua +++ b/nselib/punycode.lua @@ -1,631 +1,630 @@ ---- --- Library methods for handling punycode strings. --- --- Punycode is a simple and efficient transfer encoding syntax designed --- for use with Internationalized Domain Names in Applications (IDNA). --- It uniquely and reversibly transforms a Unicode string into an ASCII --- string. ASCII characters in the Unicode string are represented --- literally, and non-ASCII characters are represented by ASCII --- characters that are allowed in host name labels (letters, digits, and --- hyphens). This document defines a general algorithm called --- Bootstring that allows a string of basic code points to uniquely --- represent any string of code points drawn from a larger set. --- Punycode is an instance of Bootstring that uses particular parameter --- values specified by this document, appropriate for IDNA. --- --- Advantages of Bootstring algorithm are Completeness, Uniqueness, --- Reversibility, Efficient encoding, Simplicity and Readability. --- --- References: --- * http://ietf.org/rfc/rfc3492.txt --- --- @author Rewanth Cool --- @copyright Same as Nmap--See https://nmap.org/book/man-legal.html - -local stdnse = require "stdnse" -local string = require "string" -local math = require "math" -local table = require "table" -local unicode = require "unicode" -local unittest = require "unittest" - -_ENV = stdnse.module("punycode", stdnse.seeall) - --- Localize few functions for a tiny speed boost, since these will be --- used frequently. -local floor = math.floor -local byte = string.byte -local char = string.char -local find = string.find -local match = string.match -local reverse = string.reverse -local sub = string.sub - --- Highest positive signed 32-bit float value -local maxInt = 0x7FFFFFFF - --- Regular expressions (RFC 3490 separators) -local regexSeparators = { - 0x3002, -- Ideographic full stop - 0xFF0E, -- Fullwidth full stop - 0xFF61 -- Halfwidth ideographic full stop -} - --- Bootstring parameters -local base = 0x24 -local tMin = 0x1 -local tMax = 0x1A -local skew = 0x26 -local damp = 0x2BC -local initialBias = 0x48 -local initialN = 0x80 -local delimiter = char("0x2D") - --- Convenience shortcuts -local baseMinusTMin = base - tMin - --- This function finds and replaces matched values in a table. --- --- @param tbl Table of values. --- @param val Value to to be replaced in the table. --- @param new_val Value to be replaced with. --- @return Returns a new table with new values. -local function find_and_replace(tbl, val, new_val) - - for index, data in pairs(tbl) do - if data == val then - tbl[index] = new_val - end - end - - return tbl - -end - - --- Bias adaptation function as per section 3.4 of RFC 3492. --- https://tools.ietf.org/html/rfc3492#section-3.4 --- The following function is adapted from punycode.js by Mathias Bynens --- under the MIT License. -local function adapt(delta, numPoints, firstTime) - - local k = 0; - - if firstTime then - delta = floor(delta / damp) - else - delta = (delta >> 1) - end - - delta = delta + floor(delta / numPoints) - - while delta > (baseMinusTMin * tMax >> 1) do - delta = floor(delta / baseMinusTMin) - k = k + base - end - - return floor(k + (baseMinusTMin + 1) * delta / (delta + skew)) - -end - --- The following function converts boolean value to integer. --- --- @param status boolean value is given as input. --- @return Returns 0/1 based on the given boolean input. -local function boolToNum(status) - - if status == true then - return 1 - else - return 0 - end - -end - --- This function converts a basic code point into a digit/integer. --- --- @param codePoint The basic numeric code point value. --- @return The numeric value of a basic code point (for use in --- representing integers) in the range `0` to `base - 1`, or `base` if --- the code point does not represent a value. --- The following function is adapted from punycode.js by Mathias Bynens --- under the MIT License. -local function basicToDigit(codePoint) - - if (codePoint - 0x30 < 0x0A) then - return codePoint - 0x16 - end - if (codePoint - 0x41 < 0x1A) then - return codePoint - 0x41 - end - if (codePoint - 0x61 < 0x1A) then - return codePoint - 0x61 - end - - return base - -end - - --- This function converts a digit/integer into a basic code point. --- --- @param digit The numeric value of a basic code point. --- @return The basic code point whose value (when used for --- representing integers) is `digit`, which needs to be in the range --- `0` to `base - 1`. If `flag` is non-zero, the uppercase form is --- used; else, the lowercase form is used. The behavior is undefined --- if `flag` is non-zero and `digit` has no uppercase form. --- The following function is adapted from punycode.js by Mathias Bynens --- under the MIT License. -local function digitToBasic(digit, flag) - -- 0..25 map to ASCII a..z or A..Z - -- 26..35 map to ASCII 0..9 - return digit + 22 + 75 * boolToNum(digit < 26) - (boolToNum((flag ~= 0)) << 5) -end - --- This function creates a string based on an array of numeric code points. --- --- @param input String of input to be encoded. --- @param decoder Sets the decoding format to be used. --- @return The new encoded string --- The following function is adapted from punycode.js by Mathias Bynens --- under the MIT License. -function encode_input(input, decoder) - - local output = {} - - -- Convert the input into an array of Unicode code points. - input = unicode.decode(input, decoder) - - -- Cache the length. - local inputLength = #input - - -- Initialize the state. - local n = initialN - local delta = 0 - local bias = initialBias - - -- Handle the basic code points. - for _, v in ipairs(input) do - if v < 0x80 then - table.insert(output, char(v)) - end - end - - local basicLength = #output - local handledCPCount = basicLength - - -- `handledCPCount` is the number of code points that have been handled - -- `basicLength` is the number of basic code points. - -- Finish the basic string with a delimiter unless it's empty. - if (basicLength > 0) then - table.insert(output, delimiter) - end - - -- Main encoding loop: - while (handledCPCount < inputLength) do - -- All non-basic code points < n have been handled already. Find - -- the next larger one: - local m = maxInt - for _, v in ipairs(input) do - if v >= n and v < m then - m = v - end - end - - -- Increase `delta` enough to advance the decoder's state to - -- , but guard against overflow. - local handledCPCountPlusOne = handledCPCount + 1 - if (m - n > floor((maxInt - delta) / handledCPCountPlusOne)) then - --error('overflow') - return nil, "Overflow exception occurred." - end - - delta = delta + (m - n) * handledCPCountPlusOne - n = m - - for _, currentValue in ipairs(input) do - - if currentValue < n then - delta = delta + 1 --Move this down incase of wrong answer - if delta > maxInt then - --error("overflow") - return nil, "Overflow exception occurred." - end - end - - if (currentValue == n) then - -- Represent delta as a generalized variable-length integer. - local q = delta - local k = base - - repeat - local t - - if k <= bias then - t = tMin - else - if k >= bias + tMax then - t = tMax - else - t = k - bias - end - end - - if q < t then - break - end - - local qMinusT = q - t - local baseMinusT = base - t - local ans = digitToBasic(t + qMinusT % baseMinusT, 0) - - table.insert(output, char(ans)) - - q = floor(qMinusT / baseMinusT) - - k = k + base - until false - - local ans = digitToBasic(q, 0) - table.insert(output, char(ans)) - bias = adapt(delta, handledCPCountPlusOne, handledCPCount == basicLength) - - delta = 0 - handledCPCount = handledCPCount + 1 - end - end - - delta = delta + 1 - n = n + 1 - - end - - return table.concat(output, '') - -end - --- This function converts a Punycode string of ASCII-only symbols to a --- string of Unicode symbols. --- --- @param input The Punycode string of ASCII-only symbols. --- @param encoder Defines the type of encoding format to be used. --- @return The resulting string of Unicode symbols. --- The following function is adapted from punycode.js by Mathias Bynens --- under the MIT License. -function decode_input(input, encoder) - - local output = {} - local inputLength = #input - local i = 0 - local n = initialN - local bias = initialBias - - local basic - if find(reverse(input), delimiter) then - basic = #input - find(reverse(input), delimiter) - else - basic = -1 - end - - if basic < 0 then - basic = 0 - end - - for j = 1, basic do - local c = sub(input, j, j) - local value = byte(c) - - if value >= 0x80 then - --error("Not basic") - return nil, "Not basic exception occurred." - end - table.insert(output, value) - end - - local index - if basic > 0 then - index = basic + 1 - else - index = 0 - end - - while index < inputLength do - local oldi = i - local w = 1 - local k = base - - repeat - - if index >= inputLength then - --error("Invalid input") - return nil, "Invalid input exception occurred." - end - - local c = sub(input, index+1, index+1) - local value = byte(c) - local digit = basicToDigit(value) - - index = index + 1 - - if (digit >= base or digit > floor((maxInt - i) / w)) then - --error('overflow'); - return nil, "Overflow exception occurred." - end - i = i + digit * w; - - local t - if k <= bias then - t = tMin - else - if k >= bias + tMax then - t = tMax - else - t = k - bias - end - end - - if digit < t then - break - end - - local baseMinusT = base - t; - if (w > floor(maxInt / baseMinusT)) then - --error('overflow'); - return nil, "Overflow exception occurred." - end - - w = w * baseMinusT; - k = k + base - - until false - - local out = #output + 1; - - bias = adapt(i - oldi, out, oldi == 0) - - -- `i` was supposed to wrap around from `out` to `0`, - -- incrementing `n` each time, so we'll fix that now: - if (floor(i / out) > maxInt - n) then - --error('overflow'); - return nil, "Overflow exception occurred." - end - - n = n + floor(i / out); - i = i % out; - for temp = #output, i, -1 do - output[temp+1] = output[temp] - end - output[i+1] = n - i = i + 1 - end - - return unicode.encode(output, encoder) - -end - --- The following function looks for non-ASCII characters in a string. --- --- @param s String of input to be encoded. --- @param decoder A decoder function to convert the domain into a --- table of Unicode code points. --- @return Returns encoded string. -function encode_label(s, decoder) - - local flag = false - local decoded_tbl = unicode.decode(s, decoder) - - -- Looks for non-ASCII character - for _, val in pairs(decoded_tbl) do - - if not (val >=0 and val <= 127) then - flag = true - break - end - - end - - if flag then - - local res, err = encode_input(s, decoder) - if err then - return nil, err - end - - return 'xn--' .. res - - else - return s - end - -end - --- The following function validates and decodes the given input. --- --- @param s String of input --- @param encoder An encoder function to convert a Unicode code point --- into a string of bytes. Default: unicode.utf8_enc --- @return Returns decoded string. -function decode_label(s, encoder) - - if match(s, "^xn%-%-") then - - local res, err = decode_input(sub(s, 5):lower(), encoder) - if err then - return nil, err - end - - return res - - else - return s - end - -end - --- The following function splits the domain name and maps it with the --- corresponding data. --- --- @param s The domain name to be processed. --- @param fn The function to be called for every label. --- @param formatter The type of encoder/decoder to be used. --- @param delimiter delimiter character for concatinating output. --- @return Returns encoded/decoded string based on the formatter. --- The following function is adapted from punycode.js by Mathias Bynens --- under the MIT License. -function mapLabels(labels, fn, formatter, delimiter) - - local encoded = {} - - for index, v in ipairs(labels) do - - local res, err = fn(labels[index], formatter) - - if err then - stdnse.debug2(err) - return nil - end - - encoded[index] = res - end - - return table.concat(encoded, delimiter) - -end - --- This function breaks the tables of codepoints using a delimiter. --- --- @param A table is given as an input which contains codepoints. --- @param ASCII value of delimiter is provided. --- @return Returns table of tables after breaking the give table using delimiter. -function breakInput(codepoints, delimiter) - - local tbl = {} - local output = {} - - local delimiter = delimiter or 0x002E - - for _, v in ipairs(codepoints) do - if v == delimiter then - table.insert(output, tbl) - tbl = {} - else - table.insert(tbl, v) - end - end - - table.insert(output, tbl) - - return output - -end - ---- --- This function converts the given domain name or string into a --- ASCII string. --- --- @param input Domain or string to be decoded. --- @param decoder A decoder function to convert the domain into a --- table of Unicode code points. Default: unicode.utf8_dec --- @param encoder An encoder function to convert a Unicode code --- point into a string of bytes. --- @param decoder An decoder function to decode the input string --- into an array of code points. --- @return Returns decoded string in the desired format. --- @return Throws an error, if any. -function encode(input, encoder, decoder) - - decoder = decoder or unicode.utf8_dec - encoder = encoder or unicode.utf8_enc - - local decoded_tbl = unicode.decode(input, decoder) - - -- Works only for punycode. - for _, val in pairs(regexSeparators) do - decoded_tbl = find_and_replace(decoded_tbl, val, byte('.')) - end - - local delimiterCodePoint = 0x002E - -- Expects codepoints and delimiter values. - local codepointLabels = breakInput(decoded_tbl, delimiterCodePoint) - - local stringLabels = {} - - for _, label in ipairs(codepointLabels) do - table.insert(stringLabels, unicode.encode(label, encoder)) - end - - local delimiter = unicode.encode({0x002E}, encoder) - - return mapLabels(stringLabels, encode_label, decoder, delimiter) -end - ---- --- This function converts the given domain name or string into a --- Unicode string. --- --- @param input Domain or string to be encoded. --- @param encoder An encoder function to convert a Unicode code --- point into a string of bytes. --- @param decoder An decoder function to decode the input string --- into an array of code points. --- @return Returns encoded string in the desired format. --- @return Throws an error, if any. -function decode(input, encoder, decoder) - - encoder = encoder or unicode.utf8_enc - decoder = decoder or unicode.utf8_dec - local delimiterCodePoint = 0x002E - local delimiter = unicode.encode({0x002E}, encoder) - - local codepoints = unicode.decode(input, decoder) - local codepointLabels = breakInput(codepoints, delimiterCodePoint) - - local stringLabels = {} - - for _, label in ipairs(codepointLabels) do - table.insert(stringLabels, unicode.encode(label, encoder)) - end - - return mapLabels(stringLabels, decode_label, encoder, delimiter) - -end - ---Ignore the rest if we are not testing. -if not unittest.testing() then - return _ENV -end - --- Table of punycode test cases. -local testCases = { - { - "xn--0zwm56d", - "\xe6\xb5\x8b\xe8\xaf\x95" - }, - { - "xn--knigsgsschen-lcb0w", - "k\xc3\xb6nigsg\xc3\xa4sschen" - }, - { - "xn--ab-fsf", - "a\xe0\xa5\x8db" - }, - { - "xn--maana-pta", - "ma\xc3\xb1ana" - }, - { - "xn----dqo34k", - "\xe2\x98\x83-\xe2\x8c\x98" - } -} - -test_suite = unittest.TestSuite:new() - --- Running test cases against Encoding function. -for i, v in ipairs(testCases) do - test_suite:add_test(unittest.equal(decode(v[1]), v[2])) - test_suite:add_test(unittest.equal(encode(v[2]), v[1])) -end - -return _ENV +--- +-- Library methods for handling punycode strings. +-- +-- Punycode is a simple and efficient transfer encoding syntax designed +-- for use with Internationalized Domain Names in Applications (IDNA). +-- It uniquely and reversibly transforms a Unicode string into an ASCII +-- string. ASCII characters in the Unicode string are represented +-- literally, and non-ASCII characters are represented by ASCII +-- characters that are allowed in host name labels (letters, digits, and +-- hyphens). This document defines a general algorithm called +-- Bootstring that allows a string of basic code points to uniquely +-- represent any string of code points drawn from a larger set. +-- Punycode is an instance of Bootstring that uses particular parameter +-- values specified by this document, appropriate for IDNA. +-- +-- Advantages of Bootstring algorithm are Completeness, Uniqueness, +-- Reversibility, Efficient encoding, Simplicity and Readability. +-- +-- References: +-- * http://ietf.org/rfc/rfc3492.txt +-- +-- @author Rewanth Cool +-- @copyright Same as Nmap--See https://nmap.org/book/man-legal.html + +local stdnse = require "stdnse" +local string = require "string" +local math = require "math" +local table = require "table" +local unicode = require "unicode" +local unittest = require "unittest" + +_ENV = stdnse.module("punycode", stdnse.seeall) + +-- Localize few functions for a tiny speed boost, since these will be +-- used frequently. +local floor = math.floor +local byte = string.byte +local char = string.char +local find = string.find +local match = string.match +local reverse = string.reverse +local sub = string.sub + +-- Highest positive signed 32-bit float value +local maxInt = 0x7FFFFFFF + +-- Regular expressions (RFC 3490 separators) +local regexSeparators = { + 0x3002, -- Ideographic full stop + 0xFF0E, -- Fullwidth full stop + 0xFF61 -- Halfwidth ideographic full stop +} + +-- Bootstring parameters +local base = 0x24 +local tMin = 0x1 +local tMax = 0x1A +local skew = 0x26 +local damp = 0x2BC +local initialBias = 0x48 +local initialN = 0x80 +local delimiter = char("0x2D") + +-- Convenience shortcuts +local baseMinusTMin = base - tMin + +-- This function finds and replaces matched values in a table. +-- +-- @param tbl Table of values. +-- @param val Value to to be replaced in the table. +-- @param new_val Value to be replaced with. +-- @return Returns a new table with new values. +local function find_and_replace(tbl, val, new_val) + + for index, data in pairs(tbl) do + if data == val then + tbl[index] = new_val + end + end + + return tbl + +end + + +-- Bias adaptation function as per section 3.4 of RFC 3492. +-- https://tools.ietf.org/html/rfc3492#section-3.4 +-- The following function is adapted from punycode.js by Mathias Bynens +-- under the MIT License. +local function adapt(delta, numPoints, firstTime) + + local k = 0; + + if firstTime then + delta = floor(delta / damp) + else + delta = (delta >> 1) + end + + delta = delta + floor(delta / numPoints) + + while delta > (baseMinusTMin * tMax >> 1) do + delta = floor(delta / baseMinusTMin) + k = k + base + end + + return floor(k + (baseMinusTMin + 1) * delta / (delta + skew)) + +end + +-- The following function converts boolean value to integer. +-- +-- @param status boolean value is given as input. +-- @return Returns 0/1 based on the given boolean input. +local function boolToNum(status) + + if status == true then + return 1 + else + return 0 + end + +end + +-- This function converts a basic code point into a digit/integer. +-- +-- @param codePoint The basic numeric code point value. +-- @return The numeric value of a basic code point (for use in +-- representing integers) in the range `0` to `base - 1`, or `base` if +-- the code point does not represent a value. +-- The following function is adapted from punycode.js by Mathias Bynens +-- under the MIT License. +local function basicToDigit(codePoint) + + if (codePoint - 0x30 < 0x0A) then + return codePoint - 0x16 + end + if (codePoint - 0x41 < 0x1A) then + return codePoint - 0x41 + end + if (codePoint - 0x61 < 0x1A) then + return codePoint - 0x61 + end + + return base + +end + + +-- This function converts a digit/integer into a basic code point. +-- +-- @param digit The numeric value of a basic code point. +-- @return The basic code point whose value (when used for +-- representing integers) is `digit`, which needs to be in the range +-- `0` to `base - 1`. If `flag` is non-zero, the uppercase form is +-- used; else, the lowercase form is used. The behavior is undefined +-- if `flag` is non-zero and `digit` has no uppercase form. +-- The following function is adapted from punycode.js by Mathias Bynens +-- under the MIT License. +local function digitToBasic(digit, flag) + -- 0..25 map to ASCII a..z or A..Z + -- 26..35 map to ASCII 0..9 + return digit + 22 + 75 * boolToNum(digit < 26) - (boolToNum((flag ~= 0)) << 5) +end + +-- Creates a string based on an array of numeric code points. +-- +-- @param input String of input to be encoded. +-- @param decoder Sets the decoding format to be used. +-- @return The new encoded string +-- The following function is adapted from punycode.js by Mathias Bynens +-- under the MIT License. +function encode_input(input, decoder) + + local output = {} + + -- Convert the input into an array of Unicode code points. + input = unicode.decode(input, decoder) + + -- Cache the length. + local inputLength = #input + + -- Initialize the state. + local n = initialN + local delta = 0 + local bias = initialBias + + -- Handle the basic code points. + for _, v in ipairs(input) do + if v < 0x80 then + table.insert(output, char(v)) + end + end + + local basicLength = #output + local handledCPCount = basicLength + + -- `handledCPCount` is the number of code points that have been handled + -- `basicLength` is the number of basic code points. + -- Finish the basic string with a delimiter unless it's empty. + if (basicLength > 0) then + table.insert(output, delimiter) + end + + -- Main encoding loop: + while (handledCPCount < inputLength) do + -- All non-basic code points < n have been handled already. Find + -- the next larger one: + local m = maxInt + for _, v in ipairs(input) do + if v >= n and v < m then + m = v + end + end + + -- Increase `delta` enough to advance the decoder's state to + -- , but guard against overflow. + local handledCPCountPlusOne = handledCPCount + 1 + if (m - n > floor((maxInt - delta) / handledCPCountPlusOne)) then + --error('overflow') + return nil, "Overflow exception occurred." + end + + delta = delta + (m - n) * handledCPCountPlusOne + n = m + + for _, currentValue in ipairs(input) do + + if currentValue < n then + delta = delta + 1 --Move this down incase of wrong answer + if delta > maxInt then + --error("overflow") + return nil, "Overflow exception occurred." + end + end + + if (currentValue == n) then + -- Represent delta as a generalized variable-length integer. + local q = delta + local k = base + + repeat + local t + + if k <= bias then + t = tMin + else + if k >= bias + tMax then + t = tMax + else + t = k - bias + end + end + + if q < t then + break + end + + local qMinusT = q - t + local baseMinusT = base - t + local ans = digitToBasic(t + qMinusT % baseMinusT, 0) + + table.insert(output, char(ans)) + + q = floor(qMinusT / baseMinusT) + + k = k + base + until false + + local ans = digitToBasic(q, 0) + table.insert(output, char(ans)) + bias = adapt(delta, handledCPCountPlusOne, handledCPCount == basicLength) + + delta = 0 + handledCPCount = handledCPCount + 1 + end + end + + delta = delta + 1 + n = n + 1 + + end + + return table.concat(output, '') + +end + +-- Converts a Punycode string of ASCII-only symbols to a +-- string of Unicode symbols. +-- +-- @param input The Punycode string of ASCII-only symbols. +-- @param encoder Defines the type of encoding format to be used. +-- @return The resulting string of Unicode symbols. +-- The following function is adapted from punycode.js by Mathias Bynens +-- under the MIT License. +function decode_input(input, encoder) + + local output = {} + local inputLength = #input + local i = 0 + local n = initialN + local bias = initialBias + + local basic + if find(reverse(input), delimiter) then + basic = #input - find(reverse(input), delimiter) + else + basic = -1 + end + + if basic < 0 then + basic = 0 + end + + for j = 1, basic do + local c = sub(input, j, j) + local value = byte(c) + + if value >= 0x80 then + --error("Not basic") + return nil, "Not basic exception occurred." + end + table.insert(output, value) + end + + local index + if basic > 0 then + index = basic + 1 + else + index = 0 + end + + while index < inputLength do + local oldi = i + local w = 1 + local k = base + + repeat + + if index >= inputLength then + --error("Invalid input") + return nil, "Invalid input exception occurred." + end + + local c = sub(input, index+1, index+1) + local value = byte(c) + local digit = basicToDigit(value) + + index = index + 1 + + if (digit >= base or digit > floor((maxInt - i) / w)) then + --error('overflow'); + return nil, "Overflow exception occurred." + end + i = i + digit * w; + + local t + if k <= bias then + t = tMin + else + if k >= bias + tMax then + t = tMax + else + t = k - bias + end + end + + if digit < t then + break + end + + local baseMinusT = base - t; + if (w > floor(maxInt / baseMinusT)) then + --error('overflow'); + return nil, "Overflow exception occurred." + end + + w = w * baseMinusT; + k = k + base + + until false + + local out = #output + 1; + + bias = adapt(i - oldi, out, oldi == 0) + + -- `i` was supposed to wrap around from `out` to `0`, + -- incrementing `n` each time, so we'll fix that now: + if (floor(i / out) > maxInt - n) then + --error('overflow'); + return nil, "Overflow exception occurred." + end + + n = n + floor(i / out); + i = i % out; + for temp = #output, i, -1 do + output[temp+1] = output[temp] + end + output[i+1] = n + i = i + 1 + end + + return unicode.encode(output, encoder) + +end + +-- Performs punycode encoding on a label +-- +-- @param s String of input to be encoded. +-- @param decoder A decoder function to convert the domain into a +-- table of Unicode code points. +-- @return Returns encoded string. +function encode_label(s, decoder) + + local flag = false + local decoded_tbl = unicode.decode(s, decoder) + + -- Looks for non-ASCII character + for _, val in pairs(decoded_tbl) do + + if not (val >=0 and val <= 127) then + flag = true + break + end + + end + + if flag then + + local res, err = encode_input(s, decoder) + if err then + return nil, err + end + + return 'xn--' .. res + + else + return s + end + +end + +--- Decodes a punycode-encoded label to Unicode. +-- +-- @param s String of input +-- @param encoder An encoder function to convert a Unicode code point +-- into a string of bytes. Default: unicode.utf8_enc +-- @return Returns decoded string. +function decode_label(s, encoder) + + if match(s, "^xn%-%-") then + + local res, err = decode_input(sub(s, 5):lower(), encoder) + if err then + return nil, err + end + + return res + + else + return s + end + +end + +--- Splits the domain name and maps it with the corresponding data. +-- +-- @param s The domain name to be processed. +-- @param fn The function to be called for every label. +-- @param formatter The type of encoder/decoder to be used. +-- @param delimiter delimiter character for concatinating output. +-- @return Returns encoded/decoded string based on the formatter. +-- The following function is adapted from punycode.js by Mathias Bynens +-- under the MIT License. +function mapLabels(labels, fn, formatter, delimiter) + + local encoded = {} + + for index, v in ipairs(labels) do + + local res, err = fn(labels[index], formatter) + + if err then + stdnse.debug2(err) + return nil + end + + encoded[index] = res + end + + return table.concat(encoded, delimiter) + +end + +--- Breaks the tables of codepoints using a delimiter. +-- +-- @param A table is given as an input which contains codepoints. +-- @param ASCII value of delimiter is provided. +-- @return Returns table of tables after breaking the give table using delimiter. +function breakInput(codepoints, delimiter) + + local tbl = {} + local output = {} + + local delimiter = delimiter or 0x002E + + for _, v in ipairs(codepoints) do + if v == delimiter then + table.insert(output, tbl) + tbl = {} + else + table.insert(tbl, v) + end + end + + table.insert(output, tbl) + + return output + +end + +--- +-- This function converts the given domain name or string into a +-- ASCII string. +-- +-- @param input Domain or string to be decoded. +-- @param decoder A decoder function to convert the domain into a +-- table of Unicode code points. Default: unicode.utf8_dec +-- @param encoder An encoder function to convert a Unicode code +-- point into a string of bytes. +-- @param decoder An decoder function to decode the input string +-- into an array of code points. +-- @return Returns decoded string in the desired format. +-- @return Throws an error, if any. +function encode(input, encoder, decoder) + + decoder = decoder or unicode.utf8_dec + encoder = encoder or unicode.utf8_enc + + local decoded_tbl = unicode.decode(input, decoder) + + -- Works only for punycode. + for _, val in pairs(regexSeparators) do + decoded_tbl = find_and_replace(decoded_tbl, val, byte('.')) + end + + local delimiterCodePoint = 0x002E + -- Expects codepoints and delimiter values. + local codepointLabels = breakInput(decoded_tbl, delimiterCodePoint) + + local stringLabels = {} + + for _, label in ipairs(codepointLabels) do + table.insert(stringLabels, unicode.encode(label, encoder)) + end + + local delimiter = unicode.encode({0x002E}, encoder) + + return mapLabels(stringLabels, encode_label, decoder, delimiter) +end + +--- +-- This function converts the given domain name or string into a +-- Unicode string. +-- +-- @param input Domain or string to be encoded. +-- @param encoder An encoder function to convert a Unicode code +-- point into a string of bytes. +-- @param decoder An decoder function to decode the input string +-- into an array of code points. +-- @return Returns encoded string in the desired format. +-- @return Throws an error, if any. +function decode(input, encoder, decoder) + + encoder = encoder or unicode.utf8_enc + decoder = decoder or unicode.utf8_dec + local delimiterCodePoint = 0x002E + local delimiter = unicode.encode({0x002E}, encoder) + + local codepoints = unicode.decode(input, decoder) + local codepointLabels = breakInput(codepoints, delimiterCodePoint) + + local stringLabels = {} + + for _, label in ipairs(codepointLabels) do + table.insert(stringLabels, unicode.encode(label, encoder)) + end + + return mapLabels(stringLabels, decode_label, encoder, delimiter) + +end + +--Ignore the rest if we are not testing. +if not unittest.testing() then + return _ENV +end + +-- Table of punycode test cases. +local testCases = { + { + "xn--0zwm56d", + "\xe6\xb5\x8b\xe8\xaf\x95" + }, + { + "xn--knigsgsschen-lcb0w", + "k\xc3\xb6nigsg\xc3\xa4sschen" + }, + { + "xn--ab-fsf", + "a\xe0\xa5\x8db" + }, + { + "xn--maana-pta", + "ma\xc3\xb1ana" + }, + { + "xn----dqo34k", + "\xe2\x98\x83-\xe2\x8c\x98" + } +} + +test_suite = unittest.TestSuite:new() + +-- Running test cases against Encoding function. +for i, v in ipairs(testCases) do + test_suite:add_test(unittest.equal(decode(v[1]), v[2])) + test_suite:add_test(unittest.equal(encode(v[2]), v[1])) +end + +return _ENV