From e3ab97215d8575fab936b139375c7d67a823cbe9 Mon Sep 17 00:00:00 2001 From: dmiller Date: Wed, 21 Sep 2022 21:03:57 +0000 Subject: [PATCH] Handle internationalized domain names (IDN) --- CHANGELOG | 3 +++ NmapOps.cc | 5 +++++ NmapOps.h | 1 + main.cc | 2 ++ nselib/http.lua | 54 +++++++++++++++++++++++++++++-------------------- nselib/url.lua | 18 +++++++++++++++-- tcpip.cc | 9 +++++++++ 7 files changed, 68 insertions(+), 24 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 1f0246a42..90a2f7d5a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,8 @@ #Nmap Changelog ($Id$); -*-text-*- +o [GH#1023] Handle Internationalized Domain Names (IDN) like Яндекс.рф on + platforms where getaddrinfo supports the AI_IDN flag. [Daniel Miller] + o Avoid storing many small strings from IPv4 OS detection results in the global string_pool. These were effectively leaked after a host is done being scanned, since string_pool allocations are not freed until Nmap quits. diff --git a/NmapOps.cc b/NmapOps.cc index 2c4d00175..3fb7b452b 100644 --- a/NmapOps.cc +++ b/NmapOps.cc @@ -125,6 +125,10 @@ NmapOps::~NmapOps() { free(datadir); datadir = NULL; } + if (locale) { + free(locale); + locale = NULL; + } #ifndef NOLUA if (scriptversion || script) @@ -305,6 +309,7 @@ void NmapOps::Initialize() { numhosts_up = 0; numhosts_scanning = 0; noninteractive = false; + locale = NULL; current_scantype = STYPE_UNKNOWN; ipoptions = NULL; ipoptionslen = 0; diff --git a/NmapOps.h b/NmapOps.h index 740d2ddc3..60e93b5ee 100644 --- a/NmapOps.h +++ b/NmapOps.h @@ -356,6 +356,7 @@ class NmapOps { int numhosts_scanning; stype current_scantype; bool noninteractive; + char *locale; bool release_memory; /* suggest to release memory before quitting. used to find memory leaks. */ private: diff --git a/main.cc b/main.cc index 73a47508a..1df32b7ee 100644 --- a/main.cc +++ b/main.cc @@ -63,6 +63,7 @@ /* $Id$ */ #include +#include #include "nmap.h" #include "NmapOps.h" @@ -116,6 +117,7 @@ int main(int argc, char *argv[]) { int ret; int i; + o.locale = strdup(setlocale(LC_CTYPE, NULL)); set_program_name(argv[0]); #ifdef __amigaos__ diff --git a/nselib/http.lua b/nselib/http.lua index e6936ea9c..8be3e8d08 100644 --- a/nselib/http.lua +++ b/nselib/http.lua @@ -145,6 +145,7 @@ local stringaux = require "stringaux" local table = require "table" local tableaux = require "tableaux" local url = require "url" +local ascii_hostname = url.ascii_hostname local smbauth = require "smbauth" local unicode = require "unicode" @@ -187,8 +188,9 @@ local function get_host_field(host, port, scheme) if host_header then return host_header end -- If there's no host, we can't invent a name. if not host then return nil end + local hostname = ascii_hostname(host) -- If there's no port, just return hostname. - if not port then return stdnse.get_hostname(host) end + if not port then return hostname end if type(port) == "string" then port = tonumber(port) assert(port, "Invalid port: not a number or table") @@ -200,7 +202,7 @@ local function get_host_field(host, port, scheme) if scheme then -- Caller provided scheme. If it's default, return just the hostname. if number == get_default_port(scheme) then - return stdnse.get_hostname(host) + return hostname end else scheme = url.get_default_scheme(port) @@ -210,12 +212,12 @@ local function get_host_field(host, port, scheme) if (ssl_port and scheme == 'https') or (not ssl_port and scheme == 'http') then -- If it's SSL and https, or if it's plaintext and http, return just the hostname. - return stdnse.get_hostname(host) + return hostname end end end -- No special cases matched, so include the port number in the host header - return stdnse.get_hostname(host) .. ":" .. number + return hostname .. ":" .. number end -- Skip *( SP | HT ) starting at offset. See RFC 2616, section 2.2. @@ -1076,7 +1078,7 @@ local function lookup_cache (method, host, port, path, options) if type(port) == "table" then port = port.number end - local key = stdnse.get_hostname(host)..":"..port..":"..path; + local key = ascii_hostname(host)..":"..port..":"..path; local mutex = nmap.mutex(tostring(lookup_cache)..key); local state = { @@ -1615,7 +1617,7 @@ local redirect_ok_rules = { -- * ccTLDs are not treated as such. The rule will not stop a redirect -- from foo.co.uk to bar.co.uk even though it logically should. function (url, host, port) - local hostname = stdnse.get_hostname(host) + local hostname = ascii_hostname(host) if hostname == host.ip then return url.host == hostname end @@ -1700,7 +1702,7 @@ function parse_redirect(host, port, path, response) local u = url.parse(response.header.location) if ( not(u.host) ) then -- we're dealing with a relative url - u.host = stdnse.get_hostname(host) + u.host = ascii_hostname(host) end -- do port fixup u.port = u.port or get_default_port(u.scheme) or port.number @@ -1811,7 +1813,7 @@ function get_url( u, options ) path = path .. "?" .. parsed.query end - return get( parsed.host, port, path, options ) + return get( parsed.ascii_host or parsed.host, port, path, options ) end ---Fetches a resource with a HEAD request. @@ -2857,7 +2859,7 @@ end --@param contenttype [optional] The content-type value for the path, if it's known. function save_path(host, port, path, status, links_to, linked_from, contenttype) -- Make sure we have a proper hostname and port - host = stdnse.get_hostname(host) + host = ascii_hostname(host) if(type(port) == 'table') then port = port['number'] end @@ -2888,42 +2890,50 @@ function save_path(host, port, path, status, links_to, linked_from, contenttype) end end + if parsed.host then + host = parsed.ascii_host or parsed.host + end + + if parsed.port then + port = parsed.port + end + -- Add to the 'all_pages' key - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'all_pages'}, parsed['path']) + stdnse.registry_add_array({host, 'www', port, 'all_pages'}, parsed['path']) -- Add the URL with querystring to all_pages_full_query - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'all_pages_full_query'}, parsed['path_query']) + stdnse.registry_add_array({host, 'www', port, 'all_pages_full_query'}, parsed['path_query']) -- Add the URL to a key matching the response code if(status) then - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'status_codes', status}, parsed['path']) + stdnse.registry_add_array({host, 'www', port, 'status_codes', status}, parsed['path']) end -- If it's a directory, add it to the directories list; otherwise, add it to the files list if(parsed['is_folder']) then - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'directories'}, parsed['path']) + stdnse.registry_add_array({host, 'www', port, 'directories'}, parsed['path']) else - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'files'}, parsed['path']) + stdnse.registry_add_array({host, 'www', port, 'files'}, parsed['path']) end -- If we have an extension, add it to the extensions key if(parsed['extension']) then - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'extensions', parsed['extension']}, parsed['path']) + stdnse.registry_add_array({host, 'www', port, 'extensions', parsed['extension']}, parsed['path']) end -- Add an entry for the page and its arguments if(parsed['querystring']) then -- Add all scripts with a querystring to the 'cgi' and 'cgi_full_query' keys - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'cgi'}, parsed['path']) - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'cgi_full_query'}, parsed['path_query']) + stdnse.registry_add_array({host, 'www', port, 'cgi'}, parsed['path']) + stdnse.registry_add_array({host, 'www', port, 'cgi_full_query'}, parsed['path_query']) -- Add the query string alone to the registry (probably not necessary) - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'cgi_querystring', parsed['path'] }, parsed['raw_querystring']) + stdnse.registry_add_array({host, 'www', port, 'cgi_querystring', parsed['path'] }, parsed['raw_querystring']) -- Add the individual arguments for the page, along with their values for key, value in pairs(parsed['querystring']) do - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'cgi_args', parsed['path']}, parsed['querystring']) + stdnse.registry_add_array({host, 'www', port, 'cgi_args', parsed['path']}, parsed['querystring']) end end @@ -2934,7 +2944,7 @@ function save_path(host, port, path, status, links_to, linked_from, contenttype) end for _, v in ipairs(links_to) do - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'links_to', parsed['path_query']}, v) + stdnse.registry_add_array({host, 'www', port, 'links_to', parsed['path_query']}, v) end end @@ -2945,13 +2955,13 @@ function save_path(host, port, path, status, links_to, linked_from, contenttype) end for _, v in ipairs(linked_from) do - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'links_to', v}, parsed['path_query']) + stdnse.registry_add_array({host, 'www', port, 'links_to', v}, parsed['path_query']) end end -- Save it as a content-type, if we have one if(contenttype) then - stdnse.registry_add_array({parsed['host'] or host, 'www', parsed['port'] or port, 'content-type', contenttype}, parsed['path_query']) + stdnse.registry_add_array({host, 'www', port, 'content-type', contenttype}, parsed['path_query']) end end diff --git a/nselib/url.lua b/nselib/url.lua index e066a5f10..a0e030878 100644 --- a/nselib/url.lua +++ b/nselib/url.lua @@ -138,6 +138,21 @@ local function normalize_escape (s) return escape(unescape(s)) end +function ascii_hostname(host) + local hostname = stdnse.get_hostname(host) + if hostname:match("[\x80-\xff]") then + -- TODO: Allow other Unicode encodings + local decoded = unicode.decode(hostname, unicode.utf8_dec) + if decoded then + local ascii_host = idna.toASCII(decoded) + if ascii_host then + hostname = ascii_host + end + end + end + return hostname +end + --- -- Parses a URL and returns a table with all its parts according to RFC 3986. -- @@ -219,8 +234,7 @@ function parse(url, default) function(p) parsed.port = tonumber(p); return "" end) if authority ~= "" then parsed.host = authority end if parsed.host then - -- TODO: Allow other Unicode encodings - parsed.ascii_host = idna.toASCII(unicode.decode(parsed.host, unicode.utf8_dec)) + parsed.ascii_host = ascii_hostname(parsed.host) end local userinfo = parsed.userinfo if not userinfo then return parsed end diff --git a/tcpip.cc b/tcpip.cc index 980d54969..40aa09e0c 100644 --- a/tcpip.cc +++ b/tcpip.cc @@ -65,6 +65,7 @@ #include "nmap.h" +#include #include "nbase.h" #include #include "tcpip.h" @@ -419,7 +420,15 @@ struct addrinfo *resolve_all(const char *hostname, int pf) { hints.ai_family = pf; /* Otherwise we get multiple identical addresses with different socktypes. */ hints.ai_socktype = SOCK_DGRAM; +#ifdef AI_IDN + /* Try resolving internationalized domain names */ + hints.ai_flags = AI_IDN; + setlocale(LC_CTYPE, ""); +#endif rc = getaddrinfo(hostname, NULL, &hints, &result); +#ifdef AI_IDN + setlocale(LC_CTYPE, o.locale); +#endif if (rc != 0){ if (o.debugging > 1) error("Error resolving %s: %s", hostname, gai_strerror(rc));