diff --git a/nselib/httpspider.lua b/nselib/httpspider.lua index 41f1c9d2d..a7139c963 100644 --- a/nselib/httpspider.lua +++ b/nselib/httpspider.lua @@ -131,6 +131,7 @@ Options = { if ( o.withinhost ) then o.withinhost = function(u) local parsed_u = url.parse(tostring(u)) + local host = parsed_u.ascii_host or parsed_u.host if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then if ( parsed_u.port ~= tonumber(o.base_url:getPort()) ) then @@ -139,7 +140,7 @@ Options = { elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then return false -- if urls don't match only on the "www" prefix, then they are probably the same - elseif ( parsed_u.host == nil or removewww(parsed_u.host:lower()) ~= removewww(o.base_url:getHost():lower()) ) then + elseif ( host == nil or removewww(host:lower()) ~= removewww(o.base_url:getHost():lower()) ) then return false end return true @@ -148,13 +149,14 @@ Options = { if ( o.withindomain ) then o.withindomain = function(u) local parsed_u = url.parse(tostring(u)) + local host = parsed_u.ascii_host or parsed_u.host if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then if ( parsed_u.port ~= tonumber(o.base_url:getPort()) ) then return false end elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then return false - elseif ( parsed_u.host == nil or parsed_u.host:sub(-#o.base_url:getDomain()):lower() ~= o.base_url:getDomain():lower() ) then + elseif ( host == nil or host:sub(-#o.base_url:getDomain()):lower() ~= o.base_url:getDomain():lower() ) then return false end return true @@ -422,21 +424,19 @@ URL = { -- URL components -- @return status true on success, false on failure parse = function(self) - self.proto, self.host, self.port, self.file = self.raw:match("^(http[s]?)://([^:/]*)[:]?(%d*)") - if ( self.proto and self.host ) then + local parsed = url.parse(self.raw) + if parsed.scheme and parsed.scheme:match("^https?$") then + self.proto = parsed.scheme + self.host = parsed.ascii_host or parsed.host + self.port = tonumber(parsed.port) or url.get_default_port(self.proto) + -- XXX: This should be parsed via url.lua, but this legacy pattern works + -- and is simpler for now. self.file = self.raw:match("^http[s]?://[^:/]*[:]?%d*(/[^#]*)") or '/' - self.port = tonumber(self.port) or url.get_default_port(self.proto) - - self.path = self.file:match("^([^?]*)[%?]?") + self.path = parsed.path self.dir = self.path:match("^(.+%/)") or "/" + -- TODO: Use public suffix list to extract domain self.domain= self.host:match("^[^%.]-%.(.*)") return true - elseif( self.raw:match("^javascript:") ) then - stdnse.debug2("%s: Skipping javascript url: %s", LIBRARY_NAME, self.raw) - elseif( self.raw:match("^mailto:") ) then - stdnse.debug2("%s: Skipping mailto link: %s", LIBRARY_NAME, self.raw) - else - stdnse.debug2("%s: WARNING: Failed to parse url: %s", LIBRARY_NAME, self.raw) end return false end, @@ -543,6 +543,7 @@ Crawler = { -- @param u URL that points to the resource we want to check. iswithinhost = function(self, u) local parsed_u = url.parse(tostring(u)) + local host = parsed_u.ascii_host or parsed_u.host if ( self.options.base_url:getPort() ~= 80 and self.options.base_url:getPort() ~= 443 ) then if ( parsed_u.port ~= tonumber(self.options.base_url:getPort()) ) then return false @@ -550,7 +551,7 @@ Crawler = { elseif ( parsed_u.scheme ~= self.options.base_url:getProto() ) then return false -- if urls don't match only on the "www" prefix, then they are probably the same - elseif ( parsed_u.host == nil or self.removewww(parsed_u.host:lower()) ~= self.removewww(self.options.base_url:getHost():lower()) ) then + elseif ( host == nil or self.removewww(host:lower()) ~= self.removewww(self.options.base_url:getHost():lower()) ) then return false end return true @@ -560,13 +561,14 @@ Crawler = { -- @param u URL that points to the resource we want to check. iswithindomain = function(self, u) local parsed_u = url.parse(tostring(u)) + local host = parsed_u.ascii_host or parsed_u.host if ( self.options.base_url:getPort() ~= 80 and self.options.base_url:getPort() ~= 443 ) then if ( parsed_u.port ~= tonumber(self.options.base_url:getPort()) ) then return false end elseif ( parsed_u.scheme ~= self.options.base_url:getProto() ) then return false - elseif ( parsed_u.host == nil or parsed_u.host:sub(-#self.options.base_url:getDomain()):lower() ~= self.options.base_url:getDomain():lower() ) then + elseif ( host == nil or host:sub(-#self.options.base_url:getDomain()):lower() ~= self.options.base_url:getDomain():lower() ) then return false end return true