1
0
mirror of https://github.com/nmap/nmap.git synced 2025-12-21 15:09:02 +00:00

Fixed a number of bugs and prepared the library to handle the new redirect

code being added to the http-library. [Patrik]
This commit is contained in:
patrik
2012-02-02 21:23:19 +00:00
parent 0f92825783
commit 2d55f8822c
3 changed files with 107 additions and 39 deletions

View File

@@ -73,19 +73,10 @@ Options = {
-- set a few default values -- set a few default values
o.timeout = options.timeout or 10000 o.timeout = options.timeout or 10000
o.withindomain = o.withindomain or false
-- we default to withinhost, unless withindomain is set
if ( o.withindomain ) then
o.withinhost = o.withinhost or false
else
o.withinhost = o.withinhost or true
end
o.whitelist = o.whitelist or {} o.whitelist = o.whitelist or {}
o.blacklist = o.blacklist or {} o.blacklist = o.blacklist or {}
if ( o.withinhost or o.withindomain ) then if ( o.withinhost == true or o.withindomain == true ) then
local host_match, domain_match local host_match, domain_match
if ( ( o.base_url:getProto() == 'https' and o.base_url:getPort() == 443 ) or if ( ( o.base_url:getProto() == 'https' and o.base_url:getPort() == 443 ) or
( o.base_url:getProto() == 'http' and o.base_url:getPort() == 80 ) ) then ( o.base_url:getProto() == 'http' and o.base_url:getPort() == 80 ) ) then
@@ -101,7 +92,6 @@ Options = {
domain_match = ("%s://.*%s/"):format(o.base_url:getProto(), o.base_url:getDomain() ) domain_match = ("%s://.*%s/"):format(o.base_url:getProto(), o.base_url:getDomain() )
end end
end end
-- set up the appropriate matching functions -- set up the appropriate matching functions
if ( o.withinhost ) then if ( o.withinhost ) then
o.withinhost = function(url) return string.match(tostring(url), host_match) end o.withinhost = function(url) return string.match(tostring(url), host_match) end
@@ -277,7 +267,7 @@ LinkExtractor = {
return false return false
end end
end end
-- withinhost trumps any whitelisting -- withinhost trumps any whitelisting
if ( self.options.withinhost ) then if ( self.options.withinhost ) then
if ( not(self.options.withinhost(url)) ) then if ( not(self.options.withinhost(url)) ) then
@@ -410,7 +400,14 @@ URL = {
-- Gets the domain component of the URL -- Gets the domain component of the URL
-- @return domain string containing the hosts domain -- @return domain string containing the hosts domain
getDomain = function(self) return self.domain end, getDomain = function(self)
if ( self.domain ) then
return self.domain
-- fallback to the host, if we can't find a domain
else
return self.host
end
end,
-- Converts the URL to a string -- Converts the URL to a string
-- @return url string containing the string representation of the url -- @return url string containing the string representation of the url
@@ -589,7 +586,7 @@ Crawler = {
crawl_thread = function(self, response_queue) crawl_thread = function(self, response_queue)
local condvar = nmap.condvar(response_queue) local condvar = nmap.condvar(response_queue)
if ( self.options.withinhost and self.options.withindomain ) then if ( false ~= self.options.withinhost and false ~= self.options.withindomain ) then
table.insert(response_queue, { false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } }) table.insert(response_queue, { false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } })
condvar "signal" condvar "signal"
return return
@@ -634,13 +631,26 @@ Crawler = {
-- fetch the url, and then push it to the processed table -- fetch the url, and then push it to the processed table
local response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout } ) local response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout } )
self.processed[tostring(url)] = true self.processed[tostring(url)] = true
-- if we have a response, proceed scraping it if ( response ) then
if ( response.body ) then -- were we redirected?
local links = LinkExtractor:new(url, response.body, self.options):getLinks() if ( response.location ) then
self.urlqueue:add(links) -- was the link absolute?
if ( response.location:match("^http") ) then
url = URL:new(response.location)
-- guess not
else
url.path = response.location
end
end
-- if we have a response, proceed scraping it
if ( response.body ) then
local links = LinkExtractor:new(url, response.body, self.options):getLinks()
self.urlqueue:add(links)
end
else
response = { body = "", headers = {} }
end end
table.insert(response_queue, { true, { url = url, response = response } } ) table.insert(response_queue, { true, { url = url, response = response } } )
while ( PREFETCH_SIZE < #response_queue ) do while ( PREFETCH_SIZE < #response_queue ) do
stdnse.print_debug(2, "%s: Response queue full, waiting ...", LIBRARY_NAME) stdnse.print_debug(2, "%s: Response queue full, waiting ...", LIBRARY_NAME)
@@ -659,28 +669,78 @@ Crawler = {
return return
end end
self.options.maxdepth = self.options.maxdepth or tonumber(stdnse.get_script_args(sn .. ".maxdepth")) if ( nil == self.options.maxdepth ) then
self.options.maxpagecount = self.options.maxpagecount or tonumber(stdnse.get_script_args(sn .. ".maxpagecount")) self.options.maxdepth = tonumber(stdnse.get_script_args(sn .. ".maxdepth"))
self.url = self.url or stdnse.get_script_args(sn .. ".url") end
self.options.withinhost = self.options.withinhost or stdnse.get_script_args(sn .. ".withinhost") if ( nil == self.options.maxpagecount ) then
self.options.withindomain = self.options.withindomain or stdnse.get_script_args(sn .. ".withindomain") self.options.maxpagecount = tonumber(stdnse.get_script_args(sn .. ".maxpagecount"))
self.options.noblacklist = self.options.noblacklist or stdnse.get_script_args(sn .. ".noblacklist") end
if ( nil == self.url ) then
self.url = stdnse.get_script_args(sn .. ".url")
end
if ( nil == self.options.withinhost ) then
self.options.withinhost = stdnse.get_script_args(sn .. ".withinhost")
end
if ( nil == self.options.withindomain ) then
self.options.withindomain = stdnse.get_script_args(sn .. ".withindomain")
end
if ( nil == self.options.noblacklist ) then
self.options.noblacklist = stdnse.get_script_args(sn .. ".noblacklist")
end
end, end,
-- Loads the argument on a library level -- Loads the argument on a library level
loadLibraryArguments = function(self) loadLibraryArguments = function(self)
local ln = LIBRARY_NAME local ln = LIBRARY_NAME
self.options.maxdepth = self.options.maxdepth or tonumber(stdnse.get_script_args(ln .. ".maxdepth")) if ( nil == self.options.maxdepth ) then
self.options.maxpagecount = self.options.maxpagecount or tonumber(stdnse.get_script_args(ln .. ".maxpagecount")) self.options.maxdepth = tonumber(stdnse.get_script_args(ln .. ".maxdepth"))
self.url = self.url or stdnse.get_script_args(ln .. ".url") end
self.options.withinhost = self.options.withinhost or stdnse.get_script_args(ln .. ".withinhost") if ( nil == self.options.maxpagecount ) then
self.options.withindomain = self.options.withindomain or stdnse.get_script_args(ln .. ".withindomain") self.options.maxpagecount = tonumber(stdnse.get_script_args(ln .. ".maxpagecount"))
self.options.noblacklist = self.options.noblacklist or stdnse.get_script_args(ln .. ".noblacklist") end
if ( nil == self.url ) then
self.url = stdnse.get_script_args(ln .. ".url")
end
if ( nil == self.options.withinhost ) then
self.options.withinhost = stdnse.get_script_args(ln .. ".withinhost")
end
if ( nil == self.options.withindomain ) then
self.options.withindomain = stdnse.get_script_args(ln .. ".withindomain")
end
if ( nil == self.options.noblacklist ) then
self.options.noblacklist = stdnse.get_script_args(ln .. ".noblacklist")
end
end, end,
-- Loads any defaults for arguments that were not set -- Loads any defaults for arguments that were not set
loadDefaultArguments = function(self) loadDefaultArguments = function(self)
local function tobool(b)
if ( nil == b ) then
return
end
assert("string" == type(b) or "boolean" == type(b), "httpspider: tobool failed, unsupported type")
if ( "string" == type(b) ) then
if ( "true" == b ) then
return true
else
return false
end
end
return b
end
-- fixup some booleans to make sure they're actually booleans
self.options.withinhost = tobool(self.options.withinhost)
self.options.withindomain = tobool(self.options.withindomain)
self.options.noblacklist = tobool(self.options.noblacklist)
if ( self.options.withinhost == nil ) then
self.options.withinhost = true
end
if ( self.options.withindomain == nil ) then
self.options.withindomain = false
end
self.options.maxdepth = self.options.maxdepth or 3 self.options.maxdepth = self.options.maxdepth or 3
self.options.maxpagecount = self.options.maxpagecount or 20 self.options.maxpagecount = self.options.maxpagecount or 20
self.url = self.url or '/' self.url = self.url or '/'
@@ -690,7 +750,6 @@ Crawler = {
getLimitations = function(self) getLimitations = function(self)
local o = self.options local o = self.options
local limits = {} local limits = {}
if ( o.maxdepth > 0 or o.maxpagecount > 0 or if ( o.maxdepth > 0 or o.maxpagecount > 0 or
o.withinhost or o.wihtindomain ) then o.withinhost or o.wihtindomain ) then
if ( o.maxdepth > 0 ) then if ( o.maxdepth > 0 ) then
@@ -700,7 +759,7 @@ Crawler = {
table.insert(limits, ("maxpagecount=%d"):format(o.maxpagecount)) table.insert(limits, ("maxpagecount=%d"):format(o.maxpagecount))
end end
if ( o.withindomain ) then if ( o.withindomain ) then
table.insert(limits, ("withindomain=%s"):format(o.base_url:getDomain())) table.insert(limits, ("withindomain=%s"):format(o.base_url:getDomain() or o.base_url:getHost()))
end end
if ( o.withinhost ) then if ( o.withinhost ) then
table.insert(limits, ("withinhost=%s"):format(o.base_url:getHost())) table.insert(limits, ("withinhost=%s"):format(o.base_url:getHost()))

View File

@@ -58,6 +58,10 @@ action = function(host, port)
-- create a new crawler instance -- create a new crawler instance
local crawler = httpspider.Crawler:new( host, port, nil, { scriptname = SCRIPT_NAME } ) local crawler = httpspider.Crawler:new( host, port, nil, { scriptname = SCRIPT_NAME } )
if ( not(crawler) ) then
return
end
-- create a table entry in the registry -- create a table entry in the registry
nmap.registry.auth_urls = nmap.registry.auth_urls or {} nmap.registry.auth_urls = nmap.registry.auth_urls or {}
crawler:set_timeout(10000) crawler:set_timeout(10000)
@@ -70,7 +74,7 @@ action = function(host, port)
-- most of them are "legitimate" and should not be reason to abort -- most of them are "legitimate" and should not be reason to abort
if ( not(status) ) then if ( not(status) ) then
if ( r.err ) then if ( r.err ) then
return stdnse.format_output(true, "ERROR: %s", r.reason) return stdnse.format_output(true, ("ERROR: %s"):format(r.reason))
else else
break break
end end
@@ -101,6 +105,8 @@ action = function(host, port)
end end
end end
if ( #auth_urls > 1 ) then if ( #auth_urls > 1 ) then
return stdnse.format_output(true, tab.dump(auth_urls)) local result = { tab.dump(auth_urls) }
result.name = crawler:getLimitations()
return stdnse.format_output(true, result)
end end
end end

View File

@@ -45,6 +45,9 @@ function action(host, port)
} }
) )
if ( not(crawler) ) then
return
end
crawler:set_timeout(10000) crawler:set_timeout(10000)
local emails = {} local emails = {}
@@ -54,7 +57,7 @@ function action(host, port)
-- most of them are "legitimate" and should not be reason to abort -- most of them are "legitimate" and should not be reason to abort
if ( not(status) ) then if ( not(status) ) then
if ( r.err ) then if ( r.err ) then
return stdnse.format_output(true, "ERROR: %s", r.reason) return stdnse.format_output(true, ("ERROR: %s"):format(r.reason))
else else
break break
end end