1
0
mirror of https://github.com/nmap/nmap.git synced 2025-12-09 06:01:28 +00:00

o [NSE] Modified the httpspider library to prefetch links in the queue and

change how script arguments are processed. Script and library arguments are
  now processed from within the library. [Patrik]
This commit is contained in:
patrik
2011-12-09 15:48:19 +00:00
parent 6cbd5a9a58
commit e20a1b5174
3 changed files with 127 additions and 52 deletions

View File

@@ -1,5 +1,9 @@
# Nmap Changelog ($Id$); -*-text-*- # Nmap Changelog ($Id$); -*-text-*-
o [NSE] Modified the httpspider library to prefetch links in the queue and
change how script arguments are processed. Script and library arguments are
now processed from within the library. [Patrik]
o The --exclude and --excludefile options can be used together now. [David] o The --exclude and --excludefile options can be used together now. [David]
o [NSE] Added the script http-apache-negotiation that detects if the Apache o [NSE] Added the script http-apache-negotiation that detects if the Apache

View File

@@ -25,6 +25,7 @@ module(... or "httpspider", package.seeall)
require 'http' require 'http'
local LIBRARY_NAME = "httpspider" local LIBRARY_NAME = "httpspider"
local PREFETCH_SIZE = 5
-- The Options class, handling all spidering options -- The Options class, handling all spidering options
Options = { Options = {
@@ -432,7 +433,6 @@ UrlQueue = {
} }
-- The Crawler class -- The Crawler class
Crawler = { Crawler = {
@@ -440,7 +440,16 @@ Crawler = {
-- @param host table as received by the action method -- @param host table as received by the action method
-- @param port table as received by the action method -- @param port table as received by the action method
-- @param url string containing the relative URL -- @param url string containing the relative URL
-- @param options table of options -- @param options table of options:
-- <code>noblacklist</code> - do not load default blacklist
-- <code>base_url</code> - start url to crawl
-- <code>timeout</code> - timeout for the http request
-- <code>maxdepth</code> - the maximum directory depth to crawl
-- <code>maxpagecount</code> - the maximum amount of pages to retrieve
-- <code>withinhost</code> - stay within the host of the base_url
-- <code>withindomain</code> - stay within the base_url domain
-- <code>scriptname</code> - should be set to SCRIPT_NAME to enable
-- script specific arguments.
-- @return o new instance of Crawler or nil on failure -- @return o new instance of Crawler or nil on failure
new = function(self, host, port, url, options) new = function(self, host, port, url, options)
local o = { local o = {
@@ -453,6 +462,10 @@ Crawler = {
setmetatable(o, self) setmetatable(o, self)
self.__index = self self.__index = self
o:loadScriptArguments()
o:loadLibraryArguments()
o:loadDefaultArguments()
local response = http.get(o.host, o.port, '/', { timeout = o.options.timeout } ) local response = http.get(o.host, o.port, '/', { timeout = o.options.timeout } )
if ( not(response) or 'table' ~= type(response) ) then if ( not(response) or 'table' ~= type(response) ) then
@@ -536,18 +549,24 @@ Crawler = {
-- unexpected or requires attention we set the error property accordingly. -- unexpected or requires attention we set the error property accordingly.
-- This way the script can alert the user of the details by calling -- This way the script can alert the user of the details by calling
-- getError() -- getError()
crawl = function(self) crawl_thread = function(self, response_queue)
local condvar = nmap.condvar(response_queue)
if ( self.options.withinhost and self.options.withindomain ) then if ( self.options.withinhost and self.options.withindomain ) then
return false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } table.insert(response_queue, { false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } })
condvar "signal"
return
end end
while(true) do
-- in case the user set a max page count to retrieve check how many -- in case the user set a max page count to retrieve check how many
-- pages we have retrieved so far -- pages we have retrieved so far
local count = self:getPageCount() local count = self:getPageCount()
if ( self.options.maxpagecount and if ( self.options.maxpagecount and
( count > self.options.maxpagecount ) ) then ( count > self.options.maxpagecount ) ) then
return false, { err = false, msg = "Reached max page count" } table.insert(response_queue, { false, { err = false, msg = "Reached max page count" } })
condvar "signal"
return
end end
-- pull links from the queue until we get a valid one -- pull links from the queue until we get a valid one
@@ -558,7 +577,9 @@ Crawler = {
-- if no url could be retrieved from the queue, abort ... -- if no url could be retrieved from the queue, abort ...
if ( not(url) ) then if ( not(url) ) then
return false, { err = false, msg = "No more urls" } table.insert(response_queue, { false, { err = false, msg = "No more urls" } })
condvar "signal"
return
end end
if ( self.options.maxpagecount ) then if ( self.options.maxpagecount ) then
@@ -576,7 +597,68 @@ Crawler = {
local links = LinkExtractor:new(url, response.body, self.options):getLinks() local links = LinkExtractor:new(url, response.body, self.options):getLinks()
self.urlqueue:add(links) self.urlqueue:add(links)
end end
return true, { url = url, response = response }
table.insert(response_queue, { true, { url = url, response = response } } )
while ( PREFETCH_SIZE < #response_queue ) do
stdnse.print_debug(2, "%s: Response queue full, waiting ...", LIBRARY_NAME)
condvar "wait"
end
condvar "signal"
end
end,
-- Loads the argument set on a script level
loadScriptArguments = function(self)
local sn = self.options.scriptname
if ( not(sn) ) then
stdnse.print_debug("%s: WARNING: Script argument could not be loaded as scriptname was not set", LIBRARY_NAME)
return
end
self.options.maxdepth = tonumber(stdnse.get_script_args(sn .. ".maxdepth"))
self.options.maxpagecount = tonumber(stdnse.get_script_args(sn .. ".maxpagecount"))
self.url = stdnse.get_script_args(sn .. ".url")
self.options.withinhost = stdnse.get_script_args(sn .. ".withinhost")
self.options.withindomain = stdnse.get_script_args(sn .. ".withindomain")
self.options.noblacklist = stdnse.get_script_args(sn .. ".noblacklist")
end,
-- Loads the argument on a library level
loadLibraryArguments = function(self)
local ln = LIBRARY_NAME
self.options.maxdepth = self.options.maxdepth or tonumber(stdnse.get_script_args(ln .. ".maxdepth"))
self.options.maxpagecount = self.options.maxpagecount or tonumber(stdnse.get_script_args(ln .. ".maxpagecount"))
self.url = self.url or stdnse.get_script_args(ln .. ".url")
self.options.withinhost = self.options.withinhost or stdnse.get_script_args(ln .. ".withinhost")
self.options.withindomain = self.options.withindomain or stdnse.get_script_args(ln .. ".withindomain")
self.options.noblacklist = self.options.noblacklist or stdnse.get_script_args(ln .. ".noblacklist")
end,
-- Loads any defaults for arguments that were not set
loadDefaultArguments = function(self)
self.options.maxdepth = self.options.maxdepth or 3
self.options.maxpagecount = self.options.maxpagecount or 20
self.url = self.url or '/'
end,
crawl = function(self)
self.response_queue = self.response_queue or {}
local condvar = nmap.condvar(self.response_queue)
if ( not(self.thread) ) then
self.thread = stdnse.new_thread(self.crawl_thread, self, self.response_queue)
end
if ( #self.response_queue == 0 and coroutine.status(self.thread) ~= 'dead') then
condvar "wait"
end
condvar "signal"
if ( #self.response_queue == 0 ) then
return false, { err = false, msg = "No more urls" }
else
return unpack(table.remove(self.response_queue, 1))
end
end, end,

View File

@@ -39,31 +39,20 @@ portrule = shortport.http
function action(host, port) function action(host, port)
local EMAIL_PATTERN = "[A-Za-z0-9%.%%%+%-]+@[A-Za-z0-9%.%%%+%-]+%.%w%w%w?%w?" local EMAIL_PATTERN = "[A-Za-z0-9%.%%%+%-]+@[A-Za-z0-9%.%%%+%-]+%.%w%w%w?%w?"
-- by default, we cap the script at a maximum depth of 3 local crawler = httpspider.Crawler:new(host, port, url or '/', {
local maxdepth = tonumber(stdnse.get_script_args("http-email-harvest.maxdepth")) or 3 scriptname = SCRIPT_NAME
-- by default, we cap the script at a maximum pagecount of 20 }
local maxpagecount = tonumber(stdnse.get_script_args("http-email-harvest.maxpagecount")) or 20 )
local url = stdnse.get_script_args("http-email-harvest.url") or "/" crawler:set_timeout(10000)
local withinhost = stdnse.get_script_args("http-email-harvest.withinhost")
local withindomain = stdnse.get_script_args("http-email-harvest.withindomain")
local maxdepth, maxpagecount = crawler.options.maxdepth, crawler.options.maxpagecount
if ( maxdepth < 0 ) then maxdepth = nil end if ( maxdepth < 0 ) then maxdepth = nil end
if ( maxpagecount < 0 ) then maxpagecount = nil end if ( maxpagecount < 0 ) then maxpagecount = nil end
stdnse.print_debug(2, "%s: Running crawler maxdepth: %s; maxpagecount: %s", stdnse.print_debug(2, "%s: Running crawler maxdepth: %s; maxpagecount: %s",
SCRIPT_NAME, maxdepth or "[none]", maxpagecount or "[none]") SCRIPT_NAME, maxdepth or "[none]", maxpagecount or "[none]")
local crawler = httpspider.Crawler:new(host, port, url or '/', {
maxdepth = maxdepth,
maxpagecount = maxpagecount,
withinhost = withinhost,
withindomain= withindomain,
}
)
crawler:set_timeout(10000)
local emails = {} local emails = {}
while(true) do while(true) do
local status, r = crawler:crawl() local status, r = crawler:crawl()