diff --git a/CHANGELOG b/CHANGELOG
index 70b8100ad..da38c72a5 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,9 @@
# Nmap Changelog ($Id$); -*-text-*-
+o [NSE] Modified the httpspider library to prefetch links in the queue and
+ change how script arguments are processed. Script and library arguments are
+ now processed from within the library. [Patrik]
+
o The --exclude and --excludefile options can be used together now. [David]
o [NSE] Added the script http-apache-negotiation that detects if the Apache
diff --git a/nselib/httpspider.lua b/nselib/httpspider.lua
index b83c6ce95..78df27fb7 100644
--- a/nselib/httpspider.lua
+++ b/nselib/httpspider.lua
@@ -25,6 +25,7 @@ module(... or "httpspider", package.seeall)
require 'http'
local LIBRARY_NAME = "httpspider"
+local PREFETCH_SIZE = 5
-- The Options class, handling all spidering options
Options = {
@@ -432,7 +433,6 @@ UrlQueue = {
}
-
-- The Crawler class
Crawler = {
@@ -440,7 +440,16 @@ Crawler = {
-- @param host table as received by the action method
-- @param port table as received by the action method
-- @param url string containing the relative URL
- -- @param options table of options
+ -- @param options table of options:
+ -- noblacklist - do not load default blacklist
+ -- base_url - start url to crawl
+ -- timeout - timeout for the http request
+ -- maxdepth - the maximum directory depth to crawl
+ -- maxpagecount - the maximum amount of pages to retrieve
+ -- withinhost - stay within the host of the base_url
+ -- withindomain - stay within the base_url domain
+ -- scriptname - should be set to SCRIPT_NAME to enable
+ -- script specific arguments.
-- @return o new instance of Crawler or nil on failure
new = function(self, host, port, url, options)
local o = {
@@ -453,6 +462,10 @@ Crawler = {
setmetatable(o, self)
self.__index = self
+ o:loadScriptArguments()
+ o:loadLibraryArguments()
+ o:loadDefaultArguments()
+
local response = http.get(o.host, o.port, '/', { timeout = o.options.timeout } )
if ( not(response) or 'table' ~= type(response) ) then
@@ -536,47 +549,116 @@ Crawler = {
-- unexpected or requires attention we set the error property accordingly.
-- This way the script can alert the user of the details by calling
-- getError()
- crawl = function(self)
+ crawl_thread = function(self, response_queue)
+ local condvar = nmap.condvar(response_queue)
if ( self.options.withinhost and self.options.withindomain ) then
- return false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" }
+ table.insert(response_queue, { false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } })
+ condvar "signal"
+ return
end
- -- in case the user set a max page count to retrieve check how many
- -- pages we have retrieved so far
- local count = self:getPageCount()
- if ( self.options.maxpagecount and
- ( count > self.options.maxpagecount ) ) then
- return false, { err = false, msg = "Reached max page count" }
+ while(true) do
+ -- in case the user set a max page count to retrieve check how many
+ -- pages we have retrieved so far
+ local count = self:getPageCount()
+ if ( self.options.maxpagecount and
+ ( count > self.options.maxpagecount ) ) then
+ table.insert(response_queue, { false, { err = false, msg = "Reached max page count" } })
+ condvar "signal"
+ return
+ end
+
+ -- pull links from the queue until we get a valid one
+ local url
+ repeat
+ url = self.urlqueue:getNext()
+ until( not(url) or not(self.processed[tostring(url)]) )
+
+ -- if no url could be retrieved from the queue, abort ...
+ if ( not(url) ) then
+ table.insert(response_queue, { false, { err = false, msg = "No more urls" } })
+ condvar "signal"
+ return
+ end
+
+ if ( self.options.maxpagecount ) then
+ stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url))
+ else
+ stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url))
+ end
+
+ -- fetch the url, and then push it to the processed table
+ local response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout } )
+ self.processed[tostring(url)] = true
+
+ -- if we have a response, proceed scraping it
+ if ( response.body ) then
+ local links = LinkExtractor:new(url, response.body, self.options):getLinks()
+ self.urlqueue:add(links)
+ end
+
+ table.insert(response_queue, { true, { url = url, response = response } } )
+ while ( PREFETCH_SIZE < #response_queue ) do
+ stdnse.print_debug(2, "%s: Response queue full, waiting ...", LIBRARY_NAME)
+ condvar "wait"
+ end
+ condvar "signal"
+ end
+ end,
+
+ -- Loads the argument set on a script level
+ loadScriptArguments = function(self)
+ local sn = self.options.scriptname
+ if ( not(sn) ) then
+ stdnse.print_debug("%s: WARNING: Script argument could not be loaded as scriptname was not set", LIBRARY_NAME)
+ return
end
- -- pull links from the queue until we get a valid one
- local url
- repeat
- url = self.urlqueue:getNext()
- until( not(url) or not(self.processed[tostring(url)]) )
+ self.options.maxdepth = tonumber(stdnse.get_script_args(sn .. ".maxdepth"))
+ self.options.maxpagecount = tonumber(stdnse.get_script_args(sn .. ".maxpagecount"))
+ self.url = stdnse.get_script_args(sn .. ".url")
+ self.options.withinhost = stdnse.get_script_args(sn .. ".withinhost")
+ self.options.withindomain = stdnse.get_script_args(sn .. ".withindomain")
+ self.options.noblacklist = stdnse.get_script_args(sn .. ".noblacklist")
+ end,
+
+ -- Loads the argument on a library level
+ loadLibraryArguments = function(self)
+ local ln = LIBRARY_NAME
+
+ self.options.maxdepth = self.options.maxdepth or tonumber(stdnse.get_script_args(ln .. ".maxdepth"))
+ self.options.maxpagecount = self.options.maxpagecount or tonumber(stdnse.get_script_args(ln .. ".maxpagecount"))
+ self.url = self.url or stdnse.get_script_args(ln .. ".url")
+ self.options.withinhost = self.options.withinhost or stdnse.get_script_args(ln .. ".withinhost")
+ self.options.withindomain = self.options.withindomain or stdnse.get_script_args(ln .. ".withindomain")
+ self.options.noblacklist = self.options.noblacklist or stdnse.get_script_args(ln .. ".noblacklist")
+ end,
+
+ -- Loads any defaults for arguments that were not set
+ loadDefaultArguments = function(self)
+ self.options.maxdepth = self.options.maxdepth or 3
+ self.options.maxpagecount = self.options.maxpagecount or 20
+ self.url = self.url or '/'
+ end,
+
+ crawl = function(self)
- -- if no url could be retrieved from the queue, abort ...
- if ( not(url) ) then
+ self.response_queue = self.response_queue or {}
+ local condvar = nmap.condvar(self.response_queue)
+ if ( not(self.thread) ) then
+ self.thread = stdnse.new_thread(self.crawl_thread, self, self.response_queue)
+ end
+
+ if ( #self.response_queue == 0 and coroutine.status(self.thread) ~= 'dead') then
+ condvar "wait"
+ end
+ condvar "signal"
+ if ( #self.response_queue == 0 ) then
return false, { err = false, msg = "No more urls" }
- end
-
- if ( self.options.maxpagecount ) then
- stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url))
else
- stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url))
+ return unpack(table.remove(self.response_queue, 1))
end
-
- -- fetch the url, and then push it to the processed table
- local response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout } )
- self.processed[tostring(url)] = true
-
- -- if we have a response, proceed scraping it
- if ( response.body ) then
- local links = LinkExtractor:new(url, response.body, self.options):getLinks()
- self.urlqueue:add(links)
- end
- return true, { url = url, response = response }
end,
diff --git a/scripts/http-email-harvest.nse b/scripts/http-email-harvest.nse
index 167172db8..3365c65b7 100644
--- a/scripts/http-email-harvest.nse
+++ b/scripts/http-email-harvest.nse
@@ -38,31 +38,20 @@ portrule = shortport.http
function action(host, port)
local EMAIL_PATTERN = "[A-Za-z0-9%.%%%+%-]+@[A-Za-z0-9%.%%%+%-]+%.%w%w%w?%w?"
-
- -- by default, we cap the script at a maximum depth of 3
- local maxdepth = tonumber(stdnse.get_script_args("http-email-harvest.maxdepth")) or 3
- -- by default, we cap the script at a maximum pagecount of 20
- local maxpagecount = tonumber(stdnse.get_script_args("http-email-harvest.maxpagecount")) or 20
-
- local url = stdnse.get_script_args("http-email-harvest.url") or "/"
- local withinhost = stdnse.get_script_args("http-email-harvest.withinhost")
- local withindomain = stdnse.get_script_args("http-email-harvest.withindomain")
+ local crawler = httpspider.Crawler:new(host, port, url or '/', {
+ scriptname = SCRIPT_NAME
+ }
+ )
+
+ crawler:set_timeout(10000)
+
+ local maxdepth, maxpagecount = crawler.options.maxdepth, crawler.options.maxpagecount
if ( maxdepth < 0 ) then maxdepth = nil end
if ( maxpagecount < 0 ) then maxpagecount = nil end
stdnse.print_debug(2, "%s: Running crawler maxdepth: %s; maxpagecount: %s",
SCRIPT_NAME, maxdepth or "[none]", maxpagecount or "[none]")
-
- local crawler = httpspider.Crawler:new(host, port, url or '/', {
- maxdepth = maxdepth,
- maxpagecount = maxpagecount,
- withinhost = withinhost,
- withindomain= withindomain,
- }
- )
-
- crawler:set_timeout(10000)
local emails = {}
while(true) do