diff --git a/CHANGELOG b/CHANGELOG index 70b8100ad..da38c72a5 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,9 @@ # Nmap Changelog ($Id$); -*-text-*- +o [NSE] Modified the httpspider library to prefetch links in the queue and + change how script arguments are processed. Script and library arguments are + now processed from within the library. [Patrik] + o The --exclude and --excludefile options can be used together now. [David] o [NSE] Added the script http-apache-negotiation that detects if the Apache diff --git a/nselib/httpspider.lua b/nselib/httpspider.lua index b83c6ce95..78df27fb7 100644 --- a/nselib/httpspider.lua +++ b/nselib/httpspider.lua @@ -25,6 +25,7 @@ module(... or "httpspider", package.seeall) require 'http' local LIBRARY_NAME = "httpspider" +local PREFETCH_SIZE = 5 -- The Options class, handling all spidering options Options = { @@ -432,7 +433,6 @@ UrlQueue = { } - -- The Crawler class Crawler = { @@ -440,7 +440,16 @@ Crawler = { -- @param host table as received by the action method -- @param port table as received by the action method -- @param url string containing the relative URL - -- @param options table of options + -- @param options table of options: + -- noblacklist - do not load default blacklist + -- base_url - start url to crawl + -- timeout - timeout for the http request + -- maxdepth - the maximum directory depth to crawl + -- maxpagecount - the maximum amount of pages to retrieve + -- withinhost - stay within the host of the base_url + -- withindomain - stay within the base_url domain + -- scriptname - should be set to SCRIPT_NAME to enable + -- script specific arguments. -- @return o new instance of Crawler or nil on failure new = function(self, host, port, url, options) local o = { @@ -453,6 +462,10 @@ Crawler = { setmetatable(o, self) self.__index = self + o:loadScriptArguments() + o:loadLibraryArguments() + o:loadDefaultArguments() + local response = http.get(o.host, o.port, '/', { timeout = o.options.timeout } ) if ( not(response) or 'table' ~= type(response) ) then @@ -536,47 +549,116 @@ Crawler = { -- unexpected or requires attention we set the error property accordingly. -- This way the script can alert the user of the details by calling -- getError() - crawl = function(self) + crawl_thread = function(self, response_queue) + local condvar = nmap.condvar(response_queue) if ( self.options.withinhost and self.options.withindomain ) then - return false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } + table.insert(response_queue, { false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } }) + condvar "signal" + return end - -- in case the user set a max page count to retrieve check how many - -- pages we have retrieved so far - local count = self:getPageCount() - if ( self.options.maxpagecount and - ( count > self.options.maxpagecount ) ) then - return false, { err = false, msg = "Reached max page count" } + while(true) do + -- in case the user set a max page count to retrieve check how many + -- pages we have retrieved so far + local count = self:getPageCount() + if ( self.options.maxpagecount and + ( count > self.options.maxpagecount ) ) then + table.insert(response_queue, { false, { err = false, msg = "Reached max page count" } }) + condvar "signal" + return + end + + -- pull links from the queue until we get a valid one + local url + repeat + url = self.urlqueue:getNext() + until( not(url) or not(self.processed[tostring(url)]) ) + + -- if no url could be retrieved from the queue, abort ... + if ( not(url) ) then + table.insert(response_queue, { false, { err = false, msg = "No more urls" } }) + condvar "signal" + return + end + + if ( self.options.maxpagecount ) then + stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url)) + else + stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url)) + end + + -- fetch the url, and then push it to the processed table + local response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout } ) + self.processed[tostring(url)] = true + + -- if we have a response, proceed scraping it + if ( response.body ) then + local links = LinkExtractor:new(url, response.body, self.options):getLinks() + self.urlqueue:add(links) + end + + table.insert(response_queue, { true, { url = url, response = response } } ) + while ( PREFETCH_SIZE < #response_queue ) do + stdnse.print_debug(2, "%s: Response queue full, waiting ...", LIBRARY_NAME) + condvar "wait" + end + condvar "signal" + end + end, + + -- Loads the argument set on a script level + loadScriptArguments = function(self) + local sn = self.options.scriptname + if ( not(sn) ) then + stdnse.print_debug("%s: WARNING: Script argument could not be loaded as scriptname was not set", LIBRARY_NAME) + return end - -- pull links from the queue until we get a valid one - local url - repeat - url = self.urlqueue:getNext() - until( not(url) or not(self.processed[tostring(url)]) ) + self.options.maxdepth = tonumber(stdnse.get_script_args(sn .. ".maxdepth")) + self.options.maxpagecount = tonumber(stdnse.get_script_args(sn .. ".maxpagecount")) + self.url = stdnse.get_script_args(sn .. ".url") + self.options.withinhost = stdnse.get_script_args(sn .. ".withinhost") + self.options.withindomain = stdnse.get_script_args(sn .. ".withindomain") + self.options.noblacklist = stdnse.get_script_args(sn .. ".noblacklist") + end, + + -- Loads the argument on a library level + loadLibraryArguments = function(self) + local ln = LIBRARY_NAME + + self.options.maxdepth = self.options.maxdepth or tonumber(stdnse.get_script_args(ln .. ".maxdepth")) + self.options.maxpagecount = self.options.maxpagecount or tonumber(stdnse.get_script_args(ln .. ".maxpagecount")) + self.url = self.url or stdnse.get_script_args(ln .. ".url") + self.options.withinhost = self.options.withinhost or stdnse.get_script_args(ln .. ".withinhost") + self.options.withindomain = self.options.withindomain or stdnse.get_script_args(ln .. ".withindomain") + self.options.noblacklist = self.options.noblacklist or stdnse.get_script_args(ln .. ".noblacklist") + end, + + -- Loads any defaults for arguments that were not set + loadDefaultArguments = function(self) + self.options.maxdepth = self.options.maxdepth or 3 + self.options.maxpagecount = self.options.maxpagecount or 20 + self.url = self.url or '/' + end, + + crawl = function(self) - -- if no url could be retrieved from the queue, abort ... - if ( not(url) ) then + self.response_queue = self.response_queue or {} + local condvar = nmap.condvar(self.response_queue) + if ( not(self.thread) ) then + self.thread = stdnse.new_thread(self.crawl_thread, self, self.response_queue) + end + + if ( #self.response_queue == 0 and coroutine.status(self.thread) ~= 'dead') then + condvar "wait" + end + condvar "signal" + if ( #self.response_queue == 0 ) then return false, { err = false, msg = "No more urls" } - end - - if ( self.options.maxpagecount ) then - stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url)) else - stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url)) + return unpack(table.remove(self.response_queue, 1)) end - - -- fetch the url, and then push it to the processed table - local response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout } ) - self.processed[tostring(url)] = true - - -- if we have a response, proceed scraping it - if ( response.body ) then - local links = LinkExtractor:new(url, response.body, self.options):getLinks() - self.urlqueue:add(links) - end - return true, { url = url, response = response } end, diff --git a/scripts/http-email-harvest.nse b/scripts/http-email-harvest.nse index 167172db8..3365c65b7 100644 --- a/scripts/http-email-harvest.nse +++ b/scripts/http-email-harvest.nse @@ -38,31 +38,20 @@ portrule = shortport.http function action(host, port) local EMAIL_PATTERN = "[A-Za-z0-9%.%%%+%-]+@[A-Za-z0-9%.%%%+%-]+%.%w%w%w?%w?" - - -- by default, we cap the script at a maximum depth of 3 - local maxdepth = tonumber(stdnse.get_script_args("http-email-harvest.maxdepth")) or 3 - -- by default, we cap the script at a maximum pagecount of 20 - local maxpagecount = tonumber(stdnse.get_script_args("http-email-harvest.maxpagecount")) or 20 - - local url = stdnse.get_script_args("http-email-harvest.url") or "/" - local withinhost = stdnse.get_script_args("http-email-harvest.withinhost") - local withindomain = stdnse.get_script_args("http-email-harvest.withindomain") + local crawler = httpspider.Crawler:new(host, port, url or '/', { + scriptname = SCRIPT_NAME + } + ) + + crawler:set_timeout(10000) + + local maxdepth, maxpagecount = crawler.options.maxdepth, crawler.options.maxpagecount if ( maxdepth < 0 ) then maxdepth = nil end if ( maxpagecount < 0 ) then maxpagecount = nil end stdnse.print_debug(2, "%s: Running crawler maxdepth: %s; maxpagecount: %s", SCRIPT_NAME, maxdepth or "[none]", maxpagecount or "[none]") - - local crawler = httpspider.Crawler:new(host, port, url or '/', { - maxdepth = maxdepth, - maxpagecount = maxpagecount, - withinhost = withinhost, - withindomain= withindomain, - } - ) - - crawler:set_timeout(10000) local emails = {} while(true) do