o [NSE] Modified the httpspider library to prefetch links in the queue and

change how script arguments are processed. Script and library arguments are now processed from within the library. [Patrik]
2025-12-09 14:11:29 +00:00 · 2011-12-09 15:48:19 +00:00
parent 6cbd5a9a58
commit e20a1b5174
3 changed files with 127 additions and 52 deletions
--- a/4
+++ b/4
@@ -1,5 +1,9 @@
 # Nmap Changelog ($Id$); -*-text-*-
 o [NSE] Modified the httpspider library to prefetch links in the queue and
  change how script arguments are processed. Script and library arguments are
  now processed from within the library. [Patrik]
 o The --exclude and --excludefile options can be used together now. [David]
 o [NSE] Added the script http-apache-negotiation that detects if the Apache
--- a/nselib/httpspider.lua
+++ b/nselib/httpspider.lua
@@ -25,6 +25,7 @@ module(... or "httpspider", package.seeall)
 require 'http'
 local LIBRARY_NAME = "httpspider"
 local PREFETCH_SIZE = 5
 -- The Options class, handling all spidering options
 Options = {
@@ -432,7 +433,6 @@ UrlQueue = {
 }
 -- The Crawler class
 Crawler = {
@@ -440,7 +440,16 @@ Crawler = {
 	-- @param host table as received by the action method
 	-- @param port table as received by the action method
 	-- @param url string containing the relative URL
-	-- @param options table of options
+	-- @param options table of options:
 	--        <code>noblacklist</code> - do not load default blacklist
 	--        <code>base_url</code> - start url to crawl
 	--        <code>timeout</code> - timeout for the http request
 	--        <code>maxdepth</code> - the maximum directory depth to crawl
 	--        <code>maxpagecount</code> - the maximum amount of pages to retrieve
 	--        <code>withinhost</code> - stay within the host of the base_url
 	--        <code>withindomain</code> - stay within the base_url domain
 	--        <code>scriptname</code> - should be set to SCRIPT_NAME to enable
 	--                                  script specific arguments.
 	-- @return o new instance of Crawler or nil on failure
 	new = function(self, host, port, url, options)
 		local o = {
@@ -453,6 +462,10 @@ Crawler = {
 		setmetatable(o, self)
 		self.__index = self
 		o:loadScriptArguments()
 		o:loadLibraryArguments()
 		o:loadDefaultArguments()
 		local response = http.get(o.host, o.port, '/', { timeout = o.options.timeout } )
 		if ( not(response) or 'table' ~= type(response) ) then
@@ -536,47 +549,116 @@ Crawler = {
 	-- unexpected or requires attention we set the error property accordingly.
 	-- This way the script can alert the user of the details by calling
 	-- getError()
-	crawl = function(self)
+	crawl_thread = function(self, response_queue)
 		local condvar = nmap.condvar(response_queue)
 		if ( self.options.withinhost and self.options.withindomain ) then
-			return false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" }
+			table.insert(response_queue, { false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } })
 			condvar "signal"
 			return
 		end
-		-- in case the user set a max page count to retrieve check how many
+		while(true) do
-		-- pages we have retrieved so far
+			-- in case the user set a max page count to retrieve check how many
-		local count = self:getPageCount()
+			-- pages we have retrieved so far
-		if ( self.options.maxpagecount and
+			local count = self:getPageCount()
-			 ( count > self.options.maxpagecount ) ) then
+			if ( self.options.maxpagecount and
-			return false, { err = false, msg = "Reached max page count" }
+				 ( count > self.options.maxpagecount ) ) then
 				table.insert(response_queue, { false, { err = false, msg = "Reached max page count" } })
 				condvar "signal"
 				return
 			end
 			-- pull links from the queue until we get a valid one
 			local url
 			repeat
 				url = self.urlqueue:getNext()
 			until( not(url) or not(self.processed[tostring(url)]) )
 			-- if no url could be retrieved from the queue, abort ...
 			if ( not(url) ) then
 				table.insert(response_queue, { false, { err = false, msg = "No more urls" } })
 				condvar "signal"
 				return
 			end
 			if ( self.options.maxpagecount ) then
 				stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url))
 			else
 				stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url))
 			end
 			-- fetch the url, and then push it to the processed table
 			local response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout } )
 			self.processed[tostring(url)] = true
 			-- if we have a response, proceed scraping it
 			if ( response.body ) then
 				local links = LinkExtractor:new(url, response.body, self.options):getLinks()
 				self.urlqueue:add(links)
 			end
 			table.insert(response_queue, { true, { url = url, response = response } } )
 			while ( PREFETCH_SIZE < #response_queue ) do
 				stdnse.print_debug(2, "%s: Response queue full, waiting ...", LIBRARY_NAME)
 				condvar "wait"
 			end
 			condvar "signal"
 		end
 	end,
 	-- Loads the argument set on a script level
 	loadScriptArguments = function(self)
 		local sn = self.options.scriptname
 		if ( not(sn) ) then
 			stdnse.print_debug("%s: WARNING: Script argument could not be loaded as scriptname was not set", LIBRARY_NAME)
 			return
 		end
-		-- pull links from the queue until we get a valid one
+		self.options.maxdepth		= tonumber(stdnse.get_script_args(sn .. ".maxdepth"))
-		local url
+		self.options.maxpagecount 	= tonumber(stdnse.get_script_args(sn .. ".maxpagecount"))
-		repeat
+		self.url 					= stdnse.get_script_args(sn .. ".url")
-			url = self.urlqueue:getNext()
+		self.options.withinhost 	= stdnse.get_script_args(sn .. ".withinhost")
-		until( not(url) or not(self.processed[tostring(url)]) )
+		self.options.withindomain 	= stdnse.get_script_args(sn .. ".withindomain")
 		self.options.noblacklist    = stdnse.get_script_args(sn .. ".noblacklist")
 	end,
-		-- if no url could be retrieved from the queue, abort ...
+	-- Loads the argument on a library level
-		if ( not(url) ) then
+	loadLibraryArguments = function(self)
 		local ln = LIBRARY_NAME
 		self.options.maxdepth		= self.options.maxdepth or tonumber(stdnse.get_script_args(ln .. ".maxdepth"))
 		self.options.maxpagecount 	= self.options.maxpagecount or tonumber(stdnse.get_script_args(ln .. ".maxpagecount"))
 		self.url 					= self.url or stdnse.get_script_args(ln .. ".url")
 		self.options.withinhost 	= self.options.withinhost or stdnse.get_script_args(ln .. ".withinhost")
 		self.options.withindomain 	= self.options.withindomain or stdnse.get_script_args(ln .. ".withindomain")
 		self.options.noblacklist    = self.options.noblacklist or stdnse.get_script_args(ln .. ".noblacklist")
 	end,
 	-- Loads any defaults for arguments that were not set
 	loadDefaultArguments = function(self)
 		self.options.maxdepth = self.options.maxdepth or 3
 		self.options.maxpagecount = self.options.maxpagecount or 20
 		self.url = self.url or '/'
 	end,	
 	crawl = function(self)
 		self.response_queue = self.response_queue or {}
 		local condvar = nmap.condvar(self.response_queue)
 		if ( not(self.thread) ) then
 			self.thread = stdnse.new_thread(self.crawl_thread, self, self.response_queue)
 		end
 		if ( #self.response_queue == 0 and coroutine.status(self.thread) ~= 'dead') then
 			condvar "wait" 
 		end
 		condvar "signal"
 		if ( #self.response_queue == 0 ) then
 			return false, { err = false, msg = "No more urls" }
 		end
 		if ( self.options.maxpagecount ) then
 			stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url))
 		else
-			stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url))
+			return unpack(table.remove(self.response_queue, 1))
 		end
 		-- fetch the url, and then push it to the processed table
 		local response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout } )
 		self.processed[tostring(url)] = true
 		-- if we have a response, proceed scraping it
 		if ( response.body ) then
 			local links = LinkExtractor:new(url, response.body, self.options):getLinks()
 			self.urlqueue:add(links)
 		end
 		return true, { url = url, response = response }
 	end,
--- a/scripts/http-email-harvest.nse
+++ b/scripts/http-email-harvest.nse
@@ -39,31 +39,20 @@ portrule = shortport.http
 function action(host, port)
 	local EMAIL_PATTERN = "[A-Za-z0-9%.%%%+%-]+@[A-Za-z0-9%.%%%+%-]+%.%w%w%w?%w?"
-	-- by default, we cap the script at a maximum depth of 3 
+	local crawler = httpspider.Crawler:new(host, port, url or '/', { 
-	local maxdepth 		= tonumber(stdnse.get_script_args("http-email-harvest.maxdepth")) or 3
+			scriptname = SCRIPT_NAME
-	-- by default, we cap the script at a maximum pagecount of 20
+		}
-	local maxpagecount 	= tonumber(stdnse.get_script_args("http-email-harvest.maxpagecount")) or 20
+	)
-	local url 			= stdnse.get_script_args("http-email-harvest.url") or "/"
+	crawler:set_timeout(10000)
 	local withinhost 	= stdnse.get_script_args("http-email-harvest.withinhost")
 	local withindomain 	= stdnse.get_script_args("http-email-harvest.withindomain")
 	local maxdepth, maxpagecount = crawler.options.maxdepth, crawler.options.maxpagecount
 	if ( maxdepth < 0 ) then maxdepth = nil end
 	if ( maxpagecount < 0 ) then maxpagecount = nil end
 	stdnse.print_debug(2, "%s: Running crawler maxdepth: %s; maxpagecount: %s", 
 		SCRIPT_NAME, maxdepth or "[none]", maxpagecount or "[none]")
 	local crawler = httpspider.Crawler:new(host, port, url or '/', { 
 			maxdepth = maxdepth, 
 			maxpagecount = maxpagecount,
 			withinhost = withinhost,
 			withindomain= withindomain,
 		}
 	)
 	crawler:set_timeout(10000)
 	local emails = {}
 	while(true) do
 		local status, r = crawler:crawl()