o [NSE] Modified the httpspider library to prefetch links in the queue and

change how script arguments are processed. Script and library arguments are now processed from within the library. [Patrik]
2026-01-22 06:09:01 +00:00 · 2011-12-09 15:48:19 +00:00
parent 6cbd5a9a58
commit e20a1b5174
3 changed files with 127 additions and 52 deletions
--- a/nselib/httpspider.lua
+++ b/nselib/httpspider.lua
@@ -25,6 +25,7 @@ module(... or "httpspider", package.seeall)
 require 'http'

 local LIBRARY_NAME = "httpspider"
+local PREFETCH_SIZE = 5

 -- The Options class, handling all spidering options
 Options = {
@@ -432,7 +433,6 @@ UrlQueue = {
 	
 }

-
 -- The Crawler class
 Crawler = {
 	
@@ -440,7 +440,16 @@ Crawler = {
 	-- @param host table as received by the action method
 	-- @param port table as received by the action method
 	-- @param url string containing the relative URL
-	-- @param options table of options
+	-- @param options table of options:
+	--        <code>noblacklist</code> - do not load default blacklist
+	--        <code>base_url</code> - start url to crawl
+	--        <code>timeout</code> - timeout for the http request
+	--        <code>maxdepth</code> - the maximum directory depth to crawl
+	--        <code>maxpagecount</code> - the maximum amount of pages to retrieve
+	--        <code>withinhost</code> - stay within the host of the base_url
+	--        <code>withindomain</code> - stay within the base_url domain
+	--        <code>scriptname</code> - should be set to SCRIPT_NAME to enable
+	--                                  script specific arguments.
 	-- @return o new instance of Crawler or nil on failure
 	new = function(self, host, port, url, options)
 		local o = {
@@ -453,6 +462,10 @@ Crawler = {
 		setmetatable(o, self)
 		self.__index = self

+		o:loadScriptArguments()
+		o:loadLibraryArguments()
+		o:loadDefaultArguments()
+
 		local response = http.get(o.host, o.port, '/', { timeout = o.options.timeout } )
 		
 		if ( not(response) or 'table' ~= type(response) ) then
@@ -536,47 +549,116 @@ Crawler = {
 	-- unexpected or requires attention we set the error property accordingly.
 	-- This way the script can alert the user of the details by calling
 	-- getError()
-	crawl = function(self)
+	crawl_thread = function(self, response_queue)
+		local condvar = nmap.condvar(response_queue)

 		if ( self.options.withinhost and self.options.withindomain ) then
-			return false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" }
+			table.insert(response_queue, { false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } })
+			condvar "signal"
+			return
 		end

-		-- in case the user set a max page count to retrieve check how many
-		-- pages we have retrieved so far
-		local count = self:getPageCount()
-		if ( self.options.maxpagecount and
-			 ( count > self.options.maxpagecount ) ) then
-			return false, { err = false, msg = "Reached max page count" }
+		while(true) do
+			-- in case the user set a max page count to retrieve check how many
+			-- pages we have retrieved so far
+			local count = self:getPageCount()
+			if ( self.options.maxpagecount and
+				 ( count > self.options.maxpagecount ) ) then
+				table.insert(response_queue, { false, { err = false, msg = "Reached max page count" } })
+				condvar "signal"
+				return
+			end
+		
+			-- pull links from the queue until we get a valid one
+			local url
+			repeat
+				url = self.urlqueue:getNext()
+			until( not(url) or not(self.processed[tostring(url)]) )
+
+			-- if no url could be retrieved from the queue, abort ...
+			if ( not(url) ) then
+				table.insert(response_queue, { false, { err = false, msg = "No more urls" } })
+				condvar "signal"
+				return
+			end
+		
+			if ( self.options.maxpagecount ) then
+				stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url))
+			else
+				stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url))
+			end
+
+			-- fetch the url, and then push it to the processed table
+			local response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout } )
+			self.processed[tostring(url)] = true
+		
+			-- if we have a response, proceed scraping it
+			if ( response.body ) then
+				local links = LinkExtractor:new(url, response.body, self.options):getLinks()
+				self.urlqueue:add(links)
+			end
+		
+			table.insert(response_queue, { true, { url = url, response = response } } )
+			while ( PREFETCH_SIZE < #response_queue ) do
+				stdnse.print_debug(2, "%s: Response queue full, waiting ...", LIBRARY_NAME)
+				condvar "wait"
+			end
+			condvar "signal"
+		end
+	end,
+	
+	-- Loads the argument set on a script level
+	loadScriptArguments = function(self)
+		local sn = self.options.scriptname
+		if ( not(sn) ) then
+			stdnse.print_debug("%s: WARNING: Script argument could not be loaded as scriptname was not set", LIBRARY_NAME)
+			return
 		end
 		
-		-- pull links from the queue until we get a valid one
-		local url
-		repeat
-			url = self.urlqueue:getNext()
-		until( not(url) or not(self.processed[tostring(url)]) )
+		self.options.maxdepth		= tonumber(stdnse.get_script_args(sn .. ".maxdepth"))
+		self.options.maxpagecount 	= tonumber(stdnse.get_script_args(sn .. ".maxpagecount"))
+		self.url 					= stdnse.get_script_args(sn .. ".url")
+		self.options.withinhost 	= stdnse.get_script_args(sn .. ".withinhost")
+		self.options.withindomain 	= stdnse.get_script_args(sn .. ".withindomain")
+		self.options.noblacklist    = stdnse.get_script_args(sn .. ".noblacklist")
+	end,
+	
+	-- Loads the argument on a library level
+	loadLibraryArguments = function(self)
+		local ln = LIBRARY_NAME
+		
+		self.options.maxdepth		= self.options.maxdepth or tonumber(stdnse.get_script_args(ln .. ".maxdepth"))
+		self.options.maxpagecount 	= self.options.maxpagecount or tonumber(stdnse.get_script_args(ln .. ".maxpagecount"))
+		self.url 					= self.url or stdnse.get_script_args(ln .. ".url")
+		self.options.withinhost 	= self.options.withinhost or stdnse.get_script_args(ln .. ".withinhost")
+		self.options.withindomain 	= self.options.withindomain or stdnse.get_script_args(ln .. ".withindomain")
+		self.options.noblacklist    = self.options.noblacklist or stdnse.get_script_args(ln .. ".noblacklist")
+	end,
+	
+	-- Loads any defaults for arguments that were not set
+	loadDefaultArguments = function(self)
+		self.options.maxdepth = self.options.maxdepth or 3
+		self.options.maxpagecount = self.options.maxpagecount or 20
+		self.url = self.url or '/'
+	end,	
+	
+	crawl = function(self)

-		-- if no url could be retrieved from the queue, abort ...
-		if ( not(url) ) then
+		self.response_queue = self.response_queue or {}
+		local condvar = nmap.condvar(self.response_queue)
+		if ( not(self.thread) ) then
+			self.thread = stdnse.new_thread(self.crawl_thread, self, self.response_queue)
+		end
+
+		if ( #self.response_queue == 0 and coroutine.status(self.thread) ~= 'dead') then
+			condvar "wait" 
+		end
+		condvar "signal"
+		if ( #self.response_queue == 0 ) then
 			return false, { err = false, msg = "No more urls" }
-		end
-		
-		if ( self.options.maxpagecount ) then
-			stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url))
 		else
-			stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url))
+			return unpack(table.remove(self.response_queue, 1))
 		end
-
-		-- fetch the url, and then push it to the processed table
-		local response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout } )
-		self.processed[tostring(url)] = true
-		
-		-- if we have a response, proceed scraping it
-		if ( response.body ) then
-			local links = LinkExtractor:new(url, response.body, self.options):getLinks()
-			self.urlqueue:add(links)
-		end
-		return true, { url = url, response = response }
 	end,