o [NSE] Added the script http-grep that attempts to match web pages and urls

against a given pattern. [Patrik]
2025-12-22 07:29:01 +00:00 · 2011-12-11 19:44:26 +00:00
parent 74b53a6a14
commit 4214307364
4 changed files with 149 additions and 4 deletions
--- a/3
+++ b/3
@@ -1,5 +1,8 @@
 # Nmap Changelog ($Id$); -*-text-*-
 o [NSE] Added the script http-grep that attempts to match web pages and urls
  against a given pattern. [Patrik]
 o [NSE] Added stop function to crawler so that scripts can properly shutdown
  the crawler in case they want to end early. [Patrik]
--- a/nselib/httpspider.lua
+++ b/nselib/httpspider.lua
@@ -17,8 +17,43 @@
 -- * <code>Crawler</code>
 -- ** This class is responsible for the actual crawling.
 -- 
 -- The following sample code shows how the spider could be used:
 -- <code>
 --   local crawler = httpspider.Crawler:new( host, port, '/', { scriptname = SCRIPT_NAME } )
 --   crawler:set_timeout(10000)
 --
 --   local result
 --   while(true) do
 --     local status, r = crawler:crawl()
 --     if ( not(status) ) then
 --       break
 --     end
 --     if ( r.response.body:match(str_match) ) then
 --        crawler:stop()
 --        result = r.url
 --        break
 --     end
 --   end
 --
 --   return result
 -- </code>
 --
 -- @author Patrik Karlsson <patrik@cqure.net>
 -- 
 -- @args httpspider.maxdepth the maximum amount of directories beneath
 --       the initial url to spider. A negative value disables the limit.
 --       (default: 3)
 -- @args httpspider.maxpagecount the maximum amount of pages to visit.
 --       A negative value disables the limit (default: 20)
 -- @args httpspider.url the url to start spidering. This is a URL
 --       relative to the scanned host eg. /default.html (default: /)
 -- @args httpspider.withinhost only spider URLs within the same host.
 --       (default: true)
 -- @args httpspider.withindomain only spider URLs within the same
 --       domain. This widens the scope from <code>withinhost</code> and can
 --       not be used in combination. (default: false)
 -- @args httpspider.noblacklist if set, doesn't load the default blacklist
 --
 module(... or "httpspider", package.seeall)
@@ -679,7 +714,6 @@ Crawler = {
 	-- does the crawling
 	crawl = function(self)
 		self.response_queue = self.response_queue or {}
 		local condvar = nmap.condvar(self.response_queue)
 		if ( not(self.thread) ) then
@@ -704,7 +738,4 @@ Crawler = {
 		condvar "signal"
 		condvar "wait"
 	end
 }
--- a/scripts/http-grep.nse
+++ b/scripts/http-grep.nse
@@ -0,0 +1,110 @@
 description = [[
 Spiders a website and attempts to match all pages and urls against a given
 string. Matches are counted and grouped per url under which they were
 discovered.
 ]]
 ---
 -- @usage
 -- nmap -p 80 www.example.com --script http-grep --script-args='http-grep.match="[A-Za-z0-9%.%%%+%-]+@[A-Za-z0-9%.%%%+%-]+%.%w%w%w?%w?",http-grep.breakonmatch'
 --
 -- @output
 -- PORT   STATE SERVICE REASON
 -- 80/tcp open  http    syn-ack
 -- | http-grep: 
 -- |   (4) http://example.com/name/
 -- |     + name@example.com
 -- |     + name@example.com
 -- |     + name@example.com
 -- |     + name@example.com
 -- |   (4) http://example.com/sales.html
 -- |     + sales@example.com
 -- |     + sales@example.com
 -- |     + sales@example.com
 -- |__   + sales@example.com
 -- @args http-grep.match the string to match in urls and page contents
 -- @args http-grep.maxdepth the maximum amount of directories beneath
 --       the initial url to spider. A negative value disables the limit.
 --       (default: 3)
 -- @args http-grep.maxpagecount the maximum amount of pages to visit.
 --       A negative value disables the limit (default: 20)
 -- @args http-grep.url the url to start spidering. This is a URL
 --       relative to the scanned host eg. /default.html (default: /)
 -- @args http-grep.withinhost only spider URLs within the same host.
 --       (default: true)
 -- @args http-grep.withindomain only spider URLs within the same
 --       domain. This widens the scope from <code>withinhost</code> and can
 --       not be used in combination. (default: false)
 author = "Patrik Karlsson"
 license = "Same as Nmap--See http://nmap.org/book/man-legal.html"
 categories = {"discovery", "safe"}
 require 'httpspider'
 require 'shortport'
 require 'url'
 portrule = shortport.http
 -- Shortens a matching string if it exceeds 60 characters
 -- All characters after 60 will be replaced with ...
 local function shortenMatch(match)
 	if ( #match > 60 ) then
 		return match:sub(1, 60) .. " ..."
 	else
 		return match
 	end
 end
 action = function(host, port)
 	-- read script specific arguments
 	local match 			= stdnse.get_script_args("http-grep.match")
 	local break_on_match 	= stdnse.get_script_args("http-grep.breakonmatch")
 	if ( not(match) ) then
 		return stdnse.format_output(true, "ERROR: Argument http-grep.match was not set")
 	end
 	local crawler = httpspider.Crawler:new(host, port, '/', { scriptname = SCRIPT_NAME } )
 	local results = {}
 	-- set timeout to 10 seconds
 	crawler:set_timeout(10000)
 	while(true) do
 		local status, r = crawler:crawl()
 		-- if the crawler fails it can be due to a number of different reasons
 		-- most of them are "legitimate" and should not be reason to abort
 		if ( not(status) ) then
 			if ( r.err ) then
 				return stdnse.format_output(true, "ERROR: %s", r.reason)
 			else
 				break
 			end
 		end
 		local matches = {}
 		local body = r.response.body
 		-- try to match the url and body
 		if ( body:match( match ) or tostring(r.url):match(match) ) then
 			local count = select(2, body:gsub(match, match))
 			for match in body:gmatch(match) do
 				table.insert(matches, "+ " .. shortenMatch(match))
 			end
 			matches.name = ("(%d) %s"):format(count,tostring(r.url))
 			table.insert(results, matches)
 			-- should we continue to search for matches?
 			if ( break_on_match ) then
 				crawler:stop()
 				break
 			end
 		end
 	end
 	table.sort(results, function(a,b) return a.name>b.name end)
 	return stdnse.format_output(true, results)	
 end
--- a/scripts/script.db
+++ b/scripts/script.db
@@ -101,6 +101,7 @@ Entry { filename = "http-enum.nse", categories = { "discovery", "intrusive", "vu
 Entry { filename = "http-favicon.nse", categories = { "default", "discovery", "safe", } }
 Entry { filename = "http-form-brute.nse", categories = { "brute", "intrusive", } }
 Entry { filename = "http-google-malware.nse", categories = { "discovery", "external", "malware", "safe", } }
 Entry { filename = "http-grep.nse", categories = { "discovery", "safe", } }
 Entry { filename = "http-headers.nse", categories = { "discovery", "safe", } }
 Entry { filename = "http-iis-webdav-vuln.nse", categories = { "intrusive", "vuln", } }
 Entry { filename = "http-joomla-brute.nse", categories = { "brute", "intrusive", } }