diff --git a/CHANGELOG b/CHANGELOG
index f290d5e92..ab1217a1a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,8 @@
# Nmap Changelog ($Id$); -*-text-*-
+o [NSE] Added the script http-grep that attempts to match web pages and urls
+ against a given pattern. [Patrik]
+
o [NSE] Added stop function to crawler so that scripts can properly shutdown
the crawler in case they want to end early. [Patrik]
diff --git a/nselib/httpspider.lua b/nselib/httpspider.lua
index c69f2a6b3..192f5e0c5 100644
--- a/nselib/httpspider.lua
+++ b/nselib/httpspider.lua
@@ -17,8 +17,43 @@
-- * Crawler
-- ** This class is responsible for the actual crawling.
--
+-- The following sample code shows how the spider could be used:
+--
+-- local crawler = httpspider.Crawler:new( host, port, '/', { scriptname = SCRIPT_NAME } )
+-- crawler:set_timeout(10000)
+--
+-- local result
+-- while(true) do
+-- local status, r = crawler:crawl()
+-- if ( not(status) ) then
+-- break
+-- end
+-- if ( r.response.body:match(str_match) ) then
+-- crawler:stop()
+-- result = r.url
+-- break
+-- end
+-- end
+--
+-- return result
+--
+--
-- @author Patrik Karlsson
--
+-- @args httpspider.maxdepth the maximum amount of directories beneath
+-- the initial url to spider. A negative value disables the limit.
+-- (default: 3)
+-- @args httpspider.maxpagecount the maximum amount of pages to visit.
+-- A negative value disables the limit (default: 20)
+-- @args httpspider.url the url to start spidering. This is a URL
+-- relative to the scanned host eg. /default.html (default: /)
+-- @args httpspider.withinhost only spider URLs within the same host.
+-- (default: true)
+-- @args httpspider.withindomain only spider URLs within the same
+-- domain. This widens the scope from withinhost and can
+-- not be used in combination. (default: false)
+-- @args httpspider.noblacklist if set, doesn't load the default blacklist
+--
module(... or "httpspider", package.seeall)
@@ -679,7 +714,6 @@ Crawler = {
-- does the crawling
crawl = function(self)
-
self.response_queue = self.response_queue or {}
local condvar = nmap.condvar(self.response_queue)
if ( not(self.thread) ) then
@@ -704,7 +738,4 @@ Crawler = {
condvar "signal"
condvar "wait"
end
-
-
-
}
diff --git a/scripts/http-grep.nse b/scripts/http-grep.nse
new file mode 100644
index 000000000..4d5fc98fc
--- /dev/null
+++ b/scripts/http-grep.nse
@@ -0,0 +1,110 @@
+description = [[
+Spiders a website and attempts to match all pages and urls against a given
+string. Matches are counted and grouped per url under which they were
+discovered.
+]]
+
+---
+-- @usage
+-- nmap -p 80 www.example.com --script http-grep --script-args='http-grep.match="[A-Za-z0-9%.%%%+%-]+@[A-Za-z0-9%.%%%+%-]+%.%w%w%w?%w?",http-grep.breakonmatch'
+--
+-- @output
+-- PORT STATE SERVICE REASON
+-- 80/tcp open http syn-ack
+-- | http-grep:
+-- | (4) http://example.com/name/
+-- | + name@example.com
+-- | + name@example.com
+-- | + name@example.com
+-- | + name@example.com
+-- | (4) http://example.com/sales.html
+-- | + sales@example.com
+-- | + sales@example.com
+-- | + sales@example.com
+-- |__ + sales@example.com
+
+
+-- @args http-grep.match the string to match in urls and page contents
+-- @args http-grep.maxdepth the maximum amount of directories beneath
+-- the initial url to spider. A negative value disables the limit.
+-- (default: 3)
+-- @args http-grep.maxpagecount the maximum amount of pages to visit.
+-- A negative value disables the limit (default: 20)
+-- @args http-grep.url the url to start spidering. This is a URL
+-- relative to the scanned host eg. /default.html (default: /)
+-- @args http-grep.withinhost only spider URLs within the same host.
+-- (default: true)
+-- @args http-grep.withindomain only spider URLs within the same
+-- domain. This widens the scope from withinhost and can
+-- not be used in combination. (default: false)
+
+author = "Patrik Karlsson"
+license = "Same as Nmap--See http://nmap.org/book/man-legal.html"
+categories = {"discovery", "safe"}
+
+require 'httpspider'
+require 'shortport'
+require 'url'
+
+portrule = shortport.http
+
+-- Shortens a matching string if it exceeds 60 characters
+-- All characters after 60 will be replaced with ...
+local function shortenMatch(match)
+ if ( #match > 60 ) then
+ return match:sub(1, 60) .. " ..."
+ else
+ return match
+ end
+end
+
+action = function(host, port)
+
+ -- read script specific arguments
+ local match = stdnse.get_script_args("http-grep.match")
+ local break_on_match = stdnse.get_script_args("http-grep.breakonmatch")
+
+ if ( not(match) ) then
+ return stdnse.format_output(true, "ERROR: Argument http-grep.match was not set")
+ end
+
+ local crawler = httpspider.Crawler:new(host, port, '/', { scriptname = SCRIPT_NAME } )
+ local results = {}
+
+ -- set timeout to 10 seconds
+ crawler:set_timeout(10000)
+
+ while(true) do
+ local status, r = crawler:crawl()
+ -- if the crawler fails it can be due to a number of different reasons
+ -- most of them are "legitimate" and should not be reason to abort
+ if ( not(status) ) then
+ if ( r.err ) then
+ return stdnse.format_output(true, "ERROR: %s", r.reason)
+ else
+ break
+ end
+ end
+
+ local matches = {}
+ local body = r.response.body
+ -- try to match the url and body
+ if ( body:match( match ) or tostring(r.url):match(match) ) then
+ local count = select(2, body:gsub(match, match))
+ for match in body:gmatch(match) do
+ table.insert(matches, "+ " .. shortenMatch(match))
+ end
+
+ matches.name = ("(%d) %s"):format(count,tostring(r.url))
+ table.insert(results, matches)
+
+ -- should we continue to search for matches?
+ if ( break_on_match ) then
+ crawler:stop()
+ break
+ end
+ end
+ end
+ table.sort(results, function(a,b) return a.name>b.name end)
+ return stdnse.format_output(true, results)
+end
\ No newline at end of file
diff --git a/scripts/script.db b/scripts/script.db
index bdaa44bdb..35fcc6535 100644
--- a/scripts/script.db
+++ b/scripts/script.db
@@ -101,6 +101,7 @@ Entry { filename = "http-enum.nse", categories = { "discovery", "intrusive", "vu
Entry { filename = "http-favicon.nse", categories = { "default", "discovery", "safe", } }
Entry { filename = "http-form-brute.nse", categories = { "brute", "intrusive", } }
Entry { filename = "http-google-malware.nse", categories = { "discovery", "external", "malware", "safe", } }
+Entry { filename = "http-grep.nse", categories = { "discovery", "safe", } }
Entry { filename = "http-headers.nse", categories = { "discovery", "safe", } }
Entry { filename = "http-iis-webdav-vuln.nse", categories = { "intrusive", "vuln", } }
Entry { filename = "http-joomla-brute.nse", categories = { "brute", "intrusive", } }