1
0
mirror of https://github.com/nmap/nmap.git synced 2025-12-09 14:11:29 +00:00
Files
nmap/nselib/httpspider.lua
tomsellers b82c819afb Update to add additional blacklist entries the httpspider library. The goal is to avoid downloading and processing certain additional video, audio and binary formats.
This should speed up crawling certain sites.  In the case of http-email-harvest it should reduce some of the false positives generated by running the RegEx against binary data. The only script that this appears likely to have affected the results of would have been http-sitemap-generator and that script specifically disables the blacklist.
2012-07-10 00:23:02 +00:00

895 lines
28 KiB
Lua

---
-- A smallish httpspider library providing basic spidering capabilities
-- It consists of the following classes:
--
-- * <code>Options</code>
-- ** This class is responsible for handling library options.
--
-- * <code>LinkExtractor</code>
-- ** This class contains code responsible for extracting urls from web pages.
--
-- * <code>URL</code>
-- ** This class contains code to parse and process URLs.
--
-- * <code>UrlQueue</code>
-- ** This class contains a queue of the next links to process.
--
-- * <code>Crawler</code>
-- ** This class is responsible for the actual crawling.
--
-- The following sample code shows how the spider could be used:
-- <code>
-- local crawler = httpspider.Crawler:new( host, port, '/', { scriptname = SCRIPT_NAME } )
-- crawler:set_timeout(10000)
--
-- local result
-- while(true) do
-- local status, r = crawler:crawl()
-- if ( not(status) ) then
-- break
-- end
-- if ( r.response.body:match(str_match) ) then
-- crawler:stop()
-- result = r.url
-- break
-- end
-- end
--
-- return result
-- </code>
--
-- @author Patrik Karlsson <patrik@cqure.net>
--
-- @args httpspider.maxdepth the maximum amount of directories beneath
-- the initial url to spider. A negative value disables the limit.
-- (default: 3)
-- @args httpspider.maxpagecount the maximum amount of pages to visit.
-- A negative value disables the limit (default: 20)
-- @args httpspider.url the url to start spidering. This is a URL
-- relative to the scanned host eg. /default.html (default: /)
-- @args httpspider.withinhost only spider URLs within the same host.
-- (default: true)
-- @args httpspider.withindomain only spider URLs within the same
-- domain. This widens the scope from <code>withinhost</code> and can
-- not be used in combination. (default: false)
-- @args httpspider.noblacklist if set, doesn't load the default blacklist
-- @args httpspider.useheadfornonwebfiles if set, the crawler would use
-- HEAD instead of GET for files that do not have extensions indicating
-- that they are webpages (the list of webpage extensions is located in
-- nselib/data/http-web-files-extensions.lst)
--
local coroutine = require "coroutine"
local http = require "http"
local io = require "io"
local nmap = require "nmap"
local stdnse = require "stdnse"
local string = require "string"
local table = require "table"
local url = require "url"
_ENV = stdnse.module("httpspider", stdnse.seeall)
local LIBRARY_NAME = "httpspider"
local PREFETCH_SIZE = 5
-- The Options class, handling all spidering options
Options = {
new = function(self, options)
local o = { }
-- copy all options as class members
for k, v in pairs(options) do o[k] = v end
-- set a few default values
o.timeout = options.timeout or 10000
o.whitelist = o.whitelist or {}
o.blacklist = o.blacklist or {}
local removewww = function(url) string.gsub(url, "^www%.", "") end
if ( o.withinhost == true or o.withindomain == true ) then
-- set up the appropriate matching functions
if ( o.withinhost ) then
o.withinhost = function(u)
local parsed_u = url.parse(tostring(u))
if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then
return false
end
elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
return false
-- if urls don't match only on the "www" prefix, then they are probably the same
elseif ( parsed_u.host == nil or removewww(parsed_u.host:lower()) ~= removewww(o.base_url:getHost():lower()) ) then
return false
end
return true
end
else
o.withindomain = function(u)
local parsed_u = url.parse(tostring(u))
if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then
return false
end
elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
return false
elseif ( parsed_u.host == nil or parsed_u.host:sub(-#o.base_url:getDomain()):lower() ~= o.base_url:getDomain():lower() ) then
return false
end
return true
end
end
end
setmetatable(o, self)
self.__index = self
return o
end,
addWhitelist = function(self, func) table.insert(self.whitelist, func) end,
addBlacklist = function(self, func) table.insert(self.blacklist, func) end,
}
-- Placeholder for form extraction code
FormExtractor = {
}
LinkExtractor = {
-- Creates a new instance of LinkExtractor
-- @return o instance of LinkExtractor
new = function(self, url, html, options)
local o = {
url = url,
html = html,
links = {},
options = options,
}
setmetatable(o, self)
self.__index = self
o:parse()
return o
end,
-- is the link absolute or not?
isAbsolute = function(url)
-- at this point we don't care about the protocol
-- also, we don't add // to cover stuff like:
-- feed:http://example.com/rss.xml
return ( url:match('^%w*:') ~= nil )
end,
-- Creates an absolute link from a relative one based on the base_url
-- The functionality is very simple and does not take any ../../ in
-- consideration.
--
-- @param base_url URL containing the page url from which the links were
-- extracted
-- @param rel_url string containing the relative portion of the URL
-- @return link string containing the absolute link
createAbsolute = function(base_url, rel_url, base_href)
-- is relative with leading slash? ie /dir1/foo.html
local leading_slash = rel_url:match("^/")
rel_url = rel_url:match("^/?(.*)") or '/'
-- check for tailing slash
if ( base_href and not(base_href:match("/$") ) ) then
base_href = base_href .. '/'
end
if ( ( base_url:getProto() == 'https' and base_url:getPort() == 443 ) or
( base_url:getProto() == 'http' and base_url:getPort() == 80 ) ) then
if ( leading_slash ) then
return ("%s://%s/%s"):format(base_url:getProto(), base_url:getHost(), rel_url)
else
if ( base_href ) then
return ("%s%s"):format(base_href, rel_url)
else
return ("%s://%s%s%s"):format(base_url:getProto(), base_url:getHost(), base_url:getDir(), rel_url)
end
end
else
if ( leading_slash ) then
return ("%s://%s:%d/%s"):format(base_url:getProto(), base_url:getHost(), base_url:getPort(), rel_url)
else
if ( base_href ) then
return ("%s%s"):format(base_href, rel_url)
else
return ("%s://%s:%d%s%s"):format(base_url:getProto(), base_url:getHost(), base_href or base_url:getPort(), base_url:getDir(), rel_url)
end
end
end
end,
-- Gets the depth of the link, relative to our base url eg.
-- base_url = http://www.cqure.net/wp/
-- url = http://www.cqure.net/wp/ - depth: 0
-- url = http://www.cqure.net/wp/index.php - depth: 0
-- url = http://www.cqure.net/wp/2011/index.php - depth: 1
-- url = http://www.cqure.net/index.html - depth: -1
--
-- @param url instance of URL
-- @return depth number containing the depth relative to the base_url
getDepth = function(self, url)
local base_dir, url_dir = self.options.base_url:getDir(), url:getDir()
if ( url_dir and base_dir ) then
local m = url_dir:match(base_dir.."(.*)")
if ( not(m) ) then
return -1
else
local _, depth = m:gsub("/", "/")
return depth
end
end
end,
validate_link = function(self, url)
local valid = true
-- if our url is nil, abort, this could be due to a number of
-- reasons such as unsupported protocols: javascript, mail ... or
-- that the URL failed to parse for some reason
if ( url == nil or tostring(url) == nil ) then
return false
end
-- linkdepth trumps whitelisting
if ( self.options.maxdepth and self.options.maxdepth >= 0 ) then
local depth = self:getDepth( url )
if ( -1 == depth or depth > self.options.maxdepth ) then
stdnse.print_debug(3, "%s: Skipping link depth: %d; b_url=%s; url=%s", LIBRARY_NAME, depth, tostring(self.options.base_url), tostring(url))
return false
end
end
-- withindomain trumps any whitelisting
if ( self.options.withindomain ) then
if ( not(self.options.withindomain(url)) ) then
stdnse.print_debug(2, "%s: Link is not within domain: %s", LIBRARY_NAME, tostring(url))
return false
end
end
-- withinhost trumps any whitelisting
if ( self.options.withinhost ) then
if ( not(self.options.withinhost(url)) ) then
stdnse.print_debug(2, "%s: Link is not within host: %s", LIBRARY_NAME, tostring(url))
return false
end
end
-- run through all blacklists
if ( #self.options.blacklist > 0 ) then
for _, func in ipairs(self.options.blacklist) do
if ( func(url) ) then
stdnse.print_debug(2, "%s: Blacklist match: %s", LIBRARY_NAME, tostring(url))
valid = false
break
end
end
end
-- check the url against our whitelist
if ( #self.options.whitelist > 0 ) then
for _, func in ipairs(self.options.whitelist) do
if ( func(url) ) then
stdnse.print_debug(2, "%s: Whitelist match: %s", LIBRARY_NAME, tostring(url))
valid = true
break
end
end
end
return valid
end,
-- Parses a HTML response and extracts all links it can find
-- The function currently supports href, src and action links
-- Also all behaviour options, such as depth, white- and black-list are
-- processed in here.
parse = function(self)
local links = {}
local patterns = {
'[hH][rR][eE][fF]%s*=%s*[\'"]%s*([^"^\']-)%s*[\'"]',
'[hH][rR][eE][fF]%s*=%s*([^\'\"][^%s>]+)',
'[sS][rR][cC]%s*=%s*[\'"]%s*([^"^\']-)%s*[\'"]',
'[sS][rR][cC]%s*=%s*([^\'\"][^%s>]+)',
'[aA][cC][tT][iI][oO][nN]%s*=%s*[\'"]%s*([^"^\']+%s*)[\'"]',
}
local base_hrefs = {
'[Bb][Aa][Ss][Ee]%s*[Hh][Rr][Ee][Ff]%s*=%s*[\'"](%s*[^"^\']+%s*)[\'"]',
'[Bb][Aa][Ss][Ee]%s*[Hh][Rr][Ee][Ff]%s*=%s*([^\'\"][^%s>]+)'
}
local base_href
for _, pattern in ipairs(base_hrefs) do
base_href = self.html:match(pattern)
if ( base_href ) then
break
end
end
for _, pattern in ipairs(patterns) do
for l in self.html:gmatch(pattern) do
local link = l
if ( not(LinkExtractor.isAbsolute(l)) ) then
link = LinkExtractor.createAbsolute(self.url, l, base_href)
end
local url = URL:new(link)
local valid = self:validate_link(url)
if ( valid ) then
stdnse.print_debug(3, "%s: Adding link: %s", LIBRARY_NAME, tostring(url))
links[tostring(url)] = true
elseif ( tostring(url) ) then
stdnse.print_debug(3, "%s: Skipping url: %s", LIBRARY_NAME, link)
end
end
end
for link in pairs(links) do
table.insert(self.links, link)
end
end,
-- Gets a table containing all of the retrieved URLs, after filtering
-- has been applied.
getLinks = function(self) return self.links end,
}
-- The URL class, containing code to process URLS
-- This class is heavily inspired by the Java URL class
URL = {
-- Creates a new instance of URL
-- @param url string containing the text representation of a URL
-- @return o instance of URL, in case of parsing being successful
-- nil in case parsing fails
new = function(self, url)
local o = {
raw = url,
}
setmetatable(o, self)
self.__index = self
if ( o:parse() ) then
return o
end
end,
-- Parses the string representation of the URL and splits it into different
-- URL components
-- @return status true on success, false on failure
parse = function(self)
self.proto, self.host, self.port, self.file = self.raw:match("^(http[s]?)://([^:/]*)[:]?(%d*)")
if ( self.proto and self.host ) then
self.file = self.raw:match("^http[s]?://[^:/]*[:]?%d*(/[^#]*)") or '/'
self.port = tonumber(self.port)
if ( not(self.port) ) then
if ( self.proto:match("https") ) then
self.port = 443
elseif ( self.proto:match("http")) then
self.port = 80
end
end
self.path = self.file:match("^([^?]*)[%?]?")
self.dir = self.path:match("^(.+%/)") or "/"
self.domain= self.host:match("^[^%.]-%.(.*)")
return true
elseif( self.raw:match("^javascript:") ) then
stdnse.print_debug(2, "%s: Skipping javascript url: %s", LIBRARY_NAME, self.raw)
elseif( self.raw:match("^mailto:") ) then
stdnse.print_debug(2, "%s: Skipping mailto link: %s", LIBRARY_NAME, self.raw)
else
stdnse.print_debug(2, "%s: WARNING: Failed to parse url: %s", LIBRARY_NAME, self.raw)
end
return false
end,
-- Get's the host portion of the URL
-- @return host string containing the hostname
getHost = function(self) return self.host end,
-- Get's the protocol representation of the URL
-- @return proto string containing the protocol (ie. http, https)
getProto = function(self) return self.proto end,
-- Returns the filename component of the URL.
-- @return file string containing the path and query components of the url
getFile = function(self) return self.file end,
-- Gets the port component of the URL
-- @return port number containing the port of the URL
getPort = function(self) return self.port end,
-- Gets the path component of the URL
-- @return the full path and filename of the URL
getPath = function(self) return self.path end,
-- Gets the directory component of the URL
-- @return directory string containing the directory part of the URL
getDir = function(self) return self.dir end,
-- Gets the domain component of the URL
-- @return domain string containing the hosts domain
getDomain = function(self)
if ( self.domain ) then
return self.domain
-- fallback to the host, if we can't find a domain
else
return self.host
end
end,
-- Converts the URL to a string
-- @return url string containing the string representation of the url
__tostring = function(self) return self.raw end,
}
-- An UrlQueue
UrlQueue = {
-- creates a new instance of UrlQueue
-- @param options table containing options
-- @return o new instance of UrlQueue
new = function(self, options)
local o = {
urls = {},
options = options
}
setmetatable(o, self)
self.__index = self
return o
end,
-- get's the next available url in the queue
getNext = function(self)
return table.remove(self.urls,1)
end,
-- adds a new url to the queue
-- @param url can be either a string or a URL or a table of URLs
add = function(self, url)
assert( type(url) == 'string' or type(url) == 'table', "url was neither a string or table")
local urls = ( 'string' == type(url) ) and URL:new(url) or url
-- if it's a table, it can be either a single URL or an array of URLs
if ( 'table' == type(url) and url.raw ) then
urls = { url }
end
for _, u in ipairs(urls) do
u = ( 'string' == type(u) ) and URL:new(u) or u
if ( u ) then
table.insert(self.urls, u)
else
stdnse.print_debug("ERROR: Invalid URL: %s", url)
end
end
end,
-- dumps the contents of the UrlQueue
dump = function(self)
for _, url in ipairs(self.urls) do
print("url:", url)
end
end,
}
-- The Crawler class
Crawler = {
-- creates a new instance of the Crawler instance
-- @param host table as received by the action method
-- @param port table as received by the action method
-- @param url string containing the relative URL
-- @param options table of options:
-- <code>noblacklist</code> - do not load default blacklist
-- <code>base_url</code> - start url to crawl
-- <code>timeout</code> - timeout for the http request
-- <code>maxdepth</code> - the maximum directory depth to crawl
-- <code>maxpagecount</code> - the maximum amount of pages to retrieve
-- <code>withinhost</code> - stay within the host of the base_url
-- <code>withindomain</code> - stay within the base_url domain
-- <code>scriptname</code> - should be set to SCRIPT_NAME to enable
-- script specific arguments.
-- <code>redirect_ok</code> - redirect_ok closure to pass to http.get function
-- @return o new instance of Crawler or nil on failure
new = function(self, host, port, url, options)
local o = {
host = host,
port = port,
url = url,
options = options or {},
basethread = stdnse.base(),
}
setmetatable(o, self)
self.__index = self
o:loadScriptArguments()
o:loadLibraryArguments()
o:loadDefaultArguments()
local response = http.get(o.host, o.port, '/', { timeout = o.options.timeout, redirect_ok = o.options.redirect_ok } )
if ( not(response) or 'table' ~= type(response) ) then
return
end
o.url = o.url:match("/?(.*)")
local u_host = o.host.targetname or o.host.name
if ( not(u_host) or 0 == #u_host ) then
u_host = o.host.ip
end
local u = ("%s://%s:%d/%s"):format(response.ssl and "https" or "http", u_host, o.port.number, o.url)
o.options.base_url = URL:new(u)
o.options = Options:new(o.options)
o.urlqueue = UrlQueue:new(o.options)
o.urlqueue:add(o.options.base_url)
o.options.timeout = o.options.timeout or 10000
o.processed = {}
-- script arguments have precedense
if ( not(o.options.maxdepth) ) then
o.options.maxdepth = tonumber(stdnse.get_script_args("httpspider.maxdepth"))
end
-- script arguments have precedense
if ( not(o.options.maxpagecount) ) then
o.options.maxpagecount = tonumber(stdnse.get_script_args("httpspider.maxpagecount"))
end
if ( not(o.options.noblacklist) ) then
o:addDefaultBlacklist()
end
if ( o.options.useheadfornonwebfiles ) then
-- Load web files extensitons from a file in nselib/data folder.
-- For more information on individual file formats, see
-- http://en.wikipedia.org/wiki/List_of_file_formats.
o.web_files_extensions = {}
local f = nmap.fetchfile("nselib/data/http-web-files-extensions.lst")
if f then
for l in io.lines(f) do
table.insert(o.web_files_extensions, l)
end
end
end
stdnse.print_debug(2, "%s: %s", LIBRARY_NAME, o:getLimitations())
return o
end,
-- Set's the timeout used by the http library
-- @param timeout number containing the timeout in ms.
set_timeout = function(self, timeout)
self.options.timeout = timeout
end,
-- Get's the amount of pages that has been retrieved
-- @return count number of pages retrieved by the instance
getPageCount = function(self)
local count = 1
for url in pairs(self.processed) do
count = count + 1
end
return count
end,
-- Adds a default blacklist blocking binary files such as images,
-- compressed archives and executable files
addDefaultBlacklist = function(self)
local extensions = {
image_extensions = {"png","jpg","jpeg","gif","bmp"},
video_extensions = {"avi","flv","ogg","mp4","wmv"},
audio_extensions = {"aac","m4a","mp3","wav"},
doc_extensions = {"pdf", "doc", "docx", "docm", "xls", "xlsx", "xlsm",
"ppt", "pptx", "pptm", "odf", "ods", "odp", "ps", "xps"},
archive_extensions = {"zip", "tar.gz", "gz", "rar", "7z", "sit", "sitx",
"tgz", "tar.bz", "tar", "iso"},
exe_extensions = {"exe", "com", "msi", "bin","dmg"}
}
local blacklist = {}
for _, cat in pairs(extensions) do
for _, ext in ipairs(cat) do
table.insert(blacklist, string.format(".%s$", ext))
end
end
self.options:addBlacklist( function(url)
local p = url:getPath():lower()
for _, pat in ipairs(blacklist) do
if ( p:match(pat) ) then
return true
end
end
end )
end,
-- does the heavy crawling
--
-- The crawler may exit due to a number of different reasons, including
-- invalid options, reaching max count or simply running out of links
-- We return a false status for all of these and in case the error was
-- unexpected or requires attention we set the error property accordingly.
-- This way the script can alert the user of the details by calling
-- getError()
crawl_thread = function(self, response_queue)
local condvar = nmap.condvar(response_queue)
if ( false ~= self.options.withinhost and false ~= self.options.withindomain ) then
table.insert(response_queue, { false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } })
condvar "signal"
return
end
while(true) do
if ( self.quit or coroutine.status(self.basethread) == 'dead' ) then
table.insert(response_queue, {false, { err = false, msg = "Quit signalled by crawler" } })
break
end
-- in case the user set a max page count to retrieve check how many
-- pages we have retrieved so far
local count = self:getPageCount()
if ( self.options.maxpagecount and
( count > self.options.maxpagecount ) ) then
table.insert(response_queue, { false, { err = false, msg = "Reached max page count" } })
condvar "signal"
return
end
-- pull links from the queue until we get a valid one
local url
repeat
url = self.urlqueue:getNext()
until( not(url) or not(self.processed[tostring(url)]) )
-- if no url could be retrieved from the queue, abort ...
if ( not(url) ) then
table.insert(response_queue, { false, { err = false, msg = "No more urls" } })
condvar "signal"
return
end
if ( self.options.maxpagecount ) then
stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url))
else
stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url))
end
local response
-- in case we want to use HEAD rather than GET for files with certain extensions
if ( self.options.useheadfornonwebfiles ) then
local is_web_file = false
local file = url:getPath():lower()
-- check if we are at a URL with 'no extension', for example: nmap.org/6
if string.match(file,".*(/[^/%.]*)$") or string.match(file, "/$") then is_web_file = true end
if not is_web_file then
for _,v in pairs(self.web_files_extensions) do
if string.match(file, "%."..v.."$") then
is_web_file = true
break
end
end
end
if is_web_file then
stdnse.print_debug(2, "%s: Using GET: %s", LIBRARY_NAME, file)
response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout, redirect_ok = self.options.redirect_ok } )
else
stdnse.print_debug(2, "%s: Using HEAD: %s", LIBRARY_NAME, file)
response = http.head(url:getHost(), url:getPort(), url:getFile())
end
else
-- fetch the url, and then push it to the processed table
response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout, redirect_ok = self.options.redirect_ok } )
end
self.processed[tostring(url)] = true
if ( response ) then
-- were we redirected?
if ( response.location ) then
-- was the link absolute?
local link = response.location[#response.location]
if ( link:match("^http") ) then
url = URL:new(link)
-- guess not
else
url.path = link
end
end
-- if we have a response, proceed scraping it
if ( response.body ) then
local links = LinkExtractor:new(url, response.body, self.options):getLinks()
self.urlqueue:add(links)
end
else
response = { body = "", headers = {} }
end
table.insert(response_queue, { true, { url = url, response = response } } )
while ( PREFETCH_SIZE < #response_queue ) do
stdnse.print_debug(2, "%s: Response queue full, waiting ...", LIBRARY_NAME)
condvar "wait"
end
condvar "signal"
end
condvar "signal"
end,
-- Loads the argument set on a script level
loadScriptArguments = function(self)
local sn = self.options.scriptname
if ( not(sn) ) then
stdnse.print_debug("%s: WARNING: Script argument could not be loaded as scriptname was not set", LIBRARY_NAME)
return
end
if ( nil == self.options.maxdepth ) then
self.options.maxdepth = tonumber(stdnse.get_script_args(sn .. ".maxdepth"))
end
if ( nil == self.options.maxpagecount ) then
self.options.maxpagecount = tonumber(stdnse.get_script_args(sn .. ".maxpagecount"))
end
if ( nil == self.url ) then
self.url = stdnse.get_script_args(sn .. ".url")
end
if ( nil == self.options.withinhost ) then
self.options.withinhost = stdnse.get_script_args(sn .. ".withinhost")
end
if ( nil == self.options.withindomain ) then
self.options.withindomain = stdnse.get_script_args(sn .. ".withindomain")
end
if ( nil == self.options.noblacklist ) then
self.options.noblacklist = stdnse.get_script_args(sn .. ".noblacklist")
end
if ( nil == self.options.useheadfornonwebfiles ) then
self.options.useheadfornonwebfiles = stdnse.get_script_args(sn .. ".useheadfornonwebfiles")
end
end,
-- Loads the argument on a library level
loadLibraryArguments = function(self)
local ln = LIBRARY_NAME
if ( nil == self.options.maxdepth ) then
self.options.maxdepth = tonumber(stdnse.get_script_args(ln .. ".maxdepth"))
end
if ( nil == self.options.maxpagecount ) then
self.options.maxpagecount = tonumber(stdnse.get_script_args(ln .. ".maxpagecount"))
end
if ( nil == self.url ) then
self.url = stdnse.get_script_args(ln .. ".url")
end
if ( nil == self.options.withinhost ) then
self.options.withinhost = stdnse.get_script_args(ln .. ".withinhost")
end
if ( nil == self.options.withindomain ) then
self.options.withindomain = stdnse.get_script_args(ln .. ".withindomain")
end
if ( nil == self.options.noblacklist ) then
self.options.noblacklist = stdnse.get_script_args(ln .. ".noblacklist")
end
if ( nil == self.options.useheadfornonwebfiles ) then
self.options.useheadfornonwebfiles = stdnse.get_script_args(ln .. ".useheadfornonwebfiles")
end
end,
-- Loads any defaults for arguments that were not set
loadDefaultArguments = function(self)
local function tobool(b)
if ( nil == b ) then
return
end
assert("string" == type(b) or "boolean" == type(b) or "number" == type(b), "httpspider: tobool failed, unsupported type")
if ( "string" == type(b) ) then
if ( "true" == b ) then
return true
else
return false
end
elseif ( "number" == type(b) ) then
if ( 1 == b ) then
return true
else
return false
end
end
return b
end
-- fixup some booleans to make sure they're actually booleans
self.options.withinhost = tobool(self.options.withinhost)
self.options.withindomain = tobool(self.options.withindomain)
self.options.noblacklist = tobool(self.options.noblacklist)
self.options.useheadfornonwebfiles = tobool(self.options.useheadfornonwebfiles)
if ( self.options.withinhost == nil ) then
if ( self.options.withindomain ~= true ) then
self.options.withinhost = true
else
self.options.withinhost = false
end
end
if ( self.options.withindomain == nil ) then
self.options.withindomain = false
end
self.options.maxdepth = self.options.maxdepth or 3
self.options.maxpagecount = self.options.maxpagecount or 20
self.url = self.url or '/'
end,
-- gets a string of limitations imposed on the crawl
getLimitations = function(self)
local o = self.options
local limits = {}
if ( o.maxdepth > 0 or o.maxpagecount > 0 or
o.withinhost or o.wihtindomain ) then
if ( o.maxdepth > 0 ) then
table.insert(limits, ("maxdepth=%d"):format(o.maxdepth))
end
if ( o.maxpagecount > 0 ) then
table.insert(limits, ("maxpagecount=%d"):format(o.maxpagecount))
end
if ( o.withindomain ) then
table.insert(limits, ("withindomain=%s"):format(o.base_url:getDomain() or o.base_url:getHost()))
end
if ( o.withinhost ) then
table.insert(limits, ("withinhost=%s"):format(o.base_url:getHost()))
end
end
if ( #limits > 0 ) then
return ("Spidering limited to: %s"):format(stdnse.strjoin("; ", limits))
end
end,
-- does the crawling
crawl = function(self)
self.response_queue = self.response_queue or {}
local condvar = nmap.condvar(self.response_queue)
if ( not(self.thread) ) then
self.thread = stdnse.new_thread(self.crawl_thread, self, self.response_queue)
end
if ( #self.response_queue == 0 and coroutine.status(self.thread) ~= 'dead') then
condvar "wait"
end
condvar "signal"
if ( #self.response_queue == 0 ) then
return false, { err = false, msg = "No more urls" }
else
return table.unpack(table.remove(self.response_queue, 1))
end
end,
-- signals the crawler to stop
stop = function(self)
local condvar = nmap.condvar(self.response_queue)
self.quit = true
condvar "signal"
if ( coroutine.status(self.thread) == "dead" ) then
return
end
condvar "wait"
end
}
return _ENV;