mirror of
https://github.com/nmap/nmap.git
synced 2025-12-09 14:11:29 +00:00
This should speed up crawling certain sites. In the case of http-email-harvest it should reduce some of the false positives generated by running the RegEx against binary data. The only script that this appears likely to have affected the results of would have been http-sitemap-generator and that script specifically disables the blacklist.
895 lines
28 KiB
Lua
895 lines
28 KiB
Lua
---
|
|
-- A smallish httpspider library providing basic spidering capabilities
|
|
-- It consists of the following classes:
|
|
--
|
|
-- * <code>Options</code>
|
|
-- ** This class is responsible for handling library options.
|
|
--
|
|
-- * <code>LinkExtractor</code>
|
|
-- ** This class contains code responsible for extracting urls from web pages.
|
|
--
|
|
-- * <code>URL</code>
|
|
-- ** This class contains code to parse and process URLs.
|
|
--
|
|
-- * <code>UrlQueue</code>
|
|
-- ** This class contains a queue of the next links to process.
|
|
--
|
|
-- * <code>Crawler</code>
|
|
-- ** This class is responsible for the actual crawling.
|
|
--
|
|
-- The following sample code shows how the spider could be used:
|
|
-- <code>
|
|
-- local crawler = httpspider.Crawler:new( host, port, '/', { scriptname = SCRIPT_NAME } )
|
|
-- crawler:set_timeout(10000)
|
|
--
|
|
-- local result
|
|
-- while(true) do
|
|
-- local status, r = crawler:crawl()
|
|
-- if ( not(status) ) then
|
|
-- break
|
|
-- end
|
|
-- if ( r.response.body:match(str_match) ) then
|
|
-- crawler:stop()
|
|
-- result = r.url
|
|
-- break
|
|
-- end
|
|
-- end
|
|
--
|
|
-- return result
|
|
-- </code>
|
|
--
|
|
-- @author Patrik Karlsson <patrik@cqure.net>
|
|
--
|
|
-- @args httpspider.maxdepth the maximum amount of directories beneath
|
|
-- the initial url to spider. A negative value disables the limit.
|
|
-- (default: 3)
|
|
-- @args httpspider.maxpagecount the maximum amount of pages to visit.
|
|
-- A negative value disables the limit (default: 20)
|
|
-- @args httpspider.url the url to start spidering. This is a URL
|
|
-- relative to the scanned host eg. /default.html (default: /)
|
|
-- @args httpspider.withinhost only spider URLs within the same host.
|
|
-- (default: true)
|
|
-- @args httpspider.withindomain only spider URLs within the same
|
|
-- domain. This widens the scope from <code>withinhost</code> and can
|
|
-- not be used in combination. (default: false)
|
|
-- @args httpspider.noblacklist if set, doesn't load the default blacklist
|
|
-- @args httpspider.useheadfornonwebfiles if set, the crawler would use
|
|
-- HEAD instead of GET for files that do not have extensions indicating
|
|
-- that they are webpages (the list of webpage extensions is located in
|
|
-- nselib/data/http-web-files-extensions.lst)
|
|
--
|
|
|
|
local coroutine = require "coroutine"
|
|
local http = require "http"
|
|
local io = require "io"
|
|
local nmap = require "nmap"
|
|
local stdnse = require "stdnse"
|
|
local string = require "string"
|
|
local table = require "table"
|
|
local url = require "url"
|
|
_ENV = stdnse.module("httpspider", stdnse.seeall)
|
|
|
|
local LIBRARY_NAME = "httpspider"
|
|
local PREFETCH_SIZE = 5
|
|
|
|
-- The Options class, handling all spidering options
|
|
Options = {
|
|
|
|
new = function(self, options)
|
|
local o = { }
|
|
|
|
-- copy all options as class members
|
|
for k, v in pairs(options) do o[k] = v end
|
|
|
|
-- set a few default values
|
|
o.timeout = options.timeout or 10000
|
|
o.whitelist = o.whitelist or {}
|
|
o.blacklist = o.blacklist or {}
|
|
local removewww = function(url) string.gsub(url, "^www%.", "") end
|
|
|
|
if ( o.withinhost == true or o.withindomain == true ) then
|
|
-- set up the appropriate matching functions
|
|
if ( o.withinhost ) then
|
|
o.withinhost = function(u)
|
|
local parsed_u = url.parse(tostring(u))
|
|
|
|
if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
|
|
if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then
|
|
return false
|
|
end
|
|
elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
|
|
return false
|
|
-- if urls don't match only on the "www" prefix, then they are probably the same
|
|
elseif ( parsed_u.host == nil or removewww(parsed_u.host:lower()) ~= removewww(o.base_url:getHost():lower()) ) then
|
|
return false
|
|
end
|
|
return true
|
|
end
|
|
else
|
|
o.withindomain = function(u)
|
|
local parsed_u = url.parse(tostring(u))
|
|
if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
|
|
if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then
|
|
return false
|
|
end
|
|
elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
|
|
return false
|
|
elseif ( parsed_u.host == nil or parsed_u.host:sub(-#o.base_url:getDomain()):lower() ~= o.base_url:getDomain():lower() ) then
|
|
return false
|
|
end
|
|
return true
|
|
end
|
|
end
|
|
end
|
|
setmetatable(o, self)
|
|
self.__index = self
|
|
return o
|
|
end,
|
|
|
|
addWhitelist = function(self, func) table.insert(self.whitelist, func) end,
|
|
addBlacklist = function(self, func) table.insert(self.blacklist, func) end,
|
|
|
|
}
|
|
|
|
-- Placeholder for form extraction code
|
|
FormExtractor = {
|
|
|
|
}
|
|
|
|
LinkExtractor = {
|
|
|
|
-- Creates a new instance of LinkExtractor
|
|
-- @return o instance of LinkExtractor
|
|
new = function(self, url, html, options)
|
|
local o = {
|
|
url = url,
|
|
html = html,
|
|
links = {},
|
|
options = options,
|
|
}
|
|
setmetatable(o, self)
|
|
self.__index = self
|
|
o:parse()
|
|
return o
|
|
end,
|
|
|
|
-- is the link absolute or not?
|
|
isAbsolute = function(url)
|
|
-- at this point we don't care about the protocol
|
|
-- also, we don't add // to cover stuff like:
|
|
-- feed:http://example.com/rss.xml
|
|
return ( url:match('^%w*:') ~= nil )
|
|
end,
|
|
|
|
-- Creates an absolute link from a relative one based on the base_url
|
|
-- The functionality is very simple and does not take any ../../ in
|
|
-- consideration.
|
|
--
|
|
-- @param base_url URL containing the page url from which the links were
|
|
-- extracted
|
|
-- @param rel_url string containing the relative portion of the URL
|
|
-- @return link string containing the absolute link
|
|
createAbsolute = function(base_url, rel_url, base_href)
|
|
|
|
-- is relative with leading slash? ie /dir1/foo.html
|
|
local leading_slash = rel_url:match("^/")
|
|
rel_url = rel_url:match("^/?(.*)") or '/'
|
|
|
|
-- check for tailing slash
|
|
if ( base_href and not(base_href:match("/$") ) ) then
|
|
base_href = base_href .. '/'
|
|
end
|
|
|
|
if ( ( base_url:getProto() == 'https' and base_url:getPort() == 443 ) or
|
|
( base_url:getProto() == 'http' and base_url:getPort() == 80 ) ) then
|
|
|
|
if ( leading_slash ) then
|
|
return ("%s://%s/%s"):format(base_url:getProto(), base_url:getHost(), rel_url)
|
|
else
|
|
if ( base_href ) then
|
|
return ("%s%s"):format(base_href, rel_url)
|
|
else
|
|
return ("%s://%s%s%s"):format(base_url:getProto(), base_url:getHost(), base_url:getDir(), rel_url)
|
|
end
|
|
end
|
|
else
|
|
if ( leading_slash ) then
|
|
return ("%s://%s:%d/%s"):format(base_url:getProto(), base_url:getHost(), base_url:getPort(), rel_url)
|
|
else
|
|
if ( base_href ) then
|
|
return ("%s%s"):format(base_href, rel_url)
|
|
else
|
|
return ("%s://%s:%d%s%s"):format(base_url:getProto(), base_url:getHost(), base_href or base_url:getPort(), base_url:getDir(), rel_url)
|
|
end
|
|
end
|
|
end
|
|
end,
|
|
|
|
-- Gets the depth of the link, relative to our base url eg.
|
|
-- base_url = http://www.cqure.net/wp/
|
|
-- url = http://www.cqure.net/wp/ - depth: 0
|
|
-- url = http://www.cqure.net/wp/index.php - depth: 0
|
|
-- url = http://www.cqure.net/wp/2011/index.php - depth: 1
|
|
-- url = http://www.cqure.net/index.html - depth: -1
|
|
--
|
|
-- @param url instance of URL
|
|
-- @return depth number containing the depth relative to the base_url
|
|
getDepth = function(self, url)
|
|
local base_dir, url_dir = self.options.base_url:getDir(), url:getDir()
|
|
if ( url_dir and base_dir ) then
|
|
local m = url_dir:match(base_dir.."(.*)")
|
|
if ( not(m) ) then
|
|
return -1
|
|
else
|
|
local _, depth = m:gsub("/", "/")
|
|
return depth
|
|
end
|
|
end
|
|
end,
|
|
|
|
validate_link = function(self, url)
|
|
local valid = true
|
|
|
|
-- if our url is nil, abort, this could be due to a number of
|
|
-- reasons such as unsupported protocols: javascript, mail ... or
|
|
-- that the URL failed to parse for some reason
|
|
if ( url == nil or tostring(url) == nil ) then
|
|
return false
|
|
end
|
|
|
|
-- linkdepth trumps whitelisting
|
|
if ( self.options.maxdepth and self.options.maxdepth >= 0 ) then
|
|
local depth = self:getDepth( url )
|
|
if ( -1 == depth or depth > self.options.maxdepth ) then
|
|
stdnse.print_debug(3, "%s: Skipping link depth: %d; b_url=%s; url=%s", LIBRARY_NAME, depth, tostring(self.options.base_url), tostring(url))
|
|
return false
|
|
end
|
|
end
|
|
|
|
-- withindomain trumps any whitelisting
|
|
if ( self.options.withindomain ) then
|
|
if ( not(self.options.withindomain(url)) ) then
|
|
stdnse.print_debug(2, "%s: Link is not within domain: %s", LIBRARY_NAME, tostring(url))
|
|
return false
|
|
end
|
|
end
|
|
|
|
-- withinhost trumps any whitelisting
|
|
if ( self.options.withinhost ) then
|
|
if ( not(self.options.withinhost(url)) ) then
|
|
stdnse.print_debug(2, "%s: Link is not within host: %s", LIBRARY_NAME, tostring(url))
|
|
return false
|
|
end
|
|
end
|
|
|
|
-- run through all blacklists
|
|
if ( #self.options.blacklist > 0 ) then
|
|
for _, func in ipairs(self.options.blacklist) do
|
|
if ( func(url) ) then
|
|
stdnse.print_debug(2, "%s: Blacklist match: %s", LIBRARY_NAME, tostring(url))
|
|
valid = false
|
|
break
|
|
end
|
|
end
|
|
end
|
|
|
|
-- check the url against our whitelist
|
|
if ( #self.options.whitelist > 0 ) then
|
|
for _, func in ipairs(self.options.whitelist) do
|
|
if ( func(url) ) then
|
|
stdnse.print_debug(2, "%s: Whitelist match: %s", LIBRARY_NAME, tostring(url))
|
|
valid = true
|
|
break
|
|
end
|
|
end
|
|
end
|
|
return valid
|
|
end,
|
|
|
|
-- Parses a HTML response and extracts all links it can find
|
|
-- The function currently supports href, src and action links
|
|
-- Also all behaviour options, such as depth, white- and black-list are
|
|
-- processed in here.
|
|
parse = function(self)
|
|
local links = {}
|
|
local patterns = {
|
|
'[hH][rR][eE][fF]%s*=%s*[\'"]%s*([^"^\']-)%s*[\'"]',
|
|
'[hH][rR][eE][fF]%s*=%s*([^\'\"][^%s>]+)',
|
|
'[sS][rR][cC]%s*=%s*[\'"]%s*([^"^\']-)%s*[\'"]',
|
|
'[sS][rR][cC]%s*=%s*([^\'\"][^%s>]+)',
|
|
'[aA][cC][tT][iI][oO][nN]%s*=%s*[\'"]%s*([^"^\']+%s*)[\'"]',
|
|
}
|
|
|
|
local base_hrefs = {
|
|
'[Bb][Aa][Ss][Ee]%s*[Hh][Rr][Ee][Ff]%s*=%s*[\'"](%s*[^"^\']+%s*)[\'"]',
|
|
'[Bb][Aa][Ss][Ee]%s*[Hh][Rr][Ee][Ff]%s*=%s*([^\'\"][^%s>]+)'
|
|
}
|
|
|
|
local base_href
|
|
for _, pattern in ipairs(base_hrefs) do
|
|
base_href = self.html:match(pattern)
|
|
if ( base_href ) then
|
|
break
|
|
end
|
|
end
|
|
|
|
for _, pattern in ipairs(patterns) do
|
|
for l in self.html:gmatch(pattern) do
|
|
local link = l
|
|
if ( not(LinkExtractor.isAbsolute(l)) ) then
|
|
link = LinkExtractor.createAbsolute(self.url, l, base_href)
|
|
end
|
|
|
|
local url = URL:new(link)
|
|
|
|
local valid = self:validate_link(url)
|
|
|
|
if ( valid ) then
|
|
stdnse.print_debug(3, "%s: Adding link: %s", LIBRARY_NAME, tostring(url))
|
|
links[tostring(url)] = true
|
|
elseif ( tostring(url) ) then
|
|
stdnse.print_debug(3, "%s: Skipping url: %s", LIBRARY_NAME, link)
|
|
end
|
|
end
|
|
end
|
|
|
|
for link in pairs(links) do
|
|
table.insert(self.links, link)
|
|
end
|
|
|
|
end,
|
|
|
|
-- Gets a table containing all of the retrieved URLs, after filtering
|
|
-- has been applied.
|
|
getLinks = function(self) return self.links end,
|
|
|
|
|
|
}
|
|
|
|
-- The URL class, containing code to process URLS
|
|
-- This class is heavily inspired by the Java URL class
|
|
URL = {
|
|
|
|
-- Creates a new instance of URL
|
|
-- @param url string containing the text representation of a URL
|
|
-- @return o instance of URL, in case of parsing being successful
|
|
-- nil in case parsing fails
|
|
new = function(self, url)
|
|
local o = {
|
|
raw = url,
|
|
}
|
|
|
|
setmetatable(o, self)
|
|
self.__index = self
|
|
if ( o:parse() ) then
|
|
return o
|
|
end
|
|
end,
|
|
|
|
-- Parses the string representation of the URL and splits it into different
|
|
-- URL components
|
|
-- @return status true on success, false on failure
|
|
parse = function(self)
|
|
self.proto, self.host, self.port, self.file = self.raw:match("^(http[s]?)://([^:/]*)[:]?(%d*)")
|
|
if ( self.proto and self.host ) then
|
|
self.file = self.raw:match("^http[s]?://[^:/]*[:]?%d*(/[^#]*)") or '/'
|
|
self.port = tonumber(self.port)
|
|
if ( not(self.port) ) then
|
|
if ( self.proto:match("https") ) then
|
|
self.port = 443
|
|
elseif ( self.proto:match("http")) then
|
|
self.port = 80
|
|
end
|
|
end
|
|
|
|
self.path = self.file:match("^([^?]*)[%?]?")
|
|
self.dir = self.path:match("^(.+%/)") or "/"
|
|
self.domain= self.host:match("^[^%.]-%.(.*)")
|
|
return true
|
|
elseif( self.raw:match("^javascript:") ) then
|
|
stdnse.print_debug(2, "%s: Skipping javascript url: %s", LIBRARY_NAME, self.raw)
|
|
elseif( self.raw:match("^mailto:") ) then
|
|
stdnse.print_debug(2, "%s: Skipping mailto link: %s", LIBRARY_NAME, self.raw)
|
|
else
|
|
stdnse.print_debug(2, "%s: WARNING: Failed to parse url: %s", LIBRARY_NAME, self.raw)
|
|
end
|
|
return false
|
|
end,
|
|
|
|
-- Get's the host portion of the URL
|
|
-- @return host string containing the hostname
|
|
getHost = function(self) return self.host end,
|
|
|
|
-- Get's the protocol representation of the URL
|
|
-- @return proto string containing the protocol (ie. http, https)
|
|
getProto = function(self) return self.proto end,
|
|
|
|
-- Returns the filename component of the URL.
|
|
-- @return file string containing the path and query components of the url
|
|
getFile = function(self) return self.file end,
|
|
|
|
-- Gets the port component of the URL
|
|
-- @return port number containing the port of the URL
|
|
getPort = function(self) return self.port end,
|
|
|
|
-- Gets the path component of the URL
|
|
-- @return the full path and filename of the URL
|
|
getPath = function(self) return self.path end,
|
|
|
|
-- Gets the directory component of the URL
|
|
-- @return directory string containing the directory part of the URL
|
|
getDir = function(self) return self.dir end,
|
|
|
|
-- Gets the domain component of the URL
|
|
-- @return domain string containing the hosts domain
|
|
getDomain = function(self)
|
|
if ( self.domain ) then
|
|
return self.domain
|
|
-- fallback to the host, if we can't find a domain
|
|
else
|
|
return self.host
|
|
end
|
|
end,
|
|
|
|
-- Converts the URL to a string
|
|
-- @return url string containing the string representation of the url
|
|
__tostring = function(self) return self.raw end,
|
|
}
|
|
|
|
-- An UrlQueue
|
|
UrlQueue = {
|
|
|
|
-- creates a new instance of UrlQueue
|
|
-- @param options table containing options
|
|
-- @return o new instance of UrlQueue
|
|
new = function(self, options)
|
|
local o = {
|
|
urls = {},
|
|
options = options
|
|
}
|
|
setmetatable(o, self)
|
|
self.__index = self
|
|
return o
|
|
end,
|
|
|
|
-- get's the next available url in the queue
|
|
getNext = function(self)
|
|
return table.remove(self.urls,1)
|
|
end,
|
|
|
|
-- adds a new url to the queue
|
|
-- @param url can be either a string or a URL or a table of URLs
|
|
add = function(self, url)
|
|
assert( type(url) == 'string' or type(url) == 'table', "url was neither a string or table")
|
|
local urls = ( 'string' == type(url) ) and URL:new(url) or url
|
|
|
|
-- if it's a table, it can be either a single URL or an array of URLs
|
|
if ( 'table' == type(url) and url.raw ) then
|
|
urls = { url }
|
|
end
|
|
|
|
for _, u in ipairs(urls) do
|
|
u = ( 'string' == type(u) ) and URL:new(u) or u
|
|
if ( u ) then
|
|
table.insert(self.urls, u)
|
|
else
|
|
stdnse.print_debug("ERROR: Invalid URL: %s", url)
|
|
end
|
|
end
|
|
end,
|
|
|
|
-- dumps the contents of the UrlQueue
|
|
dump = function(self)
|
|
for _, url in ipairs(self.urls) do
|
|
print("url:", url)
|
|
end
|
|
end,
|
|
|
|
}
|
|
|
|
-- The Crawler class
|
|
Crawler = {
|
|
|
|
-- creates a new instance of the Crawler instance
|
|
-- @param host table as received by the action method
|
|
-- @param port table as received by the action method
|
|
-- @param url string containing the relative URL
|
|
-- @param options table of options:
|
|
-- <code>noblacklist</code> - do not load default blacklist
|
|
-- <code>base_url</code> - start url to crawl
|
|
-- <code>timeout</code> - timeout for the http request
|
|
-- <code>maxdepth</code> - the maximum directory depth to crawl
|
|
-- <code>maxpagecount</code> - the maximum amount of pages to retrieve
|
|
-- <code>withinhost</code> - stay within the host of the base_url
|
|
-- <code>withindomain</code> - stay within the base_url domain
|
|
-- <code>scriptname</code> - should be set to SCRIPT_NAME to enable
|
|
-- script specific arguments.
|
|
-- <code>redirect_ok</code> - redirect_ok closure to pass to http.get function
|
|
-- @return o new instance of Crawler or nil on failure
|
|
new = function(self, host, port, url, options)
|
|
local o = {
|
|
host = host,
|
|
port = port,
|
|
url = url,
|
|
options = options or {},
|
|
basethread = stdnse.base(),
|
|
}
|
|
|
|
setmetatable(o, self)
|
|
self.__index = self
|
|
|
|
o:loadScriptArguments()
|
|
o:loadLibraryArguments()
|
|
o:loadDefaultArguments()
|
|
|
|
local response = http.get(o.host, o.port, '/', { timeout = o.options.timeout, redirect_ok = o.options.redirect_ok } )
|
|
|
|
if ( not(response) or 'table' ~= type(response) ) then
|
|
return
|
|
end
|
|
|
|
o.url = o.url:match("/?(.*)")
|
|
|
|
local u_host = o.host.targetname or o.host.name
|
|
if ( not(u_host) or 0 == #u_host ) then
|
|
u_host = o.host.ip
|
|
end
|
|
local u = ("%s://%s:%d/%s"):format(response.ssl and "https" or "http", u_host, o.port.number, o.url)
|
|
o.options.base_url = URL:new(u)
|
|
o.options = Options:new(o.options)
|
|
o.urlqueue = UrlQueue:new(o.options)
|
|
o.urlqueue:add(o.options.base_url)
|
|
|
|
o.options.timeout = o.options.timeout or 10000
|
|
o.processed = {}
|
|
|
|
-- script arguments have precedense
|
|
if ( not(o.options.maxdepth) ) then
|
|
o.options.maxdepth = tonumber(stdnse.get_script_args("httpspider.maxdepth"))
|
|
end
|
|
|
|
-- script arguments have precedense
|
|
if ( not(o.options.maxpagecount) ) then
|
|
o.options.maxpagecount = tonumber(stdnse.get_script_args("httpspider.maxpagecount"))
|
|
end
|
|
|
|
if ( not(o.options.noblacklist) ) then
|
|
o:addDefaultBlacklist()
|
|
end
|
|
|
|
if ( o.options.useheadfornonwebfiles ) then
|
|
-- Load web files extensitons from a file in nselib/data folder.
|
|
-- For more information on individual file formats, see
|
|
-- http://en.wikipedia.org/wiki/List_of_file_formats.
|
|
o.web_files_extensions = {}
|
|
local f = nmap.fetchfile("nselib/data/http-web-files-extensions.lst")
|
|
if f then
|
|
for l in io.lines(f) do
|
|
table.insert(o.web_files_extensions, l)
|
|
end
|
|
end
|
|
end
|
|
|
|
stdnse.print_debug(2, "%s: %s", LIBRARY_NAME, o:getLimitations())
|
|
|
|
return o
|
|
end,
|
|
|
|
-- Set's the timeout used by the http library
|
|
-- @param timeout number containing the timeout in ms.
|
|
set_timeout = function(self, timeout)
|
|
self.options.timeout = timeout
|
|
end,
|
|
|
|
-- Get's the amount of pages that has been retrieved
|
|
-- @return count number of pages retrieved by the instance
|
|
getPageCount = function(self)
|
|
local count = 1
|
|
for url in pairs(self.processed) do
|
|
count = count + 1
|
|
end
|
|
return count
|
|
end,
|
|
|
|
-- Adds a default blacklist blocking binary files such as images,
|
|
-- compressed archives and executable files
|
|
addDefaultBlacklist = function(self)
|
|
local extensions = {
|
|
image_extensions = {"png","jpg","jpeg","gif","bmp"},
|
|
video_extensions = {"avi","flv","ogg","mp4","wmv"},
|
|
audio_extensions = {"aac","m4a","mp3","wav"},
|
|
doc_extensions = {"pdf", "doc", "docx", "docm", "xls", "xlsx", "xlsm",
|
|
"ppt", "pptx", "pptm", "odf", "ods", "odp", "ps", "xps"},
|
|
archive_extensions = {"zip", "tar.gz", "gz", "rar", "7z", "sit", "sitx",
|
|
"tgz", "tar.bz", "tar", "iso"},
|
|
exe_extensions = {"exe", "com", "msi", "bin","dmg"}
|
|
}
|
|
local blacklist = {}
|
|
for _, cat in pairs(extensions) do
|
|
for _, ext in ipairs(cat) do
|
|
table.insert(blacklist, string.format(".%s$", ext))
|
|
end
|
|
end
|
|
|
|
self.options:addBlacklist( function(url)
|
|
local p = url:getPath():lower()
|
|
for _, pat in ipairs(blacklist) do
|
|
if ( p:match(pat) ) then
|
|
return true
|
|
end
|
|
end
|
|
end )
|
|
end,
|
|
|
|
-- does the heavy crawling
|
|
--
|
|
-- The crawler may exit due to a number of different reasons, including
|
|
-- invalid options, reaching max count or simply running out of links
|
|
-- We return a false status for all of these and in case the error was
|
|
-- unexpected or requires attention we set the error property accordingly.
|
|
-- This way the script can alert the user of the details by calling
|
|
-- getError()
|
|
crawl_thread = function(self, response_queue)
|
|
local condvar = nmap.condvar(response_queue)
|
|
|
|
if ( false ~= self.options.withinhost and false ~= self.options.withindomain ) then
|
|
table.insert(response_queue, { false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } })
|
|
condvar "signal"
|
|
return
|
|
end
|
|
|
|
while(true) do
|
|
|
|
if ( self.quit or coroutine.status(self.basethread) == 'dead' ) then
|
|
table.insert(response_queue, {false, { err = false, msg = "Quit signalled by crawler" } })
|
|
break
|
|
end
|
|
|
|
-- in case the user set a max page count to retrieve check how many
|
|
-- pages we have retrieved so far
|
|
local count = self:getPageCount()
|
|
if ( self.options.maxpagecount and
|
|
( count > self.options.maxpagecount ) ) then
|
|
table.insert(response_queue, { false, { err = false, msg = "Reached max page count" } })
|
|
condvar "signal"
|
|
return
|
|
end
|
|
|
|
-- pull links from the queue until we get a valid one
|
|
local url
|
|
repeat
|
|
url = self.urlqueue:getNext()
|
|
until( not(url) or not(self.processed[tostring(url)]) )
|
|
|
|
-- if no url could be retrieved from the queue, abort ...
|
|
if ( not(url) ) then
|
|
table.insert(response_queue, { false, { err = false, msg = "No more urls" } })
|
|
condvar "signal"
|
|
return
|
|
end
|
|
|
|
if ( self.options.maxpagecount ) then
|
|
stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url))
|
|
else
|
|
stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url))
|
|
end
|
|
|
|
local response
|
|
-- in case we want to use HEAD rather than GET for files with certain extensions
|
|
if ( self.options.useheadfornonwebfiles ) then
|
|
local is_web_file = false
|
|
local file = url:getPath():lower()
|
|
-- check if we are at a URL with 'no extension', for example: nmap.org/6
|
|
if string.match(file,".*(/[^/%.]*)$") or string.match(file, "/$") then is_web_file = true end
|
|
if not is_web_file then
|
|
for _,v in pairs(self.web_files_extensions) do
|
|
if string.match(file, "%."..v.."$") then
|
|
is_web_file = true
|
|
break
|
|
end
|
|
end
|
|
end
|
|
if is_web_file then
|
|
stdnse.print_debug(2, "%s: Using GET: %s", LIBRARY_NAME, file)
|
|
response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout, redirect_ok = self.options.redirect_ok } )
|
|
else
|
|
stdnse.print_debug(2, "%s: Using HEAD: %s", LIBRARY_NAME, file)
|
|
response = http.head(url:getHost(), url:getPort(), url:getFile())
|
|
end
|
|
else
|
|
-- fetch the url, and then push it to the processed table
|
|
response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout, redirect_ok = self.options.redirect_ok } )
|
|
end
|
|
|
|
self.processed[tostring(url)] = true
|
|
|
|
if ( response ) then
|
|
-- were we redirected?
|
|
if ( response.location ) then
|
|
-- was the link absolute?
|
|
local link = response.location[#response.location]
|
|
if ( link:match("^http") ) then
|
|
url = URL:new(link)
|
|
-- guess not
|
|
else
|
|
url.path = link
|
|
end
|
|
end
|
|
-- if we have a response, proceed scraping it
|
|
if ( response.body ) then
|
|
local links = LinkExtractor:new(url, response.body, self.options):getLinks()
|
|
self.urlqueue:add(links)
|
|
end
|
|
else
|
|
response = { body = "", headers = {} }
|
|
end
|
|
table.insert(response_queue, { true, { url = url, response = response } } )
|
|
while ( PREFETCH_SIZE < #response_queue ) do
|
|
stdnse.print_debug(2, "%s: Response queue full, waiting ...", LIBRARY_NAME)
|
|
condvar "wait"
|
|
end
|
|
condvar "signal"
|
|
end
|
|
condvar "signal"
|
|
end,
|
|
|
|
-- Loads the argument set on a script level
|
|
loadScriptArguments = function(self)
|
|
local sn = self.options.scriptname
|
|
if ( not(sn) ) then
|
|
stdnse.print_debug("%s: WARNING: Script argument could not be loaded as scriptname was not set", LIBRARY_NAME)
|
|
return
|
|
end
|
|
|
|
if ( nil == self.options.maxdepth ) then
|
|
self.options.maxdepth = tonumber(stdnse.get_script_args(sn .. ".maxdepth"))
|
|
end
|
|
if ( nil == self.options.maxpagecount ) then
|
|
self.options.maxpagecount = tonumber(stdnse.get_script_args(sn .. ".maxpagecount"))
|
|
end
|
|
if ( nil == self.url ) then
|
|
self.url = stdnse.get_script_args(sn .. ".url")
|
|
end
|
|
if ( nil == self.options.withinhost ) then
|
|
self.options.withinhost = stdnse.get_script_args(sn .. ".withinhost")
|
|
end
|
|
if ( nil == self.options.withindomain ) then
|
|
self.options.withindomain = stdnse.get_script_args(sn .. ".withindomain")
|
|
end
|
|
if ( nil == self.options.noblacklist ) then
|
|
self.options.noblacklist = stdnse.get_script_args(sn .. ".noblacklist")
|
|
end
|
|
if ( nil == self.options.useheadfornonwebfiles ) then
|
|
self.options.useheadfornonwebfiles = stdnse.get_script_args(sn .. ".useheadfornonwebfiles")
|
|
end
|
|
end,
|
|
|
|
-- Loads the argument on a library level
|
|
loadLibraryArguments = function(self)
|
|
local ln = LIBRARY_NAME
|
|
|
|
if ( nil == self.options.maxdepth ) then
|
|
self.options.maxdepth = tonumber(stdnse.get_script_args(ln .. ".maxdepth"))
|
|
end
|
|
if ( nil == self.options.maxpagecount ) then
|
|
self.options.maxpagecount = tonumber(stdnse.get_script_args(ln .. ".maxpagecount"))
|
|
end
|
|
if ( nil == self.url ) then
|
|
self.url = stdnse.get_script_args(ln .. ".url")
|
|
end
|
|
if ( nil == self.options.withinhost ) then
|
|
self.options.withinhost = stdnse.get_script_args(ln .. ".withinhost")
|
|
end
|
|
if ( nil == self.options.withindomain ) then
|
|
self.options.withindomain = stdnse.get_script_args(ln .. ".withindomain")
|
|
end
|
|
if ( nil == self.options.noblacklist ) then
|
|
self.options.noblacklist = stdnse.get_script_args(ln .. ".noblacklist")
|
|
end
|
|
if ( nil == self.options.useheadfornonwebfiles ) then
|
|
self.options.useheadfornonwebfiles = stdnse.get_script_args(ln .. ".useheadfornonwebfiles")
|
|
end
|
|
end,
|
|
|
|
-- Loads any defaults for arguments that were not set
|
|
loadDefaultArguments = function(self)
|
|
local function tobool(b)
|
|
if ( nil == b ) then
|
|
return
|
|
end
|
|
assert("string" == type(b) or "boolean" == type(b) or "number" == type(b), "httpspider: tobool failed, unsupported type")
|
|
if ( "string" == type(b) ) then
|
|
if ( "true" == b ) then
|
|
return true
|
|
else
|
|
return false
|
|
end
|
|
elseif ( "number" == type(b) ) then
|
|
if ( 1 == b ) then
|
|
return true
|
|
else
|
|
return false
|
|
end
|
|
end
|
|
return b
|
|
end
|
|
|
|
-- fixup some booleans to make sure they're actually booleans
|
|
self.options.withinhost = tobool(self.options.withinhost)
|
|
self.options.withindomain = tobool(self.options.withindomain)
|
|
self.options.noblacklist = tobool(self.options.noblacklist)
|
|
self.options.useheadfornonwebfiles = tobool(self.options.useheadfornonwebfiles)
|
|
|
|
if ( self.options.withinhost == nil ) then
|
|
if ( self.options.withindomain ~= true ) then
|
|
self.options.withinhost = true
|
|
else
|
|
self.options.withinhost = false
|
|
end
|
|
end
|
|
if ( self.options.withindomain == nil ) then
|
|
self.options.withindomain = false
|
|
end
|
|
self.options.maxdepth = self.options.maxdepth or 3
|
|
self.options.maxpagecount = self.options.maxpagecount or 20
|
|
self.url = self.url or '/'
|
|
end,
|
|
|
|
-- gets a string of limitations imposed on the crawl
|
|
getLimitations = function(self)
|
|
local o = self.options
|
|
local limits = {}
|
|
if ( o.maxdepth > 0 or o.maxpagecount > 0 or
|
|
o.withinhost or o.wihtindomain ) then
|
|
if ( o.maxdepth > 0 ) then
|
|
table.insert(limits, ("maxdepth=%d"):format(o.maxdepth))
|
|
end
|
|
if ( o.maxpagecount > 0 ) then
|
|
table.insert(limits, ("maxpagecount=%d"):format(o.maxpagecount))
|
|
end
|
|
if ( o.withindomain ) then
|
|
table.insert(limits, ("withindomain=%s"):format(o.base_url:getDomain() or o.base_url:getHost()))
|
|
end
|
|
if ( o.withinhost ) then
|
|
table.insert(limits, ("withinhost=%s"):format(o.base_url:getHost()))
|
|
end
|
|
end
|
|
|
|
if ( #limits > 0 ) then
|
|
return ("Spidering limited to: %s"):format(stdnse.strjoin("; ", limits))
|
|
end
|
|
end,
|
|
|
|
-- does the crawling
|
|
crawl = function(self)
|
|
self.response_queue = self.response_queue or {}
|
|
local condvar = nmap.condvar(self.response_queue)
|
|
if ( not(self.thread) ) then
|
|
self.thread = stdnse.new_thread(self.crawl_thread, self, self.response_queue)
|
|
end
|
|
|
|
if ( #self.response_queue == 0 and coroutine.status(self.thread) ~= 'dead') then
|
|
condvar "wait"
|
|
end
|
|
condvar "signal"
|
|
if ( #self.response_queue == 0 ) then
|
|
return false, { err = false, msg = "No more urls" }
|
|
else
|
|
return table.unpack(table.remove(self.response_queue, 1))
|
|
end
|
|
end,
|
|
|
|
-- signals the crawler to stop
|
|
stop = function(self)
|
|
local condvar = nmap.condvar(self.response_queue)
|
|
self.quit = true
|
|
condvar "signal"
|
|
if ( coroutine.status(self.thread) == "dead" ) then
|
|
return
|
|
end
|
|
condvar "wait"
|
|
end
|
|
}
|
|
|
|
return _ENV;
|