mirror of
https://github.com/nmap/nmap.git
synced 2025-12-06 04:31:29 +00:00
httpspider was extracting "links" from javascript if there was a variable called "src" or similar. By requiring an open HTML tag, we eliminate this problem, still matching src, href, or action attributes of any tag.
1073 lines
37 KiB
Lua
1073 lines
37 KiB
Lua
---
|
|
-- A smallish httpspider library providing basic spidering capabilities
|
|
-- It consists of the following classes:
|
|
--
|
|
-- * <code>Options</code>
|
|
-- ** This class is responsible for handling library options.
|
|
--
|
|
-- * <code>LinkExtractor</code>
|
|
-- ** This class contains code responsible for extracting urls from web pages.
|
|
--
|
|
-- * <code>URL</code>
|
|
-- ** This class contains code to parse and process URLs.
|
|
--
|
|
-- * <code>UrlQueue</code>
|
|
-- ** This class contains a queue of the next links to process.
|
|
--
|
|
-- * <code>Crawler</code>
|
|
-- ** This class is responsible for the actual crawling.
|
|
--
|
|
-- The following sample code shows how the spider could be used:
|
|
-- <code>
|
|
-- local crawler = httpspider.Crawler:new( host, port, '/', { scriptname = SCRIPT_NAME } )
|
|
-- crawler:set_timeout(10000)
|
|
--
|
|
-- local result
|
|
-- while(true) do
|
|
-- local status, r = crawler:crawl()
|
|
-- if ( not(status) ) then
|
|
-- break
|
|
-- end
|
|
-- if ( r.response.body:match(str_match) ) then
|
|
-- crawler:stop()
|
|
-- result = r.url
|
|
-- break
|
|
-- end
|
|
-- end
|
|
--
|
|
-- return result
|
|
-- </code>
|
|
--
|
|
-- For advanced use, the library currently supports a number of closures (withinhost,
|
|
-- withindomain, doscraping). Please note, that withinhost and withindomain options also
|
|
-- support boolean values. You will want to override them only for advanced use. You can
|
|
-- define them using the following utilities:
|
|
--
|
|
-- * <code>iswithinhost</code>
|
|
-- ** You can use this utility to check if the resource exists within the host.
|
|
--
|
|
-- * <code>iswithindomain</code>
|
|
-- ** You can use this utility to check if the resource exists within the domain.
|
|
--
|
|
-- * <code>isresource</code>
|
|
-- ** You can use this utility to check the type of the resource (for example "js").
|
|
-- ** A third option may hold a number of signs that may exist after the extension
|
|
-- ** of the resource. By default, these are [#, ?]. For example, if we want to return
|
|
-- only php resources, the function will also return example.php?query=foo or
|
|
-- example.php#foo.
|
|
--
|
|
-- The following sample code shows an example usage. We override the default
|
|
-- withinhost method and we allow spidering only on resources within the host
|
|
-- that they are not "js" or "css".
|
|
-- <code>
|
|
-- crawler.options.withinhost = function(url)
|
|
-- if crawler:iswithinhost(url)
|
|
-- and not crawler:isresource(url, "js")
|
|
-- and not crawler:isresource(url, "css") then
|
|
-- return true
|
|
-- end
|
|
-- end
|
|
-- </code>
|
|
--
|
|
-- @author Patrik Karlsson <patrik@cqure.net>
|
|
--
|
|
-- @args httpspider.maxdepth the maximum amount of directories beneath
|
|
-- the initial url to spider. A negative value disables the limit.
|
|
-- (default: 3)
|
|
-- @args httpspider.maxpagecount the maximum amount of pages to visit.
|
|
-- A negative value disables the limit (default: 20)
|
|
-- @args httpspider.url the url to start spidering. This is a URL
|
|
-- relative to the scanned host eg. /default.html (default: /)
|
|
-- @args httpspider.withinhost Closure that overrides the default withinhost
|
|
-- function that only spiders URLs within the same host. If this is
|
|
-- set to false the crawler will spider URLs both inside and outside
|
|
-- the host. See the closure section above to override the default
|
|
-- behaviour. (default: true)
|
|
-- @args httpspider.withindomain Closure that overrides the default
|
|
-- withindomain function that only spiders URLs within the same
|
|
-- domain. This widens the scope from <code>withinhost</code> and can
|
|
-- not be used in combination. See the closure section above to
|
|
-- override the default behaviour. (default: false)
|
|
-- @args httpspider.noblacklist if set, doesn't load the default blacklist
|
|
-- @args httpspider.useheadfornonwebfiles if set, the crawler would use
|
|
-- HEAD instead of GET for files that do not have extensions indicating
|
|
-- that they are webpages (the list of webpage extensions is located in
|
|
-- nselib/data/http-web-files-extensions.lst)
|
|
-- @args httpspider.doscraping Closure that overrides the default doscraping
|
|
-- function used to check if the resource should be scraped (in terms
|
|
-- of extracting any links within it). See the closure section above to
|
|
-- override the default behaviour.
|
|
---
|
|
|
|
local coroutine = require "coroutine"
|
|
local http = require "http"
|
|
local io = require "io"
|
|
local nmap = require "nmap"
|
|
local stdnse = require "stdnse"
|
|
local string = require "string"
|
|
local table = require "table"
|
|
local url = require "url"
|
|
_ENV = stdnse.module("httpspider", stdnse.seeall)
|
|
|
|
local LIBRARY_NAME = "httpspider"
|
|
local PREFETCH_SIZE = 5
|
|
|
|
-- The Options class, handling all spidering options
|
|
Options = {
|
|
|
|
new = function(self, options)
|
|
local o = { }
|
|
|
|
-- copy all options as class members
|
|
for k, v in pairs(options) do o[k] = v end
|
|
|
|
-- set a few default values
|
|
o.timeout = options.timeout or 10000
|
|
o.whitelist = o.whitelist or {}
|
|
o.blacklist = o.blacklist or {}
|
|
local removewww = function(url) return string.gsub(url, "^www%.", "") end
|
|
|
|
-- set up the appropriate matching functions
|
|
if ( o.withinhost ) then
|
|
o.withinhost = function(u)
|
|
local parsed_u = url.parse(tostring(u))
|
|
local host = parsed_u.ascii_host or parsed_u.host
|
|
|
|
if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
|
|
if ( parsed_u.port ~= tonumber(o.base_url:getPort()) ) then
|
|
return false
|
|
end
|
|
elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
|
|
return false
|
|
-- if urls don't match only on the "www" prefix, then they are probably the same
|
|
elseif ( host == nil or removewww(host:lower()) ~= removewww(o.base_url:getHost():lower()) ) then
|
|
return false
|
|
end
|
|
return true
|
|
end
|
|
end
|
|
if ( o.withindomain ) then
|
|
o.withindomain = function(u)
|
|
local parsed_u = url.parse(tostring(u))
|
|
local host = parsed_u.ascii_host or parsed_u.host
|
|
if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
|
|
if ( parsed_u.port ~= tonumber(o.base_url:getPort()) ) then
|
|
return false
|
|
end
|
|
elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
|
|
return false
|
|
elseif ( host == nil or host:sub(-#o.base_url:getDomain()):lower() ~= o.base_url:getDomain():lower() ) then
|
|
return false
|
|
end
|
|
return true
|
|
end
|
|
end
|
|
|
|
if (not o.doscraping) then
|
|
|
|
o.doscraping = function(u)
|
|
return true
|
|
end
|
|
end
|
|
|
|
setmetatable(o, self)
|
|
self.__index = self
|
|
return o
|
|
end,
|
|
|
|
addWhitelist = function(self, func) table.insert(self.whitelist, func) end,
|
|
addBlacklist = function(self, func) table.insert(self.blacklist, func) end,
|
|
|
|
}
|
|
|
|
-- Placeholder for form extraction code
|
|
FormExtractor = {
|
|
|
|
}
|
|
|
|
LinkExtractor = {
|
|
|
|
-- Creates a new instance of LinkExtractor
|
|
-- @return o instance of LinkExtractor
|
|
new = function(self, url, html, options)
|
|
local o = {
|
|
url = url,
|
|
html = html,
|
|
links = {},
|
|
options = options,
|
|
}
|
|
setmetatable(o, self)
|
|
self.__index = self
|
|
o:parse()
|
|
|
|
return o
|
|
end,
|
|
|
|
-- is the link absolute or not?
|
|
isAbsolute = function(url)
|
|
-- at this point we don't care about the protocol
|
|
-- also, we don't add // to cover stuff like:
|
|
-- feed:http://example.com/rss.xml
|
|
return ( url:match('^%w*:') ~= nil )
|
|
end,
|
|
|
|
-- Creates an absolute link from a relative one based on the base_url
|
|
-- The functionality is very simple and does not take any ../../ in
|
|
-- consideration.
|
|
--
|
|
-- @param base_url URL containing the page url from which the links were
|
|
-- extracted
|
|
-- @param rel_url string containing the relative portion of the URL
|
|
-- @return link string containing the absolute link
|
|
createAbsolute = function(base_url, rel_url, base_href)
|
|
|
|
-- is protocol-relative?
|
|
if rel_url:match("^//") then
|
|
return ("%s%s%s"):format(base_url:getProto(), ":", rel_url)
|
|
end
|
|
|
|
-- is relative with leading slash? ie /dir1/foo.html
|
|
local leading_slash = rel_url:match("^/")
|
|
rel_url = rel_url:match("^/?(.*)") or '/'
|
|
|
|
-- check for tailing slash
|
|
if ( base_href and not(base_href:match("/$") ) ) then
|
|
base_href = base_href .. '/'
|
|
end
|
|
|
|
if base_url:getPort() == url.get_default_port(base_url:getProto()) then
|
|
if ( leading_slash ) then
|
|
return ("%s://%s/%s"):format(base_url:getProto(), base_url:getHost(), rel_url)
|
|
else
|
|
if ( base_href ) then
|
|
return ("%s%s"):format(base_href, rel_url)
|
|
else
|
|
return ("%s://%s%s%s"):format(base_url:getProto(), base_url:getHost(), base_url:getDir(), rel_url)
|
|
end
|
|
end
|
|
else
|
|
if ( leading_slash ) then
|
|
return ("%s://%s:%d/%s"):format(base_url:getProto(), base_url:getHost(), base_url:getPort(), rel_url)
|
|
else
|
|
if ( base_href ) then
|
|
return ("%s%s"):format(base_href, rel_url)
|
|
else
|
|
return ("%s://%s:%d%s%s"):format(base_url:getProto(), base_url:getHost(), base_href or base_url:getPort(), base_url:getDir(), rel_url)
|
|
end
|
|
end
|
|
end
|
|
end,
|
|
|
|
-- Gets the depth of the link, relative to our base url eg.
|
|
-- base_url = http://www.cqure.net/wp/
|
|
-- url = http://www.cqure.net/wp/ - depth: 0
|
|
-- url = http://www.cqure.net/wp/index.php - depth: 0
|
|
-- url = http://www.cqure.net/wp/2011/index.php - depth: 1
|
|
-- url = http://www.cqure.net/index.html - depth: -1
|
|
--
|
|
-- @param url instance of URL
|
|
-- @return depth number containing the depth relative to the base_url
|
|
getDepth = function(self, url)
|
|
local base_dir, url_dir = self.options.base_url:getDir(), url:getDir()
|
|
if ( url_dir and base_dir ) then
|
|
local m = url_dir:match(base_dir.."(.*)")
|
|
if ( not(m) ) then
|
|
return -1
|
|
else
|
|
local _, depth = m:gsub("/", "/")
|
|
return depth
|
|
end
|
|
end
|
|
end,
|
|
|
|
validate_link = function(self, url)
|
|
local valid = true
|
|
|
|
-- if our url is nil, abort, this could be due to a number of
|
|
-- reasons such as unsupported protocols: javascript, mail ... or
|
|
-- that the URL failed to parse for some reason
|
|
if ( url == nil or tostring(url) == nil ) then
|
|
return false
|
|
end
|
|
|
|
-- linkdepth trumps whitelisting
|
|
if ( self.options.maxdepth and self.options.maxdepth >= 0 ) then
|
|
local depth = self:getDepth( url )
|
|
if ( -1 == depth or depth > self.options.maxdepth ) then
|
|
stdnse.debug3("%s: Skipping link depth: %d; b_url=%s; url=%s", LIBRARY_NAME, depth, tostring(self.options.base_url), tostring(url))
|
|
return false
|
|
end
|
|
end
|
|
|
|
-- withindomain trumps any whitelisting
|
|
if ( self.options.withindomain ) then
|
|
if ( not(self.options.withindomain(url)) ) then
|
|
stdnse.debug2("%s: Link is not within domain: %s", LIBRARY_NAME, tostring(url))
|
|
return false
|
|
end
|
|
end
|
|
|
|
-- withinhost trumps any whitelisting
|
|
if ( self.options.withinhost ) then
|
|
if ( not(self.options.withinhost(url)) ) then
|
|
stdnse.debug2("%s: Link is not within host: %s", LIBRARY_NAME, tostring(url))
|
|
return false
|
|
end
|
|
end
|
|
|
|
-- run through all blacklists
|
|
if ( #self.options.blacklist > 0 ) then
|
|
for _, func in ipairs(self.options.blacklist) do
|
|
if ( func(url) ) then
|
|
stdnse.debug2("%s: Blacklist match: %s", LIBRARY_NAME, tostring(url))
|
|
valid = false
|
|
break
|
|
end
|
|
end
|
|
end
|
|
|
|
-- check the url against our whitelist
|
|
if ( #self.options.whitelist > 0 ) then
|
|
valid = false
|
|
for _, func in ipairs(self.options.whitelist) do
|
|
if ( func(url) ) then
|
|
stdnse.debug2("%s: Whitelist match: %s", LIBRARY_NAME, tostring(url))
|
|
valid = true
|
|
break
|
|
end
|
|
end
|
|
end
|
|
return valid
|
|
end,
|
|
|
|
-- Parses a HTML response and extracts all links it can find
|
|
-- The function currently supports href, src and action links
|
|
-- Also all behaviour options, such as depth, white- and black-list are
|
|
-- processed in here.
|
|
parse = function(self)
|
|
local links = {}
|
|
local patterns = {
|
|
'<[^>]+[hH][rR][eE][fF]%s*=%s*[\'"]%s*([^"^\']-)%s*[\'"]',
|
|
'<[^>]+[hH][rR][eE][fF]%s*=%s*([^\'\"][^%s>]+)',
|
|
'<[^>]+[sS][rR][cC]%s*=%s*[\'"]%s*([^"^\']-)%s*[\'"]',
|
|
'<[^>]+[sS][rR][cC]%s*=%s*([^\'\"][^%s>]+)',
|
|
'<[^>]+[aA][cC][tT][iI][oO][nN]%s*=%s*[\'"]%s*([^"^\']+%s*)[\'"]',
|
|
}
|
|
|
|
local base_hrefs = {
|
|
'<[^>]+[Bb][Aa][Ss][Ee]%s*[Hh][Rr][Ee][Ff]%s*=%s*[\'"](%s*[^"^\']+%s*)[\'"]',
|
|
'<[^>]+[Bb][Aa][Ss][Ee]%s*[Hh][Rr][Ee][Ff]%s*=%s*([^\'\"][^%s>]+)'
|
|
}
|
|
|
|
local base_href
|
|
for _, pattern in ipairs(base_hrefs) do
|
|
base_href = self.html:match(pattern)
|
|
if ( base_href ) then
|
|
break
|
|
end
|
|
end
|
|
|
|
for _, pattern in ipairs(patterns) do
|
|
for l in self.html:gmatch(pattern) do
|
|
local link = l
|
|
if ( not(LinkExtractor.isAbsolute(l)) ) then
|
|
link = LinkExtractor.createAbsolute(self.url, l, base_href)
|
|
end
|
|
|
|
local url = URL:new(link)
|
|
|
|
local valid = self:validate_link(url)
|
|
|
|
if ( valid ) then
|
|
stdnse.debug3("%s: Adding link: %s", LIBRARY_NAME, tostring(url))
|
|
links[tostring(url)] = true
|
|
elseif ( tostring(url) ) then
|
|
stdnse.debug3("%s: Skipping url: %s", LIBRARY_NAME, link)
|
|
end
|
|
end
|
|
end
|
|
|
|
for link in pairs(links) do
|
|
table.insert(self.links, link)
|
|
end
|
|
|
|
end,
|
|
|
|
-- Gets a table containing all of the retrieved URLs, after filtering
|
|
-- has been applied.
|
|
getLinks = function(self) return self.links end,
|
|
|
|
|
|
}
|
|
|
|
-- The URL class, containing code to process URLS
|
|
-- This class is heavily inspired by the Java URL class
|
|
URL = {
|
|
|
|
-- Creates a new instance of URL
|
|
-- @param url string containing the text representation of a URL
|
|
-- @return o instance of URL, in case of parsing being successful
|
|
-- nil in case parsing fails
|
|
new = function(self, url)
|
|
local o = {
|
|
raw = url,
|
|
}
|
|
|
|
setmetatable(o, self)
|
|
self.__index = self
|
|
if ( o:parse() ) then
|
|
return o
|
|
end
|
|
end,
|
|
|
|
-- Parses the string representation of the URL and splits it into different
|
|
-- URL components
|
|
-- @return status true on success, false on failure
|
|
parse = function(self)
|
|
local parsed = url.parse(self.raw)
|
|
if parsed.scheme and parsed.scheme:match("^https?$") then
|
|
self.proto = parsed.scheme
|
|
self.host = parsed.ascii_host or parsed.host
|
|
self.port = tonumber(parsed.port) or url.get_default_port(self.proto)
|
|
-- "file" is the path, params, and query, but not the fragment
|
|
local fileparts = {parsed.path}
|
|
if parsed.params then
|
|
fileparts[#fileparts+1] = ";"
|
|
fileparts[#fileparts+1] = parsed.params
|
|
end
|
|
if parsed.query then
|
|
fileparts[#fileparts+1] = "?"
|
|
fileparts[#fileparts+1] = parsed.query
|
|
end
|
|
self.file = table.concat(fileparts)
|
|
self.path = parsed.path
|
|
-- Normalize the values; removes dot and dot-dot path segments
|
|
self.file = url.absolute("", self.file)
|
|
self.path = url.absolute("", self.path)
|
|
self.dir = self.path:match("^(.+%/)") or "/"
|
|
-- TODO: Use public suffix list to extract domain
|
|
self.domain= self.host:match("^[^%.]-%.(.*)")
|
|
return true
|
|
end
|
|
return false
|
|
end,
|
|
|
|
-- Gets the host portion of the URL
|
|
-- @return host string containing the hostname
|
|
getHost = function(self) return self.host end,
|
|
|
|
-- Gets the protocol representation of the URL
|
|
-- @return proto string containing the protocol (ie. http, https)
|
|
getProto = function(self) return self.proto end,
|
|
|
|
-- Returns the filename component of the URL.
|
|
-- @return file string containing the path and query components of the url
|
|
getFile = function(self) return self.file end,
|
|
|
|
-- Gets the port component of the URL
|
|
-- @return port number containing the port of the URL
|
|
getPort = function(self) return self.port end,
|
|
|
|
-- Gets the path component of the URL
|
|
-- @return the full path and filename of the URL
|
|
getPath = function(self) return self.path end,
|
|
|
|
-- Gets the directory component of the URL
|
|
-- @return directory string containing the directory part of the URL
|
|
getDir = function(self) return self.dir end,
|
|
|
|
-- Gets the domain component of the URL
|
|
-- @return domain string containing the hosts domain
|
|
getDomain = function(self)
|
|
if ( self.domain ) then
|
|
return self.domain
|
|
-- fallback to the host, if we can't find a domain
|
|
else
|
|
return self.host
|
|
end
|
|
end,
|
|
|
|
-- Converts the URL to a string
|
|
-- @return url string containing the string representation of the url
|
|
__tostring = function(self)
|
|
return string.format("%s://%s:%s%s",
|
|
self.proto,
|
|
self.host,
|
|
self.port,
|
|
self.file
|
|
)
|
|
end,
|
|
}
|
|
|
|
-- An UrlQueue
|
|
UrlQueue = {
|
|
|
|
-- creates a new instance of UrlQueue
|
|
-- @param options table containing options
|
|
-- @return o new instance of UrlQueue
|
|
new = function(self, options)
|
|
local o = {
|
|
urls = {},
|
|
options = options
|
|
}
|
|
setmetatable(o, self)
|
|
self.__index = self
|
|
return o
|
|
end,
|
|
|
|
-- gets the next available url in the queue
|
|
getNext = function(self)
|
|
return table.remove(self.urls,1)
|
|
end,
|
|
|
|
-- adds a new url to the queue
|
|
-- @param url can be either a string or a URL or a table of URLs
|
|
add = function(self, url)
|
|
assert( type(url) == 'string' or type(url) == 'table', "url was neither a string or table")
|
|
local urls = ( 'string' == type(url) ) and URL:new(url) or url
|
|
|
|
-- if it's a table, it can be either a single URL or an array of URLs
|
|
if ( 'table' == type(url) and url.raw ) then
|
|
urls = { url }
|
|
end
|
|
|
|
for _, u in ipairs(urls) do
|
|
u = ( 'string' == type(u) ) and URL:new(u) or u
|
|
if ( u ) then
|
|
table.insert(self.urls, u)
|
|
else
|
|
stdnse.debug1("ERROR: Invalid URL: %s", url)
|
|
end
|
|
end
|
|
end,
|
|
|
|
-- dumps the contents of the UrlQueue
|
|
dump = function(self)
|
|
for _, url in ipairs(self.urls) do
|
|
print("url:", url)
|
|
end
|
|
end,
|
|
|
|
}
|
|
|
|
-- The Crawler class
|
|
Crawler = {
|
|
|
|
options = {},
|
|
|
|
removewww = function(url) return string.gsub(url, "^www%.", "") end,
|
|
|
|
-- An utility when defining closures. Checks if the resource exists within host.
|
|
-- @param u URL that points to the resource we want to check.
|
|
iswithinhost = function(self, u)
|
|
local parsed_u = url.parse(tostring(u))
|
|
local host = parsed_u.ascii_host or parsed_u.host
|
|
if ( self.options.base_url:getPort() ~= 80 and self.options.base_url:getPort() ~= 443 ) then
|
|
if ( parsed_u.port ~= tonumber(self.options.base_url:getPort()) ) then
|
|
return false
|
|
end
|
|
elseif ( parsed_u.scheme ~= self.options.base_url:getProto() ) then
|
|
return false
|
|
-- if urls don't match only on the "www" prefix, then they are probably the same
|
|
elseif ( host == nil or self.removewww(host:lower()) ~= self.removewww(self.options.base_url:getHost():lower()) ) then
|
|
return false
|
|
end
|
|
return true
|
|
end,
|
|
|
|
-- An utility when defining closures. Checks if the resource exists within domain.
|
|
-- @param u URL that points to the resource we want to check.
|
|
iswithindomain = function(self, u)
|
|
local parsed_u = url.parse(tostring(u))
|
|
local host = parsed_u.ascii_host or parsed_u.host
|
|
if ( self.options.base_url:getPort() ~= 80 and self.options.base_url:getPort() ~= 443 ) then
|
|
if ( parsed_u.port ~= tonumber(self.options.base_url:getPort()) ) then
|
|
return false
|
|
end
|
|
elseif ( parsed_u.scheme ~= self.options.base_url:getProto() ) then
|
|
return false
|
|
elseif ( host == nil or host:sub(-#self.options.base_url:getDomain()):lower() ~= self.options.base_url:getDomain():lower() ) then
|
|
return false
|
|
end
|
|
return true
|
|
end,
|
|
|
|
-- An utility when defining closures. Checks the type of the resource.
|
|
-- @param u URL that points to the resource we want to check.
|
|
-- @param ext the extension of the resource.
|
|
-- @param signs table of signs that may exist after the extension of the resource.
|
|
isresource = function(self, u, ext, signs)
|
|
u = tostring(u)
|
|
|
|
if string.match(u, "." .. ext .. "$") then
|
|
return true
|
|
end
|
|
|
|
local signstring = ""
|
|
if signs then
|
|
for _, s in signs do
|
|
signstring = signstring .. s
|
|
end
|
|
signstring:gsub('?', '%?')
|
|
else
|
|
signstring = "#%?"
|
|
end
|
|
|
|
return string.match(u, "." .. ext .. "[" .. signstring .. "]" .. "[^.]*$")
|
|
|
|
end,
|
|
|
|
-- creates a new instance of the Crawler instance
|
|
-- @param host table as received by the action method
|
|
-- @param port table as received by the action method
|
|
-- @param url string containing the relative URL
|
|
-- @param options table of options:
|
|
-- <code>noblacklist</code> - do not load default blacklist
|
|
-- <code>base_url</code> - start url to crawl
|
|
-- <code>timeout</code> - timeout for the http request
|
|
-- <code>maxdepth</code> - the maximum directory depth to crawl
|
|
-- <code>maxpagecount</code> - the maximum amount of pages to retrieve
|
|
-- <code>withinhost</code> - stay within the host of the base_url
|
|
-- <code>withindomain</code> - stay within the base_url domain
|
|
-- <code>doscraping</code> - Permit scraping
|
|
-- <code>scriptname</code> - should be set to SCRIPT_NAME to enable
|
|
-- script specific arguments.
|
|
-- <code>redirect_ok</code> - redirect_ok closure to pass to http.get function
|
|
-- <code>no_cache</code> - no_cache option to pass to http.get function
|
|
-- @return o new instance of Crawler or nil on failure
|
|
new = function(self, host, port, url, options)
|
|
local o = {
|
|
host = host,
|
|
port = port,
|
|
url = url,
|
|
options = options or {},
|
|
basethread = stdnse.base(),
|
|
}
|
|
|
|
setmetatable(o, self)
|
|
self.__index = self
|
|
|
|
self.options = o
|
|
|
|
o:loadScriptArguments()
|
|
o:loadLibraryArguments()
|
|
o:loadDefaultArguments()
|
|
|
|
local response = http.get(o.host, o.port, '/', { timeout = o.options.timeout, redirect_ok = o.options.redirect_ok, no_cache = o.options.no_cache } )
|
|
|
|
if ( not(response) or 'table' ~= type(response) ) then
|
|
return
|
|
end
|
|
|
|
o.url = o.url:match("/?(.*)")
|
|
|
|
local u_host = o.host.targetname or o.host.name
|
|
if ( not(u_host) or 0 == #u_host ) then
|
|
u_host = o.host.ip
|
|
end
|
|
local u = ("%s://%s:%d/%s"):format(response.ssl and "https" or "http", u_host, o.port.number, o.url)
|
|
o.options.base_url = URL:new(u)
|
|
o.options = Options:new(o.options)
|
|
o.urlqueue = UrlQueue:new(o.options)
|
|
o.urlqueue:add(o.options.base_url)
|
|
|
|
o.options.timeout = o.options.timeout or 10000
|
|
o.processed = {}
|
|
|
|
-- script arguments have precedence
|
|
if ( not(o.options.maxdepth) ) then
|
|
o.options.maxdepth = tonumber(stdnse.get_script_args("httpspider.maxdepth"))
|
|
end
|
|
|
|
-- script arguments have precedence
|
|
if ( not(o.options.maxpagecount) ) then
|
|
o.options.maxpagecount = tonumber(stdnse.get_script_args("httpspider.maxpagecount"))
|
|
end
|
|
|
|
if ( not(o.options.noblacklist) ) then
|
|
o:addDefaultBlacklist()
|
|
end
|
|
|
|
if ( o.options.useheadfornonwebfiles ) then
|
|
-- Load web files extensions from a file in nselib/data folder.
|
|
-- For more information on individual file formats, see
|
|
-- http://en.wikipedia.org/wiki/List_of_file_formats.
|
|
o.web_files_extensions = {}
|
|
local f = nmap.fetchfile("nselib/data/http-web-files-extensions.lst")
|
|
if f then
|
|
for l in io.lines(f) do
|
|
table.insert(o.web_files_extensions, l)
|
|
end
|
|
end
|
|
end
|
|
|
|
stdnse.debug2("%s: %s", LIBRARY_NAME, o:getLimitations())
|
|
|
|
return o
|
|
end,
|
|
|
|
-- Sets the timeout used by the http library
|
|
-- @param timeout number containing the timeout in ms.
|
|
set_timeout = function(self, timeout)
|
|
self.options.timeout = timeout
|
|
end,
|
|
|
|
-- Gets the amount of pages that has been retrieved
|
|
-- @return count number of pages retrieved by the instance
|
|
getPageCount = function(self)
|
|
local count = 1
|
|
for url in pairs(self.processed) do
|
|
count = count + 1
|
|
end
|
|
return count
|
|
end,
|
|
|
|
-- Adds a default blacklist blocking binary files such as images,
|
|
-- compressed archives and executable files
|
|
addDefaultBlacklist = function(self)
|
|
-- References:
|
|
--[[
|
|
Image file formats: https://en.wikipedia.org/wiki/Image_file_formats
|
|
Video file formats: https://en.wikipedia.org/wiki/Video_file_format
|
|
Audio file formats: https://en.wikipedia.org/wiki/Audio_file_format
|
|
Doc file extension: https://en.wikipedia.org/wiki/List_of_Microsoft_Office_filename_extensions
|
|
Archive formats: https://en.wikipedia.org/wiki/List_of_archive_formats ,
|
|
https://en.wikipedia.org/wiki/Category:Archive_formats
|
|
]]
|
|
local extensions = {
|
|
image_extensions = {"png", "jpg", "jpeg", "gif", "bmp", "jfif", "exif",
|
|
"tiff", "bmp", "ppm", "pgm", "pbm", "pnm", "webp", "heif", "bpg",
|
|
"cgm", "svg"},
|
|
video_extensions = {"avi", "flv", "ogg", "mp4", "m4p", "m4v", "wmv",
|
|
"vob", "ogv", "mng", "mov", "rmvb", "asf", "nsv", "f4v", "f4p",
|
|
"amv", "webm", "mkv", "mpg", "mp2", "mpeg", "mpv", "svi", "3gp",
|
|
"3g2", "mxf", "roq"},
|
|
audio_extensions = {"aac", "m4a", "mp3", "wav", "aa", "aax", "act", "aiff",
|
|
"amr", "ape", "au", "awb", "dct", "dss", "dvf", "flac", "gsm", "iklax",
|
|
"ivs", "m4a", "m4b", "m4p", "mmf", "mpc", "msc", "ogg", "oga", "mogg",
|
|
"oups", "ra", "raw", "sln", "tta", "vox", "wma", "wv", "webm"},
|
|
doc_extensions = {"pdf", "doc", "docx", "docm", "xla", "xls", "xlsx",
|
|
"xlsm", "ppt", "pptx", "pptm", "odf", "ods", "odp", "ps", "xps", "dot",
|
|
"wbk", "dotx", "dotm", "docb", "xlt", "xlm", "xltx", "xltm", "xlsb",
|
|
"xlam", "xll", "xlw", "pot", "pps", "potx", "potm", "ppam", "ppsx", "ppsm", "pub"},
|
|
archive_extensions = {"zip", "tar.gz", "gz", "rar", "7z", "sit", "sitx",
|
|
"tgz", "tar.bz", "tar", "iso", "a", "ar", "cpio", "shar", "lbr", "iso",
|
|
"mar", "sbx", "bz2", "lz", "lzma", "lzo", "rz", "sz", "s7z", "ace", "afa",
|
|
"alz", "apk", "tar.bz2", "tar.Z", "tar.lzma", "tlz", "tbz2", "xp3", "zz",
|
|
"bzip", "lzip", "lzop", "rzip"},
|
|
exe_extensions = {"exe", "com", "msi", "bin","dmg"}
|
|
}
|
|
local blacklist = {}
|
|
for _, cat in pairs(extensions) do
|
|
for _, ext in ipairs(cat) do
|
|
table.insert(blacklist, string.format(".%s$", ext))
|
|
end
|
|
end
|
|
|
|
self.options:addBlacklist( function(url)
|
|
local p = url:getPath():lower()
|
|
for _, pat in ipairs(blacklist) do
|
|
if ( p:match(pat) ) then
|
|
return true
|
|
end
|
|
end
|
|
end )
|
|
end,
|
|
|
|
-- does the heavy crawling
|
|
--
|
|
-- The crawler may exit due to a number of different reasons, including
|
|
-- invalid options, reaching max count or simply running out of links
|
|
-- We return a false status for all of these and in case the error was
|
|
-- unexpected or requires attention we set the error property accordingly.
|
|
-- This way the script can alert the user of the details by calling
|
|
-- getError()
|
|
crawl_thread = function(self, response_queue)
|
|
local condvar = nmap.condvar(response_queue)
|
|
|
|
if ( false ~= self.options.withinhost and false ~= self.options.withindomain ) then
|
|
table.insert(response_queue, { false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } })
|
|
condvar "signal"
|
|
return
|
|
end
|
|
|
|
while(true) do
|
|
|
|
if ( self.quit or coroutine.status(self.basethread) == 'dead' ) then
|
|
table.insert(response_queue, {false, { err = false, msg = "Quit signalled by crawler" } })
|
|
break
|
|
end
|
|
|
|
-- in case the user set a max page count to retrieve check how many
|
|
-- pages we have retrieved so far
|
|
local count = self:getPageCount()
|
|
if ( self.options.maxpagecount and
|
|
( self.options.maxpagecount > 0 ) and
|
|
( count > self.options.maxpagecount ) ) then
|
|
table.insert(response_queue, { false, { err = false, msg = "Reached max page count" } })
|
|
condvar "signal"
|
|
return
|
|
end
|
|
|
|
-- pull links from the queue until we get a valid one
|
|
local url
|
|
repeat
|
|
url = self.urlqueue:getNext()
|
|
until( not(url) or not(self.processed[tostring(url)]) )
|
|
|
|
-- if no url could be retrieved from the queue, abort ...
|
|
if ( not(url) ) then
|
|
table.insert(response_queue, { false, { err = false, msg = "No more urls" } })
|
|
condvar "signal"
|
|
return
|
|
end
|
|
|
|
if ( self.options.maxpagecount ) then
|
|
stdnse.debug2("%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url))
|
|
else
|
|
stdnse.debug2("%s: Fetching url: %s", LIBRARY_NAME, tostring(url))
|
|
end
|
|
|
|
local scrape = true
|
|
|
|
|
|
if not (self.options.doscraping(url)) then
|
|
stdnse.debug2("%s: Scraping is not allowed for url: %s", LIBRARY_NAME, tostring(url))
|
|
scrape = false
|
|
end
|
|
|
|
local response
|
|
-- in case we want to use HEAD rather than GET for files with certain extensions
|
|
if ( self.options.useheadfornonwebfiles ) then
|
|
local is_web_file = false
|
|
local file = url:getPath():lower()
|
|
-- check if we are at a URL with 'no extension', for example: nmap.org/6
|
|
if string.match(file,".*(/[^/%.]*)$") or string.match(file, "/$") then is_web_file = true end
|
|
if not is_web_file then
|
|
for _,v in pairs(self.web_files_extensions) do
|
|
if string.match(file, "%."..v.."$") then
|
|
is_web_file = true
|
|
break
|
|
end
|
|
end
|
|
end
|
|
if is_web_file then
|
|
stdnse.debug2("%s: Using GET: %s", LIBRARY_NAME, file)
|
|
response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout, redirect_ok = self.options.redirect_ok, no_cache = self.options.no_cache } )
|
|
else
|
|
stdnse.debug2("%s: Using HEAD: %s", LIBRARY_NAME, file)
|
|
response = http.head(url:getHost(), url:getPort(), url:getFile())
|
|
end
|
|
else
|
|
-- fetch the url, and then push it to the processed table
|
|
response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout, redirect_ok = self.options.redirect_ok, no_cache = self.options.no_cache } )
|
|
end
|
|
|
|
self.processed[tostring(url)] = true
|
|
|
|
if ( response ) then
|
|
-- were we redirected?
|
|
if ( response.location ) then
|
|
-- was the link absolute?
|
|
local link = response.location[#response.location]
|
|
if ( link:match("^http") ) then
|
|
url = URL:new(link)
|
|
-- guess not
|
|
else
|
|
url.path = link
|
|
end
|
|
end
|
|
-- if we have a response, proceed scraping it
|
|
if ( response.body ) and scrape then
|
|
local links = LinkExtractor:new(url, response.body, self.options):getLinks()
|
|
self.urlqueue:add(links)
|
|
end
|
|
else
|
|
response = { body = "", headers = {} }
|
|
end
|
|
table.insert(response_queue, { true, { url = url, response = response } } )
|
|
while ( PREFETCH_SIZE < #response_queue ) do
|
|
stdnse.debug2("%s: Response queue full, waiting ...", LIBRARY_NAME)
|
|
condvar "wait"
|
|
end
|
|
condvar "signal"
|
|
end
|
|
condvar "signal"
|
|
end,
|
|
|
|
-- Loads the argument set on a script level
|
|
loadScriptArguments = function(self)
|
|
local sn = self.options.scriptname
|
|
if ( not(sn) ) then
|
|
stdnse.debug1("%s: WARNING: Script argument could not be loaded as scriptname was not set", LIBRARY_NAME)
|
|
return
|
|
end
|
|
|
|
if ( nil == self.options.maxdepth ) then
|
|
self.options.maxdepth = tonumber(stdnse.get_script_args(sn .. ".maxdepth"))
|
|
end
|
|
if ( nil == self.options.maxpagecount ) then
|
|
self.options.maxpagecount = tonumber(stdnse.get_script_args(sn .. ".maxpagecount"))
|
|
end
|
|
if ( nil == self.url ) then
|
|
self.url = stdnse.get_script_args(sn .. ".url")
|
|
end
|
|
if ( nil == self.options.withinhost ) then
|
|
self.options.withinhost = stdnse.get_script_args(sn .. ".withinhost")
|
|
end
|
|
if ( nil == self.options.withindomain ) then
|
|
self.options.withindomain = stdnse.get_script_args(sn .. ".withindomain")
|
|
end
|
|
if ( nil == self.options.noblacklist ) then
|
|
self.options.noblacklist = stdnse.get_script_args(sn .. ".noblacklist")
|
|
end
|
|
if ( nil == self.options.useheadfornonwebfiles ) then
|
|
self.options.useheadfornonwebfiles = stdnse.get_script_args(sn .. ".useheadfornonwebfiles")
|
|
end
|
|
if ( nil == self.options.doscraping ) then
|
|
self.options.doscraping = stdnse.get_script_args(sn .. ".doscraping")
|
|
end
|
|
|
|
end,
|
|
|
|
-- Loads the argument on a library level
|
|
loadLibraryArguments = function(self)
|
|
local ln = LIBRARY_NAME
|
|
|
|
if ( nil == self.options.maxdepth ) then
|
|
self.options.maxdepth = tonumber(stdnse.get_script_args(ln .. ".maxdepth"))
|
|
end
|
|
if ( nil == self.options.maxpagecount ) then
|
|
self.options.maxpagecount = tonumber(stdnse.get_script_args(ln .. ".maxpagecount"))
|
|
end
|
|
if ( nil == self.url ) then
|
|
self.url = stdnse.get_script_args(ln .. ".url")
|
|
end
|
|
if ( nil == self.options.withinhost ) then
|
|
self.options.withinhost = stdnse.get_script_args(ln .. ".withinhost")
|
|
end
|
|
if ( nil == self.options.withindomain ) then
|
|
self.options.withindomain = stdnse.get_script_args(ln .. ".withindomain")
|
|
end
|
|
if ( nil == self.options.noblacklist ) then
|
|
self.options.noblacklist = stdnse.get_script_args(ln .. ".noblacklist")
|
|
end
|
|
if ( nil == self.options.useheadfornonwebfiles ) then
|
|
self.options.useheadfornonwebfiles = stdnse.get_script_args(ln .. ".useheadfornonwebfiles")
|
|
end
|
|
if ( nil == self.options.doscraping ) then
|
|
self.options.doscraping = stdnse.get_script_args(ln .. ".doscraping")
|
|
end
|
|
end,
|
|
|
|
-- Loads any defaults for arguments that were not set
|
|
loadDefaultArguments = function(self)
|
|
local function tobool(b)
|
|
if ( nil == b ) then
|
|
return
|
|
end
|
|
assert("string" == type(b) or "boolean" == type(b) or "number" == type(b), "httpspider: tobool failed, unsupported type")
|
|
if ( "string" == type(b) ) then
|
|
if ( "true" == b ) then
|
|
return true
|
|
else
|
|
return false
|
|
end
|
|
elseif ( "number" == type(b) ) then
|
|
if ( 1 == b ) then
|
|
return true
|
|
else
|
|
return false
|
|
end
|
|
end
|
|
return b
|
|
end
|
|
|
|
if self.options.withinhost == 0 then
|
|
self.options.withinhost = false
|
|
end
|
|
|
|
if self.options.withindomain == 0 then
|
|
self.options.withindomain = false
|
|
end
|
|
|
|
-- fixup some booleans to make sure they're actually booleans
|
|
self.options.noblacklist = tobool(self.options.noblacklist)
|
|
self.options.useheadfornonwebfiles = tobool(self.options.useheadfornonwebfiles)
|
|
|
|
if ( self.options.withinhost == nil ) then
|
|
if ( self.options.withindomain ~= true ) then
|
|
self.options.withinhost = true
|
|
else
|
|
self.options.withinhost = false
|
|
end
|
|
end
|
|
if ( self.options.withindomain == nil ) then
|
|
self.options.withindomain = false
|
|
end
|
|
if ( not ( type(self.options.doscraping) == "function" ) ) then
|
|
self.options.doscraping = false
|
|
end
|
|
self.options.maxdepth = tonumber(self.options.maxdepth) or 3
|
|
self.options.maxpagecount = tonumber(self.options.maxpagecount) or 20
|
|
self.url = self.url or '/'
|
|
end,
|
|
|
|
-- gets a string of limitations imposed on the crawl
|
|
getLimitations = function(self)
|
|
local o = self.options
|
|
local limits = {}
|
|
if ( o.maxdepth > 0 or o.maxpagecount > 0 or
|
|
o.withinhost or o.withindomain ) then
|
|
if ( o.maxdepth > 0 ) then
|
|
table.insert(limits, ("maxdepth=%d"):format(o.maxdepth))
|
|
end
|
|
if ( o.maxpagecount > 0 ) then
|
|
table.insert(limits, ("maxpagecount=%d"):format(o.maxpagecount))
|
|
end
|
|
if ( o.withindomain ) then
|
|
table.insert(limits, ("withindomain=%s"):format(o.base_url:getDomain() or o.base_url:getHost()))
|
|
end
|
|
if ( o.withinhost ) then
|
|
table.insert(limits, ("withinhost=%s"):format(o.base_url:getHost()))
|
|
end
|
|
end
|
|
|
|
if ( #limits > 0 ) then
|
|
return ("Spidering limited to: %s"):format(stdnse.strjoin("; ", limits))
|
|
end
|
|
end,
|
|
|
|
-- does the crawling
|
|
crawl = function(self)
|
|
self.response_queue = self.response_queue or {}
|
|
local condvar = nmap.condvar(self.response_queue)
|
|
if ( not(self.thread) ) then
|
|
self.thread = stdnse.new_thread(self.crawl_thread, self, self.response_queue)
|
|
end
|
|
|
|
if ( #self.response_queue == 0 and coroutine.status(self.thread) ~= 'dead') then
|
|
condvar "wait"
|
|
end
|
|
condvar "signal"
|
|
if ( #self.response_queue == 0 ) then
|
|
return false, { err = false, msg = "No more urls" }
|
|
else
|
|
return table.unpack(table.remove(self.response_queue, 1))
|
|
end
|
|
end,
|
|
|
|
-- signals the crawler to stop
|
|
stop = function(self)
|
|
local condvar = nmap.condvar(self.response_queue)
|
|
self.quit = true
|
|
condvar "signal"
|
|
if ( coroutine.status(self.thread) == "dead" ) then
|
|
return
|
|
end
|
|
condvar "wait"
|
|
end
|
|
}
|
|
|
|
return _ENV;
|