diff --git a/nselib/httpspider.lua b/nselib/httpspider.lua
index 440afac65..dde84ccba 100644
--- a/nselib/httpspider.lua
+++ b/nselib/httpspider.lua
@@ -74,490 +74,490 @@ local PREFETCH_SIZE = 5
-- The Options class, handling all spidering options
Options = {
-
- new = function(self, options)
- local o = { }
-
- -- copy all options as class members
- for k, v in pairs(options) do o[k] = v end
+
+ new = function(self, options)
+ local o = { }
+
+ -- copy all options as class members
+ for k, v in pairs(options) do o[k] = v end
- -- set a few default values
- o.timeout = options.timeout or 10000
- o.whitelist = o.whitelist or {}
- o.blacklist = o.blacklist or {}
+ -- set a few default values
+ o.timeout = options.timeout or 10000
+ o.whitelist = o.whitelist or {}
+ o.blacklist = o.blacklist or {}
local removewww = function(url) return string.gsub(url, "^www%.", "") end
-
- if ( o.withinhost == true or o.withindomain == true ) then
- -- set up the appropriate matching functions
- if ( o.withinhost ) then
- o.withinhost = function(u)
- local parsed_u = url.parse(tostring(u))
-
- if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
- if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then
- return false
- end
- elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
- return false
- -- if urls don't match only on the "www" prefix, then they are probably the same
- elseif ( parsed_u.host == nil or removewww(parsed_u.host:lower()) ~= removewww(o.base_url:getHost():lower()) ) then
- return false
- end
- return true
- end
- else
- o.withindomain = function(u)
- local parsed_u = url.parse(tostring(u))
- if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
- if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then
- return false
- end
- elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
- return false
- elseif ( parsed_u.host == nil or parsed_u.host:sub(-#o.base_url:getDomain()):lower() ~= o.base_url:getDomain():lower() ) then
- return false
- end
- return true
- end
- end
- end
- setmetatable(o, self)
- self.__index = self
- return o
- end,
-
- addWhitelist = function(self, func) table.insert(self.whitelist, func) end,
- addBlacklist = function(self, func) table.insert(self.blacklist, func) end,
+
+ if ( o.withinhost == true or o.withindomain == true ) then
+ -- set up the appropriate matching functions
+ if ( o.withinhost ) then
+ o.withinhost = function(u)
+ local parsed_u = url.parse(tostring(u))
+
+ if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
+ if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then
+ return false
+ end
+ elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
+ return false
+ -- if urls don't match only on the "www" prefix, then they are probably the same
+ elseif ( parsed_u.host == nil or removewww(parsed_u.host:lower()) ~= removewww(o.base_url:getHost():lower()) ) then
+ return false
+ end
+ return true
+ end
+ else
+ o.withindomain = function(u)
+ local parsed_u = url.parse(tostring(u))
+ if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
+ if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then
+ return false
+ end
+ elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
+ return false
+ elseif ( parsed_u.host == nil or parsed_u.host:sub(-#o.base_url:getDomain()):lower() ~= o.base_url:getDomain():lower() ) then
+ return false
+ end
+ return true
+ end
+ end
+ end
+ setmetatable(o, self)
+ self.__index = self
+ return o
+ end,
+
+ addWhitelist = function(self, func) table.insert(self.whitelist, func) end,
+ addBlacklist = function(self, func) table.insert(self.blacklist, func) end,
}
-- Placeholder for form extraction code
FormExtractor = {
-
+
}
LinkExtractor = {
-
- -- Creates a new instance of LinkExtractor
- -- @return o instance of LinkExtractor
- new = function(self, url, html, options)
- local o = {
- url = url,
- html = html,
- links = {},
- options = options,
- }
- setmetatable(o, self)
- self.__index = self
- o:parse()
- return o
- end,
-
- -- is the link absolute or not?
- isAbsolute = function(url)
- -- at this point we don't care about the protocol
- -- also, we don't add // to cover stuff like:
- -- feed:http://example.com/rss.xml
- return ( url:match('^%w*:') ~= nil )
- end,
-
- -- Creates an absolute link from a relative one based on the base_url
- -- The functionality is very simple and does not take any ../../ in
- -- consideration.
- --
- -- @param base_url URL containing the page url from which the links were
- -- extracted
- -- @param rel_url string containing the relative portion of the URL
- -- @return link string containing the absolute link
- createAbsolute = function(base_url, rel_url, base_href)
+
+ -- Creates a new instance of LinkExtractor
+ -- @return o instance of LinkExtractor
+ new = function(self, url, html, options)
+ local o = {
+ url = url,
+ html = html,
+ links = {},
+ options = options,
+ }
+ setmetatable(o, self)
+ self.__index = self
+ o:parse()
+ return o
+ end,
+
+ -- is the link absolute or not?
+ isAbsolute = function(url)
+ -- at this point we don't care about the protocol
+ -- also, we don't add // to cover stuff like:
+ -- feed:http://example.com/rss.xml
+ return ( url:match('^%w*:') ~= nil )
+ end,
+
+ -- Creates an absolute link from a relative one based on the base_url
+ -- The functionality is very simple and does not take any ../../ in
+ -- consideration.
+ --
+ -- @param base_url URL containing the page url from which the links were
+ -- extracted
+ -- @param rel_url string containing the relative portion of the URL
+ -- @return link string containing the absolute link
+ createAbsolute = function(base_url, rel_url, base_href)
- -- is relative with leading slash? ie /dir1/foo.html
- local leading_slash = rel_url:match("^/")
- rel_url = rel_url:match("^/?(.*)") or '/'
+ -- is relative with leading slash? ie /dir1/foo.html
+ local leading_slash = rel_url:match("^/")
+ rel_url = rel_url:match("^/?(.*)") or '/'
- -- check for tailing slash
- if ( base_href and not(base_href:match("/$") ) ) then
- base_href = base_href .. '/'
- end
+ -- check for tailing slash
+ if ( base_href and not(base_href:match("/$") ) ) then
+ base_href = base_href .. '/'
+ end
- if ( ( base_url:getProto() == 'https' and base_url:getPort() == 443 ) or
- ( base_url:getProto() == 'http' and base_url:getPort() == 80 ) ) then
-
- if ( leading_slash ) then
- return ("%s://%s/%s"):format(base_url:getProto(), base_url:getHost(), rel_url)
- else
- if ( base_href ) then
- return ("%s%s"):format(base_href, rel_url)
- else
- return ("%s://%s%s%s"):format(base_url:getProto(), base_url:getHost(), base_url:getDir(), rel_url)
- end
- end
- else
- if ( leading_slash ) then
- return ("%s://%s:%d/%s"):format(base_url:getProto(), base_url:getHost(), base_url:getPort(), rel_url)
- else
- if ( base_href ) then
- return ("%s%s"):format(base_href, rel_url)
- else
- return ("%s://%s:%d%s%s"):format(base_url:getProto(), base_url:getHost(), base_href or base_url:getPort(), base_url:getDir(), rel_url)
- end
- end
- end
- end,
-
- -- Gets the depth of the link, relative to our base url eg.
- -- base_url = http://www.cqure.net/wp/
- -- url = http://www.cqure.net/wp/ - depth: 0
- -- url = http://www.cqure.net/wp/index.php - depth: 0
- -- url = http://www.cqure.net/wp/2011/index.php - depth: 1
- -- url = http://www.cqure.net/index.html - depth: -1
- --
- -- @param url instance of URL
- -- @return depth number containing the depth relative to the base_url
- getDepth = function(self, url)
- local base_dir, url_dir = self.options.base_url:getDir(), url:getDir()
- if ( url_dir and base_dir ) then
- local m = url_dir:match(base_dir.."(.*)")
- if ( not(m) ) then
- return -1
- else
- local _, depth = m:gsub("/", "/")
- return depth
- end
- end
- end,
-
- validate_link = function(self, url)
- local valid = true
+ if ( ( base_url:getProto() == 'https' and base_url:getPort() == 443 ) or
+ ( base_url:getProto() == 'http' and base_url:getPort() == 80 ) ) then
+
+ if ( leading_slash ) then
+ return ("%s://%s/%s"):format(base_url:getProto(), base_url:getHost(), rel_url)
+ else
+ if ( base_href ) then
+ return ("%s%s"):format(base_href, rel_url)
+ else
+ return ("%s://%s%s%s"):format(base_url:getProto(), base_url:getHost(), base_url:getDir(), rel_url)
+ end
+ end
+ else
+ if ( leading_slash ) then
+ return ("%s://%s:%d/%s"):format(base_url:getProto(), base_url:getHost(), base_url:getPort(), rel_url)
+ else
+ if ( base_href ) then
+ return ("%s%s"):format(base_href, rel_url)
+ else
+ return ("%s://%s:%d%s%s"):format(base_url:getProto(), base_url:getHost(), base_href or base_url:getPort(), base_url:getDir(), rel_url)
+ end
+ end
+ end
+ end,
+
+ -- Gets the depth of the link, relative to our base url eg.
+ -- base_url = http://www.cqure.net/wp/
+ -- url = http://www.cqure.net/wp/ - depth: 0
+ -- url = http://www.cqure.net/wp/index.php - depth: 0
+ -- url = http://www.cqure.net/wp/2011/index.php - depth: 1
+ -- url = http://www.cqure.net/index.html - depth: -1
+ --
+ -- @param url instance of URL
+ -- @return depth number containing the depth relative to the base_url
+ getDepth = function(self, url)
+ local base_dir, url_dir = self.options.base_url:getDir(), url:getDir()
+ if ( url_dir and base_dir ) then
+ local m = url_dir:match(base_dir.."(.*)")
+ if ( not(m) ) then
+ return -1
+ else
+ local _, depth = m:gsub("/", "/")
+ return depth
+ end
+ end
+ end,
+
+ validate_link = function(self, url)
+ local valid = true
- -- if our url is nil, abort, this could be due to a number of
- -- reasons such as unsupported protocols: javascript, mail ... or
- -- that the URL failed to parse for some reason
- if ( url == nil or tostring(url) == nil ) then
- return false
- end
+ -- if our url is nil, abort, this could be due to a number of
+ -- reasons such as unsupported protocols: javascript, mail ... or
+ -- that the URL failed to parse for some reason
+ if ( url == nil or tostring(url) == nil ) then
+ return false
+ end
- -- linkdepth trumps whitelisting
- if ( self.options.maxdepth and self.options.maxdepth >= 0 ) then
- local depth = self:getDepth( url )
- if ( -1 == depth or depth > self.options.maxdepth ) then
- stdnse.print_debug(3, "%s: Skipping link depth: %d; b_url=%s; url=%s", LIBRARY_NAME, depth, tostring(self.options.base_url), tostring(url))
- return false
- end
- end
+ -- linkdepth trumps whitelisting
+ if ( self.options.maxdepth and self.options.maxdepth >= 0 ) then
+ local depth = self:getDepth( url )
+ if ( -1 == depth or depth > self.options.maxdepth ) then
+ stdnse.print_debug(3, "%s: Skipping link depth: %d; b_url=%s; url=%s", LIBRARY_NAME, depth, tostring(self.options.base_url), tostring(url))
+ return false
+ end
+ end
- -- withindomain trumps any whitelisting
- if ( self.options.withindomain ) then
- if ( not(self.options.withindomain(url)) ) then
- stdnse.print_debug(2, "%s: Link is not within domain: %s", LIBRARY_NAME, tostring(url))
- return false
- end
- end
+ -- withindomain trumps any whitelisting
+ if ( self.options.withindomain ) then
+ if ( not(self.options.withindomain(url)) ) then
+ stdnse.print_debug(2, "%s: Link is not within domain: %s", LIBRARY_NAME, tostring(url))
+ return false
+ end
+ end
- -- withinhost trumps any whitelisting
- if ( self.options.withinhost ) then
- if ( not(self.options.withinhost(url)) ) then
- stdnse.print_debug(2, "%s: Link is not within host: %s", LIBRARY_NAME, tostring(url))
- return false
- end
- end
+ -- withinhost trumps any whitelisting
+ if ( self.options.withinhost ) then
+ if ( not(self.options.withinhost(url)) ) then
+ stdnse.print_debug(2, "%s: Link is not within host: %s", LIBRARY_NAME, tostring(url))
+ return false
+ end
+ end
- -- run through all blacklists
- if ( #self.options.blacklist > 0 ) then
- for _, func in ipairs(self.options.blacklist) do
- if ( func(url) ) then
- stdnse.print_debug(2, "%s: Blacklist match: %s", LIBRARY_NAME, tostring(url))
- valid = false
- break
- end
- end
- end
+ -- run through all blacklists
+ if ( #self.options.blacklist > 0 ) then
+ for _, func in ipairs(self.options.blacklist) do
+ if ( func(url) ) then
+ stdnse.print_debug(2, "%s: Blacklist match: %s", LIBRARY_NAME, tostring(url))
+ valid = false
+ break
+ end
+ end
+ end
- -- check the url against our whitelist
- if ( #self.options.whitelist > 0 ) then
- valid = false
- for _, func in ipairs(self.options.whitelist) do
- if ( func(url) ) then
- stdnse.print_debug(2, "%s: Whitelist match: %s", LIBRARY_NAME, tostring(url))
- valid = true
- break
- end
- end
- end
- return valid
- end,
+ -- check the url against our whitelist
+ if ( #self.options.whitelist > 0 ) then
+ valid = false
+ for _, func in ipairs(self.options.whitelist) do
+ if ( func(url) ) then
+ stdnse.print_debug(2, "%s: Whitelist match: %s", LIBRARY_NAME, tostring(url))
+ valid = true
+ break
+ end
+ end
+ end
+ return valid
+ end,
- -- Parses a HTML response and extracts all links it can find
- -- The function currently supports href, src and action links
- -- Also all behaviour options, such as depth, white- and black-list are
- -- processed in here.
- parse = function(self)
- local links = {}
- local patterns = {
- '[hH][rR][eE][fF]%s*=%s*[\'"]%s*([^"^\']-)%s*[\'"]',
- '[hH][rR][eE][fF]%s*=%s*([^\'\"][^%s>]+)',
- '[sS][rR][cC]%s*=%s*[\'"]%s*([^"^\']-)%s*[\'"]',
- '[sS][rR][cC]%s*=%s*([^\'\"][^%s>]+)',
- '[aA][cC][tT][iI][oO][nN]%s*=%s*[\'"]%s*([^"^\']+%s*)[\'"]',
- }
-
- local base_hrefs = {
- '[Bb][Aa][Ss][Ee]%s*[Hh][Rr][Ee][Ff]%s*=%s*[\'"](%s*[^"^\']+%s*)[\'"]',
- '[Bb][Aa][Ss][Ee]%s*[Hh][Rr][Ee][Ff]%s*=%s*([^\'\"][^%s>]+)'
- }
-
- local base_href
- for _, pattern in ipairs(base_hrefs) do
- base_href = self.html:match(pattern)
- if ( base_href ) then
- break
- end
- end
+ -- Parses a HTML response and extracts all links it can find
+ -- The function currently supports href, src and action links
+ -- Also all behaviour options, such as depth, white- and black-list are
+ -- processed in here.
+ parse = function(self)
+ local links = {}
+ local patterns = {
+ '[hH][rR][eE][fF]%s*=%s*[\'"]%s*([^"^\']-)%s*[\'"]',
+ '[hH][rR][eE][fF]%s*=%s*([^\'\"][^%s>]+)',
+ '[sS][rR][cC]%s*=%s*[\'"]%s*([^"^\']-)%s*[\'"]',
+ '[sS][rR][cC]%s*=%s*([^\'\"][^%s>]+)',
+ '[aA][cC][tT][iI][oO][nN]%s*=%s*[\'"]%s*([^"^\']+%s*)[\'"]',
+ }
+
+ local base_hrefs = {
+ '[Bb][Aa][Ss][Ee]%s*[Hh][Rr][Ee][Ff]%s*=%s*[\'"](%s*[^"^\']+%s*)[\'"]',
+ '[Bb][Aa][Ss][Ee]%s*[Hh][Rr][Ee][Ff]%s*=%s*([^\'\"][^%s>]+)'
+ }
+
+ local base_href
+ for _, pattern in ipairs(base_hrefs) do
+ base_href = self.html:match(pattern)
+ if ( base_href ) then
+ break
+ end
+ end
- for _, pattern in ipairs(patterns) do
- for l in self.html:gmatch(pattern) do
- local link = l
- if ( not(LinkExtractor.isAbsolute(l)) ) then
- link = LinkExtractor.createAbsolute(self.url, l, base_href)
- end
-
- local url = URL:new(link)
-
- local valid = self:validate_link(url)
-
- if ( valid ) then
- stdnse.print_debug(3, "%s: Adding link: %s", LIBRARY_NAME, tostring(url))
- links[tostring(url)] = true
- elseif ( tostring(url) ) then
- stdnse.print_debug(3, "%s: Skipping url: %s", LIBRARY_NAME, link)
- end
- end
- end
-
- for link in pairs(links) do
- table.insert(self.links, link)
- end
-
- end,
+ for _, pattern in ipairs(patterns) do
+ for l in self.html:gmatch(pattern) do
+ local link = l
+ if ( not(LinkExtractor.isAbsolute(l)) ) then
+ link = LinkExtractor.createAbsolute(self.url, l, base_href)
+ end
+
+ local url = URL:new(link)
+
+ local valid = self:validate_link(url)
+
+ if ( valid ) then
+ stdnse.print_debug(3, "%s: Adding link: %s", LIBRARY_NAME, tostring(url))
+ links[tostring(url)] = true
+ elseif ( tostring(url) ) then
+ stdnse.print_debug(3, "%s: Skipping url: %s", LIBRARY_NAME, link)
+ end
+ end
+ end
+
+ for link in pairs(links) do
+ table.insert(self.links, link)
+ end
+
+ end,
- -- Gets a table containing all of the retrieved URLs, after filtering
- -- has been applied.
- getLinks = function(self) return self.links end,
-
-
+ -- Gets a table containing all of the retrieved URLs, after filtering
+ -- has been applied.
+ getLinks = function(self) return self.links end,
+
+
}
-- The URL class, containing code to process URLS
-- This class is heavily inspired by the Java URL class
URL = {
-
- -- Creates a new instance of URL
- -- @param url string containing the text representation of a URL
- -- @return o instance of URL, in case of parsing being successful
- -- nil in case parsing fails
- new = function(self, url)
- local o = {
- raw = url,
- }
-
- setmetatable(o, self)
- self.__index = self
- if ( o:parse() ) then
- return o
- end
- end,
-
- -- Parses the string representation of the URL and splits it into different
- -- URL components
- -- @return status true on success, false on failure
- parse = function(self)
- self.proto, self.host, self.port, self.file = self.raw:match("^(http[s]?)://([^:/]*)[:]?(%d*)")
- if ( self.proto and self.host ) then
- self.file = self.raw:match("^http[s]?://[^:/]*[:]?%d*(/[^#]*)") or '/'
- self.port = tonumber(self.port)
- if ( not(self.port) ) then
- if ( self.proto:match("https") ) then
- self.port = 443
- elseif ( self.proto:match("http")) then
- self.port = 80
- end
- end
-
- self.path = self.file:match("^([^?]*)[%?]?")
- self.dir = self.path:match("^(.+%/)") or "/"
- self.domain= self.host:match("^[^%.]-%.(.*)")
- return true
- elseif( self.raw:match("^javascript:") ) then
- stdnse.print_debug(2, "%s: Skipping javascript url: %s", LIBRARY_NAME, self.raw)
- elseif( self.raw:match("^mailto:") ) then
- stdnse.print_debug(2, "%s: Skipping mailto link: %s", LIBRARY_NAME, self.raw)
- else
- stdnse.print_debug(2, "%s: WARNING: Failed to parse url: %s", LIBRARY_NAME, self.raw)
- end
- return false
- end,
-
- -- Get's the host portion of the URL
- -- @return host string containing the hostname
- getHost = function(self) return self.host end,
-
- -- Get's the protocol representation of the URL
- -- @return proto string containing the protocol (ie. http, https)
- getProto = function(self) return self.proto end,
-
- -- Returns the filename component of the URL.
- -- @return file string containing the path and query components of the url
- getFile = function(self) return self.file end,
-
- -- Gets the port component of the URL
- -- @return port number containing the port of the URL
- getPort = function(self) return self.port end,
-
- -- Gets the path component of the URL
- -- @return the full path and filename of the URL
- getPath = function(self) return self.path end,
-
- -- Gets the directory component of the URL
- -- @return directory string containing the directory part of the URL
- getDir = function(self) return self.dir end,
-
- -- Gets the domain component of the URL
- -- @return domain string containing the hosts domain
- getDomain = function(self)
- if ( self.domain ) then
- return self.domain
- -- fallback to the host, if we can't find a domain
- else
- return self.host
- end
- end,
-
- -- Converts the URL to a string
- -- @return url string containing the string representation of the url
- __tostring = function(self) return self.raw end,
+
+ -- Creates a new instance of URL
+ -- @param url string containing the text representation of a URL
+ -- @return o instance of URL, in case of parsing being successful
+ -- nil in case parsing fails
+ new = function(self, url)
+ local o = {
+ raw = url,
+ }
+
+ setmetatable(o, self)
+ self.__index = self
+ if ( o:parse() ) then
+ return o
+ end
+ end,
+
+ -- Parses the string representation of the URL and splits it into different
+ -- URL components
+ -- @return status true on success, false on failure
+ parse = function(self)
+ self.proto, self.host, self.port, self.file = self.raw:match("^(http[s]?)://([^:/]*)[:]?(%d*)")
+ if ( self.proto and self.host ) then
+ self.file = self.raw:match("^http[s]?://[^:/]*[:]?%d*(/[^#]*)") or '/'
+ self.port = tonumber(self.port)
+ if ( not(self.port) ) then
+ if ( self.proto:match("https") ) then
+ self.port = 443
+ elseif ( self.proto:match("http")) then
+ self.port = 80
+ end
+ end
+
+ self.path = self.file:match("^([^?]*)[%?]?")
+ self.dir = self.path:match("^(.+%/)") or "/"
+ self.domain= self.host:match("^[^%.]-%.(.*)")
+ return true
+ elseif( self.raw:match("^javascript:") ) then
+ stdnse.print_debug(2, "%s: Skipping javascript url: %s", LIBRARY_NAME, self.raw)
+ elseif( self.raw:match("^mailto:") ) then
+ stdnse.print_debug(2, "%s: Skipping mailto link: %s", LIBRARY_NAME, self.raw)
+ else
+ stdnse.print_debug(2, "%s: WARNING: Failed to parse url: %s", LIBRARY_NAME, self.raw)
+ end
+ return false
+ end,
+
+ -- Get's the host portion of the URL
+ -- @return host string containing the hostname
+ getHost = function(self) return self.host end,
+
+ -- Get's the protocol representation of the URL
+ -- @return proto string containing the protocol (ie. http, https)
+ getProto = function(self) return self.proto end,
+
+ -- Returns the filename component of the URL.
+ -- @return file string containing the path and query components of the url
+ getFile = function(self) return self.file end,
+
+ -- Gets the port component of the URL
+ -- @return port number containing the port of the URL
+ getPort = function(self) return self.port end,
+
+ -- Gets the path component of the URL
+ -- @return the full path and filename of the URL
+ getPath = function(self) return self.path end,
+
+ -- Gets the directory component of the URL
+ -- @return directory string containing the directory part of the URL
+ getDir = function(self) return self.dir end,
+
+ -- Gets the domain component of the URL
+ -- @return domain string containing the hosts domain
+ getDomain = function(self)
+ if ( self.domain ) then
+ return self.domain
+ -- fallback to the host, if we can't find a domain
+ else
+ return self.host
+ end
+ end,
+
+ -- Converts the URL to a string
+ -- @return url string containing the string representation of the url
+ __tostring = function(self) return self.raw end,
}
-- An UrlQueue
UrlQueue = {
-
- -- creates a new instance of UrlQueue
- -- @param options table containing options
- -- @return o new instance of UrlQueue
- new = function(self, options)
- local o = {
- urls = {},
- options = options
- }
- setmetatable(o, self)
- self.__index = self
- return o
- end,
-
- -- get's the next available url in the queue
- getNext = function(self)
- return table.remove(self.urls,1)
- end,
-
- -- adds a new url to the queue
- -- @param url can be either a string or a URL or a table of URLs
- add = function(self, url)
- assert( type(url) == 'string' or type(url) == 'table', "url was neither a string or table")
- local urls = ( 'string' == type(url) ) and URL:new(url) or url
-
- -- if it's a table, it can be either a single URL or an array of URLs
- if ( 'table' == type(url) and url.raw ) then
- urls = { url }
- end
-
- for _, u in ipairs(urls) do
- u = ( 'string' == type(u) ) and URL:new(u) or u
- if ( u ) then
- table.insert(self.urls, u)
- else
- stdnse.print_debug("ERROR: Invalid URL: %s", url)
- end
- end
- end,
-
- -- dumps the contents of the UrlQueue
- dump = function(self)
- for _, url in ipairs(self.urls) do
- print("url:", url)
- end
- end,
-
+
+ -- creates a new instance of UrlQueue
+ -- @param options table containing options
+ -- @return o new instance of UrlQueue
+ new = function(self, options)
+ local o = {
+ urls = {},
+ options = options
+ }
+ setmetatable(o, self)
+ self.__index = self
+ return o
+ end,
+
+ -- get's the next available url in the queue
+ getNext = function(self)
+ return table.remove(self.urls,1)
+ end,
+
+ -- adds a new url to the queue
+ -- @param url can be either a string or a URL or a table of URLs
+ add = function(self, url)
+ assert( type(url) == 'string' or type(url) == 'table', "url was neither a string or table")
+ local urls = ( 'string' == type(url) ) and URL:new(url) or url
+
+ -- if it's a table, it can be either a single URL or an array of URLs
+ if ( 'table' == type(url) and url.raw ) then
+ urls = { url }
+ end
+
+ for _, u in ipairs(urls) do
+ u = ( 'string' == type(u) ) and URL:new(u) or u
+ if ( u ) then
+ table.insert(self.urls, u)
+ else
+ stdnse.print_debug("ERROR: Invalid URL: %s", url)
+ end
+ end
+ end,
+
+ -- dumps the contents of the UrlQueue
+ dump = function(self)
+ for _, url in ipairs(self.urls) do
+ print("url:", url)
+ end
+ end,
+
}
-- The Crawler class
Crawler = {
-
- -- creates a new instance of the Crawler instance
- -- @param host table as received by the action method
- -- @param port table as received by the action method
- -- @param url string containing the relative URL
- -- @param options table of options:
- -- noblacklist - do not load default blacklist
- -- base_url - start url to crawl
- -- timeout - timeout for the http request
- -- maxdepth - the maximum directory depth to crawl
- -- maxpagecount - the maximum amount of pages to retrieve
- -- withinhost - stay within the host of the base_url
- -- withindomain - stay within the base_url domain
- -- scriptname - should be set to SCRIPT_NAME to enable
- -- script specific arguments.
- -- redirect_ok - redirect_ok closure to pass to http.get function
- -- no_cache - no_cache option to pass to http.get function
- -- @return o new instance of Crawler or nil on failure
- new = function(self, host, port, url, options)
- local o = {
- host = host,
- port = port,
- url = url,
- options = options or {},
- basethread = stdnse.base(),
- }
+
+ -- creates a new instance of the Crawler instance
+ -- @param host table as received by the action method
+ -- @param port table as received by the action method
+ -- @param url string containing the relative URL
+ -- @param options table of options:
+ -- noblacklist - do not load default blacklist
+ -- base_url - start url to crawl
+ -- timeout - timeout for the http request
+ -- maxdepth - the maximum directory depth to crawl
+ -- maxpagecount - the maximum amount of pages to retrieve
+ -- withinhost - stay within the host of the base_url
+ -- withindomain - stay within the base_url domain
+ -- scriptname - should be set to SCRIPT_NAME to enable
+ -- script specific arguments.
+ -- redirect_ok - redirect_ok closure to pass to http.get function
+ -- no_cache - no_cache option to pass to http.get function
+ -- @return o new instance of Crawler or nil on failure
+ new = function(self, host, port, url, options)
+ local o = {
+ host = host,
+ port = port,
+ url = url,
+ options = options or {},
+ basethread = stdnse.base(),
+ }
- setmetatable(o, self)
- self.__index = self
+ setmetatable(o, self)
+ self.__index = self
- o:loadScriptArguments()
- o:loadLibraryArguments()
- o:loadDefaultArguments()
+ o:loadScriptArguments()
+ o:loadLibraryArguments()
+ o:loadDefaultArguments()
- local response = http.get(o.host, o.port, '/', { timeout = o.options.timeout, redirect_ok = o.options.redirect_ok, no_cache = o.options.no_cache } )
-
- if ( not(response) or 'table' ~= type(response) ) then
- return
- end
-
- o.url = o.url:match("/?(.*)")
-
- local u_host = o.host.targetname or o.host.name
- if ( not(u_host) or 0 == #u_host ) then
- u_host = o.host.ip
- end
- local u = ("%s://%s:%d/%s"):format(response.ssl and "https" or "http", u_host, o.port.number, o.url)
- o.options.base_url = URL:new(u)
- o.options = Options:new(o.options)
- o.urlqueue = UrlQueue:new(o.options)
- o.urlqueue:add(o.options.base_url)
+ local response = http.get(o.host, o.port, '/', { timeout = o.options.timeout, redirect_ok = o.options.redirect_ok, no_cache = o.options.no_cache } )
+
+ if ( not(response) or 'table' ~= type(response) ) then
+ return
+ end
+
+ o.url = o.url:match("/?(.*)")
+
+ local u_host = o.host.targetname or o.host.name
+ if ( not(u_host) or 0 == #u_host ) then
+ u_host = o.host.ip
+ end
+ local u = ("%s://%s:%d/%s"):format(response.ssl and "https" or "http", u_host, o.port.number, o.url)
+ o.options.base_url = URL:new(u)
+ o.options = Options:new(o.options)
+ o.urlqueue = UrlQueue:new(o.options)
+ o.urlqueue:add(o.options.base_url)
- o.options.timeout = o.options.timeout or 10000
- o.processed = {}
-
- -- script arguments have precedense
- if ( not(o.options.maxdepth) ) then
- o.options.maxdepth = tonumber(stdnse.get_script_args("httpspider.maxdepth"))
- end
-
- -- script arguments have precedense
- if ( not(o.options.maxpagecount) ) then
- o.options.maxpagecount = tonumber(stdnse.get_script_args("httpspider.maxpagecount"))
- end
-
- if ( not(o.options.noblacklist) ) then
- o:addDefaultBlacklist()
- end
+ o.options.timeout = o.options.timeout or 10000
+ o.processed = {}
+
+ -- script arguments have precedense
+ if ( not(o.options.maxdepth) ) then
+ o.options.maxdepth = tonumber(stdnse.get_script_args("httpspider.maxdepth"))
+ end
+
+ -- script arguments have precedense
+ if ( not(o.options.maxpagecount) ) then
+ o.options.maxpagecount = tonumber(stdnse.get_script_args("httpspider.maxpagecount"))
+ end
+
+ if ( not(o.options.noblacklist) ) then
+ o:addDefaultBlacklist()
+ end
if ( o.options.useheadfornonwebfiles ) then
-- Load web files extensitons from a file in nselib/data folder.
@@ -571,110 +571,110 @@ Crawler = {
end
end
end
-
- stdnse.print_debug(2, "%s: %s", LIBRARY_NAME, o:getLimitations())
-
- return o
- end,
-
- -- Set's the timeout used by the http library
- -- @param timeout number containing the timeout in ms.
- set_timeout = function(self, timeout)
- self.options.timeout = timeout
- end,
-
- -- Get's the amount of pages that has been retrieved
- -- @return count number of pages retrieved by the instance
- getPageCount = function(self)
- local count = 1
- for url in pairs(self.processed) do
- count = count + 1
- end
- return count
- end,
-
- -- Adds a default blacklist blocking binary files such as images,
- -- compressed archives and executable files
- addDefaultBlacklist = function(self)
- local extensions = {
- image_extensions = {"png","jpg","jpeg","gif","bmp"},
- video_extensions = {"avi","flv","ogg","mp4","wmv"},
- audio_extensions = {"aac","m4a","mp3","wav"},
- doc_extensions = {"pdf", "doc", "docx", "docm", "xls", "xlsx", "xlsm",
- "ppt", "pptx", "pptm", "odf", "ods", "odp", "ps", "xps"},
- archive_extensions = {"zip", "tar.gz", "gz", "rar", "7z", "sit", "sitx",
- "tgz", "tar.bz", "tar", "iso"},
- exe_extensions = {"exe", "com", "msi", "bin","dmg"}
- }
- local blacklist = {}
- for _, cat in pairs(extensions) do
- for _, ext in ipairs(cat) do
- table.insert(blacklist, string.format(".%s$", ext))
- end
- end
+
+ stdnse.print_debug(2, "%s: %s", LIBRARY_NAME, o:getLimitations())
+
+ return o
+ end,
+
+ -- Set's the timeout used by the http library
+ -- @param timeout number containing the timeout in ms.
+ set_timeout = function(self, timeout)
+ self.options.timeout = timeout
+ end,
+
+ -- Get's the amount of pages that has been retrieved
+ -- @return count number of pages retrieved by the instance
+ getPageCount = function(self)
+ local count = 1
+ for url in pairs(self.processed) do
+ count = count + 1
+ end
+ return count
+ end,
+
+ -- Adds a default blacklist blocking binary files such as images,
+ -- compressed archives and executable files
+ addDefaultBlacklist = function(self)
+ local extensions = {
+ image_extensions = {"png","jpg","jpeg","gif","bmp"},
+ video_extensions = {"avi","flv","ogg","mp4","wmv"},
+ audio_extensions = {"aac","m4a","mp3","wav"},
+ doc_extensions = {"pdf", "doc", "docx", "docm", "xls", "xlsx", "xlsm",
+ "ppt", "pptx", "pptm", "odf", "ods", "odp", "ps", "xps"},
+ archive_extensions = {"zip", "tar.gz", "gz", "rar", "7z", "sit", "sitx",
+ "tgz", "tar.bz", "tar", "iso"},
+ exe_extensions = {"exe", "com", "msi", "bin","dmg"}
+ }
+ local blacklist = {}
+ for _, cat in pairs(extensions) do
+ for _, ext in ipairs(cat) do
+ table.insert(blacklist, string.format(".%s$", ext))
+ end
+ end
- self.options:addBlacklist( function(url)
- local p = url:getPath():lower()
- for _, pat in ipairs(blacklist) do
- if ( p:match(pat) ) then
- return true
- end
- end
- end )
- end,
-
- -- does the heavy crawling
- --
- -- The crawler may exit due to a number of different reasons, including
- -- invalid options, reaching max count or simply running out of links
- -- We return a false status for all of these and in case the error was
- -- unexpected or requires attention we set the error property accordingly.
- -- This way the script can alert the user of the details by calling
- -- getError()
- crawl_thread = function(self, response_queue)
- local condvar = nmap.condvar(response_queue)
+ self.options:addBlacklist( function(url)
+ local p = url:getPath():lower()
+ for _, pat in ipairs(blacklist) do
+ if ( p:match(pat) ) then
+ return true
+ end
+ end
+ end )
+ end,
+
+ -- does the heavy crawling
+ --
+ -- The crawler may exit due to a number of different reasons, including
+ -- invalid options, reaching max count or simply running out of links
+ -- We return a false status for all of these and in case the error was
+ -- unexpected or requires attention we set the error property accordingly.
+ -- This way the script can alert the user of the details by calling
+ -- getError()
+ crawl_thread = function(self, response_queue)
+ local condvar = nmap.condvar(response_queue)
- if ( false ~= self.options.withinhost and false ~= self.options.withindomain ) then
- table.insert(response_queue, { false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } })
- condvar "signal"
- return
- end
+ if ( false ~= self.options.withinhost and false ~= self.options.withindomain ) then
+ table.insert(response_queue, { false, { err = true, reason = "Invalid options: withinhost and withindomain can't both be true" } })
+ condvar "signal"
+ return
+ end
- while(true) do
-
- if ( self.quit or coroutine.status(self.basethread) == 'dead' ) then
- table.insert(response_queue, {false, { err = false, msg = "Quit signalled by crawler" } })
- break
- end
-
- -- in case the user set a max page count to retrieve check how many
- -- pages we have retrieved so far
- local count = self:getPageCount()
- if ( self.options.maxpagecount and
- ( count > self.options.maxpagecount ) ) then
- table.insert(response_queue, { false, { err = false, msg = "Reached max page count" } })
- condvar "signal"
- return
- end
-
- -- pull links from the queue until we get a valid one
- local url
- repeat
- url = self.urlqueue:getNext()
- until( not(url) or not(self.processed[tostring(url)]) )
+ while(true) do
+
+ if ( self.quit or coroutine.status(self.basethread) == 'dead' ) then
+ table.insert(response_queue, {false, { err = false, msg = "Quit signalled by crawler" } })
+ break
+ end
+
+ -- in case the user set a max page count to retrieve check how many
+ -- pages we have retrieved so far
+ local count = self:getPageCount()
+ if ( self.options.maxpagecount and
+ ( count > self.options.maxpagecount ) ) then
+ table.insert(response_queue, { false, { err = false, msg = "Reached max page count" } })
+ condvar "signal"
+ return
+ end
+
+ -- pull links from the queue until we get a valid one
+ local url
+ repeat
+ url = self.urlqueue:getNext()
+ until( not(url) or not(self.processed[tostring(url)]) )
- -- if no url could be retrieved from the queue, abort ...
- if ( not(url) ) then
- table.insert(response_queue, { false, { err = false, msg = "No more urls" } })
- condvar "signal"
- return
- end
-
- if ( self.options.maxpagecount ) then
- stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url))
- else
- stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url))
- end
+ -- if no url could be retrieved from the queue, abort ...
+ if ( not(url) ) then
+ table.insert(response_queue, { false, { err = false, msg = "No more urls" } })
+ condvar "signal"
+ return
+ end
+
+ if ( self.options.maxpagecount ) then
+ stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url))
+ else
+ stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url))
+ end
local response
-- in case we want to use HEAD rather than GET for files with certain extensions
@@ -699,198 +699,198 @@ Crawler = {
response = http.head(url:getHost(), url:getPort(), url:getFile())
end
else
- -- fetch the url, and then push it to the processed table
- response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout, redirect_ok = self.options.redirect_ok, no_cache = self.options.no_cache } )
- end
+ -- fetch the url, and then push it to the processed table
+ response = http.get(url:getHost(), url:getPort(), url:getFile(), { timeout = self.options.timeout, redirect_ok = self.options.redirect_ok, no_cache = self.options.no_cache } )
+ end
- self.processed[tostring(url)] = true
+ self.processed[tostring(url)] = true
- if ( response ) then
- -- were we redirected?
- if ( response.location ) then
- -- was the link absolute?
- local link = response.location[#response.location]
- if ( link:match("^http") ) then
- url = URL:new(link)
- -- guess not
- else
- url.path = link
- end
- end
- -- if we have a response, proceed scraping it
- if ( response.body ) then
- local links = LinkExtractor:new(url, response.body, self.options):getLinks()
- self.urlqueue:add(links)
- end
- else
- response = { body = "", headers = {} }
- end
- table.insert(response_queue, { true, { url = url, response = response } } )
- while ( PREFETCH_SIZE < #response_queue ) do
- stdnse.print_debug(2, "%s: Response queue full, waiting ...", LIBRARY_NAME)
- condvar "wait"
- end
- condvar "signal"
- end
- condvar "signal"
- end,
-
- -- Loads the argument set on a script level
- loadScriptArguments = function(self)
- local sn = self.options.scriptname
- if ( not(sn) ) then
- stdnse.print_debug("%s: WARNING: Script argument could not be loaded as scriptname was not set", LIBRARY_NAME)
- return
- end
-
- if ( nil == self.options.maxdepth ) then
- self.options.maxdepth = tonumber(stdnse.get_script_args(sn .. ".maxdepth"))
- end
- if ( nil == self.options.maxpagecount ) then
- self.options.maxpagecount = tonumber(stdnse.get_script_args(sn .. ".maxpagecount"))
- end
- if ( nil == self.url ) then
- self.url = stdnse.get_script_args(sn .. ".url")
- end
- if ( nil == self.options.withinhost ) then
- self.options.withinhost = stdnse.get_script_args(sn .. ".withinhost")
- end
- if ( nil == self.options.withindomain ) then
- self.options.withindomain = stdnse.get_script_args(sn .. ".withindomain")
- end
- if ( nil == self.options.noblacklist ) then
- self.options.noblacklist = stdnse.get_script_args(sn .. ".noblacklist")
- end
- if ( nil == self.options.useheadfornonwebfiles ) then
- self.options.useheadfornonwebfiles = stdnse.get_script_args(sn .. ".useheadfornonwebfiles")
- end
- end,
-
- -- Loads the argument on a library level
- loadLibraryArguments = function(self)
- local ln = LIBRARY_NAME
+ if ( response ) then
+ -- were we redirected?
+ if ( response.location ) then
+ -- was the link absolute?
+ local link = response.location[#response.location]
+ if ( link:match("^http") ) then
+ url = URL:new(link)
+ -- guess not
+ else
+ url.path = link
+ end
+ end
+ -- if we have a response, proceed scraping it
+ if ( response.body ) then
+ local links = LinkExtractor:new(url, response.body, self.options):getLinks()
+ self.urlqueue:add(links)
+ end
+ else
+ response = { body = "", headers = {} }
+ end
+ table.insert(response_queue, { true, { url = url, response = response } } )
+ while ( PREFETCH_SIZE < #response_queue ) do
+ stdnse.print_debug(2, "%s: Response queue full, waiting ...", LIBRARY_NAME)
+ condvar "wait"
+ end
+ condvar "signal"
+ end
+ condvar "signal"
+ end,
+
+ -- Loads the argument set on a script level
+ loadScriptArguments = function(self)
+ local sn = self.options.scriptname
+ if ( not(sn) ) then
+ stdnse.print_debug("%s: WARNING: Script argument could not be loaded as scriptname was not set", LIBRARY_NAME)
+ return
+ end
+
+ if ( nil == self.options.maxdepth ) then
+ self.options.maxdepth = tonumber(stdnse.get_script_args(sn .. ".maxdepth"))
+ end
+ if ( nil == self.options.maxpagecount ) then
+ self.options.maxpagecount = tonumber(stdnse.get_script_args(sn .. ".maxpagecount"))
+ end
+ if ( nil == self.url ) then
+ self.url = stdnse.get_script_args(sn .. ".url")
+ end
+ if ( nil == self.options.withinhost ) then
+ self.options.withinhost = stdnse.get_script_args(sn .. ".withinhost")
+ end
+ if ( nil == self.options.withindomain ) then
+ self.options.withindomain = stdnse.get_script_args(sn .. ".withindomain")
+ end
+ if ( nil == self.options.noblacklist ) then
+ self.options.noblacklist = stdnse.get_script_args(sn .. ".noblacklist")
+ end
+ if ( nil == self.options.useheadfornonwebfiles ) then
+ self.options.useheadfornonwebfiles = stdnse.get_script_args(sn .. ".useheadfornonwebfiles")
+ end
+ end,
+
+ -- Loads the argument on a library level
+ loadLibraryArguments = function(self)
+ local ln = LIBRARY_NAME
- if ( nil == self.options.maxdepth ) then
- self.options.maxdepth = tonumber(stdnse.get_script_args(ln .. ".maxdepth"))
- end
- if ( nil == self.options.maxpagecount ) then
- self.options.maxpagecount = tonumber(stdnse.get_script_args(ln .. ".maxpagecount"))
- end
- if ( nil == self.url ) then
- self.url = stdnse.get_script_args(ln .. ".url")
- end
- if ( nil == self.options.withinhost ) then
- self.options.withinhost = stdnse.get_script_args(ln .. ".withinhost")
- end
- if ( nil == self.options.withindomain ) then
- self.options.withindomain = stdnse.get_script_args(ln .. ".withindomain")
- end
- if ( nil == self.options.noblacklist ) then
- self.options.noblacklist = stdnse.get_script_args(ln .. ".noblacklist")
- end
- if ( nil == self.options.useheadfornonwebfiles ) then
- self.options.useheadfornonwebfiles = stdnse.get_script_args(ln .. ".useheadfornonwebfiles")
- end
- end,
-
- -- Loads any defaults for arguments that were not set
- loadDefaultArguments = function(self)
- local function tobool(b)
- if ( nil == b ) then
- return
- end
- assert("string" == type(b) or "boolean" == type(b) or "number" == type(b), "httpspider: tobool failed, unsupported type")
- if ( "string" == type(b) ) then
- if ( "true" == b ) then
- return true
- else
- return false
- end
- elseif ( "number" == type(b) ) then
- if ( 1 == b ) then
- return true
- else
- return false
- end
- end
- return b
- end
-
- -- fixup some booleans to make sure they're actually booleans
- self.options.withinhost = tobool(self.options.withinhost)
- self.options.withindomain = tobool(self.options.withindomain)
- self.options.noblacklist = tobool(self.options.noblacklist)
- self.options.useheadfornonwebfiles = tobool(self.options.useheadfornonwebfiles)
+ if ( nil == self.options.maxdepth ) then
+ self.options.maxdepth = tonumber(stdnse.get_script_args(ln .. ".maxdepth"))
+ end
+ if ( nil == self.options.maxpagecount ) then
+ self.options.maxpagecount = tonumber(stdnse.get_script_args(ln .. ".maxpagecount"))
+ end
+ if ( nil == self.url ) then
+ self.url = stdnse.get_script_args(ln .. ".url")
+ end
+ if ( nil == self.options.withinhost ) then
+ self.options.withinhost = stdnse.get_script_args(ln .. ".withinhost")
+ end
+ if ( nil == self.options.withindomain ) then
+ self.options.withindomain = stdnse.get_script_args(ln .. ".withindomain")
+ end
+ if ( nil == self.options.noblacklist ) then
+ self.options.noblacklist = stdnse.get_script_args(ln .. ".noblacklist")
+ end
+ if ( nil == self.options.useheadfornonwebfiles ) then
+ self.options.useheadfornonwebfiles = stdnse.get_script_args(ln .. ".useheadfornonwebfiles")
+ end
+ end,
+
+ -- Loads any defaults for arguments that were not set
+ loadDefaultArguments = function(self)
+ local function tobool(b)
+ if ( nil == b ) then
+ return
+ end
+ assert("string" == type(b) or "boolean" == type(b) or "number" == type(b), "httpspider: tobool failed, unsupported type")
+ if ( "string" == type(b) ) then
+ if ( "true" == b ) then
+ return true
+ else
+ return false
+ end
+ elseif ( "number" == type(b) ) then
+ if ( 1 == b ) then
+ return true
+ else
+ return false
+ end
+ end
+ return b
+ end
+
+ -- fixup some booleans to make sure they're actually booleans
+ self.options.withinhost = tobool(self.options.withinhost)
+ self.options.withindomain = tobool(self.options.withindomain)
+ self.options.noblacklist = tobool(self.options.noblacklist)
+ self.options.useheadfornonwebfiles = tobool(self.options.useheadfornonwebfiles)
- if ( self.options.withinhost == nil ) then
- if ( self.options.withindomain ~= true ) then
- self.options.withinhost = true
- else
- self.options.withinhost = false
- end
- end
- if ( self.options.withindomain == nil ) then
- self.options.withindomain = false
- end
- self.options.maxdepth = self.options.maxdepth or 3
- self.options.maxpagecount = self.options.maxpagecount or 20
- self.url = self.url or '/'
- end,
-
- -- gets a string of limitations imposed on the crawl
- getLimitations = function(self)
- local o = self.options
- local limits = {}
- if ( o.maxdepth > 0 or o.maxpagecount > 0 or
- o.withinhost or o.withindomain ) then
- if ( o.maxdepth > 0 ) then
- table.insert(limits, ("maxdepth=%d"):format(o.maxdepth))
- end
- if ( o.maxpagecount > 0 ) then
- table.insert(limits, ("maxpagecount=%d"):format(o.maxpagecount))
- end
- if ( o.withindomain ) then
- table.insert(limits, ("withindomain=%s"):format(o.base_url:getDomain() or o.base_url:getHost()))
- end
- if ( o.withinhost ) then
- table.insert(limits, ("withinhost=%s"):format(o.base_url:getHost()))
- end
- end
-
- if ( #limits > 0 ) then
- return ("Spidering limited to: %s"):format(stdnse.strjoin("; ", limits))
- end
- end,
-
- -- does the crawling
- crawl = function(self)
- self.response_queue = self.response_queue or {}
- local condvar = nmap.condvar(self.response_queue)
- if ( not(self.thread) ) then
- self.thread = stdnse.new_thread(self.crawl_thread, self, self.response_queue)
- end
+ if ( self.options.withinhost == nil ) then
+ if ( self.options.withindomain ~= true ) then
+ self.options.withinhost = true
+ else
+ self.options.withinhost = false
+ end
+ end
+ if ( self.options.withindomain == nil ) then
+ self.options.withindomain = false
+ end
+ self.options.maxdepth = self.options.maxdepth or 3
+ self.options.maxpagecount = self.options.maxpagecount or 20
+ self.url = self.url or '/'
+ end,
+
+ -- gets a string of limitations imposed on the crawl
+ getLimitations = function(self)
+ local o = self.options
+ local limits = {}
+ if ( o.maxdepth > 0 or o.maxpagecount > 0 or
+ o.withinhost or o.withindomain ) then
+ if ( o.maxdepth > 0 ) then
+ table.insert(limits, ("maxdepth=%d"):format(o.maxdepth))
+ end
+ if ( o.maxpagecount > 0 ) then
+ table.insert(limits, ("maxpagecount=%d"):format(o.maxpagecount))
+ end
+ if ( o.withindomain ) then
+ table.insert(limits, ("withindomain=%s"):format(o.base_url:getDomain() or o.base_url:getHost()))
+ end
+ if ( o.withinhost ) then
+ table.insert(limits, ("withinhost=%s"):format(o.base_url:getHost()))
+ end
+ end
+
+ if ( #limits > 0 ) then
+ return ("Spidering limited to: %s"):format(stdnse.strjoin("; ", limits))
+ end
+ end,
+
+ -- does the crawling
+ crawl = function(self)
+ self.response_queue = self.response_queue or {}
+ local condvar = nmap.condvar(self.response_queue)
+ if ( not(self.thread) ) then
+ self.thread = stdnse.new_thread(self.crawl_thread, self, self.response_queue)
+ end
- if ( #self.response_queue == 0 and coroutine.status(self.thread) ~= 'dead') then
- condvar "wait"
- end
- condvar "signal"
- if ( #self.response_queue == 0 ) then
- return false, { err = false, msg = "No more urls" }
- else
- return table.unpack(table.remove(self.response_queue, 1))
- end
- end,
-
- -- signals the crawler to stop
- stop = function(self)
- local condvar = nmap.condvar(self.response_queue)
- self.quit = true
- condvar "signal"
- if ( coroutine.status(self.thread) == "dead" ) then
- return
- end
- condvar "wait"
- end
+ if ( #self.response_queue == 0 and coroutine.status(self.thread) ~= 'dead') then
+ condvar "wait"
+ end
+ condvar "signal"
+ if ( #self.response_queue == 0 ) then
+ return false, { err = false, msg = "No more urls" }
+ else
+ return table.unpack(table.remove(self.response_queue, 1))
+ end
+ end,
+
+ -- signals the crawler to stop
+ stop = function(self)
+ local condvar = nmap.condvar(self.response_queue)
+ self.quit = true
+ condvar "signal"
+ if ( coroutine.status(self.thread) == "dead" ) then
+ return
+ end
+ condvar "wait"
+ end
}
return _ENV;