From 1ecec300db5f2692a7dbc7ca30712a9a5923c8cd Mon Sep 17 00:00:00 2001 From: sophron Date: Thu, 18 Jul 2013 14:03:42 +0000 Subject: [PATCH] Allowed callbacks to 'withinhost' and 'withindomain' options and introduced 'doscraping' option. --- nselib/httpspider.lua | 226 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 184 insertions(+), 42 deletions(-) diff --git a/nselib/httpspider.lua b/nselib/httpspider.lua index dde84ccba..c11dc96b1 100644 --- a/nselib/httpspider.lua +++ b/nselib/httpspider.lua @@ -38,6 +38,37 @@ -- return result -- -- +-- For advanced use, the library currently supports a number of closures (withinhost, +-- withindomain, doscraping). Please note, that withinhost and withindomain options also +-- support boolean values. You will want to override them only for advanced use. You can +-- define them using the following ultities: +-- +-- * iswithinhost +-- ** You can use this ultity to check if the resource exists within the host. +-- +-- * iswithindomain +-- ** You can use this ultity to check if the resource exists within the domain. +-- +-- * isresource +-- ** You can use this ultity to check the type of the resource (for example "js"). +-- ** A third option may hold a number of signs that may exist after the extension +-- ** of the resource. By default, these are [#, ?]. For example, if we want to return +-- only php resources, the function will also return example.php?query=foo or +-- example.php#foo. +-- +-- The following sample code shows an example usage. We override the default +-- withinhost method and we allow spidering only on resources within the host +-- that they are not "js" or "css". +-- +-- crawler.options.withinhost = function(url) +-- if crawler:iswithinhost(url) +-- and not crawler:isresource(url, "js") +-- and not crawler:isresource(url, "css") then +-- return true +-- end +-- end +-- +-- -- @author Patrik Karlsson -- -- @args httpspider.maxdepth the maximum amount of directories beneath @@ -47,17 +78,26 @@ -- A negative value disables the limit (default: 20) -- @args httpspider.url the url to start spidering. This is a URL -- relative to the scanned host eg. /default.html (default: /) --- @args httpspider.withinhost only spider URLs within the same host. --- (default: true) --- @args httpspider.withindomain only spider URLs within the same +-- @args httpspider.withinhost Closure that overrides the default withinhost +-- function that only spiders URLs within the same host. If this is +-- set to false the crawler will spider URLs both inside and outside +-- the host. See the closure section above to override the default +-- behaviour. (default: true) +-- @args httpspider.withindomain Closure that overrides the default +-- withindomain function that only spiders URLs within the same -- domain. This widens the scope from withinhost and can --- not be used in combination. (default: false) +-- not be used in combination. See the closure section above to +-- override the default behaviour. (default: false) -- @args httpspider.noblacklist if set, doesn't load the default blacklist -- @args httpspider.useheadfornonwebfiles if set, the crawler would use -- HEAD instead of GET for files that do not have extensions indicating -- that they are webpages (the list of webpage extensions is located in -- nselib/data/http-web-files-extensions.lst) --- +-- @args httpspider.doscraping Closure that overrides the default doscraping +-- function used to check if the resource should be scraped (in terms +-- of extracting any links within it). See the closure section above to +-- override the default behaviour. +--- local coroutine = require "coroutine" local http = require "http" @@ -87,40 +127,47 @@ Options = { o.blacklist = o.blacklist or {} local removewww = function(url) return string.gsub(url, "^www%.", "") end - if ( o.withinhost == true or o.withindomain == true ) then - -- set up the appropriate matching functions - if ( o.withinhost ) then - o.withinhost = function(u) - local parsed_u = url.parse(tostring(u)) - - if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then - if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then - return false - end - elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then - return false - -- if urls don't match only on the "www" prefix, then they are probably the same - elseif ( parsed_u.host == nil or removewww(parsed_u.host:lower()) ~= removewww(o.base_url:getHost():lower()) ) then + -- set up the appropriate matching functions + if ( o.withinhost ) then + o.withinhost = function(u) + local parsed_u = url.parse(tostring(u)) + + if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then + if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then return false end - return true - end - else - o.withindomain = function(u) - local parsed_u = url.parse(tostring(u)) - if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then - if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then - return false - end - elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then - return false - elseif ( parsed_u.host == nil or parsed_u.host:sub(-#o.base_url:getDomain()):lower() ~= o.base_url:getDomain():lower() ) then - return false - end - return true + elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then + return false + -- if urls don't match only on the "www" prefix, then they are probably the same + elseif ( parsed_u.host == nil or removewww(parsed_u.host:lower()) ~= removewww(o.base_url:getHost():lower()) ) then + return false end + return true end end + if ( o.withindomain ) then + o.withindomain = function(u) + local parsed_u = url.parse(tostring(u)) + if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then + if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then + return false + end + elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then + return false + elseif ( parsed_u.host == nil or parsed_u.host:sub(-#o.base_url:getDomain()):lower() ~= o.base_url:getDomain():lower() ) then + return false + end + return true + end + end + + if (not o.doscraping) then + + o.doscraping = function(u) + return true + end + end + setmetatable(o, self) self.__index = self return o @@ -150,9 +197,10 @@ LinkExtractor = { setmetatable(o, self) self.__index = self o:parse() + return o end, - + -- is the link absolute or not? isAbsolute = function(url) -- at this point we don't care about the protocol @@ -171,6 +219,11 @@ LinkExtractor = { -- @return link string containing the absolute link createAbsolute = function(base_url, rel_url, base_href) + -- is protocol-relative? + if rel_url:match("^//") then + return ("%s%s%s"):format(base_url:getProto(), ":", rel_url) + end + -- is relative with leading slash? ie /dir1/foo.html local leading_slash = rel_url:match("^/") rel_url = rel_url:match("^/?(.*)") or '/' @@ -490,6 +543,68 @@ UrlQueue = { -- The Crawler class Crawler = { + + options = {}, + + removewww = function(url) return string.gsub(url, "^www%.", "") end, + + -- An ultity when defining closures. Checks if the resource exists within host. + -- @param u URL that points to the resource we want to check. + iswithinhost = function(self, u) + local parsed_u = url.parse(tostring(u)) + if ( self.options.base_url:getPort() ~= 80 and self.options.base_url:getPort() ~= 443 ) then + if ( tonumber(parsed_u.port) ~= tonumber(self.options.base_url:getPort()) ) then + return false + end + elseif ( parsed_u.scheme ~= self.options.base_url:getProto() ) then + return false + -- if urls don't match only on the "www" prefix, then they are probably the same + elseif ( parsed_u.host == nil or self.removewww(parsed_u.host:lower()) ~= self.removewww(self.options.base_url:getHost():lower()) ) then + return false + end + return true + end, + + -- An ultity when defining closures. Checks if the resource exists within domain. + -- @param u URL that points to the resource we want to check. + iswithindomain = function(self, u) + local parsed_u = url.parse(tostring(u)) + if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then + if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then + return false + end + elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then + return false + elseif ( parsed_u.host == nil or parsed_u.host:sub(-#o.base_url:getDomain()):lower() ~= o.base_url:getDomain():lower() ) then + return false + end + return true + end, + + -- An ultity when defining closures. Checks the type of the resource. + -- @param u URL that points to the resource we want to check. + -- @param ext the extension of the resource. + -- @param signs table of signs that may exist after the extension of the resource. + isresource = function(self, u, ext, signs) + u = tostring(u) + + if string.match(u, "." .. ext .. "$") then + return true + end + + if signs then + signstring = "" + for _, s in signs do + signstring = signstring .. s + end + signstring:gsub('?', '%?') + else + signstring = "#%?" + end + + return string.match(u, "." .. ext .. "[" .. signstring .. "]" .. "[^.]*$") + + end, -- creates a new instance of the Crawler instance -- @param host table as received by the action method @@ -503,6 +618,7 @@ Crawler = { -- maxpagecount - the maximum amount of pages to retrieve -- withinhost - stay within the host of the base_url -- withindomain - stay within the base_url domain + -- doscraping - Permit scraping -- scriptname - should be set to SCRIPT_NAME to enable -- script specific arguments. -- redirect_ok - redirect_ok closure to pass to http.get function @@ -520,6 +636,8 @@ Crawler = { setmetatable(o, self) self.__index = self + self.options = o + o:loadScriptArguments() o:loadLibraryArguments() o:loadDefaultArguments() @@ -674,8 +792,16 @@ Crawler = { stdnse.print_debug(2, "%s: Fetching url [%d of %d]: %s", LIBRARY_NAME, count, self.options.maxpagecount, tostring(url)) else stdnse.print_debug(2, "%s: Fetching url: %s", LIBRARY_NAME, tostring(url)) - end - + end + + local scrape = true + + + if not (self.options.doscraping(url)) then + stdnse.print_debug(2, "%s: Scraping is not allowed for url: %s", LIBRARY_NAME, tostring(url)) + scrape = false + end + local response -- in case we want to use HEAD rather than GET for files with certain extensions if ( self.options.useheadfornonwebfiles ) then @@ -718,7 +844,7 @@ Crawler = { end end -- if we have a response, proceed scraping it - if ( response.body ) then + if ( response.body ) and scrape then local links = LinkExtractor:new(url, response.body, self.options):getLinks() self.urlqueue:add(links) end @@ -761,9 +887,13 @@ Crawler = { if ( nil == self.options.noblacklist ) then self.options.noblacklist = stdnse.get_script_args(sn .. ".noblacklist") end - if ( nil == self.options.useheadfornonwebfiles ) then + if ( nil == self.options.useheadfornonwebfiles ) then self.options.useheadfornonwebfiles = stdnse.get_script_args(sn .. ".useheadfornonwebfiles") end + if ( nil == self.options.doscraping ) then + self.options.doscraping = stdnse.get_script_args(sn .. ".doscraping") + end + end, -- Loads the argument on a library level @@ -788,9 +918,12 @@ Crawler = { if ( nil == self.options.noblacklist ) then self.options.noblacklist = stdnse.get_script_args(ln .. ".noblacklist") end - if ( nil == self.options.useheadfornonwebfiles ) then + if ( nil == self.options.useheadfornonwebfiles ) then self.options.useheadfornonwebfiles = stdnse.get_script_args(ln .. ".useheadfornonwebfiles") end + if ( nil == self.options.doscraping ) then + self.options.doscraping = stdnse.get_script_args(ln .. ".doscraping") + end end, -- Loads any defaults for arguments that were not set @@ -816,9 +949,15 @@ Crawler = { return b end + if self.options.withinhost == 0 then + self.options.withinhost = false + end + + if self.options.withindomain == 0 then + self.options.withindomain = false + end + -- fixup some booleans to make sure they're actually booleans - self.options.withinhost = tobool(self.options.withinhost) - self.options.withindomain = tobool(self.options.withindomain) self.options.noblacklist = tobool(self.options.noblacklist) self.options.useheadfornonwebfiles = tobool(self.options.useheadfornonwebfiles) @@ -832,6 +971,9 @@ Crawler = { if ( self.options.withindomain == nil ) then self.options.withindomain = false end + if ( not ( type(self.options.doscraping) == "function" ) ) then + self.options.doscraping = false + end self.options.maxdepth = self.options.maxdepth or 3 self.options.maxpagecount = self.options.maxpagecount or 20 self.url = self.url or '/'