adding Beautifulsoup (BSD) into extras; adding --crawl to options

2026-01-01 12:29:04 +00:00 · 2011-06-20 11:32:30 +00:00
parent 8c04aa871a
commit 07e2c72943
9 changed files with 2168 additions and 3 deletions
--- a/lib/controller/controller.py
+++ b/lib/controller/controller.py
@@ -589,7 +589,7 @@ def start():
    if kb.dataOutputFlag and not conf.multipleTargets:
        logger.info("Fetched data logged to text files under '%s'" % conf.outputPath)

-    if conf.multipleTargets:
+    if conf.multipleTargets and conf.resultsFilename:
        infoMsg  = "you can find results of scanning in multiple targets "
        infoMsg += "mode inside the CSV file '%s'" % conf.resultsFilename
        logger.info(infoMsg)
--- a/lib/core/option.py
+++ b/lib/core/option.py
@@ -114,6 +114,7 @@ from lib.request.certhandler import HTTPSCertAuthHandler
 from lib.request.rangehandler import HTTPRangeHandler
 from lib.request.redirecthandler import SmartRedirectHandler
 from lib.request.templates import getPageTemplate
+from lib.utils.crawler import Crawler
 from lib.utils.deps import checkDependencies
 from lib.utils.google import Google

@@ -388,6 +389,13 @@ def __setRequestFromFile():

    __feedTargetsDict(conf.requestFile, addedTargetUrls)

+def __setCrawler():
+    if not conf.crawl:
+        return
+
+    crawler = Crawler()
+    crawler.getTargetUrls()
+
 def __setGoogleDorking():
    """
    This function checks if the way to request testable hosts is through
@@ -1278,7 +1286,7 @@ def __cleanupOptions():
    if conf.tmpPath:
        conf.tmpPath = ntToPosixSlashes(normalizePath(conf.tmpPath))

-    if conf.googleDork or conf.logFile or conf.bulkFile or conf.forms:
+    if conf.googleDork or conf.logFile or conf.bulkFile or conf.forms or conf.crawl:
        conf.multipleTargets = True

    if conf.optimize:
@@ -1800,6 +1808,7 @@ def init(inputOptions=advancedDict(), overrideOptions=False):
        __setDNSCache()
        __setSafeUrl()
        __setGoogleDorking()
+        __setCrawler()
        __setBulkMultipleTargets()
        __urllib2Opener()
        __findPageForms()
--- a/lib/core/optiondict.py
+++ b/lib/core/optiondict.py
@@ -167,6 +167,8 @@ optDict = {
                               "beep":              "boolean",
                               "checkPayload":      "boolean",
                               "cleanup":           "boolean",
+                               "crawl":             "boolean",
+                               "forms":             "boolean",
                               "googlePage":        "integer",
                               "mobile":            "boolean",
                               "pageRank":          "boolean",
--- a/lib/parse/cmdline.py
+++ b/lib/parse/cmdline.py
@@ -511,6 +511,10 @@ def cmdLineParser():
                                  help="Clean up the DBMS by sqlmap specific "
                                  "UDF and tables")

+        miscellaneous.add_option("--crawl", dest="crawl",
+                                  action="store_true",
+                                  help="Crawl the website starting from the target url")
+
        miscellaneous.add_option("--forms", dest="forms",
                                  action="store_true",
                                  help="Parse and test forms on target url")
--- a/lib/utils/crawler.py
+++ b/lib/utils/crawler.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+"""
+$Id$
+
+Copyright (c) 2006-2011 sqlmap developers (http://sqlmap.sourceforge.net/)
+See the file 'doc/COPYING' for copying permission
+"""
+
+import re
+import threading
+import urlparse
+
+from lib.core.common import dataToStdout
+from lib.core.data import conf
+from lib.core.data import kb
+from lib.core.data import logger
+from lib.core.exception import sqlmapConnectionException
+from lib.core.threads import getCurrentThreadData
+from lib.core.threads import runThreads
+from lib.request.connect import Connect as Request
+from extra.beautifulsoup.beautifulsoup import BeautifulSoup
+from extra.oset.pyoset import oset
+
+class Crawler:
+    """
+    This class defines methods used to perform crawling (command
+    line option '--crawl'
+    """
+
+    def getTargetUrls(self, depth=1):
+        try:
+            threadData = getCurrentThreadData()
+            threadData.shared.outputs = oset()
+
+            lockNames = ('limits', 'outputs')
+            for lock in lockNames:
+                kb.locks[lock] = threading.Lock()
+
+            def crawlThread():
+                threadData = getCurrentThreadData()
+
+                while kb.threadContinue:
+                    kb.locks.limits.acquire()
+                    if threadData.shared.unprocessed:
+                        current = threadData.shared.unprocessed.pop()
+                        kb.locks.limits.release()
+                    else:
+                        kb.locks.limits.release()
+                        break
+
+                    content = Request.getPage(url=conf.url)[0]
+
+                    if not kb.threadContinue:
+                        break
+
+                    soup = BeautifulSoup(content)
+                    for tag in soup('a'):
+                        if tag.get("href"):
+                            url = urlparse.urljoin(conf.url, tag.get("href"))
+                            # flag to know if we are dealing with the same target host
+                            target = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], [url, conf.url]))
+                            if target:
+                                kb.locks.outputs.acquire()
+                                threadData.shared.deeper.add(url)
+                                if re.search(r"(.*?)\?(.+)", url):
+                                    threadData.shared.outputs.add(url)
+                                kb.locks.outputs.release()
+
+            threadData.shared.deeper = set()
+            threadData.shared.unprocessed = set([conf.url])
+
+            logger.info("starting crawling")
+
+            for i in xrange(depth):
+                numThreads = min(conf.threads, len(threadData.shared.unprocessed))
+                logger.debug("processing depth: %d" % i)
+                runThreads(numThreads, crawlThread)
+                threadData.shared.unprocessed = threadData.shared.deeper
+
+        except KeyboardInterrupt:
+            warnMsg = "user aborted during crawling. sqlmap "
+            warnMsg += "will use partial list"
+            logger.warn(warnMsg)
+
+        except sqlmapConnectionException, e:
+            errMsg = "connection exception detected. sqlmap "
+            errMsg += "will use partial list"
+            errMsg += "'%s'" % e
+            logger.critical(errMsg)
+
+        finally:
+            for url in threadData.shared.outputs:
+                kb.targetUrls.add(( url, None, None, None ))
+            kb.suppressResumeInfo = False
--- a/lib/utils/google.py
+++ b/lib/utils/google.py
@@ -60,7 +60,7 @@ class Google:
        """

        for match in self.__matches:
-            if re.search(r"(.*?)\?(.+)", match, re.I):
+            if re.search(r"(.*?)\?(.+)", match):
                kb.targetUrls.add(( htmlunescape(htmlunescape(match)), None, None, None ))
            elif re.search(URI_INJECTABLE_REGEX, match, re.I):
                kb.targetUrls.add(( htmlunescape(htmlunescape("%s" % match)), None, None, None ))