Patch for an Issue #169

2026-01-23 06:39:02 +00:00 · 2013-01-09 15:22:21 +01:00
parent 55a552ddc4
commit 3d4f381ab5
2 changed files with 110 additions and 113 deletions
--- a/lib/utils/crawler.py
+++ b/lib/utils/crawler.py
@@ -25,116 +25,110 @@ from lib.request.connect import Connect as Request
 from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
 from thirdparty.oset.pyoset import oset

-class Crawler(object):
-    """
-    This class defines methods used to perform crawling (command
-    line option '--crawl'
-    """
+def crawl(target):
+    try:
+        threadData = getCurrentThreadData()
+        threadData.shared.value = oset()

-    def getTargetUrls(self):
-        try:
+        def crawlThread():
            threadData = getCurrentThreadData()
-            threadData.shared.value = oset()

-            def crawlThread():
-                threadData = getCurrentThreadData()
-
-                while kb.threadContinue:
-                    with kb.locks.limit:
-                        if threadData.shared.unprocessed:
-                            current = threadData.shared.unprocessed.pop()
-                        else:
-                            break
-
-                    content = None
-                    try:
-                        if current:
-                            content = Request.getPage(url=current, crawling=True, raise404=False)[0]
-                    except SqlmapConnectionException, e:
-                        errMsg = "connection exception detected (%s). skipping " % e
-                        errMsg += "url '%s'" % current
-                        logger.critical(errMsg)
-                    except httplib.InvalidURL, e:
-                        errMsg = "invalid url detected (%s). skipping " % e
-                        errMsg += "url '%s'" % current
-                        logger.critical(errMsg)
-
-                    if not kb.threadContinue:
+            while kb.threadContinue:
+                with kb.locks.limit:
+                    if threadData.shared.unprocessed:
+                        current = threadData.shared.unprocessed.pop()
+                    else:
                        break

-                    if isinstance(content, unicode):
-                        try:
-                            match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
-                            if match:
-                                content = "<html>%s</html>" % match.group(1)
+                content = None
+                try:
+                    if current:
+                        content = Request.getPage(url=current, crawling=True, raise404=False)[0]
+                except SqlmapConnectionException, e:
+                    errMsg = "connection exception detected (%s). skipping " % e
+                    errMsg += "url '%s'" % current
+                    logger.critical(errMsg)
+                except httplib.InvalidURL, e:
+                    errMsg = "invalid url detected (%s). skipping " % e
+                    errMsg += "url '%s'" % current
+                    logger.critical(errMsg)

-                            soup = BeautifulSoup(content)
-                            tags = soup('a')
-
-                            if not tags:
-                                tags = re.finditer(r'(?si)<a[^>]+href="(?P<href>[^>"]+)"', content)
-
-                            for tag in tags:
-                                href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
-
-                                if href:
-                                    url = urlparse.urljoin(conf.url, href)
-
-                                    # flag to know if we are dealing with the same target host
-                                    _ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, conf.url)))
-
-                                    if conf.scope:
-                                        if not re.search(conf.scope, url, re.I):
-                                            continue
-                                    elif not _:
-                                        continue
-
-                                    if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
-                                        with kb.locks.value:
-                                            threadData.shared.deeper.add(url)
-                                            if re.search(r"(.*?)\?(.+)", url):
-                                                threadData.shared.value.add(url)
-                        except UnicodeEncodeError: # for non-HTML files
-                            pass
-                        finally:
-                            if conf.forms:
-                                findPageForms(content, current, False, True)
-
-                    if conf.verbose in (1, 2):
-                        threadData.shared.count += 1
-                        status = '%d/%d links visited (%d%s)' % (threadData.shared.count, threadData.shared.length, round(100.0*threadData.shared.count/threadData.shared.length), '%')
-                        dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
-
-            threadData.shared.deeper = set()
-            threadData.shared.unprocessed = set([conf.url])
-
-            logger.info("starting crawler")
-
-            for i in xrange(conf.crawlDepth):
-                if i > 0 and conf.threads == 1:
-                    singleTimeWarnMessage("running in a single-thread mode. This could take a while.")
-                threadData.shared.count = 0
-                threadData.shared.length = len(threadData.shared.unprocessed)
-                numThreads = min(conf.threads, len(threadData.shared.unprocessed))
-                logger.info("searching for links with depth %d" % (i + 1))
-                runThreads(numThreads, crawlThread)
-                clearConsoleLine(True)
-                if threadData.shared.deeper:
-                    threadData.shared.unprocessed = set(threadData.shared.deeper)
-                else:
+                if not kb.threadContinue:
                    break

-        except KeyboardInterrupt:
-            warnMsg = "user aborted during crawling. sqlmap "
-            warnMsg += "will use partial list"
-            logger.warn(warnMsg)
+                if isinstance(content, unicode):
+                    try:
+                        match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
+                        if match:
+                            content = "<html>%s</html>" % match.group(1)

-        finally:
+                        soup = BeautifulSoup(content)
+                        tags = soup('a')
+
+                        if not tags:
+                            tags = re.finditer(r'(?si)<a[^>]+href="(?P<href>[^>"]+)"', content)
+
+                        for tag in tags:
+                            href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
+
+                            if href:
+                                url = urlparse.urljoin(target, href)
+
+                                # flag to know if we are dealing with the same target host
+                                _ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, target)))
+
+                                if conf.scope:
+                                    if not re.search(conf.scope, url, re.I):
+                                        continue
+                                elif not _:
+                                    continue
+
+                                if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
+                                    with kb.locks.value:
+                                        threadData.shared.deeper.add(url)
+                                        if re.search(r"(.*?)\?(.+)", url):
+                                            threadData.shared.value.add(url)
+                    except UnicodeEncodeError: # for non-HTML files
+                        pass
+                    finally:
+                        if conf.forms:
+                            findPageForms(content, current, False, True)
+
+                if conf.verbose in (1, 2):
+                    threadData.shared.count += 1
+                    status = '%d/%d links visited (%d%s)' % (threadData.shared.count, threadData.shared.length, round(100.0*threadData.shared.count/threadData.shared.length), '%')
+                    dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
+
+        threadData.shared.deeper = set()
+        threadData.shared.unprocessed = set([target])
+
+        logger.info("starting crawler")
+
+        for i in xrange(conf.crawlDepth):
+            if i > 0 and conf.threads == 1:
+                singleTimeWarnMessage("running in a single-thread mode. This could take a while.")
+            threadData.shared.count = 0
+            threadData.shared.length = len(threadData.shared.unprocessed)
+            numThreads = min(conf.threads, len(threadData.shared.unprocessed))
+            logger.info("searching for links with depth %d" % (i + 1))
+            runThreads(numThreads, crawlThread)
            clearConsoleLine(True)
-
-            if not threadData.shared.value:
-                warnMsg = "no usable links found (with GET parameters)"
-                logger.warn(warnMsg)
+            if threadData.shared.deeper:
+                threadData.shared.unprocessed = set(threadData.shared.deeper)
            else:
-                for url in threadData.shared.value:
-                    kb.targets.add(( url, None, None, None ))
+                break
+
+    except KeyboardInterrupt:
+        warnMsg = "user aborted during crawling. sqlmap "
+        warnMsg += "will use partial list"
+        logger.warn(warnMsg)
+
+    finally:
+        clearConsoleLine(True)
+
+        if not threadData.shared.value:
+            warnMsg = "no usable links found (with GET parameters)"
+            logger.warn(warnMsg)
+        else:
+            for url in threadData.shared.value:
+                kb.targets.add((url, None, None, None))