From cb917299133840befbb84623b4a49f1d62385bd5 Mon Sep 17 00:00:00 2001
From: Miroslav Stampar <miroslav.stampar@gmail.com>
Date: Thu, 27 Dec 2012 20:55:37 +0100
Subject: [PATCH] Fix for an Issue #324 (crawling when HTML is not well-formed)

---
 lib/utils/crawler.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/lib/utils/crawler.py b/lib/utils/crawler.py
index b1718b6b9..f19b9fc88 100644
--- a/lib/utils/crawler.py
+++ b/lib/utils/crawler.py
@@ -64,10 +64,21 @@ class Crawler(object):
 
                     if isinstance(content, unicode):
                         try:
+                            match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
+                            if match:
+                                content = "<html>%s</html>" % match.group(1)
+
                             soup = BeautifulSoup(content)
-                            for tag in soup('a'):
-                                if tag.get("href"):
-                                    url = urlparse.urljoin(conf.url, tag.get("href"))
+                            tags = soup('a')
+
+                            if not tags:
+                                tags = re.finditer(r'(?si)<a[^>]+href="(?P<href>[^>"]+)"', content)
+
+                            for tag in tags:
+                                href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
+
+                                if href:
+                                    url = urlparse.urljoin(conf.url, href)
 
                                     # flag to know if we are dealing with the same target host
                                     _ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, conf.url)))