From cb917299133840befbb84623b4a49f1d62385bd5 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 27 Dec 2012 20:55:37 +0100 Subject: [PATCH] Fix for an Issue #324 (crawling when HTML is not well-formed) --- lib/utils/crawler.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/lib/utils/crawler.py b/lib/utils/crawler.py index b1718b6b9..f19b9fc88 100644 --- a/lib/utils/crawler.py +++ b/lib/utils/crawler.py @@ -64,10 +64,21 @@ class Crawler(object): if isinstance(content, unicode): try: + match = re.search(r"(?si)]*>(.+)", content) + if match: + content = "%s" % match.group(1) + soup = BeautifulSoup(content) - for tag in soup('a'): - if tag.get("href"): - url = urlparse.urljoin(conf.url, tag.get("href")) + tags = soup('a') + + if not tags: + tags = re.finditer(r'(?si)]+href="(?P[^>"]+)"', content) + + for tag in tags: + href = tag.get("href") if hasattr(tag, "get") else tag.group("href") + + if href: + url = urlparse.urljoin(conf.url, href) # flag to know if we are dealing with the same target host _ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, conf.url)))