removing of unused imports together with some general code refactoring

This commit is contained in:
Miroslav Stampar
2012-02-22 10:40:11 +00:00
parent 386e98a0e3
commit b3bd4144f5
104 changed files with 255 additions and 499 deletions

View File

@@ -9,7 +9,6 @@ See the file 'doc/COPYING' for copying permission
import httplib
import re
import threading
import urlparse
import time
@@ -73,16 +72,16 @@ class Crawler:
for tag in soup('a'):
if tag.get("href"):
url = urlparse.urljoin(conf.url, tag.get("href"))
# flag to know if we are dealing with the same target host
target = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], [url, conf.url]))
if conf.scope:
if not re.search(conf.scope, url, re.I):
continue
elif not target:
continue
if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
kb.locks.outputs.acquire()
threadData.shared.deeper.add(url)