Implementation of crawling results normalization

This commit is contained in:
Miroslav Stampar
2019-10-31 22:07:16 +01:00
parent 273004396c
commit a660828cec
3 changed files with 22 additions and 1 deletions

View File

@@ -195,6 +195,26 @@ def crawl(target):
for url in threadData.shared.value:
kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None))
if kb.normalizeCrawlingChoice is None:
message = "do you want to normalize "
message += "crawling results [Y/n] "
kb.normalizeCrawlingChoice = readInput(message, default='Y', boolean=True)
if kb.normalizeCrawlingChoice:
seen = set()
results = OrderedSet()
for target in kb.targets:
match = re.search(r"/[^/?]*\?.*\Z", target[0])
if match:
key = re.sub(r"=[^=&]*", "=", match.group(0))
if key not in seen:
results.add(target)
seen.add(key)
kb.targets = results
storeResultsToFile(kb.targets)
def storeResultsToFile(results):