Minor refactoring

This commit is contained in:
Miroslav Stampar
2018-06-07 01:28:02 +02:00
parent d47c16e196
commit 9b397f00be
3 changed files with 4 additions and 5 deletions

View File

@@ -1866,8 +1866,7 @@ def getFilteredPageContent(page, onlyText=True, split=" "):
# only if the page's charset has been successfully identified
if isinstance(page, unicode):
retVal = re.sub(r"(?si)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), split, page)
while retVal.find(2 * split) != -1:
retVal = retVal.replace(2 * split, split)
retVal = re.sub(r"%s{2,}" % split, split, retVal)
retVal = htmlunescape(retVal.strip().strip(split))
return retVal