Update for an Issue #2384

This commit is contained in:
Miroslav Stampar
2017-02-06 13:28:33 +01:00
parent 15f86e85b1
commit 38f16decef
6 changed files with 40 additions and 11 deletions

View File

@@ -1755,7 +1755,7 @@ def safeStringFormat(format_, params):
break
return retVal
def getFilteredPageContent(page, onlyText=True):
def getFilteredPageContent(page, onlyText=True, split=" "):
"""
Returns filtered page content without script, style and/or comments
or all HTML tags
@@ -1768,10 +1768,10 @@ def getFilteredPageContent(page, onlyText=True):
# only if the page's charset has been successfully identified
if isinstance(page, unicode):
retVal = re.sub(r"(?si)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), " ", page)
while retVal.find(" ") != -1:
retVal = retVal.replace(" ", " ")
retVal = htmlunescape(retVal.strip())
retVal = re.sub(r"(?si)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), split, page)
while retVal.find(2 * split) != -1:
retVal = retVal.replace(2 * split, split)
retVal = htmlunescape(retVal.strip().strip(split))
return retVal