Fixed bug reported privately via email

This commit is contained in:
Miroslav Stampar
2017-01-13 14:41:41 +01:00
parent 9a86365d92
commit 750d57ec96
3 changed files with 9 additions and 3 deletions

View File

@@ -314,6 +314,12 @@ def decodePage(page, contentEncoding, contentType):
page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)
kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
if kb.pageEncoding and kb.pageEncoding.lower() == "utf-8-sig":
kb.pageEncoding = "utf-8"
if page and page.startswith("\xef\xbb\xbf"): # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling)
page = page[3:]
page = getUnicode(page, kb.pageEncoding)
# e.g. &#8217;&#8230;&#8482;