Fixed bug reported privately via email

2026-01-20 13:29:02 +00:00 · 2017-01-13 14:41:41 +01:00
parent 9a86365d92
commit 750d57ec96
3 changed files with 9 additions and 3 deletions
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@@ -314,6 +314,12 @@ def decodePage(page, contentEncoding, contentType):
            page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)

            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
+
+            if kb.pageEncoding and kb.pageEncoding.lower() == "utf-8-sig":
+                kb.pageEncoding = "utf-8"
+                if page and page.startswith("\xef\xbb\xbf"):  # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling)
+                    page = page[3:]
+
            page = getUnicode(page, kb.pageEncoding)

            # e.g. &#8217;&#8230;&#8482;