From fccd69721e9f2fc037ede11a7344fc0ae1688f7c Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 23 Jul 2012 18:38:46 +0200 Subject: [PATCH] Update for an Issue #111 --- lib/request/basic.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/request/basic.py b/lib/request/basic.py index 506b60e47..12c64b92c 100644 --- a/lib/request/basic.py +++ b/lib/request/basic.py @@ -211,11 +211,17 @@ def decodePage(page, contentEncoding, contentType): # can't do for all responses because we need to support binary files too if contentType and not isinstance(page, unicode) and any(map(lambda x: x in contentType.lower(), ("text/txt", "text/raw", "text/html", "text/xml"))): + # e.g. Ãëàâà if "&#" in page: - page = re.sub('&#(\d+);', lambda _: chr(int(_.group(1))), page) + page = re.sub('&#(\d+);', lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page) + kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) page = getUnicode(page, kb.pageEncoding) + # e.g. ’…™ + if "&#" in page: + page = re.sub('&#(\d+);', lambda _: unichr(int(_.group(1))), page) + return page def processResponse(page, responseHeaders):