From ab9cb80602ca05d1e8a80bd2b6c5c8e55ac55963 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 23 Jul 2012 15:14:52 +0200 Subject: [PATCH] Implementing Issue #111 --- lib/core/convert.py | 1 - lib/request/basic.py | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/core/convert.py b/lib/core/convert.py index fa8b6ebd9..5c8c71e2d 100644 --- a/lib/core/convert.py +++ b/lib/core/convert.py @@ -144,5 +144,4 @@ def htmlunescape(value): if value and isinstance(value, basestring): codes = (('<', '<'), ('>', '>'), ('"', '"'), (' ', ' '), ('&', '&')) retVal = reduce(lambda x, y: x.replace(y[0], y[1]), codes, retVal) - retVal = re.sub('&#(\d+);', lambda x: getUnicode(chr(x.group(1))), retVal) return retVal diff --git a/lib/request/basic.py b/lib/request/basic.py index 07b718ced..506b60e47 100644 --- a/lib/request/basic.py +++ b/lib/request/basic.py @@ -209,8 +209,10 @@ def decodePage(page, contentEncoding, contentType): else: kb.pageEncoding = conf.charset + # can't do for all responses because we need to support binary files too if contentType and not isinstance(page, unicode) and any(map(lambda x: x in contentType.lower(), ("text/txt", "text/raw", "text/html", "text/xml"))): - # can't do for all responses because we need to support binary files too + if "&#" in page: + page = re.sub('&#(\d+);', lambda _: chr(int(_.group(1))), page) kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) page = getUnicode(page, kb.pageEncoding)