From fccd69721e9f2fc037ede11a7344fc0ae1688f7c Mon Sep 17 00:00:00 2001
From: Miroslav Stampar <miroslav.stampar@gmail.com>
Date: Mon, 23 Jul 2012 18:38:46 +0200
Subject: [PATCH] Update for an Issue #111

---
 lib/request/basic.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/lib/request/basic.py b/lib/request/basic.py
index 506b60e47..12c64b92c 100644
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@@ -211,11 +211,17 @@ def decodePage(page, contentEncoding, contentType):
 
     # can't do for all responses because we need to support binary files too
     if contentType and not isinstance(page, unicode) and any(map(lambda x: x in contentType.lower(), ("text/txt", "text/raw", "text/html", "text/xml"))):
+        # e.g. &#195;&#235;&#224;&#226;&#224;
         if "&#" in page:
-            page = re.sub('&#(\d+);', lambda _: chr(int(_.group(1))), page)
+            page = re.sub('&#(\d+);', lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)
+
         kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
         page = getUnicode(page, kb.pageEncoding)
 
+        # e.g. &#8217;&#8230;&#8482;
+        if "&#" in page:
+            page = re.sub('&#(\d+);', lambda _: unichr(int(_.group(1))), page)
+
     return page
 
 def processResponse(page, responseHeaders):