major bug fix (different HTTP content charsets are now properly handled)

2026-01-20 13:29:02 +00:00 · 2010-06-09 14:40:36 +00:00
parent 654d707d5d
commit eaef068c90
2 changed files with 18 additions and 16 deletions
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@@ -89,13 +89,13 @@ def parseResponse(page, headers):
                    kb.absFilePaths.add(absFilePath)
                    

-def decodePage(page, encoding):
+def decodePage(page, contentEncoding, contentType):
    """
-    Decode gzip/deflate HTTP response
+    Decode compressed/charset HTTP response
    """

-    if isinstance(encoding, basestring) and encoding.lower() in ('gzip', 'x-gzip', 'deflate'):
-        if encoding == 'deflate':
+    if isinstance(contentEncoding, basestring) and contentEncoding.lower() in ('gzip', 'x-gzip', 'deflate'):
+        if contentEncoding == 'deflate':
            # http://stackoverflow.com/questions/1089662/python-inflate-and-deflate-implementations
            data = StringIO.StringIO(zlib.decompress(page, -15))
        else:
@@ -103,4 +103,8 @@ def decodePage(page, encoding):

        page = data.read()

+    #http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
+    if contentType and (contentType.find('charset=') != -1):
+        page = unicode(page, contentType.split('charset=')[-1])
+
    return page