Bug fix (disable HTML decoding in XSS checks)

2026-01-20 05:19:02 +00:00 · 2019-11-18 11:40:07 +01:00
parent f550a2281f
commit 0977f6df61
4 changed files with 35 additions and 29 deletions
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@@ -334,41 +334,44 @@ def decodePage(page, contentEncoding, contentType, percentDecode=True):

    # can't do for all responses because we need to support binary files too
    if isinstance(page, six.binary_type) and "text/" in contentType:
-        # e.g. &#x9;&#195;&#235;&#224;&#226;&#224;
-        if b"&#" in page:
-            page = re.sub(b"&#x([0-9a-f]{1,2});", lambda _: decodeHex(_.group(1) if len(_.group(1)) == 2 else "0%s" % _.group(1)), page)
-            page = re.sub(b"&#(\\d{1,3});", lambda _: six.int2byte(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)
+        if not kb.disableHtmlDecoding:
+            # e.g. &#x9;&#195;&#235;&#224;&#226;&#224;
+            if b"&#" in page:
+                page = re.sub(b"&#x([0-9a-f]{1,2});", lambda _: decodeHex(_.group(1) if len(_.group(1)) == 2 else "0%s" % _.group(1)), page)
+                page = re.sub(b"&#(\\d{1,3});", lambda _: six.int2byte(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)

-        # e.g. %20%28%29
-        if percentDecode:
-            if b"%" in page:
-                page = re.sub(b"%([0-9a-fA-F]{2})", lambda _: decodeHex(_.group(1)), page)
+            # e.g. %20%28%29
+            if percentDecode:
+                if b"%" in page:
+                    page = re.sub(b"%([0-9a-fA-F]{2})", lambda _: decodeHex(_.group(1)), page)

-        # e.g. &amp;
-        page = re.sub(b"&([^;]+);", lambda _: six.int2byte(HTML_ENTITIES[getText(_.group(1))]) if HTML_ENTITIES.get(getText(_.group(1)), 256) < 256 else _.group(0), page)
+            # e.g. &amp;
+            page = re.sub(b"&([^;]+);", lambda _: six.int2byte(HTML_ENTITIES[getText(_.group(1))]) if HTML_ENTITIES.get(getText(_.group(1)), 256) < 256 else _.group(0), page)

-        kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
+            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))

-        if (kb.pageEncoding or "").lower() == "utf-8-sig":
-            kb.pageEncoding = "utf-8"
-            if page and page.startswith("\xef\xbb\xbf"):  # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling)
-                page = page[3:]
+            if (kb.pageEncoding or "").lower() == "utf-8-sig":
+                kb.pageEncoding = "utf-8"
+                if page and page.startswith("\xef\xbb\xbf"):  # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling)
+                    page = page[3:]

-        page = getUnicode(page, kb.pageEncoding)
+            page = getUnicode(page, kb.pageEncoding)

-        # e.g. &#8217;&#8230;&#8482;
-        if "&#" in page:
-            def _(match):
-                retVal = match.group(0)
-                try:
-                    retVal = _unichr(int(match.group(1)))
-                except (ValueError, OverflowError):
-                    pass
-                return retVal
-            page = re.sub(r"&#(\d+);", _, page)
+            # e.g. &#8217;&#8230;&#8482;
+            if "&#" in page:
+                def _(match):
+                    retVal = match.group(0)
+                    try:
+                        retVal = _unichr(int(match.group(1)))
+                    except (ValueError, OverflowError):
+                        pass
+                    return retVal
+                page = re.sub(r"&#(\d+);", _, page)

-        # e.g. &zeta;
-        page = re.sub(r"&([^;]+);", lambda _: _unichr(HTML_ENTITIES[_.group(1)]) if HTML_ENTITIES.get(_.group(1), 0) > 255 else _.group(0), page)
+            # e.g. &zeta;
+            page = re.sub(r"&([^;]+);", lambda _: _unichr(HTML_ENTITIES[_.group(1)]) if HTML_ENTITIES.get(_.group(1), 0) > 255 else _.group(0), page)
+        else:
+            page = getUnicode(page, kb.pageEncoding)

    return page