Some more DREI stuff

2025-12-08 05:31:32 +00:00 · 2019-04-19 11:24:34 +02:00
parent da15701a55
commit bb7bd51d94
15 changed files with 94 additions and 71 deletions
--- a/thirdparty/beautifulsoup/beautifulsoup.py
+++ b/thirdparty/beautifulsoup/beautifulsoup.py
@@ -91,6 +91,11 @@ import sys

 if sys.version_info >= (3, 0):
    xrange = range
+    text_type = str
+    binary_type = bytes
+else:
+    text_type = unicode
+    binary_type = str

 try:
  from htmlentitydefs import name2codepoint
@@ -434,19 +439,13 @@ class PageElement(object):
    def toEncoding(self, s, encoding=None):
        """Encodes an object to a string in some encoding, or to Unicode.
        ."""
-        if isinstance(s, unicode):
+        if isinstance(s, text_type):
            if encoding:
                s = s.encode(encoding)
-        elif isinstance(s, str):
-            if encoding:
-                s = s.encode(encoding)
-            else:
-                s = unicode(s)
+        elif isinstance(s, binary_type):
+            s = s.encode(encoding or "utf8")
        else:
-            if encoding:
-                s  = self.toEncoding(str(s), encoding)
-            else:
-                s = unicode(s)
+            s  = self.toEncoding(str(s), encoding or "utf8")
        return s

    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
@@ -459,7 +458,7 @@ class PageElement(object):
        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"


-class NavigableString(unicode, PageElement):
+class NavigableString(text_type, PageElement):

    def __new__(cls, value):
        """Create a new NavigableString.
@@ -469,9 +468,9 @@ class NavigableString(unicode, PageElement):
        passed in to the superclass's __new__ or the superclass won't know
        how to handle non-ASCII characters.
        """
-        if isinstance(value, unicode):
-            return unicode.__new__(cls, value)
-        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+        if isinstance(value, text_type):
+            return text_type.__new__(cls, value)
+        return text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)

    def __getnewargs__(self):
        return (NavigableString.__str__(self),)
@@ -1006,7 +1005,7 @@ class SoupStrainer:
            if isinstance(markup, Tag):
                markup = markup.name
            if markup and not isinstance(markup, basestring):
-                markup = unicode(markup)
+                markup = text_type(markup)
            #Now we know that chunk is either a string, or None.
            if hasattr(matchAgainst, 'match'):
                # It's a regexp object.
@@ -1016,8 +1015,8 @@ class SoupStrainer:
            elif hasattr(matchAgainst, 'items'):
                result = markup.has_key(matchAgainst)
            elif matchAgainst and isinstance(markup, basestring):
-                if isinstance(markup, unicode):
-                    matchAgainst = unicode(matchAgainst)
+                if isinstance(markup, text_type):
+                    matchAgainst = text_type(matchAgainst)
                else:
                    matchAgainst = str(matchAgainst)

@@ -1181,7 +1180,7 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
    def _feed(self, inDocumentEncoding=None, isHTML=False):
        # Convert the document to Unicode.
        markup = self.markup
-        if isinstance(markup, unicode):
+        if isinstance(markup, text_type):
            if not hasattr(self, 'originalEncoding'):
                self.originalEncoding = None
        else:
@@ -1792,9 +1791,9 @@ class UnicodeDammit:
                     self._detectEncoding(markup, isHTML)
        self.smartQuotesTo = smartQuotesTo
        self.triedEncodings = []
-        if markup == '' or isinstance(markup, unicode):
+        if markup == '' or isinstance(markup, text_type):
            self.originalEncoding = None
-            self.unicode = unicode(markup)
+            self.unicode = text_type(markup)
            return

        u = None
@@ -1807,7 +1806,7 @@ class UnicodeDammit:
                if u: break

        # If no luck and we have auto-detection library, try that:
-        if not u and chardet and not isinstance(self.markup, unicode):
+        if not u and chardet and not isinstance(self.markup, text_type):
            u = self._convertFrom(chardet.detect(self.markup)['encoding'])

        # As a last resort, try utf-8 and windows-1252:
@@ -1880,7 +1879,7 @@ class UnicodeDammit:
        elif data[:4] == '\xff\xfe\x00\x00':
            encoding = 'utf-32le'
            data = data[4:]
-        newdata = unicode(data, encoding)
+        newdata = text_type(data, encoding)
        return newdata

    def _detectEncoding(self, xml_data, isHTML=False):
@@ -1893,41 +1892,41 @@ class UnicodeDammit:
            elif xml_data[:4] == '\x00\x3c\x00\x3f':
                # UTF-16BE
                sniffed_xml_encoding = 'utf-16be'
-                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+                xml_data = text_type(xml_data, 'utf-16be').encode('utf-8')
            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
                     and (xml_data[2:4] != '\x00\x00'):
                # UTF-16BE with BOM
                sniffed_xml_encoding = 'utf-16be'
-                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+                xml_data = text_type(xml_data[2:], 'utf-16be').encode('utf-8')
            elif xml_data[:4] == '\x3c\x00\x3f\x00':
                # UTF-16LE
                sniffed_xml_encoding = 'utf-16le'
-                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+                xml_data = text_type(xml_data, 'utf-16le').encode('utf-8')
            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
                     (xml_data[2:4] != '\x00\x00'):
                # UTF-16LE with BOM
                sniffed_xml_encoding = 'utf-16le'
-                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+                xml_data = text_type(xml_data[2:], 'utf-16le').encode('utf-8')
            elif xml_data[:4] == '\x00\x00\x00\x3c':
                # UTF-32BE
                sniffed_xml_encoding = 'utf-32be'
-                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+                xml_data = text_type(xml_data, 'utf-32be').encode('utf-8')
            elif xml_data[:4] == '\x3c\x00\x00\x00':
                # UTF-32LE
                sniffed_xml_encoding = 'utf-32le'
-                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+                xml_data = text_type(xml_data, 'utf-32le').encode('utf-8')
            elif xml_data[:4] == '\x00\x00\xfe\xff':
                # UTF-32BE with BOM
                sniffed_xml_encoding = 'utf-32be'
-                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+                xml_data = text_type(xml_data[4:], 'utf-32be').encode('utf-8')
            elif xml_data[:4] == '\xff\xfe\x00\x00':
                # UTF-32LE with BOM
                sniffed_xml_encoding = 'utf-32le'
-                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+                xml_data = text_type(xml_data[4:], 'utf-32le').encode('utf-8')
            elif xml_data[:3] == '\xef\xbb\xbf':
                # UTF-8 with BOM
                sniffed_xml_encoding = 'utf-8'
-                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+                xml_data = text_type(xml_data[3:], 'utf-8').encode('utf-8')
            else:
                sniffed_xml_encoding = 'ascii'
                pass