Some more DREI stuff

This commit is contained in:
Miroslav Stampar
2019-04-19 11:24:34 +02:00
parent da15701a55
commit bb7bd51d94
15 changed files with 94 additions and 71 deletions

View File

@@ -91,6 +91,11 @@ import sys
if sys.version_info >= (3, 0):
xrange = range
text_type = str
binary_type = bytes
else:
text_type = unicode
binary_type = str
try:
from htmlentitydefs import name2codepoint
@@ -434,19 +439,13 @@ class PageElement(object):
def toEncoding(self, s, encoding=None):
"""Encodes an object to a string in some encoding, or to Unicode.
."""
if isinstance(s, unicode):
if isinstance(s, text_type):
if encoding:
s = s.encode(encoding)
elif isinstance(s, str):
if encoding:
s = s.encode(encoding)
else:
s = unicode(s)
elif isinstance(s, binary_type):
s = s.encode(encoding or "utf8")
else:
if encoding:
s = self.toEncoding(str(s), encoding)
else:
s = unicode(s)
s = self.toEncoding(str(s), encoding or "utf8")
return s
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
@@ -459,7 +458,7 @@ class PageElement(object):
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
class NavigableString(unicode, PageElement):
class NavigableString(text_type, PageElement):
def __new__(cls, value):
"""Create a new NavigableString.
@@ -469,9 +468,9 @@ class NavigableString(unicode, PageElement):
passed in to the superclass's __new__ or the superclass won't know
how to handle non-ASCII characters.
"""
if isinstance(value, unicode):
return unicode.__new__(cls, value)
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
if isinstance(value, text_type):
return text_type.__new__(cls, value)
return text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __getnewargs__(self):
return (NavigableString.__str__(self),)
@@ -1006,7 +1005,7 @@ class SoupStrainer:
if isinstance(markup, Tag):
markup = markup.name
if markup and not isinstance(markup, basestring):
markup = unicode(markup)
markup = text_type(markup)
#Now we know that chunk is either a string, or None.
if hasattr(matchAgainst, 'match'):
# It's a regexp object.
@@ -1016,8 +1015,8 @@ class SoupStrainer:
elif hasattr(matchAgainst, 'items'):
result = markup.has_key(matchAgainst)
elif matchAgainst and isinstance(markup, basestring):
if isinstance(markup, unicode):
matchAgainst = unicode(matchAgainst)
if isinstance(markup, text_type):
matchAgainst = text_type(matchAgainst)
else:
matchAgainst = str(matchAgainst)
@@ -1181,7 +1180,7 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
def _feed(self, inDocumentEncoding=None, isHTML=False):
# Convert the document to Unicode.
markup = self.markup
if isinstance(markup, unicode):
if isinstance(markup, text_type):
if not hasattr(self, 'originalEncoding'):
self.originalEncoding = None
else:
@@ -1792,9 +1791,9 @@ class UnicodeDammit:
self._detectEncoding(markup, isHTML)
self.smartQuotesTo = smartQuotesTo
self.triedEncodings = []
if markup == '' or isinstance(markup, unicode):
if markup == '' or isinstance(markup, text_type):
self.originalEncoding = None
self.unicode = unicode(markup)
self.unicode = text_type(markup)
return
u = None
@@ -1807,7 +1806,7 @@ class UnicodeDammit:
if u: break
# If no luck and we have auto-detection library, try that:
if not u and chardet and not isinstance(self.markup, unicode):
if not u and chardet and not isinstance(self.markup, text_type):
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
# As a last resort, try utf-8 and windows-1252:
@@ -1880,7 +1879,7 @@ class UnicodeDammit:
elif data[:4] == '\xff\xfe\x00\x00':
encoding = 'utf-32le'
data = data[4:]
newdata = unicode(data, encoding)
newdata = text_type(data, encoding)
return newdata
def _detectEncoding(self, xml_data, isHTML=False):
@@ -1893,41 +1892,41 @@ class UnicodeDammit:
elif xml_data[:4] == '\x00\x3c\x00\x3f':
# UTF-16BE
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
xml_data = text_type(xml_data, 'utf-16be').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
and (xml_data[2:4] != '\x00\x00'):
# UTF-16BE with BOM
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
xml_data = text_type(xml_data[2:], 'utf-16be').encode('utf-8')
elif xml_data[:4] == '\x3c\x00\x3f\x00':
# UTF-16LE
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
xml_data = text_type(xml_data, 'utf-16le').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
(xml_data[2:4] != '\x00\x00'):
# UTF-16LE with BOM
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
xml_data = text_type(xml_data[2:], 'utf-16le').encode('utf-8')
elif xml_data[:4] == '\x00\x00\x00\x3c':
# UTF-32BE
sniffed_xml_encoding = 'utf-32be'
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
xml_data = text_type(xml_data, 'utf-32be').encode('utf-8')
elif xml_data[:4] == '\x3c\x00\x00\x00':
# UTF-32LE
sniffed_xml_encoding = 'utf-32le'
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
xml_data = text_type(xml_data, 'utf-32le').encode('utf-8')
elif xml_data[:4] == '\x00\x00\xfe\xff':
# UTF-32BE with BOM
sniffed_xml_encoding = 'utf-32be'
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
xml_data = text_type(xml_data[4:], 'utf-32be').encode('utf-8')
elif xml_data[:4] == '\xff\xfe\x00\x00':
# UTF-32LE with BOM
sniffed_xml_encoding = 'utf-32le'
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
xml_data = text_type(xml_data[4:], 'utf-32le').encode('utf-8')
elif xml_data[:3] == '\xef\xbb\xbf':
# UTF-8 with BOM
sniffed_xml_encoding = 'utf-8'
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
xml_data = text_type(xml_data[3:], 'utf-8').encode('utf-8')
else:
sniffed_xml_encoding = 'ascii'
pass

View File

@@ -21,7 +21,6 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
"""
import io
import mimetools
import mimetypes
import os
import stat