Implementation for an Issue #195

This commit is contained in:
Miroslav Stampar
2012-09-25 10:17:25 +02:00
parent 9ca7b3e20e
commit c9e7e71ea2
2 changed files with 11 additions and 8 deletions

View File

@@ -100,14 +100,14 @@ def parseResponse(page, headers):
if page:
htmlParser(page)
def checkCharEncoding(encoding):
def checkCharEncoding(encoding, warn=True):
if encoding:
encoding = encoding.lower()
else:
return encoding
# http://www.destructor.de/charsets/index.htm
translate = { "windows-874": "iso-8859-11", "en_us": "utf8", "macintosh": "iso-8859-1", "euc_tw": "big5_tw", "th": "tis-620", "unicode": "utf8", "utc8": "utf8"}
translate = { "windows-874": "iso-8859-11", "en_us": "utf8", "macintosh": "iso-8859-1", "euc_tw": "big5_tw", "th": "tis-620", "unicode": "utf8", "utc8": "utf8", "ebcdic": "ebcdic-cp-be"}
for delimiter in (';', ',', '('):
if delimiter in encoding:
@@ -156,9 +156,10 @@ def checkCharEncoding(encoding):
try:
codecs.lookup(encoding)
except LookupError:
warnMsg = "unknown web page charset '%s'. " % encoding
warnMsg += "Please report by e-mail to %s." % ML
singleTimeLogMessage(warnMsg, logging.WARN, encoding)
if warn:
warnMsg = "unknown web page charset '%s'. " % encoding
warnMsg += "Please report by e-mail to %s." % ML
singleTimeLogMessage(warnMsg, logging.WARN, encoding)
encoding = None
return encoding