From 63d9b7a1f878b5c9041cea1579953f6420841062 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 20 Dec 2012 12:23:37 +0100 Subject: [PATCH] No character shall be left forgotten (no more ? in case that character was not properly being decoded by used charset) --- lib/core/common.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/core/common.py b/lib/core/common.py index 54b159eb3..afaa1c90c 100644 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -1815,7 +1815,11 @@ def getUnicode(value, encoding=None, system=False, noneToNull=False): if isinstance(value, unicode): return value elif isinstance(value, basestring): - return unicode(value, encoding or kb.pageEncoding or UNICODE_ENCODING, "replace") + while True: + try: + return unicode(value, encoding or kb.pageEncoding or UNICODE_ENCODING) + except UnicodeDecodeError, ex: + value = value[:ex.start] + "".join("\\x%02x" % ord(_) for _ in value[ex.start:ex.end]) + value[ex.end:] else: return unicode(value) # encoding ignored for non-basestring instances else: