From 9c093d91f256562dc1ff9d247a1023cbe456ec5a Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 9 Jun 2011 06:14:35 +0000 Subject: [PATCH] minor update --- lib/core/common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/core/common.py b/lib/core/common.py index e096fcb35..8b07287e8 100644 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -1399,7 +1399,8 @@ def sanitizeAsciiString(subject): def getFilteredPageContent(page, onlyText=True): retVal = page - if isinstance(page, basestring): + # only if the page's charset had been successfully identified + if isinstance(page, unicode): retVal = re.sub(r"(?s)||%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), " ", page) while retVal.find(" ") != -1: @@ -1412,7 +1413,8 @@ def getFilteredPageContent(page, onlyText=True): def getPageTextWordsSet(page): retVal = None - if isinstance(page, basestring): + # only if the page's charset had been successfully identified + if isinstance(page, unicode): page = getFilteredPageContent(page) retVal = set(re.findall(r"\w+", page))