Definite patch for MemoryError(s) (fixes #1991)

This commit is contained in:
Miroslav Stampar
2016-06-30 14:57:56 +02:00
parent 65a0f15f69
commit cb43c03712
4 changed files with 41 additions and 49 deletions

View File

@@ -21,6 +21,7 @@ from lib.core.settings import DEFAULT_PAGE_ENCODING
from lib.core.settings import DIFF_TOLERANCE
from lib.core.settings import HTML_TITLE_REGEX
from lib.core.settings import MIN_RATIO
from lib.core.settings import MAX_DIFFLIB_SEQUENCE_LENGTH
from lib.core.settings import MAX_RATIO
from lib.core.settings import REFLECTED_VALUE_MARKER
from lib.core.settings import LOWER_RATIO_BOUND
@@ -54,8 +55,6 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
if page is None and pageLength is None:
return None
count = 0
seqMatcher = threadData.seqMatcher
seqMatcher.set_seq1(kb.pageTemplate)
@@ -110,59 +109,37 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
elif isinstance(seqMatcher.a, unicode) and isinstance(page, str):
seqMatcher.a = seqMatcher.a.encode(kb.pageEncoding or DEFAULT_PAGE_ENCODING, 'ignore')
seq1, seq2 = None, None
if conf.titles:
seq1 = extractRegexResult(HTML_TITLE_REGEX, seqMatcher.a)
seq2 = extractRegexResult(HTML_TITLE_REGEX, page)
if seqMatcher.a and page and seqMatcher.a == page:
ratio = 1
elif kb.skipSeqMatcher or seqMatcher.a and page and any(len(_) > MAX_DIFFLIB_SEQUENCE_LENGTH for _ in (seqMatcher.a, page)):
ratio = 1.0 * len(seqMatcher.a) / len(page)
if ratio > 1:
ratio = 1. / ratio
else:
seq1 = getFilteredPageContent(seqMatcher.a, True) if conf.textOnly else seqMatcher.a
seq2 = getFilteredPageContent(page, True) if conf.textOnly else page
seq1, seq2 = None, None
if seq1 is None or seq2 is None:
return None
seq1 = seq1.replace(REFLECTED_VALUE_MARKER, "")
seq2 = seq2.replace(REFLECTED_VALUE_MARKER, "")
while count < min(len(seq1), len(seq2)):
if seq1[count] == seq2[count]:
count += 1
if conf.titles:
seq1 = extractRegexResult(HTML_TITLE_REGEX, seqMatcher.a)
seq2 = extractRegexResult(HTML_TITLE_REGEX, page)
else:
break
seq1 = getFilteredPageContent(seqMatcher.a, True) if conf.textOnly else seqMatcher.a
seq2 = getFilteredPageContent(page, True) if conf.textOnly else page
if count:
try:
_seq1 = seq1[count:]
_seq2 = seq2[count:]
except MemoryError:
pass
else:
seq1 = _seq1
seq2 = _seq2
if seq1 is None or seq2 is None:
return None
while True:
try:
seqMatcher.set_seq1(seq1)
except MemoryError:
seq1 = seq1[:len(seq1) / 1024]
else:
break
seq1 = seq1.replace(REFLECTED_VALUE_MARKER, "")
seq2 = seq2.replace(REFLECTED_VALUE_MARKER, "")
while True:
try:
seqMatcher.set_seq2(seq2)
except MemoryError:
seq2 = seq2[:len(seq2) / 1024]
else:
break
seqMatcher.set_seq1(seq1)
seqMatcher.set_seq2(seq2)
ratio = round(seqMatcher.quick_ratio(), 3)
ratio = round(seqMatcher.quick_ratio(), 3)
# If the url is stable and we did not set yet the match ratio and the
# current injected value changes the url page content
if kb.matchRatio is None:
if (count or ratio >= LOWER_RATIO_BOUND) and ratio <= UPPER_RATIO_BOUND:
if ratio >= LOWER_RATIO_BOUND and ratio <= UPPER_RATIO_BOUND:
kb.matchRatio = ratio
logger.debug("setting match ratio for current parameter to %.3f" % kb.matchRatio)