Added common pattern value support to bisection algorithm

2026-01-22 22:29:05 +00:00 · 2010-06-17 11:38:32 +00:00
parent 9bce22683b
commit fd76f048b6
2 changed files with 87 additions and 19 deletions
--- a/lib/core/common.py
+++ b/lib/core/common.py
@@ -1252,22 +1252,36 @@ def goGoodSamaritan(part, prevValue, originalCharset):
    predictionSet = set()
    wildIndexes = []
    singleValue = None
+    commonPatternValue = None
+    countSingleValues = 0

    # If the header (e.g. Databases) we are looking for has common
    # outputs defined
    if part in kb.commonOutputs:
+        commonPartOutputs = kb.commonOutputs[part]
+        commonPatternValue = common_finder_only(prevValue, commonPartOutputs)
+
+        # If the longest common prefix is the same as previous value then
+        # do not consider it
+        if commonPatternValue and commonPatternValue == prevValue:
+            commonPatternValue = None
+
        # For each common output
-        for item in kb.commonOutputs[part]:
+        for item in commonPartOutputs:
            # Check if the common output (item) starts with prevValue
            # where prevValue is the enumerated character(s) so far
            if item.startswith(prevValue):
                singleValue = item
+                countSingleValues += 1

                if len(item) > len(prevValue):
                    char = item[len(prevValue)]
+                    predictionSet.add(char)

-                    if char not in predictionSet:
-                        predictionSet.add(char)
+        # Reset single value if there is more than one possible common
+        # output
+        if countSingleValues > 1:
+            singleValue = None

        commonCharset = []
        otherCharset = []
@@ -1282,12 +1296,9 @@ def goGoodSamaritan(part, prevValue, originalCharset):

        commonCharset.sort()

-        if len(commonCharset) > 1:
-            return None, commonCharset, otherCharset
-        else:
-            return singleValue, commonCharset, originalCharset
+        return singleValue, commonPatternValue, commonCharset, originalCharset
    else:
-        return None, None, originalCharset
+        return None, None, None, originalCharset

 def getCompiledRegex(regex, *args):
    """
@@ -1389,3 +1400,25 @@ class UnicodeRawConfigParser(RawConfigParser):
                        fp.write("%s = %s\n" % (key, getUnicode(value).replace('\n', '\n\t')))

            fp.write("\n")
+
+# http://boredzo.org/blog/archives/2007-01-06/longest-common-prefix-in-python-2
+def longest_common_prefix(*sequences):
+    if len(sequences) == 1:
+        return sequences[0]
+
+    sequences = [pair[1] for pair in sorted((len(fi), fi) for fi in sequences)]
+
+    if not sequences:
+        return None
+
+    for i, comparison_ch in enumerate(sequences[0]):
+        for fi in sequences[1:]:
+            ch = fi[i]
+
+            if ch != comparison_ch:
+                return fi[:i]
+
+    return sequences[0]
+
+def common_finder_only(initial, sequence):
+    return longest_common_prefix(*filter(lambda x: x.startswith(initial), sequence))