improving "boolean detection" by automatic recognition of convenient --string candidate

2026-01-22 14:19:03 +00:00 · 2012-04-10 21:48:34 +00:00
parent 698b7a15d9
commit 119eec3598
5 changed files with 32 additions and 3 deletions
--- a/lib/core/common.py
+++ b/lib/core/common.py
@@ -124,6 +124,7 @@ from lib.core.settings import TIME_STDEV_COEFF
 from lib.core.settings import DYNAMICITY_MARK_LENGTH
 from lib.core.settings import REFLECTIVE_MISS_THRESHOLD
 from lib.core.settings import SENSITIVE_DATA_REGEX
+from lib.core.settings import TEXT_TAG_REGEX
 from lib.core.settings import UNION_UNIQUE_FIFO_LENGTH
 from lib.core.settings import URI_INJECTION_MARK_CHAR
 from lib.core.settings import URI_QUESTION_MARKER
@@ -2155,6 +2156,13 @@ def extractRegexResult(regex, content, flags=0):

    return retVal

+def extractTextTagContent(page):
+    """
+    Returns list containing content from "textual" tags
+    """
+
+    return [_.group('result') for _ in re.finditer(TEXT_TAG_REGEX, page or "")]
+
 def trimAlphaNum(value):
    """
    Trims alpha numeric characters from start and ending of a given value
--- a/lib/core/settings.py
+++ b/lib/core/settings.py
@@ -62,10 +62,13 @@ URI_QUESTION_MARKER = "__QUESTION_MARK__"

 PAYLOAD_DELIMITER = "\x00"
 CHAR_INFERENCE_MARK = "%c"
-PRINTABLE_CHAR_REGEX = r'[^\x00-\x1f\x7e-\xff]'
+PRINTABLE_CHAR_REGEX = r"[^\x00-\x1f\x7e-\xff]"

 # regular expression used for extracting results from google search
-GOOGLE_REGEX = r'url\?q=(http[^>]+)&amp;sa=U&amp'
+GOOGLE_REGEX = r"url\?q=(http[^>]+)&amp;sa=U&amp"
+
+# regular expression used for extracting content from "textual" tags
+TEXT_TAG_REGEX = r"(?si)<(abbr|acronym|b|blockquote|br|center|cite|code|dt|em|font|h\d|i|li|p|pre|q|strong|sub|sup|td|th|title|tt|u)(?!\w).*?>(?P<result>[^<]+)"

 # dumping characters used in GROUP_CONCAT MySQL technique
 CONCAT_ROW_DELIMITER = ','
--- a/lib/core/threads.py
+++ b/lib/core/threads.py
@@ -43,6 +43,7 @@ class _ThreadData(threading.local):
        self.disableStdOut = False
        self.hashDBCursor = None
        self.inTransaction = False
+        self.lastComparisonPage = None
        self.lastErrorPage = None
        self.lastHTTPError = None
        self.lastRedirectMsg = None