Update of 3rd party library chardet

2025-12-09 14:11:29 +00:00 · 2022-03-03 18:03:01 +01:00
parent 75905e0cd9
commit bacf18832a
42 changed files with 2025 additions and 2959 deletions
--- a/thirdparty/chardet/hebrewprober.py
+++ b/thirdparty/chardet/hebrewprober.py
@@ -26,8 +26,7 @@
 ######################### END LICENSE BLOCK #########################

 from .charsetprober import CharSetProber
-from .constants import eNotMe, eDetecting
-from .compat import wrap_ord
+from .enums import ProbingState

 # This prober doesn't actually recognize a language or a charset.
 # It is a helper prober for the use of the Hebrew model probers
@@ -126,56 +125,59 @@ from .compat import wrap_ord
 # model probers scores. The answer is returned in the form of the name of the
 # charset identified, either "windows-1255" or "ISO-8859-8".

-# windows-1255 / ISO-8859-8 code points of interest
-FINAL_KAF = 0xea
-NORMAL_KAF = 0xeb
-FINAL_MEM = 0xed
-NORMAL_MEM = 0xee
-FINAL_NUN = 0xef
-NORMAL_NUN = 0xf0
-FINAL_PE = 0xf3
-NORMAL_PE = 0xf4
-FINAL_TSADI = 0xf5
-NORMAL_TSADI = 0xf6
-
-# Minimum Visual vs Logical final letter score difference.
-# If the difference is below this, don't rely solely on the final letter score
-# distance.
-MIN_FINAL_CHAR_DISTANCE = 5
-
-# Minimum Visual vs Logical model score difference.
-# If the difference is below this, don't rely at all on the model score
-# distance.
-MIN_MODEL_DISTANCE = 0.01
-
-VISUAL_HEBREW_NAME = "ISO-8859-8"
-LOGICAL_HEBREW_NAME = "windows-1255"
-
-
 class HebrewProber(CharSetProber):
+    # windows-1255 / ISO-8859-8 code points of interest
+    FINAL_KAF = 0xea
+    NORMAL_KAF = 0xeb
+    FINAL_MEM = 0xed
+    NORMAL_MEM = 0xee
+    FINAL_NUN = 0xef
+    NORMAL_NUN = 0xf0
+    FINAL_PE = 0xf3
+    NORMAL_PE = 0xf4
+    FINAL_TSADI = 0xf5
+    NORMAL_TSADI = 0xf6
+
+    # Minimum Visual vs Logical final letter score difference.
+    # If the difference is below this, don't rely solely on the final letter score
+    # distance.
+    MIN_FINAL_CHAR_DISTANCE = 5
+
+    # Minimum Visual vs Logical model score difference.
+    # If the difference is below this, don't rely at all on the model score
+    # distance.
+    MIN_MODEL_DISTANCE = 0.01
+
+    VISUAL_HEBREW_NAME = "ISO-8859-8"
+    LOGICAL_HEBREW_NAME = "windows-1255"
+
    def __init__(self):
-        CharSetProber.__init__(self)
-        self._mLogicalProber = None
-        self._mVisualProber = None
+        super(HebrewProber, self).__init__()
+        self._final_char_logical_score = None
+        self._final_char_visual_score = None
+        self._prev = None
+        self._before_prev = None
+        self._logical_prober = None
+        self._visual_prober = None
        self.reset()

    def reset(self):
-        self._mFinalCharLogicalScore = 0
-        self._mFinalCharVisualScore = 0
+        self._final_char_logical_score = 0
+        self._final_char_visual_score = 0
        # The two last characters seen in the previous buffer,
        # mPrev and mBeforePrev are initialized to space in order to simulate
        # a word delimiter at the beginning of the data
-        self._mPrev = ' '
-        self._mBeforePrev = ' '
+        self._prev = ' '
+        self._before_prev = ' '
        # These probers are owned by the group prober.

    def set_model_probers(self, logicalProber, visualProber):
-        self._mLogicalProber = logicalProber
-        self._mVisualProber = visualProber
+        self._logical_prober = logicalProber
+        self._visual_prober = visualProber

    def is_final(self, c):
-        return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE,
-                               FINAL_TSADI]
+        return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
+                     self.FINAL_PE, self.FINAL_TSADI]

    def is_non_final(self, c):
        # The normal Tsadi is not a good Non-Final letter due to words like
@@ -188,9 +190,10 @@ class HebrewProber(CharSetProber):
        # for example legally end with a Non-Final Pe or Kaf. However, the
        # benefit of these letters as Non-Final letters outweighs the damage
        # since these words are quite rare.
-        return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
+        return c in [self.NORMAL_KAF, self.NORMAL_MEM,
+                     self.NORMAL_NUN, self.NORMAL_PE]

-    def feed(self, aBuf):
+    def feed(self, byte_str):
        # Final letter analysis for logical-visual decision.
        # Look for evidence that the received buffer is either logical Hebrew
        # or visual Hebrew.
@@ -217,67 +220,73 @@ class HebrewProber(CharSetProber):
        # We automatically filter out all 7-bit characters (replace them with
        # spaces) so the word boundary detection works properly. [MAP]

-        if self.get_state() == eNotMe:
+        if self.state == ProbingState.NOT_ME:
            # Both model probers say it's not them. No reason to continue.
-            return eNotMe
+            return ProbingState.NOT_ME

-        aBuf = self.filter_high_bit_only(aBuf)
+        byte_str = self.filter_high_byte_only(byte_str)

-        for cur in aBuf:
+        for cur in byte_str:
            if cur == ' ':
                # We stand on a space - a word just ended
-                if self._mBeforePrev != ' ':
-                    # next-to-last char was not a space so self._mPrev is not a
+                if self._before_prev != ' ':
+                    # next-to-last char was not a space so self._prev is not a
                    # 1 letter word
-                    if self.is_final(self._mPrev):
+                    if self.is_final(self._prev):
                        # case (1) [-2:not space][-1:final letter][cur:space]
-                        self._mFinalCharLogicalScore += 1
-                    elif self.is_non_final(self._mPrev):
+                        self._final_char_logical_score += 1
+                    elif self.is_non_final(self._prev):
                        # case (2) [-2:not space][-1:Non-Final letter][
                        #  cur:space]
-                        self._mFinalCharVisualScore += 1
+                        self._final_char_visual_score += 1
            else:
                # Not standing on a space
-                if ((self._mBeforePrev == ' ') and
-                        (self.is_final(self._mPrev)) and (cur != ' ')):
+                if ((self._before_prev == ' ') and
+                        (self.is_final(self._prev)) and (cur != ' ')):
                    # case (3) [-2:space][-1:final letter][cur:not space]
-                    self._mFinalCharVisualScore += 1
-            self._mBeforePrev = self._mPrev
-            self._mPrev = cur
+                    self._final_char_visual_score += 1
+            self._before_prev = self._prev
+            self._prev = cur

        # Forever detecting, till the end or until both model probers return
-        # eNotMe (handled above)
-        return eDetecting
+        # ProbingState.NOT_ME (handled above)
+        return ProbingState.DETECTING

-    def get_charset_name(self):
+    @property
+    def charset_name(self):
        # Make the decision: is it Logical or Visual?
        # If the final letter score distance is dominant enough, rely on it.
-        finalsub = self._mFinalCharLogicalScore - self._mFinalCharVisualScore
-        if finalsub >= MIN_FINAL_CHAR_DISTANCE:
-            return LOGICAL_HEBREW_NAME
-        if finalsub <= -MIN_FINAL_CHAR_DISTANCE:
-            return VISUAL_HEBREW_NAME
+        finalsub = self._final_char_logical_score - self._final_char_visual_score
+        if finalsub >= self.MIN_FINAL_CHAR_DISTANCE:
+            return self.LOGICAL_HEBREW_NAME
+        if finalsub <= -self.MIN_FINAL_CHAR_DISTANCE:
+            return self.VISUAL_HEBREW_NAME

        # It's not dominant enough, try to rely on the model scores instead.
-        modelsub = (self._mLogicalProber.get_confidence()
-                    - self._mVisualProber.get_confidence())
-        if modelsub > MIN_MODEL_DISTANCE:
-            return LOGICAL_HEBREW_NAME
-        if modelsub < -MIN_MODEL_DISTANCE:
-            return VISUAL_HEBREW_NAME
+        modelsub = (self._logical_prober.get_confidence()
+                    - self._visual_prober.get_confidence())
+        if modelsub > self.MIN_MODEL_DISTANCE:
+            return self.LOGICAL_HEBREW_NAME
+        if modelsub < -self.MIN_MODEL_DISTANCE:
+            return self.VISUAL_HEBREW_NAME

        # Still no good, back to final letter distance, maybe it'll save the
        # day.
        if finalsub < 0.0:
-            return VISUAL_HEBREW_NAME
+            return self.VISUAL_HEBREW_NAME

        # (finalsub > 0 - Logical) or (don't know what to do) default to
        # Logical.
-        return LOGICAL_HEBREW_NAME
+        return self.LOGICAL_HEBREW_NAME

-    def get_state(self):
+    @property
+    def language(self):
+        return 'Hebrew'
+
+    @property
+    def state(self):
        # Remain active as long as any of the model probers are active.
-        if (self._mLogicalProber.get_state() == eNotMe) and \
-           (self._mVisualProber.get_state() == eNotMe):
-            return eNotMe
-        return eDetecting
+        if (self._logical_prober.state == ProbingState.NOT_ME) and \
+           (self._visual_prober.state == ProbingState.NOT_ME):
+            return ProbingState.NOT_ME
+        return ProbingState.DETECTING