Update of 3rd party library chardet

This commit is contained in:
Miroslav Stampar
2022-03-03 18:03:01 +01:00
parent 75905e0cd9
commit bacf18832a
42 changed files with 2025 additions and 2959 deletions

View File

@@ -27,8 +27,7 @@
######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber
from .constants import eNotMe
from .compat import wrap_ord
from .enums import ProbingState
FREQ_CAT_NUM = 4
@@ -82,7 +81,7 @@ Latin1_CharToClass = (
# 2 : normal
# 3 : very likely
Latin1ClassModel = (
# UDF OTH ASC ASS ACV ACO ASV ASO
# UDF OTH ASC ASS ACV ACO ASV ASO
0, 0, 0, 0, 0, 0, 0, 0, # UDF
0, 3, 3, 3, 3, 3, 3, 3, # OTH
0, 3, 3, 3, 3, 3, 3, 3, # ASC
@@ -96,40 +95,47 @@ Latin1ClassModel = (
class Latin1Prober(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
super(Latin1Prober, self).__init__()
self._last_char_class = None
self._freq_counter = None
self.reset()
def reset(self):
self._mLastCharClass = OTH
self._mFreqCounter = [0] * FREQ_CAT_NUM
self._last_char_class = OTH
self._freq_counter = [0] * FREQ_CAT_NUM
CharSetProber.reset(self)
def get_charset_name(self):
return "windows-1252"
@property
def charset_name(self):
return "ISO-8859-1"
def feed(self, aBuf):
aBuf = self.filter_with_english_letters(aBuf)
for c in aBuf:
charClass = Latin1_CharToClass[wrap_ord(c)]
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM)
+ charClass]
@property
def language(self):
return ""
def feed(self, byte_str):
byte_str = self.filter_with_english_letters(byte_str)
for c in byte_str:
char_class = Latin1_CharToClass[c]
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
+ char_class]
if freq == 0:
self._mState = eNotMe
self._state = ProbingState.NOT_ME
break
self._mFreqCounter[freq] += 1
self._mLastCharClass = charClass
self._freq_counter[freq] += 1
self._last_char_class = char_class
return self.get_state()
return self.state
def get_confidence(self):
if self.get_state() == eNotMe:
if self.state == ProbingState.NOT_ME:
return 0.01
total = sum(self._mFreqCounter)
total = sum(self._freq_counter)
if total < 0.01:
confidence = 0.0
else:
confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0)
confidence = ((self._freq_counter[3] - self._freq_counter[1] * 20.0)
/ total)
if confidence < 0.0:
confidence = 0.0