Update of 3rd party library chardet

This commit is contained in:
Miroslav Stampar
2022-03-03 18:03:01 +01:00
parent 75905e0cd9
commit bacf18832a
42 changed files with 2025 additions and 2959 deletions

View File

@@ -25,68 +25,68 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import sys
from . import constants
from .enums import ProbingState, MachineState
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCJPDistributionAnalysis
from .jpcntx import EUCJPContextAnalysis
from .mbcssm import EUCJPSMModel
from .mbcssm import EUCJP_SM_MODEL
if sys.version_info >= (3, 0):
xrange = range
class EUCJPProber(MultiByteCharSetProber):
def __init__(self):
MultiByteCharSetProber.__init__(self)
self._mCodingSM = CodingStateMachine(EUCJPSMModel)
self._mDistributionAnalyzer = EUCJPDistributionAnalysis()
self._mContextAnalyzer = EUCJPContextAnalysis()
super(EUCJPProber, self).__init__()
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
self.distribution_analyzer = EUCJPDistributionAnalysis()
self.context_analyzer = EUCJPContextAnalysis()
self.reset()
def reset(self):
MultiByteCharSetProber.reset(self)
self._mContextAnalyzer.reset()
super(EUCJPProber, self).reset()
self.context_analyzer.reset()
def get_charset_name(self):
@property
def charset_name(self):
return "EUC-JP"
def feed(self, aBuf):
aLen = len(aBuf)
for i in xrange(0, aLen):
# PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte
codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == constants.eError:
if constants._debug:
sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe
@property
def language(self):
return "Japanese"
def feed(self, byte_str):
for i in range(len(byte_str)):
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.ERROR:
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self._state = ProbingState.NOT_ME
break
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen()
elif coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen()
if i == 0:
self._mLastChar[1] = aBuf[0]
self._mContextAnalyzer.feed(self._mLastChar, charLen)
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
self._last_char[1] = byte_str[0]
self.context_analyzer.feed(self._last_char, char_len)
self.distribution_analyzer.feed(self._last_char, char_len)
else:
self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self.context_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
self._mLastChar[0] = aBuf[aLen - 1]
self._last_char[0] = byte_str[-1]
if self.get_state() == constants.eDetecting:
if (self._mContextAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt
if self.state == ProbingState.DETECTING:
if (self.context_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
self._state = ProbingState.FOUND_IT
return self.get_state()
return self.state
def get_confidence(self):
contxtCf = self._mContextAnalyzer.get_confidence()
distribCf = self._mDistributionAnalyzer.get_confidence()
return max(contxtCf, distribCf)
context_conf = self.context_analyzer.get_confidence()
distrib_conf = self.distribution_analyzer.get_confidence()
return max(context_conf, distrib_conf)