Adding new version of chardet

This commit is contained in:
Miroslav Stampar
2015-10-09 13:35:48 +02:00
parent d424d4cdc7
commit 439d003753
39 changed files with 1499 additions and 1148 deletions

View File

@@ -13,39 +13,43 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine
from . import constants
from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel,
ISO2022KRSMModel)
from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .compat import wrap_ord
class EscCharSetProber(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
self._mCodingSM = [ \
self._mCodingSM = [
CodingStateMachine(HZSMModel),
CodingStateMachine(ISO2022CNSMModel),
CodingStateMachine(ISO2022JPSMModel),
CodingStateMachine(ISO2022KRSMModel)
]
]
self.reset()
def reset(self):
CharSetProber.reset(self)
for codingSM in self._mCodingSM:
if not codingSM: continue
codingSM.active = constants.True
if not codingSM:
continue
codingSM.active = True
codingSM.reset()
self._mActiveSM = len(self._mCodingSM)
self._mDetectedCharset = None
@@ -61,19 +65,22 @@ class EscCharSetProber(CharSetProber):
def feed(self, aBuf):
for c in aBuf:
# PY3K: aBuf is a byte array, so c is an int, not a byte
for codingSM in self._mCodingSM:
if not codingSM: continue
if not codingSM.active: continue
codingState = codingSM.next_state(c)
if not codingSM:
continue
if not codingSM.active:
continue
codingState = codingSM.next_state(wrap_ord(c))
if codingState == constants.eError:
codingSM.active = constants.False
codingSM.active = False
self._mActiveSM -= 1
if self._mActiveSM <= 0:
self._mState = constants.eNotMe
return self.get_state()
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
self._mDetectedCharset = codingSM.get_coding_state_machine()
self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8
return self.get_state()
return self.get_state()