U
    ]9Y                     @   s0   d dl Z d dlZddlmZ G dd deZdS )    N   )ProbingStatec                   @   sn   e Zd ZdZdddZdd Zedd Zd	d
 Zedd Z	dd Z
edd Zedd Zedd ZdS )CharSetProbergffffff?Nc                 C   s   d | _ || _tt| _d S N)_statelang_filterloggingZ	getLogger__name__Zlogger)selfr    r   7/usr/lib/python3/dist-packages/chardet/charsetprober.py__init__'   s    zCharSetProber.__init__c                 C   s   t j| _d S r   )r   Z	DETECTINGr   r
   r   r   r   reset,   s    zCharSetProber.resetc                 C   s   d S r   r   r   r   r   r   charset_name/   s    zCharSetProber.charset_namec                 C   s   d S r   r   )r
   bufr   r   r   feed3   s    zCharSetProber.feedc                 C   s   | j S r   )r   r   r   r   r   state6   s    zCharSetProber.statec                 C   s   dS )Ng        r   r   r   r   r   get_confidence:   s    zCharSetProber.get_confidencec                 C   s   t dd| } | S )Ns   ([ -])+    )resub)r   r   r   r   filter_high_byte_only=   s    z#CharSetProber.filter_high_byte_onlyc                 C   s\   t  }td| }|D ]@}||dd  |dd }| sL|dk rLd}|| q|S )u9  
        We define three types of bytes:
        alphabet: english alphabets [a-zA-Z]
        international: international characters [-ÿ]
        marker: everything else [^a-zA-Z-ÿ]

        The input buffer can be thought to contain a series of words delimited
        by markers. This function works to filter all words that contain at
        least one international character. All contiguous sequences of markers
        are replaced by a single space ascii character.

        This filter applies to all scripts which do not use English characters.
        s%   [a-zA-Z]*[-]+[a-zA-Z]*[^a-zA-Z-]?N   r   )	bytearrayr   findallextendisalpha)r   filteredZwordsZwordZ	last_charr   r   r   filter_international_wordsB   s    z(CharSetProber.filter_international_wordsc                 C   s   t  }d}d}tt| D ]n}| ||d  }|dkr<d}n|dkrHd}|dk r| s||kr|s|| ||  |d |d }q|s|| |d	  |S )
a  
        Returns a copy of ``buf`` that retains only the sequences of English
        alphabet and high byte characters that are not between <> characters.
        Also retains English alphabet and high byte characters immediately
        before occurrences of >.

        This filter can be applied to all scripts which contain both English
        characters and extended ASCII characters, but is currently only used by
        ``Latin1Prober``.
        Fr   r      >   <Tr   r   N)r   rangelenr   r   )r   r   Zin_tagprevZcurrZbuf_charr   r   r   filter_with_english_lettersg   s"    

z)CharSetProber.filter_with_english_letters)N)r	   
__module____qualname__ZSHORTCUT_THRESHOLDr   r   propertyr   r   r   r   staticmethodr   r    r&   r   r   r   r   r   #   s   




$r   )r   r   Zenumsr   objectr   r   r   r   r   <module>   s   