o
    EeL                  
   @   sz  d dl mZ d dlmZ d dlmZmZ ddlmZm	Z	m
Z
 ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG d	d
 d
eZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&edddee' dee' de(fd d!Z)ed"d	$d+d%e'd&e*d'e(de*fd(d)Z+d*S ),    )	lru_cache)	getLogger)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                   @   sP   e Zd ZdZdedefddZdeddfddZdd	d
Ze	de
fddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                 C      t )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr    r#   V/var/www/bmteknikk.ddns.net/venv/lib/python3.10/site-packages/charset_normalizer/md.pyeligible%      zMessDetectorPlugin.eligibleNc                 C   r   )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r   r!   r#   r#   r$   feed+   s   zMessDetectorPlugin.feedc                 C   r   )zB
        Permit to reset the plugin to the initial state.
        r   r"   r#   r#   r$   reset2   r&   zMessDetectorPlugin.resetc                 C   r   )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r   r(   r#   r#   r$   ratio8   s   zMessDetectorPlugin.ratior   N)__name__
__module____qualname____doc__strboolr%   r'   r)   propertyfloatr*   r#   r#   r#   r$   r      s    
r   c                   @   V   e Zd ZdddZdedefddZdeddfdd	Zdd
dZe	de
fddZdS ) TooManySymbolOrPunctuationPluginr   Nc                 C   s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr(   r#   r#   r$   __init__B   s
   
z)TooManySymbolOrPunctuationPlugin.__init__r   c                 C      |  S Nisprintabler!   r#   r#   r$   r%   J      z)TooManySymbolOrPunctuationPlugin.eligiblec                 C   sp   |  j d7  _ || jkr3|tvr3t|r|  jd7  _n| du r3t|r3t|du r3|  jd7  _|| _d S )Nr   F   )	r8   r9   r   r   r6   isdigitr   r   r7   r!   r#   r#   r$   r'   M   s   

z%TooManySymbolOrPunctuationPlugin.feedc                 C   s   d| _ d| _d| _d S Nr   )r6   r8   r7   r(   r#   r#   r$   r)   _      
z&TooManySymbolOrPunctuationPlugin.resetc                 C   s0   | j dkrdS | j| j | j  }|dkr|S dS )Nr           333333?)r8   r6   r7   )r"   ratio_of_punctuationr#   r#   r$   r*   d   s   

z&TooManySymbolOrPunctuationPlugin.ratior+   r,   r-   r.   r;   r0   r1   r%   r'   r)   r2   r3   r*   r#   r#   r#   r$   r5   A   s    

r5   c                   @   r4   )TooManyAccentuatedPluginr   Nc                 C      d| _ d| _d S rC   r8   _accentuated_countr(   r#   r#   r$   r;   q      
z!TooManyAccentuatedPlugin.__init__r   c                 C   r<   r=   )isalphar!   r#   r#   r$   r%   u   r@   z!TooManyAccentuatedPlugin.eligiblec                 C   ,   |  j d7  _ t|r|  jd7  _d S d S Nr   )r8   r
   rL   r!   r#   r#   r$   r'   x      zTooManyAccentuatedPlugin.feedc                 C   rJ   rC   rK   r(   r#   r#   r$   r)   ~   rM   zTooManyAccentuatedPlugin.resetc                 C   s*   | j dk rdS | j| j  }|dkr|S dS )N   rE   gffffff?rK   )r"   ratio_of_accentuationr#   r#   r$   r*      s   
zTooManyAccentuatedPlugin.ratior+   rH   r#   r#   r#   r$   rI   p   s    

rI   c                   @   r4   )UnprintablePluginr   Nc                 C   rJ   rC   )_unprintable_countr8   r(   r#   r#   r$   r;      rM   zUnprintablePlugin.__init__r   c                 C      dS NTr#   r!   r#   r#   r$   r%         zUnprintablePlugin.eligiblec                 C   s(   t |r|  jd7  _|  jd7  _d S rP   )r   rU   r8   r!   r#   r#   r$   r'      s   zUnprintablePlugin.feedc                 C   s
   d| _ d S rC   )rU   r(   r#   r#   r$   r)      s   
zUnprintablePlugin.resetc                 C      | j dkrdS | jd | j  S )Nr   rE   rR   )r8   rU   r(   r#   r#   r$   r*         
zUnprintablePlugin.ratior+   rH   r#   r#   r#   r$   rT      s    

rT   c                   @   r4   )SuspiciousDuplicateAccentPluginr   Nc                 C      d| _ d| _d | _d S rC   _successive_countr8   _last_latin_characterr(   r#   r#   r$   r;      s   
z(SuspiciousDuplicateAccentPlugin.__init__r   c                 C   s   |  ot|S r=   )rN   r   r!   r#   r#   r$   r%      s   z(SuspiciousDuplicateAccentPlugin.eligiblec                 C   st   |  j d7  _ | jd ur5t|r5t| jr5| r%| j r%|  jd7  _t|t| jkr5|  jd7  _|| _d S rP   )r8   r_   r
   isupperr^   r   r!   r#   r#   r$   r'      s   

z$SuspiciousDuplicateAccentPlugin.feedc                 C   r\   rC   r]   r(   r#   r#   r$   r)      rD   z%SuspiciousDuplicateAccentPlugin.resetc                 C   rY   )Nr   rE   rA   )r8   r^   r(   r#   r#   r$   r*      rZ   z%SuspiciousDuplicateAccentPlugin.ratior+   rH   r#   r#   r#   r$   r[      s    

r[   c                   @   r4   )SuspiciousRanger   Nc                 C   r\   rC   )"_suspicious_successive_range_countr8   _last_printable_seenr(   r#   r#   r$   r;      rD   zSuspiciousRange.__init__r   c                 C   r<   r=   r>   r!   r#   r#   r$   r%      r@   zSuspiciousRange.eligiblec                 C   sx   |  j d7  _ | st|s|tv rd | _d S | jd u r"|| _d S t| j}t|}t||r7|  jd7  _|| _d S rP   )r8   isspacer   r   rc   r    is_suspiciously_successive_rangerb   )r"   r   unicode_range_aunicode_range_br#   r#   r$   r'      s    



zSuspiciousRange.feedc                 C   r\   rC   )r8   rb   rc   r(   r#   r#   r$   r)      rD   zSuspiciousRange.resetc                 C   s"   | j dkrdS | jd | j  }|S )N   rE   rA   )r8   rb   )r"   ratio_of_suspicious_range_usager#   r#   r$   r*      s   
zSuspiciousRange.ratior+   rH   r#   r#   r#   r$   ra      s    

ra   c                   @   r4   )SuperWeirdWordPluginr   Nc                 C   s:   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d S )Nr   F )	_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr8   _bad_character_count_buffer_buffer_accent_countr(   r#   r#   r$   r;      s   
zSuperWeirdWordPlugin.__init__r   c                 C   rV   rW   r#   r!   r#   r#   r$   r%     rX   zSuperWeirdWordPlugin.eligiblec                 C   s*  |  rH|  j|7  _t|r|  jd7  _| jdu rFt|du s%t|rFt|du rFt|du rFt|du rFt	|du rFt
|du rFd| _d S | jsMd S | sYt|sYt|r| jr|  jd7  _t| j}|  j|7  _|dkr| j| dkr}d| _t| jd r| jd  rtdd | jD du r|  jd7  _d| _|d	kr| jrd
d t| jtd|D }d}|rt|| dkrd}|s|  jd7  _d| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d| _d S |dvr| du rt|rd| _|  j|7  _d S d S d S d S )Nr   FT   g(\?c                 s   s    | ]}|  V  qd S r=   r`   ).0_r#   r#   r$   	<genexpr>-  s    z,SuperWeirdWordPlugin.feed.<locals>.<genexpr>rh   c                 S   s   g | ]
\}}|  r|qS r#   rv   )rw   cir#   r#   r$   
<listcomp>2  s    z-SuperWeirdWordPlugin.feed.<locals>.<listcomp>r   rF   rk   >   rx   -<=>|~)rN   rr   r
   rs   rp   r   r   r   r   r   r   rd   r   r   rl   lenr8   ro   r`   allrn   ziprangerm   rq   rB   r   )r"   r   buffer_lengthcamel_case_dstprobable_camel_casedr#   r#   r$   r'     sz   



zSuperWeirdWordPlugin.feedc                 C   s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )Nrk   Fr   )rr   ro   rp   rm   rl   r8   rq   rn   r(   r#   r#   r$   r)   P  s   
zSuperWeirdWordPlugin.resetc                 C   s$   | j dkr| jdkrdS | j| j S )N
   r   rE   )rl   rn   rq   r8   r(   r#   r#   r$   r*   Z  s   zSuperWeirdWordPlugin.ratior+   rH   r#   r#   r#   r$   rj      s    

E
rj   c                   @   sZ   e Zd ZdZdddZdedefddZdeddfd	d
ZdddZ	e
defddZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    r   Nc                 C   rJ   rC   _wrong_stop_count_cjk_character_countr(   r#   r#   r$   r;   h  rM   zCjkInvalidStopPlugin.__init__r   c                 C   rV   rW   r#   r!   r#   r#   r$   r%   l  rX   zCjkInvalidStopPlugin.eligiblec                 C   s8   |dv r|  j d7  _ d S t|r|  jd7  _d S d S )N>      丄   丅r   )r   r   r   r!   r#   r#   r$   r'   o  s   zCjkInvalidStopPlugin.feedc                 C   rJ   rC   r   r(   r#   r#   r$   r)   v  rM   zCjkInvalidStopPlugin.resetc                 C   s   | j dk rdS | j| j  S )N   rE   )r   r   r(   r#   r#   r$   r*   z  s   
zCjkInvalidStopPlugin.ratior+   )r,   r-   r.   r/   r;   r0   r1   r%   r'   r)   r2   r3   r*   r#   r#   r#   r$   r   b  s    

r   c                   @   r4   )ArchaicUpperLowerPluginr   Nc                 C   s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr8   _last_alpha_seen_current_ascii_onlyr(   r#   r#   r$   r;     s   
z ArchaicUpperLowerPlugin.__init__r   c                 C   rV   rW   r#   r!   r#   r#   r$   r%     rX   z ArchaicUpperLowerPlugin.eligiblec                 C   s$  |  ot|}|du }|rC| jdkrC| jdkr+| du r+| jdu r+|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdu rQ|
 du rQd| _| jd ur| r_| j sh| r|| j r|| jdu rx|  jd7  _d| _nd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   TrA   )rN   r   r   rB   r   r   r   r   r   r8   isasciir`   islower)r"   r   is_concerned	chunk_sepr#   r#   r$   r'     s@   




zArchaicUpperLowerPlugin.feedc                 C   s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r8   r   r   r   r   r   r   r(   r#   r#   r$   r)     s   
zArchaicUpperLowerPlugin.resetc                 C   s   | j dkrdS | j| j  S )Nr   rE   )r8   r   r(   r#   r#   r$   r*     s   
zArchaicUpperLowerPlugin.ratior+   rH   r#   r#   r#   r$   r     s    

*	r   c                   @   sV   e Zd ZdddZdddZdedefdd	Zdeddfd
dZe	de
fddZdS )ArabicIsolatedFormPluginr   Nc                 C   rJ   rC   r8   _isolated_form_countr(   r#   r#   r$   r;     rM   z!ArabicIsolatedFormPlugin.__init__c                 C   rJ   rC   r   r(   r#   r#   r$   r)     rM   zArabicIsolatedFormPlugin.resetr   c                 C   s   t |S r=   )r   r!   r#   r#   r$   r%     r@   z!ArabicIsolatedFormPlugin.eligiblec                 C   rO   rP   )r8   r   r   r!   r#   r#   r$   r'     rQ   zArabicIsolatedFormPlugin.feedc                 C   s   | j dk rdS | j| j  }|S )NrR   rE   r   )r"   isolated_form_usager#   r#   r$   r*     s   
zArabicIsolatedFormPlugin.ratior+   )r,   r-   r.   r;   r)   r0   r1   r%   r'   r2   r3   r*   r#   r#   r#   r$   r     s    

r      )maxsizerf   rg   r   c                 C   sv  | du s|du r
dS | |krdS d| v rd|v rdS d| v s"d|v r$dS d| v s,d|v r6d| v s4d|v r6dS |  d| d}}|D ]}|tv rJqC||v rQ dS qC| dv |dv }}|s_|rid	| v sgd	|v ridS |ro|rodS d
| v swd
|v rd	| v sd	|v rdS | dks|dkrdS d	| v sd	|v s| dv r|dv rd| v sd|v rdS d| v sd|v rdS | dks|dkrdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr	   )rf   rg   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charsr#   r#   r$   re     s^   re   i   皙?Fdecoded_sequencemaximum_thresholddebugc              	   C   sR  dd t  D }t| d }d}|dk rd}n	|dkrd}nd	}t| d
 t|D ]2\}}|D ]}	|	|r<|	| q0|dkrG|| dksM||d kr\tdd |D }||kr\ nq*|rtd}
|
	t
d| d| d|  t| dkr|
	t
d| dd   |
	t
d| dd   |D ]}|
	t
|j d|j  qt|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 S   s   g | ]}| qS r#   r#   )rw   md_classr#   r#   r$   r|   :  s    zmess_ratio.<locals>.<listcomp>r   rE   i       r   r      
r   c                 s   s    | ]}|j V  qd S r=   )r*   )rw   dtr#   r#   r$   ry   Q  s    zmess_ratio.<locals>.<genexpr>charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r   zStarting with: NzEnding with: iz:    )r   __subclasses__r   r   r   r%   r'   sumr   logr   	__class__r*   round)r   r   r   	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr   r#   r#   r$   
mess_ratio2  sN   


r   N)r   F),	functoolsr   loggingr   typingr   r   constantr   r   r	   utilsr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r5   rI   rT   r[   ra   rj   r   r   r   r0   r1   re   r3   r   r#   r#   r#   r$   <module>   sD    L"/%1iLH