
    BqgDN                       d dl mZ d dlmZ d dlmZ ddlmZmZm	Z	 ddl
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ  G d d          Z G d	 d
e          Z G d de          Z G d de          Z G d de          Z  G d de          Z! G d de          Z" G d de          Z# G d de          Z$ G d de          Z% ed          d-d"            Z& ed#          	 d.d/d+            Z'd,S )0    )annotations)	lru_cache)	getLogger   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                  B    e Zd ZdZddZddZdd	Zedd            ZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterstrreturnboolc                    t           )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr   s     G/var/www/html/env/lib/python3.11/site-packages/charset_normalizer/md.pyeligiblezMessDetectorPlugin.eligible&   
     "!    Nonec                    t           )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r"   r$   s     r&   feedzMessDetectorPlugin.feed,   s
    
 "!r)   c                    t           )zB
        Permit to reset the plugin to the initial state.
        r"   r%   s    r&   resetzMessDetectorPlugin.reset3   r(   r)   floatc                    t           )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r"   r.   s    r&   ratiozMessDetectorPlugin.ratio9   s
     "!r)   Nr   r   r   r    r   r   r   r*   r   r*   r   r0   )	__name__
__module____qualname____doc__r'   r,   r/   propertyr2    r)   r&   r   r       sz         
" " " "" " " "" " " " " " " X" " "r)   r   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS ) TooManySymbolOrPunctuationPluginr   r*   c                L    d| _         d| _        d| _        d | _        d| _        d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr.   s    r&   __init__z)TooManySymbolOrPunctuationPlugin.__init__C   s0    '("#%&04!,1###r)   r   r   r    c                *    |                                 S Nisprintabler$   s     r&   r'   z)TooManySymbolOrPunctuationPlugin.eligibleK       $$&&&r)   c                (   | xj         dz  c_         || j        k    ro|t          vrft          |          r| xj        dz  c_        nF|                                du r0t          |          r!t          |          du r| xj        dz  c_        || _        d S )Nr   F   )	rB   rC   r   r   r@   isdigitr   r   rA   r$   s     r&   r,   z%TooManySymbolOrPunctuationPlugin.feedN   s    " 222!===i(( (''1,'''!!##u,,i(( -	**e33""a'""$-!!!r)   c                0    d| _         d| _        d| _        d S Nr   )r@   rB   rA   r.   s    r&   r/   z&TooManySymbolOrPunctuationPlugin.reset`   s     "# !r)   r0   c                ^    | j         dk    rdS | j        | j        z   | j         z  }|dk    r|ndS )Nr           333333?)rB   r@   rA   )r%   ratio_of_punctuations     r&   r2   z&TooManySymbolOrPunctuationPlugin.ratioe   sK     A%%3 #d&88!'" (<s'B'B##Kr)   Nr5   r3   r4   r6   	r7   r8   r9   rE   r'   r,   r/   r;   r2   r<   r)   r&   r>   r>   B   s        2 2 2 2' ' ' '. . . .$   
 L L L XL L Lr)   r>   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )TooManyAccentuatedPluginr   r*   c                "    d| _         d| _        d S rO   rB   _accentuated_countr.   s    r&   rE   z!TooManyAccentuatedPlugin.__init__r   s    %&'(r)   r   r   r    c                *    |                                 S rG   )isalphar$   s     r&   r'   z!TooManyAccentuatedPlugin.eligiblev   s      """r)   c                h    | xj         dz  c_         t          |          r| xj        dz  c_        d S d S Nr   )rB   r
   rY   r$   s     r&   r,   zTooManyAccentuatedPlugin.feedy   sJ    ")$$ 	)##q(####	) 	)r)   c                "    d| _         d| _        d S rO   rX   r.   s    r&   r/   zTooManyAccentuatedPlugin.reset   s     !"#r)   r0   c                N    | j         dk     rdS | j        | j         z  }|dk    r|ndS )N   rQ   gffffff?rX   )r%   ratio_of_accentuations     r&   r2   zTooManyAccentuatedPlugin.ratio   s<     1$$3'+'>AV'V(=(E(E$$3Nr)   Nr5   r3   r4   r6   rT   r<   r)   r&   rV   rV   q   s        ) ) ) )# # # #) ) ) )$ $ $ $ O O O XO O Or)   rV   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )UnprintablePluginr   r*   c                "    d| _         d| _        d S rO   )_unprintable_countrB   r.   s    r&   rE   zUnprintablePlugin.__init__   s    '(%&r)   r   r   r    c                    dS NTr<   r$   s     r&   r'   zUnprintablePlugin.eligible       tr)   c                d    t          |          r| xj        dz  c_        | xj        dz  c_        d S r]   )r   re   rB   r$   s     r&   r,   zUnprintablePlugin.feed   s@    )$$ 	)##q(##"r)   c                    d| _         d S rO   )re   r.   s    r&   r/   zUnprintablePlugin.reset   s    "#r)   r0   c                @    | j         dk    rdS | j        dz  | j         z  S )Nr   rQ   r`   )rB   re   r.   s    r&   r2   zUnprintablePlugin.ratio   s+     A%%3'!+t/DDDr)   Nr5   r3   r4   r6   rT   r<   r)   r&   rc   rc      s        ' ' ' '   # # # #
$ $ $ $ E E E XE E Er)   rc   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )SuspiciousDuplicateAccentPluginr   r*   c                0    d| _         d| _        d | _        d S rO   _successive_countrB   _last_latin_characterr.   s    r&   rE   z(SuspiciousDuplicateAccentPlugin.__init__   s     &'%&15"""r)   r   r   r    c                H    |                                 ot          |          S rG   )r[   r   r$   s     r&   r'   z(SuspiciousDuplicateAccentPlugin.eligible   s!      "":x	':'::r)   c                l   | xj         dz  c_         | j        t          |          rt          | j                  rr|                                r)| j                                        r| xj        dz  c_        t          |          t          | j                  k    r| xj        dz  c_        || _        d S r]   )rB   rq   r
   isupperrp   r   r$   s     r&   r,   z$SuspiciousDuplicateAccentPlugin.feed   s    "&2y)) 3t9:: 3   "" ,t'A'I'I'K'K ,&&!+&&Y''=9S+T+TTT&&!+&&%."""r)   c                0    d| _         d| _        d | _        d S rO   ro   r.   s    r&   r/   z%SuspiciousDuplicateAccentPlugin.reset   s     !" !%)"""r)   r0   c                @    | j         dk    rdS | j        dz  | j         z  S )Nr   rQ   rL   )rB   rp   r.   s    r&   r2   z%SuspiciousDuplicateAccentPlugin.ratio   s+     A%%3&*d.CCCr)   Nr5   r3   r4   r6   rT   r<   r)   r&   rm   rm      s        6 6 6 6; ; ; ;/ / / /* * * *
 D D D XD D Dr)   rm   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )SuspiciousRanger   r*   c                0    d| _         d| _        d | _        d S rO   )"_suspicious_successive_range_countrB   _last_printable_seenr.   s    r&   rE   zSuspiciousRange.__init__   s     78/%&04!!!r)   r   r   r    c                *    |                                 S rG   rH   r$   s     r&   r'   zSuspiciousRange.eligible   rJ   r)   c                D   | xj         dz  c_         |                                st          |          s	|t          v r	d | _        d S | j        	|| _        d S t          | j                  }t          |          }t          ||          r| xj        dz  c_        || _        d S r]   )rB   isspacer   r   r{   r    is_suspiciously_successive_rangerz   )r%   r   unicode_range_aunicode_range_bs       r&   r,   zSuspiciousRange.feed   s    " 	i((	 888(,D%F$,(1D%F&3D4M&N&N&3I&>&>+O_MM 	933q833$-!!!r)   c                0    d| _         d| _        d | _        d S rO   )rB   rz   r{   r.   s    r&   r/   zSuspiciousRange.reset   s      !23/$(!!!r)   r0   c                D    | j         dk    rdS | j        dz  | j         z  }|S )N   rQ   rL   )rB   rz   )r%   ratio_of_suspicious_range_usages     r&   r2   zSuspiciousRange.ratio   s8     B&&3 3a7!2"' /.r)   Nr5   r3   r4   r6   rT   r<   r)   r&   rx   rx      s        5 5 5 5
' ' ' '. . . ..) ) ) )
 / / / X/ / /r)   rx   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )SuperWeirdWordPluginr   r*   c                    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _	        d S )Nr   F )
_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchrB   _bad_character_count_buffer_buffer_accent_count_buffer_glyph_countr.   s    r&   rE   zSuperWeirdWordPlugin.__init__   sW     !$%() */!). %&)*!)*!()   r)   r   r   r    c                    dS rg   r<   r$   s     r&   r'   zSuperWeirdWordPlugin.eligible
  rh   r)   c                   |                                 r| xj        |z  c_        t          |          r| xj        dz  c_        | j        du r|t          |          du st          |          r\t          |          du rKt          |          du r:t          |          du r)t          |          du rt          |          du rd| _        t          |          s<t          |          s-t          |          st          |          st          |          r| xj        dz  c_        d S | j        sd S |                                st          |          st          |          r| j        r| xj        dz  c_        t!          | j                  }| xj        |z  c_        |dk    r| j        |z  dk    rd| _        nt          | j        d                   rW| j        d                                         r8t)          d | j        D                       du r| xj        dz  c_        d| _        n"| j        dk    rd| _        | xj        dz  c_        |dk    ri| j        rbd	 t-          | j        t/          d
|                    D             }d}|rt!          |          |z  dk    rd}|s| xj        dz  c_        d| _        | j        r9| xj        dz  c_        | xj        t!          | j                  z  c_        d| _        d| _        d| _        d
| _        d
| _        d S |dvr>|                                du r*t7          |          rd| _        | xj        |z  c_        d S d S d S d S )Nr   FT   g      ?c              3  >   K   | ]}|                                 V  d S rG   rt   ).0_s     r&   	<genexpr>z,SuperWeirdWordPlugin.feed.<locals>.<genexpr>7  s*      >>AAIIKK>>>>>>r)      c                @    g | ]\  }}|                                 |S r<   r   )r   cis      r&   
<listcomp>z-SuperWeirdWordPlugin.feed.<locals>.<listcomp>?  s:     " " "1yy{{"" " "r)   r   rR   r   >   r   -<=>|~)r[   r   r
   r   r   r   r   r   r   r   r   r   r~   r   r   r   lenrB   r   rt   allr   zipranger   r   rM   r   )r%   r   buffer_lengthcamel_case_dstprobable_camel_caseds        r&   r,   zSuperWeirdWordPlugin.feed  s
    	LLI%LLi(( /))Q.))(E11i((E11^I5N5N19%%..i((E11	**e33	**e33I&&%//+/(y!!.Y''. y)). y))	.
 9%%. ((A-((F| 	F5	&#1)#<#<5	&@LY@W@W5	&l5	& !!$T\!2!2M!!]2!!!!,}<CC04D-- #4<#344	2R(0022	2 >>>>>>>%GG,,1,,04D---2204D-,,1,,""t'?"" " #DL%=2I2I J J" " "
 .3$! 0s>':':]'Jc'Q'Q+/(+ 5,,1,,04D-( 2$$)$$))S->->>)),1)',D$DL()D%'(D$$$@@@!!##u,,)$$ - )-D%LLI%LLLL A@,,,,r)   c                v    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d S )Nr   Fr   )r   r   r   r   r   rB   r   r   r.   s    r&   r/   zSuperWeirdWordPlugin.reset^  sG    $)!#(   !$%!#$   r)   r0   c                P    | j         dk    r| j        dk    rdS | j        | j        z  S )N
   r   rQ   )r   r   r   rB   r.   s    r&   r2   zSuperWeirdWordPlugin.ratioh  s3    r!!d&>!&C&C3(4+@@@r)   Nr5   r3   r4   r6   rT   r<   r)   r&   r   r      s        * * * *   O& O& O& O&b% % % % A A A XA A Ar)   r   c                  J    e Zd ZdZddZddZdd	Zdd
Zedd            Z	dS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    r   r*   c                "    d| _         d| _        d S rO   _wrong_stop_count_cjk_character_countr.   s    r&   rE   zCjkInvalidStopPlugin.__init__v  s    &')*!!!r)   r   r   r    c                    dS rg   r<   r$   s     r&   r'   zCjkInvalidStopPlugin.eligiblez  rh   r)   c                t    |dv r| xj         dz  c_         d S t          |          r| xj        dz  c_        d S d S )N>      丄   丅r   )r   r   r   r$   s     r&   r,   zCjkInvalidStopPlugin.feed}  sZ    &&""a'""F) 	+%%*%%%%	+ 	+r)   c                "    d| _         d| _        d S rO   r   r.   s    r&   r/   zCjkInvalidStopPlugin.reset  s    !"$%!!!r)   r0   c                :    | j         dk     rdS | j        | j         z  S )N   rQ   )r   r   r.   s    r&   r2   zCjkInvalidStopPlugin.ratio  s&    $r))3%(AAAr)   Nr5   r3   r4   r6   )
r7   r8   r9   r:   rE   r'   r,   r/   r;   r2   r<   r)   r&   r   r   p  s         
+ + + +   + + + +& & & & B B B XB B Br)   r   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )ArchaicUpperLowerPluginr   r*   c                h    d| _         d| _        d| _        d| _        d| _        d | _        d| _        d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalrB   _last_alpha_seen_current_ascii_onlyr.   s    r&   rE   z ArchaicUpperLowerPlugin.__init__  s?    	45,23*890%&,0)-   r)   r   r   r    c                    dS rg   r<   r$   s     r&   r'   z ArchaicUpperLowerPlugin.eligible  rh   r)   c                   |                                 ot          |          }|du }|r| j        dk    rt| j        dk    r4|                                du r| j        du r| xj        | j        z  c_        d| _        d| _        d | _        d| _        | xj	        dz  c_	        d| _        d S | j        du r|
                                du rd| _        | j        |                                r| j                                        s-|                                rB| j                                        r)| j        du r| xj        dz  c_        d| _        nd| _        nd| _        | xj	        dz  c_	        | xj        dz  c_        || _        d S )NFr   @   r   TrL   )r[   r   r   rM   r   r   r   r   r   rB   isasciirt   islower)r%   r   is_concerned	chunk_seps       r&   r,   zArchaicUpperLowerPlugin.feed  s    ((**J/?	/J/J E)	 	=AA4::%%''500,5588688 23D.34D0$(D!DI!!Q&!!'+D$F#t++	0A0A0C0Cu0L0L',D$ ,!!## 	"(=(E(E(G(G 	"!!##	"(,(=(E(E(G(G	" 9$$66!;66 %DII $DII!	",,1,, )r)   c                h    d| _         d| _        d| _        d| _        d | _        d| _        d| _        d S )Nr   FT)rB   r   r   r   r   r   r   r.   s    r&   r/   zArchaicUpperLowerPlugin.reset  s?     !/0,-.*340 $	#'   r)   r0   c                :    | j         dk    rdS | j        | j         z  S )Nr   rQ   )rB   r   r.   s    r&   r2   zArchaicUpperLowerPlugin.ratio  s&     A%%37$:OOOr)   Nr5   r3   r4   r6   rT   r<   r)   r&   r   r     s        . . . .   (* (* (* (*T( ( ( ( P P P XP P Pr)   r   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )ArabicIsolatedFormPluginr   r*   c                "    d| _         d| _        d S rO   rB   _isolated_form_countr.   s    r&   rE   z!ArabicIsolatedFormPlugin.__init__  s    %&)*!!!r)   c                "    d| _         d| _        d S rO   r   r.   s    r&   r/   zArabicIsolatedFormPlugin.reset  s     !$%!!!r)   r   r   r    c                     t          |          S rG   )r   r$   s     r&   r'   z!ArabicIsolatedFormPlugin.eligible  s    ###r)   c                h    | xj         dz  c_         t          |          r| xj        dz  c_        d S d S r]   )rB   r   r   r$   s     r&   r,   zArabicIsolatedFormPlugin.feed  sJ    ""9-- 	+%%*%%%%	+ 	+r)   r0   c                >    | j         dk     rdS | j        | j         z  }|S )Nr`   rQ   r   )r%   isolated_form_usages     r&   r2   zArabicIsolatedFormPlugin.ratio  s,     1$$3%)%>AV%V""r)   Nr5   r3   r4   r6   )	r7   r8   r9   rE   r/   r'   r,   r;   r2   r<   r)   r&   r   r     s        + + + +& & & &$ $ $ $+ + + + # # # X# # #r)   r      )maxsizer   
str | Noner   r   r    c                   | |dS | |k    rdS d| v rd|v rdS d| v sd|v rdS d| v sd|v r
d| v sd|v rdS |                      d          |                     d          }}|D ]}|t          v r||v r dS | dv |dv }}|s|r
d	| v sd	|v rdS |r|rdS d
| v sd
|v rd	| v sd	|v rdS | dk    s|dk    rdS d	| v sd	|v s| dv r&|dv r"d| v sd|v rdS d| v sd|v rdS | dk    s|dk    rdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr	   )r   r   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charss          r&   r   r     s    /"9t/))u/!!g&@&@uo%%)G)Gu 	?""g&@&@&&+*H*Hu 	c""c"" '
   000!!!55 "
 	
	

 	33 ' 	 ,   E_$<$<u , u?""h/&A&AO##u'?'?5m++-/O/O5 	  E_$<$<333777O++}/O/O5o%%O)C)C5m++-/O/O54r)   i   皙?Fdecoded_sequencer   maximum_thresholdr0   debugc           	     Z   d t                                           D             }t          |           dz   }d}|dk     rd}n|dk    rd}nd}t          | d	z   t	          |                    D ]m\  }}|D ],}	|	                    |          r|	                    |           -|d
k    r	||z  d
k    s	||dz
  k    r!t          d |D                       }||k    r nn|rt          d          }
|
	                    t          d| d| d|            t          |           dk    rL|
	                    t          d| dd                     |
	                    t          d| dd                     |D ],}|
	                    t          |j         d|j                    -t          |d          S )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                "    g | ]} |            S r<   r<   )r   md_classs     r&   r   zmess_ratio.<locals>.<listcomp>I  s+     + + +

+ + +r)   r   rQ   i       r   r      
r   c              3  $   K   | ]}|j         V  d S rG   )r2   )r   dts     r&   r   zmess_ratio.<locals>.<genexpr>`  s$      !?!?r"(!?!?!?!?!?!?r)   charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r   zStarting with: NzEnding with: iz:    )r   __subclasses__r   r   r   r'   r,   sumr   logr   	__class__r2   round)r   r   r   	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr   s               r&   
mess_ratior  A  s1   + +#5#D#D#F#F+ + +I &''!+F O||13))	4,.)),/) 04 7vGG  	5! 	) 	)H  ++ )i((( AII%"CCqHHfqj  !!?!?Y!?!?!???O"333 =/00

51R5 5et5 5!25 5	
 	
 	
   2%%JJuG0@"0EGGHHHJJuG.>suu.EGGHHH 	= 	=BJJu;;;;<<<<!$$$r)   N)r   r   r   r   r   r    )r   F)r   r   r   r0   r   r    r   r0   )(
__future__r   	functoolsr   loggingr   constantr   r   r	   utilsr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r>   rV   rc   rm   rx   r   r   r   r   r   r  r<   r)   r&   <module>r     s   " " " " " "                     
                                     *" " " " " " " "D,L ,L ,L ,L ,L'9 ,L ,L ,L^O O O O O1 O O O6E E E E E* E E E0"D "D "D "D "D&8 "D "D "DJ./ ./ ./ ./ ./( ./ ./ ./bsA sA sA sA sA- sA sA sAlB B B B B- B B B>IP IP IP IP IP0 IP IP IPX# # # # #1 # # #8 4F F F FR 4IN4% 4% 4% 4% 4% 4% 4%r)   