a
    SicJ                     @   s  d dl Z d dlZd dlmZ d dlmZmZ d dlmZm	Z	m
Z
mZmZ ddlmZmZmZmZ ddlmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZm Z m!Z!m"Z"m#Z# e $dZ%e & Z'e'(e )d de*e+e+e,ee
e-  ee
e-  e.e.ed	ddZ/d e	e+e+e,ee
e-  ee
e-  e.e.ed	ddZ0d!de+e+e,ee
e-  ee
e-  e.e.ed	ddZ1d"de+e+e,ee
e-  ee
e-  e.edddZ2dS )#    N)PathLike)basenamesplitext)AnyBinaryIOListOptionalSet   )coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDTOO_BIG_SEQUENCETOO_SMALL_SEQUENCETRACE)
mess_ratio)CharsetMatchCharsetMatches)any_specified_encodingcut_sequence_chunks	iana_nameidentify_sig_or_bomis_cp_similaris_multi_byte_encodingshould_strip_sig_or_bomcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s      皙?TF)		sequencessteps
chunk_size	thresholdcp_isolationcp_exclusionpreemptive_behaviourexplainreturnc           -      C   s  t | ttfs tdt| |r>tj}tt	 t
t t| }	|	dkrtd |rvtt	 t
|prtj tt| dddg dgS |durttd	d
| dd |D }ng }|durttdd
| dd |D }ng }|	|| krttd|||	 d}|	}|dkr:|	| |k r:t|	| }t| tk }
t| tk}|
rlttd|	 n|rttd|	 g }|rt| nd}|dur|| ttd| t }g }g }d}d}d}t }t| \}}|dur|| ttdt|| |d d|vr.|d |t D ]}|rP||vrPq6|rd||v rdq6||v rrq6|| d}||k}|ot|}|dv r|sttd| q6zt|}W n, t t!fy   ttd| Y q6Y n0 zr|r<|du r<t"|du r | dtd n| t|td |d n&t"|du rL| n| t|d |d}W nb t#t$fy } zDt |t$sttd|t"| || W Y d}~q6W Y d}~n
d}~0 0 d}|D ]}t%||rd} qq|rttd|| q6t&|sdnt||	t|	| }|oD|duoDt||	k } | rZttd| tt|d }!t'|!d }!d}"d}#g }$g }%ztt(| ||||||||	D ]V}&|$|& |%t)|&| |%d! |kr|"d7 }"|"|!ks|r|du r qqW nB t#y: } z(ttd"|t"| |!}"d}#W Y d}~n
d}~0 0 |#s|r|sz| td#d j*|d$d% W nR t#y } z8ttd&|t"| || W Y d}~q6W Y d}~n
d}~0 0 |%rt+|%t|% nd}'|'|ks|"|!krf|| ttd'||"t,|'d( d)d* |dd|fv r6|#s6t| ||dg |}(||krN|(}n|dkr^|(}n|(}q6ttd+|t,|'d( d)d* |st-|})nt.|})|)rttd,|t"|) g }*|dkr|$D ],}&t/|&d-|)rd.|)nd}+|*|+ qt0|*},|,rttd/|,| |t| ||'||,| ||ddfv r|'d-k rtd0| |rrtt	 t
| t|| g  S ||kr6td1| |rtt	 t
| t|| g  S q6t|dkrz|s|s|rttd2 |rtd3|j1 || nd|r&|du sJ|r@|r@|j2|j2ksJ|dur`td4 || n|rztd5 || |rtd6|3 j1t|d  n
td7 |rtt	 t
| |S )8ae  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
    Custom logging format and handler can be set manually.
    z4Expected object of type bytes or bytearray, got: {0}r   z<Encoding detection on empty bytes, assuming utf_8 intention.utf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, c                 S   s   g | ]}t |d qS Fr   .0cp r1   R/var/www/html/django/DPS/env/lib/python3.9/site-packages/charset_normalizer/api.py
<listcomp>[       zfrom_bytes.<locals>.<listcomp>zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.c                 S   s   g | ]}t |d qS r,   r-   r.   r1   r1   r2   r3   f   r4   z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.r
   z>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).z@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_32utf_16z[Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.z2Encoding %s does not provide an IncrementalDecoderg    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %sTzW%s is deemed too similar to code page %s and was consider unsuited already. Continuing!zpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.      zaLazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %sg     j@strict)errorsz^LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %szc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d      )ndigitsz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {}g?,z We detected language {} using {}z.Encoding detection: %s is most likely the one.zoEncoding detection: %s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z7Encoding detection: %s will be used as a fallback matchz:Encoding detection: utf_8 will be used as a fallback matchz:Encoding detection: ascii will be used as a fallback matchz]Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.z=Encoding detection: Unable to determine any suitable charset.)4
isinstance	bytearraybytes	TypeErrorformattypeloggerlevel
addHandlerexplain_handlersetLevelr   lendebugremoveHandlerloggingWARNINGr   r   logjoinintr   r   r   appendsetr   r   addr   r   ModuleNotFoundErrorImportErrorstrUnicodeDecodeErrorLookupErrorr   rangemaxr   r   decodesumroundr   r   r   r   r8   fingerprintbest)-r!   r"   r#   r$   r%   r&   r'   r(   Zprevious_logger_levellengthZis_too_small_sequenceZis_too_large_sequenceZprioritized_encodingsZspecified_encodingZtestedZtested_but_hard_failureZtested_but_soft_failureZfallback_asciiZfallback_u8Zfallback_specifiedresultsZsig_encodingZsig_payloadZencoding_ianaZdecoded_payloadZbom_or_sig_availableZstrip_sig_or_bomZis_multi_byte_decodereZsimilar_soft_failure_testZencoding_soft_failedZr_Zmulti_byte_bonusZmax_chunk_gave_upZearly_stop_countZlazy_str_hard_failureZ	md_chunksZ	md_ratioschunkZmean_mess_ratioZfallback_entryZtarget_languagesZ	cd_ratiosZchunk_languagesZcd_ratios_mergedr1   r1   r2   
from_bytes#   s   














$






$











	






rh   )	fpr"   r#   r$   r%   r&   r'   r(   r)   c              	   C   s   t |  |||||||S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )rh   read)ri   r"   r#   r$   r%   r&   r'   r(   r1   r1   r2   from_fp  s    rk   zPathLike[Any])	pathr"   r#   r$   r%   r&   r'   r(   r)   c           	   
   C   sD   t | d&}t||||||||W  d   S 1 s60    Y  dS )z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openrk   )	rl   r"   r#   r$   r%   r&   r'   r(   ri   r1   r1   r2   	from_path  s    ro   )rl   r"   r#   r$   r%   r&   r'   r)   c              	   C   s   t dt t| ||||||}t| }tt|}	t|dkrNtd	||
 }
|	d  d|
j 7  < td	t| |d|	d}||
  W d   n1 s0    Y  |
S )	zi
    Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
    z2normalize is deprecated and will be removed in 3.0r   z;Unable to normalize "{}", no encoding charset seems to fit.-z{}r+   wbN)warningswarnDeprecationWarningro   r   listr   rM   IOErrorrF   rc   r8   rn   rZ   replacerS   writeoutput)rl   r"   r#   r$   r%   r&   r'   re   filenameZtarget_extensionsresultri   r1   r1   r2   	normalize  s:    
,r|   )r   r   r    NNTF)r   r   r    NNTF)r   r   r    NNTF)r   r   r    NNT)3rP   rr   osr   Zos.pathr   r   typingr   r   r   r   r	   cdr   r   r   r   constantr   r   r   r   mdr   modelsr   r   utilsr   r   r   r   r   r   r   	getLoggerrH   StreamHandlerrK   setFormatter	FormatterrD   rT   floatrZ   boolrh   rk   ro   r|   r1   r1   r1   r2   <module>   s   $
       

   G       

       

      

