a
    +=icJ                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlm	Z	 ddlm
Z
 ddlmZ ddlZddlZdd	lmZ dd
lmZ ejdk rejZnejZdddZdddZdddZG dd deZdd ZdS )z(Utilities for text input preprocessing.
    )absolute_import)division)print_functionN)OrderedDict)defaultdict)md5)range)zip   !!"#$%&()*+,-./:;<=>?@[\]^_`{|}~	
T c                    s   |r|   } tjdk rt| tr> fdd|D }| |} qt dkrht| t| }| |} q|D ]}| | } qln$ fdd|D }t|}| |} | 	 }dd |D S )a  Converts a text to a sequence of words (or tokens).

    # Arguments
        text: Input text (string).
        filters: list (or concatenation) of characters to filter out, such as
            punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n``,
            includes basic punctuation, tabs, and newlines.
        lower: boolean. Whether to convert the input to lowercase.
        split: str. Separator for word splitting.

    # Returns
        A list of words (or tokens).
    r
   c                    s   i | ]}t |t qS  )ordunicode.0csplitr   i/home/droni/.local/share/virtualenvs/DPS-5Je3_V2c/lib/python3.9/site-packages/keras_preprocessing/text.py
<dictcomp>/   s   z)text_to_word_sequence.<locals>.<dictcomp>   c                    s   i | ]
}| qS r   r   r   r   r   r   r   :       c                 S   s   g | ]}|r|qS r   r   )r   ir   r   r   
<listcomp>?   r   z)text_to_word_sequence.<locals>.<listcomp>)
lowersysversion_info
isinstancer   	translatelen	maketransreplacer   )textfiltersr   r   Ztranslate_mapr   Ztranslate_dictseqr   r   r   text_to_word_sequence   s$    




r'   c                 C   s   t | |t|||dS )a  One-hot encodes a text into a list of word indexes of size n.

    This is a wrapper to the `hashing_trick` function using `hash` as the
    hashing function; unicity of word to index mapping non-guaranteed.

    # Arguments
        text: Input text (string).
        n: int. Size of vocabulary.
        filters: list (or concatenation) of characters to filter out, such as
            punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n``,
            includes basic punctuation, tabs, and newlines.
        lower: boolean. Whether to set the text to lowercase.
        split: str. Separator for word splitting.

    # Returns
        List of integers in [1, n]. Each integer encodes a word
        (unicity non-guaranteed).
    )hash_functionr%   r   r   )hashing_trickhash)r$   nr%   r   r   r   r   r   one_hotB   s    r,   c                    sB    du rt  n dkrdd  t| |||d} fdd|D S )a  Converts a text to a sequence of indexes in a fixed-size hashing space.

    # Arguments
        text: Input text (string).
        n: Dimension of the hashing space.
        hash_function: defaults to python `hash` function, can be 'md5' or
            any function that takes in input a string and returns a int.
            Note that 'hash' is not a stable hashing function, so
            it is not consistent across different runs, while 'md5'
            is a stable hashing function.
        filters: list (or concatenation) of characters to filter out, such as
            punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n``,
            includes basic punctuation, tabs, and newlines.
        lower: boolean. Whether to set the text to lowercase.
        split: str. Separator for word splitting.

    # Returns
        A list of integer word indices (unicity non-guaranteed).

    `0` is a reserved index that won't be assigned to any word.

    Two or more words may be assigned to the same index, due to possible
    collisions by the hashing function.
    The [probability](
        https://en.wikipedia.org/wiki/Birthday_problem#Probability_table)
    of a collision is in relation to the dimension of the hashing space and
    the number of distinct objects.
    Nr   c                 S   s   t t|   dS )N   )intr   encode	hexdigest)wr   r   r   r(      s    z$hashing_trick.<locals>.hash_function)r%   r   r   c                    s    g | ]} |d   d  qS )r   r   )r   r1   r(   r+   r   r   r      r   z!hashing_trick.<locals>.<listcomp>)r*   r'   )r$   r+   r(   r%   r   r   r&   r   r2   r   r)   _   s    !r)   c                   @   sn   e Zd ZdZddd	Zd
d Zdd Zdd Zdd Zdd Z	dd Z
d ddZd!ddZdd Zdd ZdS )"	TokenizeraO  Text tokenization utility class.

    This class allows to vectorize a text corpus, by turning each
    text into either a sequence of integers (each integer being the index
    of a token in a dictionary) or into a vector where the coefficient
    for each token could be binary, based on word count, based on tf-idf...

    # Arguments
        num_words: the maximum number of words to keep, based
            on word frequency. Only the most common `num_words-1` words will
            be kept.
        filters: a string where each element is a character that will be
            filtered from the texts. The default is all punctuation, plus
            tabs and line breaks, minus the `'` character.
        lower: boolean. Whether to convert the texts to lowercase.
        split: str. Separator for word splitting.
        char_level: if True, every character will be treated as a token.
        oov_token: if given, it will be added to word_index and used to
            replace out-of-vocabulary words during text_to_sequence calls

    By default, all punctuation is removed, turning the texts into
    space-separated sequences of words
    (words maybe include the `'` character). These sequences are then
    split into lists of tokens. They will then be indexed or vectorized.

    `0` is a reserved index that won't be assigned to any word.
    Nr   Tr   Fr   c           	      K   s   d|v rt d |d}|r0tdt| t | _tt| _	|| _
|| _|| _|| _|| _|| _|| _tt| _i | _i | _d S )NZnb_wordszDThe `nb_words` argument in `Tokenizer` has been renamed `num_words`.z Unrecognized keyword arguments: )warningswarnpop	TypeErrorstrr   word_countsr   r.   	word_docsr%   r   r   	num_wordsdocument_count
char_level	oov_token
index_docs
word_index
index_word)	selfr;   r%   r   r   r=   r>   r<   kwargsr   r   r   __init__   s"    	



zTokenizer.__init__c                 C   sn  |D ]}|  j d7  _ | js&t|trT| jrNt|trFdd |D }n| }|}nt|| j| j| j}|D ],}|| jv r| j|  d7  < qld| j|< qlt	|D ]}| j
|  d7  < qqt| j }|jdd dd | jdu rg }n| jg}|d	d
 |D  tt|ttdt|d | _dd | j D | _t| j
 D ]\}}|| j| j| < qNdS )a  Updates internal vocabulary based on a list of texts.

        In the case where texts contains lists,
        we assume each entry of the lists to be a token.

        Required before using `texts_to_sequences` or `texts_to_matrix`.

        # Arguments
            texts: can be a list of strings,
                a generator of strings (for memory-efficiency),
                or a list of list of strings.
        r   c                 S   s   g | ]}|  qS r   r   r   Z	text_elemr   r   r   r      r   z*Tokenizer.fit_on_texts.<locals>.<listcomp>c                 S   s   | d S )Nr   r   )xr   r   r   <lambda>   r   z(Tokenizer.fit_on_texts.<locals>.<lambda>T)keyreverseNc                 s   s   | ]}|d  V  qdS )r   Nr   )r   wcr   r   r   	<genexpr>   r   z)Tokenizer.fit_on_texts.<locals>.<genexpr>c                 S   s   i | ]\}}||qS r   r   )r   r1   r   r   r   r   r      r   z*Tokenizer.fit_on_texts.<locals>.<dictcomp>)r<   r=   r   listr   r'   r%   r   r9   setr:   itemssortr>   extenddictr	   r   r!   r@   rA   r?   )rB   textsr$   r&   r1   ZwcountsZ
sorted_vocr   r   r   r   fit_on_texts   s>    


zTokenizer.fit_on_textsc                 C   sD   |  j t|7  _ |D ](}t|}|D ]}| j|  d7  < q&qdS )a%  Updates internal vocabulary based on a list of sequences.

        Required before using `sequences_to_matrix`
        (if `fit_on_texts` was never called).

        # Arguments
            sequences: A list of sequence.
                A "sequence" is a list of integer word indices.
        r   N)r<   r!   rN   r?   )rB   	sequencesr&   r   r   r   r   fit_on_sequences   s
    
zTokenizer.fit_on_sequencesc                 C   s   t | |S )aN  Transforms each text in texts to a sequence of integers.

        Only top `num_words-1` most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            texts: A list of texts (strings).

        # Returns
            A list of sequences.
        )rM   texts_to_sequences_generator)rB   rS   r   r   r   texts_to_sequences  s    zTokenizer.texts_to_sequencesc           	      c   s   | j }| j| j}|D ]}| js,t|trZ| jrTt|trLdd |D }n| }|}nt|| j	| j| j
}g }|D ]X}| j|}|dur|r||kr|dur|| q|| qv| jdurv|| qv|V  qdS )a  Transforms each text in `texts` to a sequence of integers.

        Each item in texts can also be a list,
        in which case we assume each item of that list to be a token.

        Only top `num_words-1` most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            texts: A list of texts (strings).

        # Yields
            Yields individual sequences.
        c                 S   s   g | ]}|  qS r   rE   rF   r   r   r   r   0  r   z:Tokenizer.texts_to_sequences_generator.<locals>.<listcomp>N)r;   r@   getr>   r=   r   rM   r   r'   r%   r   append)	rB   rS   r;   oov_token_indexr$   r&   vectr1   r   r   r   r   rW     s2    

z&Tokenizer.texts_to_sequences_generatorc                 C   s   t | |S )aY  Transforms each sequence into a list of text.

        Only top `num_words-1` most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            sequences: A list of sequences (list of integers).

        # Returns
            A list of texts (strings)
        )rM   sequences_to_texts_generator)rB   rU   r   r   r   sequences_to_textsF  s    zTokenizer.sequences_to_textsc                 c   s   | j }| j| j}|D ]}g }|D ]d}| j|}|durn|rb||krb|durl|| j|  q|| q$| jdur$|| j|  q$d|}|V  qdS )a  Transforms each sequence in `sequences` to a list of texts(strings).

        Each sequence has to a list of integers.
        In other words, sequences should be a list of sequences

        Only top `num_words-1` most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            sequences: A list of sequences.

        # Yields
            Yields individual texts.
        Nr   )r;   r@   rY   r>   rA   rZ   join)rB   rU   r;   r[   r&   r\   numwordr   r   r   r]   T  s    

z&Tokenizer.sequences_to_texts_generatorbinaryc                 C   s   |  |}| j||dS )zConvert a list of texts to a Numpy matrix.

        # Arguments
            texts: list of strings.
            mode: one of "binary", "count", "tfidf", "freq".

        # Returns
            A Numpy matrix.
        )mode)rX   sequences_to_matrix)rB   rS   rc   rU   r   r   r   texts_to_matrixt  s    

zTokenizer.texts_to_matrixc                 C   sR  | j s&| jrt| jd }q,tdn| j }|dkrB| jsBtdtt||f}t|D ]\}}|sjq\tt	}|D ]}||krqv||  d7  < qvt
| D ]\}}	|dkr|	|| |< q|dkr|	t| || |< q|dkrd|| |< q|dkr@dt|	 }
td| jd| j|d   }|
| || |< qtd	|qq\|S )
a  Converts a list of sequences into a Numpy matrix.

        # Arguments
            sequences: list of sequences
                (a sequence is a list of integer word indices).
            mode: one of "binary", "count", "tfidf", "freq"

        # Returns
            A Numpy matrix.

        # Raises
            ValueError: In case of invalid `mode` argument,
                or if the Tokenizer requires to be fit to sample data.
        r   zKSpecify a dimension (`num_words` argument), or fit on some text data first.Ztfidfz7Fit the Tokenizer on some data before using tfidf mode.countfreqrb   r   zUnknown vectorization mode:)r;   r@   r!   
ValueErrorr<   npZzeros	enumerater   r.   rM   rO   logr?   rY   )rB   rU   rc   r;   rG   r   r&   countsjr   tfZidfr   r   r   rd     s>    


zTokenizer.sequences_to_matrixc                 C   sh   t | j}t | j}t | j}t | j}t | j}| j| j| j	| j
| j| j| j|||||dS )a:  Returns the tokenizer configuration as Python dictionary.
        The word count dictionaries used by the tokenizer get serialized
        into plain JSON, so that the configuration can be read by other
        projects.

        # Returns
            A Python dictionary with the tokenizer configuration.
        )r;   r%   r   r   r=   r>   r<   r9   r:   r?   rA   r@   )jsondumpsr9   r:   r?   r@   rA   r;   r%   r   r   r=   r>   r<   )rB   Zjson_word_countsZjson_word_docsZjson_index_docsZjson_word_indexZjson_index_wordr   r   r   
get_config  s$    	zTokenizer.get_configc                 K   s(   |   }| jj|d}tj|fi |S )a  Returns a JSON string containing the tokenizer configuration.
        To load a tokenizer from a JSON string, use
        `keras.preprocessing.text.tokenizer_from_json(json_string)`.

        # Arguments
            **kwargs: Additional keyword arguments
                to be passed to `json.dumps()`.

        # Returns
            A JSON string containing the tokenizer configuration.
        )
class_nameconfig)rq   	__class____name__ro   rp   )rB   rC   rs   tokenizer_configr   r   r   to_json  s
    zTokenizer.to_json)Nr   Tr   FNr   )rb   )rb   )ru   
__module____qualname____doc__rD   rT   rV   rX   rW   r^   r]   re   rd   rq   rw   r   r   r   r   r3      s&          
6+ 

7r3   c           	      C   s   t | }|d}t |d}t |d}t |d}dd | D }t |d}dd | D }t |d	}tf i |}||_||_||_||_	||_
|S )
zParses a JSON tokenizer configuration file and returns a
    tokenizer instance.

    # Arguments
        json_string: JSON string encoding a tokenizer configuration.

    # Returns
        A Keras Tokenizer instance
    rs   r9   r:   r?   c                 S   s   i | ]\}}t ||qS r   r.   r   kvr   r   r   r     r   z'tokenizer_from_json.<locals>.<dictcomp>rA   c                 S   s   i | ]\}}t ||qS r   r{   r|   r   r   r   r     r   r@   )ro   loadsrY   r6   rO   r3   r9   r:   r?   r@   rA   )	Zjson_stringrv   rs   r9   r:   r?   rA   r@   Z	tokenizerr   r   r   tokenizer_from_json  s     


r   )r   Tr   )r   Tr   )Nr   Tr   )rz   
__future__r   r   r   stringr   r4   collectionsr   r   hashlibr   ro   numpyri   Z	six.movesr   r	   r   r"   r8   r'   r,   r)   objectr3   r   r   r   r   r   <module>   s@   
  
)   
    
.  _