a
    Sic                     @   s   d Z ddlZddlZddlm  mZ ddlm	Z	 ddl
mZ ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ejZejZejZejZejZdZdZG dd dejj Z!G dd dej"Z#G dd dej$Z%dS )z'Keras index lookup preprocessing layer.    N)backend)base_layer_utils)base_preprocessing_layer)preprocessing_utils)layer_serialization)layer_utils)tf_utils)
tf_loggingvocabidf_weightsc                   @   s8   e Zd ZdZdd Zedd Zedd Zdd	 Zd
S )NullInitializerzEA placeholder initializer for restoring this layer from a SavedModel.c                 C   s   || _ || _dS )zConstruct a table initializer object.

        Args:
          key_dtype: Type of the table keys.
          value_dtype: Type of the table values.
        N)
_key_dtype_value_dtype)self	key_dtypevalue_dtype r   c/var/www/html/django/DPS/env/lib/python3.9/site-packages/keras/layers/preprocessing/index_lookup.py__init__/   s    zNullInitializer.__init__c                 C   s   | j S )zThe expected table key dtype.)r   r   r   r   r   r   9   s    zNullInitializer.key_dtypec                 C   s   | j S )zThe expected table value dtype.)r   r   r   r   r   r   >   s    zNullInitializer.value_dtypec                 C   s   dS )z$Returns the table initialization op.Nr   )r   tabler   r   r   
initializeC   s    zNullInitializer.initializeN)	__name__
__module____qualname____doc__r   propertyr   r   r   r   r   r   r   r   ,   s   


r   c                   @   s4   e Zd ZdZdd Zedd Zdd Zdd	 Zd
S )VocabWeightHandlerz;Adds the vocabulary as a layer weight during serialization.c                 C   s   || _ |j| _tj | _d S N)_layervocabulary_dtype_dtypetf
distributeget_strategy_distribute_strategy)r   Zlookup_layerr   r   r   r   K   s    zVocabWeightHandler.__init__c                 C   s   dS )N   r   r   r   r   r   num_tensorsP   s    zVocabWeightHandler.num_tensorsc                 C   s&   t |d | j}| j|| j_d S Nr   )r"   convert_to_tensorr!   r   _lookup_table_from_tokenslookup_table)r   weightstokensr   r   r   set_weightsT   s    zVocabWeightHandler.set_weightsc                 C   s"   | j jdd}t|| j}|gS )NF)include_special_tokens)r   get_vocabularyr"   r)   r!   )r   r-   r   r   r   get_tensorsX   s    zVocabWeightHandler.get_tensorsN)	r   r   r   r   r   r   r'   r.   r1   r   r   r   r   r   H   s   
r   c                       s   e Zd ZdZd: fdd	Zdd Zd	d
 Zd;ddZdd Zdd Z	 fddZ
d<ddZdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zed6d7 Zd8d9 Z  ZS )=IndexLookupa  Maps values from a vocabulary to integer indices.

    This layer translates a set of arbitrary hashables into an integer output
    via a table-based lookup, with optional out-of-vocabulary handling. This is
    the basis layer for both IntegerLookup and StringLookup; it holds the common
    logic but is not intended to be exported as part of the Keras API.

    Args:
      max_tokens: The maximum size of the vocabulary for this layer. If None,
        there is no cap on the size of the vocabulary. Note that this size
        includes the OOV and mask tokens.
      num_oov_indices: The number of out-of-vocabulary tokens to use. If this
        value is more than 1, OOV inputs are hashed to determine their OOV
        value. If this value is 0, OOV inputs will cause an error when calling
        the layer.
      mask_token: A token that represents masked inputs. When `output_mode` is
        `"int"`, the token is included in vocabulary and mapped to index 0. In
        other output modes, the token will not appear in the vocabulary and
        instances of the mask token in the input will be dropped. If set to
        None, no mask term will be added.
      oov_token: Only used when `invert` is True. The token to return for OOV
        indices.
      vocabulary: Optional. Either an array or a string path to a text file. If
        passing an array, can pass a tuple, list, 1D numpy array, or 1D tensor
        containing the vocbulary terms. If passing a file path, the file should
        contain one line per term in the vocabulary. If this argument is set,
        there is no need to `adapt` the layer.
      vocabulary_dtype: The dtype of the vocabulary terms. For example,
        `"int64"` or `"string"`.
      idf_weights: Only valid when `output_mode` is `"tf_idf"`. A tuple, list,
        1D numpy array, or 1D tensor or the same length as the vocabulary,
        containing the floating point inverse document frequency weights, which
        will be multiplied by per sample term counts for the final `tf_idf`
        weight. If the `vocabulary` argument is set, and `output_mode` is
        `"tf_idf"`, this argument must be supplied.
      invert: Only valid when `output_mode` is `"int"`. If True, this layer will
        map indices to vocabulary items instead of mapping vocabulary items to
        indices. Default to False.
      output_mode: Specification for the output of the layer. Defaults to
        `"int"`.  Values can be `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`,
        or `"tf_idf"` configuring the layer as follows:
          - `"int"`: Return the raw integer indices of the input tokens.
          - `"one_hot"`: Encodes each individual element in the input into an
            array the same size as the vocabulary, containing a 1 at the element
            index. If the last dimension is size 1, will encode on that
            dimension.  If the last dimension is not size 1, will append a new
            dimension for the encoded output.
          - `"multi_hot"`: Encodes each sample in the input into a single array
            the same size as the vocabulary, containing a 1 for each vocabulary
            term present in the sample. Treats the last dimension as the sample
            dimension, if input shape is (..., sample_length), output shape will
            be (..., num_tokens).
          - `"count"`: As `"multi_hot"`, but the int array contains a count of
            the number of times the token at that index appeared in the sample.
          - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
            find the value in each token slot.
      pad_to_max_tokens: Only valid when `output_mode` is `"multi_hot"`,
        `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
        padded to `max_tokens` even if the number of unique tokens in the
        vocabulary is less than max_tokens, resulting in a tensor of shape
        [batch_size, max_tokens] regardless of vocabulary size. Defaults to
        False.
      sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`, `"count"`
        and `"tf-idf"` output modes. If True, returns a `SparseTensor` instead
        of a dense `Tensor`. Defaults to False.
    NFintc                    sb  |d ur|dkrt d| |r8|d u r8t d| |dk rNt d| |	dkrZt}	|	dkrft}	tj|	tttttf| jj	dd	 |r|	tkrt d
|	 |
r|	tkrt d|
 d|	 |d ur|	tkrt d| d|	 || _
|| _|| _|| _|| _|	| _|
| _|| _|| _d | _|| _|| _|d|d u| _|dd  |dd  d|vr||	tkrptjnt |d< t jf i | |	tkrt| jj s|d }t d| |r|	tkr| j!ntj| _"t| j| _#d}|}| j| _$ntt| j| _"|	tkr| j!ntj| _#|}| jtkr6dn| j#j%}| jdkrRd| _$n| jdkrj| & | _$nd| _$| jd urt'|| j"| _(t'|| j#| _)| jtkrtj*dg| +  d| jdd| _,| j,- | _.|d ur| /|| n
| 0 | _1| js^| 2t3| d tj4j5j6|tjdd| _7| jtkr^tj4j5j6|tjdd| _8tj*dtjdd| _9d S )Nr&   zBIf set, `max_tokens` must be greater than 1. Received: max_tokens=zJIf pad_to_max_tokens is True, must set `max_tokens`. Received: max_tokens=r   zP`num_oov_indices` must be greater than or equal to 0. Received: num_oov_indices=binaryztf-idfoutput_mode)allowable_strings
layer_namearg_namezK`output_mode` must be `'int'` when `invert` is true. Received: output_mode=zt`sparse` may only be true if `output_mode` is `'one_hot'`, `'multi_hot'`, `'count'` or `'tf_idf'`. Received: sparse=z and output_mode=zW`idf_weights` should only be set if `output_mode` is `'tf_idf'`. Received: idf_weights=has_input_vocabularyvocabulary_sizehas_static_tabledtypezMWhen `output_mode='int'`, `dtype` should be an integer type. Received: dtype=r   F)shaper<   	trainable)r   r   default_value)r<   r?   ):
ValueError	MULTI_HOTTF_IDFr   validate_string_argINTONE_HOTCOUNT	__class__r   invert
max_tokensnum_oov_indices
mask_token	oov_tokenr5   sparsepad_to_max_tokensr    _frozen_vocab_sizeinput_vocabularyinput_idf_weightspop_has_input_vocabularyr"   int64r   floatxsuperr   as_dtypecompute_dtype
is_integerr<   r   r   _default_valuemax_oov_start_indexr)   	_mask_key_mask_valueVariable_token_start_indexr   valueidf_weights_constset_vocabulary_uninitialized_lookup_tabler+   _add_trackabler   lookupexperimentalMutableHashTabletoken_countstoken_document_countsnum_documents)r   rJ   rK   rL   rM   r    
vocabularyr   rI   r5   rN   rO   kwargsinput_dtypeZmask_key
mask_valuerH   r   r   r      s   



zIndexLookup.__init__c                 C   s2   | j tkr|S | jr| jn| j}t|d |gS r(   )r5   rE   rO   rJ   rP   r"   TensorShape)r   input_shapedepthr   r   r   compute_output_shape^  s    
z IndexLookup.compute_output_shapec                 C   s0   |  |j }| jr| jn| j}tj||dS )N)r>   r<   )ru   r>   as_listrI   r    rY   r"   
TensorSpec)r   
input_specoutput_shapeoutput_dtyper   r   r   compute_output_signatureh  s    z$IndexLookup.compute_output_signatureTc                    s   j  dkrg g  }}n<j  \}}jr6||fn||f\}}||  }}tfddt||  fddt	
 D }jdurjtkrj|d< |s| d }|S )aq  Returns the current vocabulary of the layer.

        Args:
          include_special_tokens: If True, the returned vocabulary will include
            mask and OOV tokens, and a term's index in the vocabulary will equal
            the term's index when calling the layer. If False, the returned
            vocabulary will not include any mask or OOV tokens.
        r   c                      s    j S r   )rM   r   r   r   r   <lambda>      z,IndexLookup.get_vocabulary.<locals>.<lambda>c                    s   g | ]} | qS r   r   ).0x)rg   r   r   
<listcomp>  r}   z.IndexLookup.get_vocabulary.<locals>.<listcomp>N)r+   sizeexportrI   _tensor_vocab_to_numpynumpycollectionsdefaultdictzipranger:   rL   r5   rE   ra   )r   r/   r
   indiceskeysvaluesr   )rg   r   r   r0   o  s     
zIndexLookup.get_vocabularyc                 C   s8   t  r"t| j  |   S | j |   S dS )zGets the current size of the layer's vocabulary.

        Returns:
          The integer size of the vocabulary, including optional mask and oov
          indices.
        N)r"   executing_eagerlyr3   r+   r   r   ra   r   r   r   r   r:     s    zIndexLookup.vocabulary_sizec                 C   s   t d |  S )Nz5vocab_size is deprecated, please use vocabulary_size.)loggingwarningr:   r   r   r   r   
vocab_size  s    
zIndexLookup.vocab_sizec                    sd   | j | j| j| j| j| j| j| jt	| j
| jt	| jd}t  }tt| t|  S )N)rI   rJ   rK   rM   rL   r5   rN   rO   rm   r    r   )rI   rJ   rK   rM   rL   r5   rN   rO   utilslistify_tensorsrQ   r    rR   rW   
get_configdictlistitems)r   configbase_configrq   r   r   r     s    


zIndexLookup.get_configc                 C   sh  | j tkr(|dur(td| j  d| t|trptjj|sNtd	|| j tkr`td| 
|| _dS t st|st|rtd	| jj| jt|r| |}nt|ttfrt|}t|r| }nt|ttfrt|}|jdkrtd	||  }|  }| jg| | jg| j  }t||d| }|rh||d }n|}| |}|rtd		|| jdur| j|v rt || jkd
 }	td| d| j d|	 | jdur"| j!r"| j|v r"t || jkd
 }
td| d| j d|
 |t"| }| j#durX|| j#krXtd	|| j#| $|| _| j tkrd|du rtdt"|t"|krtd	t"|t"|| %|}|j&dkrtd	t'||rd}d}n|}t(|}d}| j)r| j#dur| j#| t"| }nd}tj*|||fd||fd}tj+|| j,d}| j-.| | j-/ | _0dS )a  Sets vocabulary (and optionally document frequency) data for this layer.

        This method sets the vocabulary and idf weights for this layer directly,
        instead of analyzing a dataset through `adapt`. It should be used
        whenever the vocab (and optionally document frequency) information is
        already known.  If vocabulary data is already present in the layer, this
        method will replace it.

        Args:
          vocabulary: Either an array or a string path to a text file. If
            passing an array, can pass a tuple, list, 1D numpy array, or 1D
            tensor containing the vocbulary terms. If passing a file path, the
            file should contain one line per term in the vocabulary.
          idf_weights: A tuple, list, 1D numpy array, or 1D tensor of inverse
            document frequency weights with equal length to vocabulary. Must be
            set if `output_mode` is `"tf_idf"`. Should not be set otherwise.

        Raises:
          ValueError: If there are too many inputs, the inputs do not match, or
            input data is missing.
          RuntimeError: If the vocabulary cannot be set when this function is
            called. This happens when `"multi_hot"`, `"count"`, and `"tf_idf"`
            modes, if `pad_to_max_tokens` is False and the layer itself has
            already been called.
          RuntimeError: If a tensor vocabulary is passed outside of eager
            execution.
        NzU`idf_weights` should only be set if output_mode is `'tf_idf'`. Received: output_mode=z and idf_weights=z"Vocabulary file {} does not exist.zGoutput_mode `'tf_idf'` does not support loading a vocabulary from file.zCannot set a tensor vocabulary on {} layer {} when not executing eagerly. Create this layer or call `set_vocabulary` outside of any `tf.function`s and with eager execution enabled.r   z.Cannot set an empty vocabulary, you passed {}.zmThe passed vocabulary has at least one repeated term. Please uniquify your dataset. The repeated terms are {}r=   a  Found reserved mask token at unexpected location in `vocabulary`. Note that passed `vocabulary` does not need to include the OOV and mask tokens. Either remove all mask and OOV tokens, or include them only at the start of the vocabulary in precisely this order: z. Received: mask_token=z at vocabulary index a  Found reserved OOV token at unexpected location in `vocabulary`. Note that passed `vocabulary` does not need to include the OOV and mask tokens. Either remove all mask and OOV tokens, or include them only at the start of the vocabulary in precisely this order: z. Received: oov_token=zpAttempted to set a vocabulary larger than the maximum vocab size. Passed vocab size is {}, max vocab size is {}.z2`idf_weights` must be set if output_mode is TF_IDFzb`idf_weights` must be the same length as vocabulary. len(idf_weights) is {}, len(vocabulary) is {}r&   z4TF-IDF data must be a 1-index array, but received {}constantconstant_valuesr<   )1r5   rC   rA   
isinstancestrr"   iogfileexistsformat_lookup_table_from_filer+   r   	is_tensorRuntimeErrorrH   r   namer   r   tuplenparrayr   r   r]   ra   rL   rM   rK   array_equal_find_repeated_tokensargwhererI   lenrJ   r*   _convert_to_ndarrayndimtypeaveragerO   padr)   rY   r   assignrb   rc   )r   rm   r   Z	oov_starttoken_startZspecial_tokensZfound_special_tokensr-   Zrepeated_tokensZ
mask_indexZ	oov_indexnew_vocab_sizeZfront_paddingZfront_padding_valueZback_padding_valueZback_paddingr,   r   r   r   rd     s   













	
zIndexLookup.set_vocabularyc                 C   s   | j rtd| jjtj|| jd}|jj	dkr@t
|d}|jj	dkrXt
|d}| |\}}| j||| j|  | jtkrt
dd |}| |\}}| j||| j|  t|r| j|  n| jt
j|t
jdd  d S )Nz^Cannot adapt {} layer after setting a static vocabulary via init argument or `set_vocabulary`.r   r   r&   c                 S   s   t | d S r(   )r"   unique)r   r   r   r   r|     r}   z*IndexLookup.update_state.<locals>.<lambda>)out_type)rT   rA   r   rH   r   r   ensure_tensorr    r>   rankr"   expand_dims_num_tokensrj   insertrg   r5   rC   map_fnrk   r   	is_raggedrl   
assign_addnrowsrU   )r   datar-   countsZdeduped_doc_dataZ
doc_countsr   r   r   update_statem  s4    

zIndexLookup.update_statec                 C   s  | j st| j dr2| jtkr.| j | _	d S | j
d urV| jt| j
g| j | jd urz| jt| jg| j | j \}}t| | fd d d }|  }| jr| j| }|d | }t||}| || _| jtkr| j|}| || j}t|| j}tj||  dggt|d}| j rn| jd urntj|d| jt| ggdd}| j!| | j | _	| "  d S )Nr   r=   r   )#rT   r"   equalrj   r   r5   rC   r   rb   rc   rL   remover)   r    rM   r   r   lexsortr   ra   rJ   gatherr*   r+   rk   rg   _inverse_document_frequencyrl   castrY   r   reduce_meanrO   r   reset_state)r   r-   r   sorted_indicesr   Zmax_learned_tokensrk   r   r   r   r   finalize_state  sP    


 
zIndexLookup.finalize_statec                 C   sP   | j r
d S | j| j d  | jtkrL| j| j d  | jd d S r(   )	rT   rj   r   r   r5   rC   rk   rl   r   r   r   r   r   r     s    
zIndexLookup.reset_statec                 C   s   |    tj|| jd}|j}|jjdkr6| |d}t|r\t	
|j| |j|j}n&t|rxt	j| j|}n
| |}| jtkr|jdkrt	|d}|S | jr| jn| j}| jtkr| jnd }tj|| j|| j| j|dS )Nr   r   r=   )r5   rt   r<   rN   r   )_maybe_freeze_vocab_sizer   r   r   r>   r   _expand_dimsr   	is_sparser"   SparseTensorr   _lookup_denser   dense_shaper   raggedmap_flat_valuesr5   rE   squeezerO   rJ   rP   rC   rc   encode_categorical_inputsrY   rN   )r   inputsoriginal_shapelookupsrt   r   r   r   r   call  s@    





zIndexLookup.callc           
      C   s`  t  r$t|r$t j|| jd}n| j|}| jdurXt 	|| j
}t || j|}| jrb|S g }| jdkrt t 	|d}t ||}t jd|f}t t 	t |d|g}|| n`| jdkr&| jjrt j|| j}nt jj|| jd}||   }t 	|| j}	t |	||}t | t |W  d   S 1 sR0    Y  dS )zALookup table values for a dense Tensor, handling masking and OOV.r   Nr   r=   zwWhen `num_oov_indices=0` all inputs should be in vocabulary, found OOV values {}, consider setting `num_oov_indices=1`.r&   )num_buckets)r"   r   r   is_keras_tensor
zeros_liker   r+   rg   rL   r   r^   wherer_   rI   rK   	gather_ndstringsr   Assertr   appendr   rZ   mathfloormodto_hash_bucket_fastr]   r[   control_dependenciesidentity)
r   r   r   Zmask_locationsZlookup_checksZoov_indicesZ
oov_inputsmsg	assertionZoov_locationsr   r   r   r     s:    

zIndexLookup._lookup_densec                 C   sJ   t  . t| j| j}t j|| jW  d    S 1 s<0    Y  d S r   )r"   
init_scoper   r   r   rg   StaticHashTabler[   )r   initializerr   r   r   re   2  s    
z'IndexLookup._uninitialized_lookup_tablec           	      C   s   t   |  }|t | }| jr,| jn| j}t j|||d}| jrP||fn||f\}}t j	||| j| j}t j
|| jW  d    S 1 s0    Y  d S )Nr   )r"   r   ra   r   rI   r   r   r   rg   KeyValueTensorInitializerr   r[   )	r   r-   r   Z	token_endindices_dtyper   r   r   r   r   r   r   r*   7  s    
z%IndexLookup._lookup_table_from_tokensc              	   C   s   | j rtjjj}tjjj}ntjjj}tjjj}t @ tjj|| j|| j	|| 
 d}tj|| jW  d    S 1 s~0    Y  d S )N)filenamer   	key_indexr   value_indexvalue_index_offset)rI   r"   rg   TextFileIndexLINE_NUMBER
WHOLE_LINEr   TextFileInitializerr   r   ra   r   r[   )r   r   r   r   r   r   r   r   r   G  s    



z#IndexLookup._lookup_table_from_filec                 C   s   t |ttfrt|S |S r   )r   r   r   r   r   )r   r   r   r   r   r   Y  s    zIndexLookup._convert_to_ndarrayc                 C   s(   t |rtj||S t||S d S r   )r   r   r"   rN   r   )r   r   axisr   r   r   r   \  s    
zIndexLookup._expand_dimsc                 C   s   | j d ur| jtkrdS dS )Nr&   r   )rL   r5   rE   r   r   r   r   r]   b  s    zIndexLookup._oov_start_indexc                 C   s   |   | j S r   )r]   rK   r   r   r   r   ra   g  s    zIndexLookup._token_start_indexc                 C   s   | j tks| jrd S t 0 t s6td| j |  }W d    n1 sR0    Y  || 	 krztd| j n*| j
d ur|| j
krtd| j | j
||| _
d S )Nz<When using `output_mode={}` eager execution must be enabled.zWhen using `output_mode={}` and `pad_to_max_tokens=False`, you must set the layer's vocabulary before calling it. Either pass a `vocabulary` argument to the layer, or call `adapt` with some sample data.zWhen using `output_mode={}` and `pad_to_max_tokens=False`, the vocabulary size cannot be changed after the layer is called. Vocab size is {}, new vocab size is {})r5   rE   rO   r"   r   r   r   r   r:   ra   rP   )r   r   r   r   r   r   j  s4    
&
z$IndexLookup._maybe_freeze_vocab_sizec                 C   s8   t |}t|t|kr0dd t| D S g S dS )z+Return all repeated tokens in a vocabulary.c                 S   s   g | ]\}}|d kr|qS )r&   r   )r~   itemcountr   r   r   r     s   z5IndexLookup._find_repeated_tokens.<locals>.<listcomp>N)setr   r   Counterr   )r   rm   Zvocabulary_setr   r   r   r     s    z!IndexLookup._find_repeated_tokensc                 C   sP   t |r|j}n t |r$|j}nt|dg}tj|tjd\}}}||fS )z?Count the number of tokens in a ragged, sparse or dense tensor.r=   )out_idx)	r   r   r   r   flat_valuesr"   reshapeunique_with_countsrU   )r   r   r   r-   _r   r   r   r   r     s    

zIndexLookup._num_tokensc                 C   s   t jd|d|   S )a  Computes the inverse-document-frequency (IDF) component of "tf_idf".

        Uses the default weighting scheme described in
        https://en.wikipedia.org/wiki/Tf%E2%80%93idf.

        Args:
          token_document_counts: An array of the # of documents each token
            appears in.
          num_documents: An int representing the total number of documents

        Returns:
          An array of "inverse document frequency" weights.
        r&   )r"   r   log)r   rk   rl   r   r   r   r     s    z'IndexLookup._inverse_document_frequencyc                 C   s
   t | S r   )r   VocabularySavedModelSaverr   r   r   r   _trackable_saved_model_saver  s    z(IndexLookup._trackable_saved_model_saverc                 C   s   |  S )z3Converts a tensor vocabulary to a numpy vocabulary.)r   )r   rm   r   r   r   r     s    z"IndexLookup._tensor_vocab_to_numpy)NNFr3   FF)T)N) r   r   r   r   r   ru   r{   r0   r:   r   r   rd   r   r   r   r   r   re   r*   r   r   r   r]   ra   r   r   r   r   r   r  r   __classcell__r   r   rq   r   r2   _   sH   J       <


 ;$;+0
r2   )&r   r   r   r   tensorflow.compat.v2compatv2r"   kerasr   keras.enginer   r   keras.layers.preprocessingr   r   keras.saving.saved_modelr   keras.utilsr   r   tensorflow.python.platformr	   r   rE   rB   rF   rG   rC   Z_VOCAB_NAMEZ_IDF_WEIGHTS_NAMErg   r   r   TrackableWeightHandlerr   PreprocessingLayerr2   r   r   r   r   <module>   s*   