a
    +=icA                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddl	m
Z
 ddlZdd
dZdddZdddZdd ZG dd deZdd ZdS )z+Utilities for preprocessing sequence data.
    )absolute_import)division)print_functionN)rangeint32pre        c              	   C   s  t | dstdt| }g }d}d}	| D ]^}
z6|t|
 |	rbt|
rbt|
jdd }d}	W q* ty   tdt|
 Y q*0 q*|du rt	|}t
|tjpt
|tj}t|tjr|tkr|std	|t|tj||f| ||d
}
t| D ]\}}t|sq|dkr4|| d }n$|dkrL|d| }ntd| tj||d
}|jdd |krtd|jdd ||f |dkr||
|dt|f< n.|dkr||
|t| df< ntd| q|
S )a  Pads sequences to the same length.

    This function transforms a list of
    `num_samples` sequences (lists of integers)
    into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
    `num_timesteps` is either the `maxlen` argument if provided,
    or the length of the longest sequence otherwise.

    Sequences that are shorter than `num_timesteps`
    are padded with `value` at the beginning or the end
    if padding='post.

    Sequences longer than `num_timesteps` are truncated
    so that they fit the desired length.
    The position where padding or truncation happens is determined by
    the arguments `padding` and `truncating`, respectively.

    Pre-padding is the default.

    # Arguments
        sequences: List of lists, where each element is a sequence.
        maxlen: Int, maximum length of all sequences.
        dtype: Type of the output sequences.
            To pad sequences with variable length strings, you can use `object`.
        padding: String, 'pre' or 'post':
            pad either before or after each sequence.
        truncating: String, 'pre' or 'post':
            remove values from sequences larger than
            `maxlen`, either at the beginning or at the end of the sequences.
        value: Float or String, padding value.

    # Returns
        x: Numpy array with shape `(len(sequences), maxlen)`

    # Raises
        ValueError: In case of invalid values for `truncating` or `padding`,
            or in case of invalid shape for a `sequences` entry.
    __len__z`sequences` must be iterable. T   NFz=`sequences` must be a list of iterables. Found non-iterable: zo`dtype` {} is not compatible with `value`'s type: {}
You should set `dtype=object` for variable length strings.)dtyper   postz#Truncating type "%s" not understoodzQShape of sample %s of sequence at position %s is different from expected shape %sz Padding type "%s" not understood)hasattr
ValueErrorlenappendnpZasarrayshape	TypeErrorstrmaxZ
issubdtypeZstr_Zunicode_
isinstancesixstring_typesobjectformattypefull	enumerate)	sequencesmaxlenr   paddingZ
truncatingvalueZnum_sampleslengthsZsample_shapeflagxZis_dtype_stridxstruncr
   r
   m/home/droni/.local/share/virtualenvs/DPS-5Je3_V2c/lib/python3.9/site-packages/keras_preprocessing/sequence.pypad_sequences   sZ    (






r*   h㈵>c                 C   sV   d}t | }d|d< |t ||  d dd|   }|| }t d|t | S )a9  Generates a word rank-based probabilistic sampling table.

    Used for generating the `sampling_table` argument for `skipgrams`.
    `sampling_table[i]` is the probability of sampling
    the word i-th most common word in a dataset
    (more common words should be sampled less frequently, for balance).

    The sampling probabilities are generated according
    to the sampling distribution used in word2vec:

    ```
    p(word) = (min(1, sqrt(word_frequency / sampling_factor) /
        (word_frequency / sampling_factor)))
    ```

    We assume that the word frequencies follow Zipf's law (s=1) to derive
    a numerical approximation of frequency(rank):

    `frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))`
    where `gamma` is the Euler-Mascheroni constant.

    # Arguments
        size: Int, number of possible words to sample.
        sampling_factor: The sampling factor in the word2vec formula.

    # Returns
        A 1D Numpy array of length `size` where the ith entry
        is the probability that a word of rank i should be sampled.
    gX9v?r   r   g      ?      ?g      (@)r   arangelogminimumsqrt)sizeZsampling_factorgammaZrankZinv_fqfr
   r
   r)   make_sampling_tableq   s    
"r4      r,   TFc                    sr  g }g }	t | D ]\}
}|sq|dur8|| t k r8qtd|
| }tt| |
| d }t||D ]F}||
krf| | }|sqf|||g |r|	ddg qf|	d qfq|dkr&tt|	| }dd |D t | fddt|D 7 }|r|	ddgg| 7 }	n|	dg| 7 }	|rj|du rBt	dd}t
| t| t
| t|	 ||	fS )a{  Generates skipgram word pairs.

    This function transforms a sequence of word indexes (list of integers)
    into tuples of words of the form:

    - (word, word in the same window), with label 1 (positive samples).
    - (word, random word from the vocabulary), with label 0 (negative samples).

    Read more about Skipgram in this gnomic paper by Mikolov et al.:
    [Efficient Estimation of Word Representations in
    Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)

    # Arguments
        sequence: A word sequence (sentence), encoded as a list
            of word indices (integers). If using a `sampling_table`,
            word indices are expected to match the rank
            of the words in a reference dataset (e.g. 10 would encode
            the 10-th most frequently occurring token).
            Note that index 0 is expected to be a non-word and will be skipped.
        vocabulary_size: Int, maximum possible word index + 1
        window_size: Int, size of sampling windows (technically half-window).
            The window of a word `w_i` will be
            `[i - window_size, i + window_size+1]`.
        negative_samples: Float >= 0. 0 for no negative (i.e. random) samples.
            1 for same number as positive samples.
        shuffle: Whether to shuffle the word couples before returning them.
        categorical: bool. if False, labels will be
            integers (eg. `[0, 1, 1 .. ]`),
            if `True`, labels will be categorical, e.g.
            `[[1,0],[0,1],[0,1] .. ]`.
        sampling_table: 1D array of size `vocabulary_size` where the entry i
            encodes the probability to sample a word of rank i.
        seed: Random seed.

    # Returns
        couples, labels: where `couples` are int pairs and
            `labels` are either 0 or 1.

    # Note
        By convention, index 0 in the vocabulary is
        a non-word and will be skipped.
    Nr   r   c                 S   s   g | ]}|d  qS )r   r
   ).0cr
   r
   r)   
<listcomp>       zskipgrams.<locals>.<listcomp>c                    s,   g | ]$}|t   td  d  gqS )r   )r   randomrandint)r6   ivocabulary_sizewordsr
   r)   r8      s   g    cA)r   r:   r   minr   r   r   intshuffler;   seed)sequencer>   Zwindow_sizeZnegative_samplesrB   ZcategoricalZsampling_tablerC   Zcoupleslabelsr<   ZwiZwindow_startZ
window_endjZwjZnum_negative_samplesr
   r=   r)   	skipgrams   sJ    -






rG   c                 C   sF   g g  }}t ||D ](\}}t|| k r|| || q||fS )aJ  Removes sequences that exceed the maximum length.

    # Arguments
        maxlen: Int, maximum length of the output sequences.
        seq: List of lists, where each sublist is a sequence.
        label: List where each element is an integer.

    # Returns
        new_seq, new_label: shortened lists for `seq` and `label`.
    )zipr   r   )r    seqlabelZnew_seqZ	new_labelr%   yr
   r
   r)   _remove_long_seq   s    

rL   c                   @   s:   e Zd ZdZdddZd	d
 Zdd Zdd Zdd ZdS )TimeseriesGeneratora+
  Utility class for generating batches of temporal data.

    This class takes in a sequence of data-points gathered at
    equal intervals, along with time series parameters such as
    stride, length of history, etc., to produce batches for
    training/validation.

    # Arguments
        data: Indexable generator (such as list or Numpy array)
            containing consecutive data points (timesteps).
            The data should be at 2D, and axis 0 is expected
            to be the time dimension.
        targets: Targets corresponding to timesteps in `data`.
            It should have same length as `data`.
        length: Length of the output sequences (in number of timesteps).
        sampling_rate: Period between successive individual timesteps
            within sequences. For rate `r`, timesteps
            `data[i]`, `data[i-r]`, ... `data[i - length]`
            are used for create a sample sequence.
        stride: Period between successive output sequences.
            For stride `s`, consecutive output samples would
            be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
        start_index: Data points earlier than `start_index` will not be used
            in the output sequences. This is useful to reserve part of the
            data for test or validation.
        end_index: Data points later than `end_index` will not be used
            in the output sequences. This is useful to reserve part of the
            data for test or validation.
        shuffle: Whether to shuffle output samples,
            or instead draw them in chronological order.
        reverse: Boolean: if `true`, timesteps in each output sample will be
            in reverse chronological order.
        batch_size: Number of timeseries samples in each batch
            (except maybe the last one).

    # Returns
        A [Sequence](/utils/#sequence) instance.

    # Examples

    ```python
    from keras.preprocessing.sequence import TimeseriesGenerator
    import numpy as np

    data = np.array([[i] for i in range(50)])
    targets = np.array([[i] for i in range(50)])

    data_gen = TimeseriesGenerator(data, targets,
                                   length=10, sampling_rate=2,
                                   batch_size=2)
    assert len(data_gen) == 20

    batch_0 = data_gen[0]
    x, y = batch_0
    assert np.array_equal(x,
                          np.array([[[0], [2], [4], [6], [8]],
                                    [[1], [3], [5], [7], [9]]]))
    assert np.array_equal(y,
                          np.array([[10], [11]]))
    ```
    r   r   NF   c                 C   s   t |t |kr4tddt | dt | || _|| _|| _|| _|| _|| | _|d u rpt |d }|| _	|| _
|	| _|
| _| j| j	krtd| j| j	f d S )NzData and targets have to bez" of same length. Data length is {}z while target length is {}r   zz`start_index+length=%i > end_index=%i` is disallowed, as no part of the sequence would be left to be used as current step.)r   r   r   datatargetslengthsampling_ratestridestart_index	end_indexrB   reverse
batch_size)selfrO   rP   rQ   rR   rS   rT   rU   rB   rV   rW   r
   r
   r)   __init__E  s0    	

zTimeseriesGenerator.__init__c                 C   s$   | j | j | j| j  | j| j  S )N)rU   rT   rW   rS   rX   r
   r
   r)   r	   g  s
    


zTimeseriesGenerator.__len__c                    s    j r$tjj j jd  jd}n> j j j |  }t|t	| j j   jd  j}t
 fdd|D }t
 fdd|D } jr|d d d d ddf |fS ||fS )Nr   )r1   c                    s$   g | ]} j | j | j qS r
   )rO   rQ   rR   r6   rowrZ   r
   r)   r8   t  s   z3TimeseriesGenerator.__getitem__.<locals>.<listcomp>c                    s   g | ]} j | qS r
   )rP   r[   rZ   r
   r)   r8   v  r9   .)rB   r   r:   r;   rT   rU   rW   rS   r-   r@   arrayrV   )rX   indexrowsr<   ZsamplesrP   r
   rZ   r)   __getitem__k  s&    zTimeseriesGenerator.__getitem__c                 C   s   | j }t| j jtjkr"| j  }zt|}W n tyL   td|Y n0 | j	}t| j	jtjkrp| j	 }zt|}W n ty   td|Y n0 ||| j
| j| j| j| j| j| j| jd
S )zReturns the TimeseriesGenerator configuration as Python dictionary.

        # Returns
            A Python dictionary with the TimeseriesGenerator configuration.
        zData not JSON Serializable:zTargets not JSON Serializable:)
rO   rP   rQ   rR   rS   rT   rU   rB   rV   rW   )rO   r   
__module__r   __name__tolistjsondumpsr   rP   rQ   rR   rS   rT   rU   rB   rV   rW   )rX   rO   Z	json_datarP   Zjson_targetsr
   r
   r)   
get_config|  s2    

zTimeseriesGenerator.get_configc                 K   s(   |   }| jj|d}tj|fi |S )a  Returns a JSON string containing the timeseries generator
        configuration. To load a generator from a JSON string, use
        `keras.preprocessing.sequence.timeseries_generator_from_json(json_string)`.

        # Arguments
            **kwargs: Additional keyword arguments
                to be passed to `json.dumps()`.

        # Returns
            A JSON string containing the tokenizer configuration.
        )
class_nameconfig)rg   	__class__rc   re   rf   )rX   kwargsri   Ztimeseries_generator_configr
   r
   r)   to_json  s
    zTimeseriesGenerator.to_json)r   r   r   NFFrN   )	rc   rb   __qualname____doc__rY   r	   ra   rg   rl   r
   r
   r
   r)   rM     s   ?       
"#rM   c                 C   sR   t | }|d}t |d}||d< t |d}||d< tf i |S )a  Parses a JSON timeseries generator configuration file and
    returns a timeseries generator instance.

    # Arguments
        json_string: JSON string encoding a timeseries
            generator configuration.

    # Returns
        A Keras TimeseriesGenerator instance
    ri   rO   rP   )re   loadsgetpoprM   )Zjson_stringZfull_configri   rO   rP   r
   r
   r)   timeseries_generator_from_json  s    

rr   )Nr   r   r   r   )r+   )r5   r,   TFNN)rn   
__future__r   r   r   numpyr   r:   re   Z	six.movesr   r   r*   r4   rG   rL   r   rM   rr   r
   r
   r
   r)   <module>   s&     
b
(  
[ .