a
    8Sic                     @   s(  d dl Z d dlZd dlZd dlmZmZmZmZ d dlZd dlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ d
dlmZ ejejdZd e	e	ee	dddZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdS )!    N)ListTupleOptionaloverload)Tensor   )Module   )	Parameter)PackedSequence)init   )_VF)RNN_TANHRNN_RELU)tensorpermutationdimreturnc                 C   s   |  ||S N)index_select)r   r   r    r   P/var/www/html/django/DPS/env/lib/python3.9/site-packages/torch/nn/modules/rnn.pyapply_permutation   s    r   c                       s  e Zd ZU g dZdgZeed< eed< eed< eed< eed< eed< e	ed	< eed
< eed< d5eeeeeee	eedd
 fddZ
 fddZddddZ fddZddddZeee dddd Zeee eeeef dd!d"Zd6eeeeef edd$d%d&Zeeee d'd(d)Zeee d*d+d,Zedd-d.Z fd/d0Zeeee  dd1d2Z fd3d4Z  ZS )7RNNBase)	mode
input_sizehidden_size
num_layersbiasbatch_firstdropoutbidirectional	proj_sizeall_weightsr   r   r   r   r   r    r!   r"   r#   r   TF        r   N)
r   r   r   r   r   r    r!   r"   r#   r   c                    s   |
|d}t t  |_|_|_|_|_|_t	|_
|_|	_|rZdnd}t|tjrd|  kr~dkrn n
t|trtd|dkr|dkrtd|| |	dk rtd|	|krtd|d	krd
| }n>|dkrd| }n,|dkr
|}n|dkr|}ntd| g _g _t|D ] t|D ]}|	dkr\|	n|} dkrn|n|| }ttj||ffi |}ttj||ffi |}ttj|fi |}ttj|fi |}d}jdkr|r||||f}n||f}n:ttj|	|ffi |}|r4|||||f}n
|||f}|dkrLdndddg}|rj|ddg7 }jdkr|dg7 } fdd|D }t||D ]\}}t|| qj| j| qHq:fddjD _   !  d S )Ndevicedtyper	   r   r   zbdropout should be a number in range [0, 1] representing the probability of an element being zeroedzdropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout={} and num_layers={}zEproj_size should be a positive integer or zero to disable projectionsz,proj_size has to be smaller than hidden_sizeLSTM   GRUr   r   r   zUnrecognized RNN mode: r   _reverse weight_ih_l{}{}weight_hh_l{}{}bias_ih_l{}{}bias_hh_l{}{}weight_hr_l{}{}c                    s   g | ]}|  qS r   format.0xlayersuffixr   r   
<listcomp>r       z$RNNBase.__init__.<locals>.<listcomp>c                    s   g | ]} fd d|qS )c                    s   t  | rt | S d S r   hasattrgetattrwnselfr   r   <lambda>y   r<   z-RNNBase.__init__.<locals>.<listcomp>.<lambda>r   r6   rA   rB   r   r   r;   y   r<   )"superr   __init__r   r   r   r   r   r    floatr!   r"   r#   
isinstancenumbersNumberbool
ValueErrorwarningswarnr4   _flat_weights_names_all_weightsranger
   torchemptyzipsetattrextendappend_flat_weightsflatten_parametersreset_parameters)rC   r   r   r   r   r   r    r!   r"   r#   r'   r(   factory_kwargsnum_directionsZ	gate_size	directionreal_hidden_sizeZlayer_input_sizew_ihw_hhb_ihb_hhZlayer_paramsZw_hrZparam_namesnameparam	__class__r9   rC   r:   r   rG   '   s    

$






zRNNBase.__init__c                    s@   t | dr*|| jv r*| j|}|| j|< tt| || d S )NrP   )r>   rP   indexrY   rF   r   __setattr__)rC   attrvalueidxrf   r   r   rj   ~   s    
zRNNBase.__setattr__r   c                 C   s  t | jt | jkrdS | jD ]}t|ts dS q| jd }|j}| jD ]:}t|jtr~|jj|kr~|jjr~tj	j
|jsJ dS qJtdd | jD }t |t | jkrdS tj| ddlm	  m
  m} t p t r@| jrdnd}| jdkr|d7 }t| j|| j|| j| j| j| j| jt| j	 W d   n1 sV0    Y  W d   n1 sv0    Y  dS )zResets parameter data pointer so that they can use faster code paths.

        Right now, this works only if the module is on the GPU and cuDNN is enabled.
        Otherwise, it's a no-op.
        Nr   c                 s   s   | ]}|  V  qd S r   )data_ptr)r6   pr   r   r   	<genexpr>   r<   z-RNNBase.flatten_parameters.<locals>.<genexpr>r*   r	   r   )lenrY   rP   rI   r   r(   datais_cudarS   backendscudnnZis_acceptablesetcuda	device_ofZtorch.backends.cudnn.rnnrnnno_gradZ_use_cudnn_rnn_flatten_weightr   r#   Z_cudnn_rnn_flatten_weightr   Zget_cudnn_moder   r   r   r    rL   r"   )rC   wZfirst_fwr(   fwZunique_data_ptrsrz   Znum_weightsr   r   r   rZ      s<    





zRNNBase.flatten_parametersc                    s2   t t |} fdd jD  _   |S )Nc                    s   g | ]} fd d|qS )c                    s   t  | rt | S d S r   r=   r@   rB   r   r   rD      r<   z+RNNBase._apply.<locals>.<listcomp>.<lambda>r   rE   rB   r   r   r;      r<   z"RNNBase._apply.<locals>.<listcomp>)rF   r   _applyrP   rY   rZ   )rC   fnretrf   rB   r   r~      s    zRNNBase._applyc                 C   s@   | j dkrdt| j  nd}|  D ]}t|| | q&d S Nr   g      ?r   mathsqrt
parametersr   uniform_rC   Zstdvweightr   r   r   r[      s    zRNNBase.reset_parametersinputbatch_sizesr   c                 C   s\   |d urdnd}|  |kr0td||  | j|dkrXtd| j|dd S )Nr	   r   z%input must have {} dimensions, got {}z?input.size(-1) must be equal to input_size. Expected {}, got {})r   RuntimeErrorr4   r   size)rC   r   r   Zexpected_input_dimr   r   r   check_input   s    zRNNBase.check_inputc                 C   sr   |d urt |d }n| jr&|dn|d}| jr:dnd}| jdkr\| j| || jf}n| j| || jf}|S Nr   r   r	   )intr    r   r"   r#   r   r   rC   r   r   Z
mini_batchr]   expected_hidden_sizer   r   r   get_expected_hidden_size   s    
z RNNBase.get_expected_hidden_sizeExpected hidden size {}, got {})hxr   msgr   c                 C   s(   |  |kr$t||t|  d S r   )r   r   r4   list)rC   r   r   r   r   r   r   check_hidden_size   s    zRNNBase.check_hidden_sizer   hiddenr   c                 C   s(   |  || | ||}| || d S r   )r   r   r   )rC   r   r   r   r   r   r   r   check_forward_args   s    zRNNBase.check_forward_args)r   r   c                 C   s   |d u r|S t ||S r   r   rC   r   r   r   r   r   permute_hidden   s    zRNNBase.permute_hiddenc                 C   s   d}| j dkr|d7 }| jdkr(|d7 }| jdur:|d7 }| jdurL|d	7 }| jdkr^|d
7 }| jdurp|d7 }|jf i | jS )N{input_size}, {hidden_size}r   z, proj_size={proj_size}r   z, num_layers={num_layers}T, bias={bias}Fz, batch_first={batch_first}z, dropout={dropout}z, bidirectional={bidirectional})r#   r   r   r    r!   r"   r4   __dict__rC   sr   r   r   
extra_repr   s    





zRNNBase.extra_reprc                    s  t t| d|v r"|d _d|vr0d_tjd d trHd S j}jrXdnd}g _	g _t
|D ]  t
|D ]}|dkrdndg d} fd	d
|D }jrjdkr j|g7  _j	| n, j|d d g7  _j	|d d  q~jdkrd j|d d g|dd  g 7  _j	|d d |dd  g  q~ j|d d g7  _j	|d d  q~qpfdd
j	D _d S )Nr$   r#   r   r	   r   r,   r-   )r.   r/   r0   r1   r2   c                    s   g | ]}|  qS r   r3   r5   r8   r   r   r;     r<   z(RNNBase.__setstate__.<locals>.<listcomp>r*   r   c                    s   g | ]} fd d|qS )c                    s   t  | rt | S d S r   r=   r@   rB   r   r   rD   $  r<   z1RNNBase.__setstate__.<locals>.<listcomp>.<lambda>r   rE   rB   r   r   r;   $  r<   )rF   r   __setstate__rQ   r#   rI   strr   r"   rP   rR   r   rW   rY   )rC   dr   r]   r^   weightsrf   rh   r   r      s8    

&$zRNNBase.__setstate__c                    s    fdd j D S )Nc                    s   g | ]} fd d|D qS )c                    s   g | ]}t  |qS r   )r?   )r6   r   rB   r   r   r;   (  r<   z2RNNBase.all_weights.<locals>.<listcomp>.<listcomp>r   )r6   r   rB   r   r   r;   (  r<   z'RNNBase.all_weights.<locals>.<listcomp>)rQ   rB   r   rB   r   r$   &  s    zRNNBase.all_weightsc                    s2   t t|  }|jd d  |_|jd d  |_|S r   )rF   r   _replicate_for_data_parallelrY   rP   )rC   replicarf   r   r   r   *  s    z$RNNBase._replicate_for_data_parallel)r   TFr%   Fr   NN)r   )__name__
__module____qualname____constants__Z__jit_unused_properties__r   __annotations__r   rL   rH   rG   rj   rZ   r~   r[   r   r   r   r   r   r   r   r   r   r   propertyr   r
   r$   r   __classcell__r   r   rf   r   r      sH   
   W0  &r   c                       s   e Zd ZdZ fddZeejjd
e	e
e	 ee	e	f dddZeejjdee
e	 eee	f dddZdd	dZ  ZS )RNNa  Applies a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}` non-linearity to an
    input sequence.


    For each element in the input sequence, each layer computes the following
    function:

    .. math::
        h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})

    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
    the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
    previous layer at time `t-1` or the initial hidden state at time `0`.
    If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
            would mean stacking two RNNs together to form a `stacked RNN`,
            with the second RNN taking in outputs of the first RNN and
            computing the final results. Default: 1
        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        batch_first: If ``True``, then the input and output tensors are provided
            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
            Note that this does not apply to hidden or cell states. See the
            Inputs/Outputs sections below for details.  Default: ``False``
        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
            RNN layer except the last layer, with dropout probability equal to
            :attr:`dropout`. Default: 0
        bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False``

    Inputs: input, h_0
        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
          :math:`(L, N, H_{in})` when ``batch_first=False`` or
          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
          the input sequence.  The input can also be a packed variable length sequence.
          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
          :func:`torch.nn.utils.rnn.pack_sequence` for details.
        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the initial hidden
          state for the input sequence batch. Defaults to zeros if not provided.

        where:

        .. math::
            \begin{aligned}
                N ={} & \text{batch size} \\
                L ={} & \text{sequence length} \\
                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
                H_{in} ={} & \text{input\_size} \\
                H_{out} ={} & \text{hidden\_size}
            \end{aligned}

    Outputs: output, h_n
        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
          `(h_t)` from the last layer of the RNN, for each `t`. If a
          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
          will also be a packed sequence.
        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state
          for each element in the batch.

    Attributes:
        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
            of shape `(hidden_size, input_size)` for `k = 0`. Otherwise, the shape is
            `(hidden_size, num_directions * hidden_size)`
        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
            of shape `(hidden_size, hidden_size)`
        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
            of shape `(hidden_size)`
        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
            of shape `(hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    .. note::
        For bidirectional RNNs, forward and backward are directions 0 and 1 respectively.
        Example of splitting the output layers when ``batch_first=False``:
        ``output.view(seq_len, batch, num_directions, hidden_size)``.

    .. note::
        ``batch_first`` argument is ignored for unbatched inputs.

    .. include:: ../cudnn_rnn_determinism.rst

    .. include:: ../cudnn_persistent_rnn.rst

    Examples::

        >>> rnn = nn.RNN(10, 20, 2)
        >>> input = torch.randn(5, 3, 10)
        >>> h0 = torch.randn(2, 3, 20)
        >>> output, hn = rnn(input, h0)
    c                    sp   d|v rt d|dd| _| jdkr.d}n | jdkr>d}nt d| jtt| j|g|R i | d S )	Nr#   =proj_size argument is only supported for LSTM, not RNN or GRUnonlinearitytanhr   relur   zUnknown nonlinearity '{}')rM   popr   r4   rF   r   rG   )rC   argskwargsr   rf   r   r   rG     s    

zRNN.__init__Nr   r   r   c                 C   s   d S r   r   rC   r   r   r   r   r   forward  s    zRNN.forwardc                 C   s   d S r   r   r   r   r   r   r     s    c                 C   sx  |}t |tr(|\}}}}t|d }nd }| dk}| jrBdnd}	|s||	}|d ur| dkr|td|  d|d}n(|d ur| dkrtd|  d| jr|dn|d}d }d }|d u r| jrdnd}
t	j
| j|
 || j|j|jd}n| ||}|d us&J | ||| | jd	ksP| jd
ksPJ |d u r| jd	krt||| j| j| j| j| j| j| j	}n(t||| j| j| j| j| j| j| j	}nZ| jd	krt|||| j| j| j| j| j| j	}n&t|||| j| j| j| j| j| j	}|d }|d }t |trNt||||}|| ||fS |sh||	}|d}|| ||fS )Nr   r   r   r	   7For unbatched 2-D input, hx should also be 2-D but got 	-D tensor5For batched 3-D input, hx should also be 3-D but got r(   r'   r   r   )rI   r   r   r   r    	unsqueezer   r   r"   rS   zerosr   r   r(   r'   r   r   r   r   rnn_tanhrY   r   r!   trainingrnn_relusqueezerC   r   r   
orig_inputr   sorted_indicesunsorted_indicesmax_batch_size
is_batched	batch_dimr]   resultoutputr   output_packedr   r   r   r     sx    





)N)N)Nr   r   r   __doc__rG   r   rS   _jit_internal_overload_methodr   r   r   r   r   r   r   r   rf   r   r   3  s   f$$r   c                
       s  e Zd ZdZ fddZeee eeeef dddZ	eeeef ee ddd	Z
eeef ee eeef d
ddZeejjdeeeeef  eeeeef f dddZeejjdeeeeef  eeeeef f dddZdddZ  ZS )r)   aW$  Applies a multi-layer long short-term memory (LSTM) RNN to an input
    sequence.


    For each element in the input sequence, each layer computes the following
    function:

    .. math::
        \begin{array}{ll} \\
            i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
            f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
            o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
            c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
            h_t = o_t \odot \tanh(c_t) \\
        \end{array}

    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
    state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{t-1}`
    is the hidden state of the layer at time `t-1` or the initial hidden
    state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`,
    :math:`o_t` are the input, forget, cell, and output gates, respectively.
    :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.

    In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
    (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
    variable which is :math:`0` with probability :attr:`dropout`.

    If ``proj_size > 0`` is specified, LSTM with projections will be used. This changes
    the LSTM cell in the following way. First, the dimension of :math:`h_t` will be changed from
    ``hidden_size`` to ``proj_size`` (dimensions of :math:`W_{hi}` will be changed accordingly).
    Second, the output hidden state of each layer will be multiplied by a learnable projection
    matrix: :math:`h_t = W_{hr}h_t`. Note that as a consequence of this, the output
    of LSTM network will be of different shape as well. See Inputs/Outputs sections below for exact
    dimensions of all variables. You can find more details in https://arxiv.org/abs/1402.1128.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
            would mean stacking two LSTMs together to form a `stacked LSTM`,
            with the second LSTM taking in outputs of the first LSTM and
            computing the final results. Default: 1
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        batch_first: If ``True``, then the input and output tensors are provided
            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
            Note that this does not apply to hidden or cell states. See the
            Inputs/Outputs sections below for details.  Default: ``False``
        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
            LSTM layer except the last layer, with dropout probability equal to
            :attr:`dropout`. Default: 0
        bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
        proj_size: If ``> 0``, will use LSTM with projections of corresponding size. Default: 0

    Inputs: input, (h_0, c_0)
        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
          :math:`(L, N, H_{in})` when ``batch_first=False`` or
          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
          the input sequence.  The input can also be a packed variable length sequence.
          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
          :func:`torch.nn.utils.rnn.pack_sequence` for details.
        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the
          initial hidden state for each element in the input sequence.
          Defaults to zeros if (h_0, c_0) is not provided.
        * **c_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
          initial cell state for each element in the input sequence.
          Defaults to zeros if (h_0, c_0) is not provided.

        where:

        .. math::
            \begin{aligned}
                N ={} & \text{batch size} \\
                L ={} & \text{sequence length} \\
                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
                H_{in} ={} & \text{input\_size} \\
                H_{cell} ={} & \text{hidden\_size} \\
                H_{out} ={} & \text{proj\_size if } \text{proj\_size}>0 \text{ otherwise hidden\_size} \\
            \end{aligned}

    Outputs: output, (h_n, c_n)
        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
          `(h_t)` from the last layer of the LSTM, for each `t`. If a
          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
          will also be a packed sequence. When ``bidirectional=True``, `output` will contain
          a concatenation of the forward and reverse hidden states at each time step in the sequence.
        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the
          final hidden state for each element in the sequence. When ``bidirectional=True``,
          `h_n` will contain a concatenation of the final forward and reverse hidden states, respectively.
        * **c_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
          final cell state for each element in the sequence. When ``bidirectional=True``,
          `c_n` will contain a concatenation of the final forward and reverse cell states, respectively.

    Attributes:
        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
            `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size, input_size)` for `k = 0`.
            Otherwise, the shape is `(4*hidden_size, num_directions * hidden_size)`. If
            ``proj_size > 0`` was specified, the shape will be
            `(4*hidden_size, num_directions * proj_size)` for `k > 0`
        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)`. If ``proj_size > 0``
            was specified, the shape will be `(4*hidden_size, proj_size)`.
        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
            `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
            `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`
        weight_hr_l[k] : the learnable projection weights of the :math:`\text{k}^{th}` layer
            of shape `(proj_size, hidden_size)`. Only present when ``proj_size > 0`` was
            specified.
        weight_ih_l[k]_reverse: Analogous to `weight_ih_l[k]` for the reverse direction.
            Only present when ``bidirectional=True``.
        weight_hh_l[k]_reverse:  Analogous to `weight_hh_l[k]` for the reverse direction.
            Only present when ``bidirectional=True``.
        bias_ih_l[k]_reverse:  Analogous to `bias_ih_l[k]` for the reverse direction.
            Only present when ``bidirectional=True``.
        bias_hh_l[k]_reverse:  Analogous to `bias_hh_l[k]` for the reverse direction.
            Only present when ``bidirectional=True``.
        weight_hr_l[k]_reverse:  Analogous to `weight_hr_l[k]` for the reverse direction.
            Only present when ``bidirectional=True`` and ``proj_size > 0`` was specified.

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    .. note::
        For bidirectional LSTMs, forward and backward are directions 0 and 1 respectively.
        Example of splitting the output layers when ``batch_first=False``:
        ``output.view(seq_len, batch, num_directions, hidden_size)``.

    .. note::
        For bidirectional LSTMs, `h_n` is not equivalent to the last element of `output`; the
        former contains the final forward and reverse hidden states, while the latter contains the
        final forward hidden state and the initial reverse hidden state.

    .. note::
        ``batch_first`` argument is ignored for unbatched inputs.

    .. include:: ../cudnn_rnn_determinism.rst

    .. include:: ../cudnn_persistent_rnn.rst

    Examples::

        >>> rnn = nn.LSTM(10, 20, 2)
        >>> input = torch.randn(5, 3, 10)
        >>> h0 = torch.randn(2, 3, 20)
        >>> c0 = torch.randn(2, 3, 20)
        >>> output, (hn, cn) = rnn(input, (h0, c0))
    c                    s"   t t| jdg|R i | d S )Nr)   )rF   r)   rG   rC   r   r   rf   r   r   rG     s    zLSTM.__init__r   c                 C   sT   |d urt |d }n| jr&|dn|d}| jr:dnd}| j| || jf}|S r   )r   r    r   r"   r   r   r   r   r   r   get_expected_cell_size  s    zLSTM.get_expected_cell_sizer   c                 C   sD   |  || | |d | ||d | |d | ||d d S )Nr   z"Expected hidden[0] size {}, got {}r   z"Expected hidden[1] size {}, got {})r   r   r   r   )rC   r   r   r   r   r   r   r     s    zLSTM.check_forward_args)r   r   r   c                 C   s(   |d u r|S t |d |t |d |fS )Nr   r   r   r   r   r   r   r     s    zLSTM.permute_hiddenNr   c                 C   s   d S r   r   r   r   r   r   r     s    zLSTM.forwardc                 C   s   d S r   r   r   r   r   r   r     s    c                 C   s  |}d }t |tr0|\}}}}|d }t|}nNd }| dk}| jrJdnd}	|s\||	}| jrl|dn|d}d }d }|d u r| jrdnd}
| jdkr| jn| j	}t
j| j|
 |||j|jd}t
j| j|
 || j	|j|jd}||f}n|d u r|rR|d  dks&|d  dkrd|d   d|d   d}t|nj|d  dksv|d  dkrd	|d   d|d   d}t||d d|d df}| ||}| ||| |d u r
t||| j| j| j| j| j| j| j	}n&t|||| j| j| j| j| j| j	}|d }|dd  }t |trnt||||}|| ||fS |s||	}|d d|d df}|| ||fS d S )
Nr   r   r   r	   r   z=For batched 3-D input, hx and cx should also be 3-D but got (z-D, z-D) tensorsz?For unbatched 2-D input, hx and cx should also be 2-D but got ()rI   r   r   r   r    r   r   r"   r#   r   rS   r   r   r(   r'   r   r   r   r   lstmrY   r   r!   r   r   )rC   r   r   r   r   r   r   r   r   r   r]   r_   Zh_zerosZc_zerosr   r   r   r   r   r   r   r   r     sz    




$


$



)N)N)N)r   r   r   r   rG   r   r   r   r   r   r   r   r   rS   r   r   r   r   r   r   r   rf   r   r)     s.     


	r)   c                       s   e Zd ZdZ fddZeejjd
e	e
e	 ee	e	f dddZeejjdee
e	 eee	f dddZdd	dZ  ZS )r+   a  Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.


    For each element in the input sequence, each layer computes the following
    function:

    .. math::
        \begin{array}{ll}
            r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
            z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
            n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
            h_t = (1 - z_t) * n_t + z_t * h_{(t-1)}
        \end{array}

    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input
    at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer
    at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`,
    :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
    :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.

    In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
    (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
    variable which is :math:`0` with probability :attr:`dropout`.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
            would mean stacking two GRUs together to form a `stacked GRU`,
            with the second GRU taking in outputs of the first GRU and
            computing the final results. Default: 1
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        batch_first: If ``True``, then the input and output tensors are provided
            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
            Note that this does not apply to hidden or cell states. See the
            Inputs/Outputs sections below for details.  Default: ``False``
        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
            GRU layer except the last layer, with dropout probability equal to
            :attr:`dropout`. Default: 0
        bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False``

    Inputs: input, h_0
        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
          :math:`(L, N, H_{in})` when ``batch_first=False`` or
          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
          the input sequence.  The input can also be a packed variable length sequence.
          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
          :func:`torch.nn.utils.rnn.pack_sequence` for details.
        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or
          :math:`(D * \text{num\_layers}, N, H_{out})`
          containing the initial hidden state for the input sequence. Defaults to zeros if not provided.

        where:

        .. math::
            \begin{aligned}
                N ={} & \text{batch size} \\
                L ={} & \text{sequence length} \\
                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
                H_{in} ={} & \text{input\_size} \\
                H_{out} ={} & \text{hidden\_size}
            \end{aligned}

    Outputs: output, h_n
        * **output**: tensor of shape :math:`(L, H_{in})` for unbatched input,
          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
          `(h_t)` from the last layer of the GRU, for each `t`. If a
          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
          will also be a packed sequence.
        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state
          for the input sequence.

    Attributes:
        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
            (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`.
            Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)`
        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
            (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)`
        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
            (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
            (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    .. note::
        For bidirectional GRUs, forward and backward are directions 0 and 1 respectively.
        Example of splitting the output layers when ``batch_first=False``:
        ``output.view(seq_len, batch, num_directions, hidden_size)``.

    .. note::
        ``batch_first`` argument is ignored for unbatched inputs.

    .. include:: ../cudnn_persistent_rnn.rst

    Examples::

        >>> rnn = nn.GRU(10, 20, 2)
        >>> input = torch.randn(5, 3, 10)
        >>> h0 = torch.randn(2, 3, 20)
        >>> output, hn = rnn(input, h0)
    c                    s2   d|v rt dtt| jdg|R i | d S )Nr#   r   r+   )rM   rF   r+   rG   r   rf   r   r   rG     s    zGRU.__init__Nr   c                 C   s   d S r   r   r   r   r   r   r     s    zGRU.forwardc                 C   s   d S r   r   r   r   r   r   r     s    c                 C   s  |}t |tr,|\}}}}|d }t|}nd }| dk}| jrFdnd}	|s||	}|d ur| dkrtd|  d|d}n(|d ur| dkrtd|  d| jr|dn|d}d }d }|d u r| jrdnd}
t	j
| j|
 || j|j|jd}n| ||}| ||| |d u r^t||| j| j| j| j| j| j| j	}n&t|||| j| j| j| j| j| j	}|d }|d }t |trt||||}|| ||fS |s||	}|d}|| ||fS d S )	Nr   r   r   r	   r   r   r   r   )rI   r   r   r   r    r   r   r   r"   rS   r   r   r   r(   r'   r   r   r   grurY   r   r!   r   r   r   r   r   r   r     s^    






)N)N)Nr   r   r   rf   r   r+     s   m$$r+   c                       sz   e Zd ZU g dZeed< eed< eed< eed< eed< deeeedd fd	d
Ze	dddZ
ddddZ  ZS )RNNCellBase)r   r   r   r   r   r   	weight_ih	weight_hhN)r   r   r   
num_chunksr   c                    s   ||d}t t|   || _|| _|| _ttj|| |ffi || _	ttj|| |ffi || _
|rttj|| fi || _ttj|| fi || _n| dd  | dd  |   d S )Nr&   bias_ihbias_hh)rF   r   rG   r   r   r   r
   rS   rT   r   r   r   r   register_parameterr[   )rC   r   r   r   r   r'   r(   r\   rf   r   r   rG     s    
  zRNNCellBase.__init__rn   c                 C   sN   d}d| j v r | jdur |d7 }d| j v r<| jdkr<|d7 }|jf i | j S )Nr   r   Tr   r   r   z, nonlinearity={nonlinearity})r   r   r   r4   r   r   r   r   r     s    zRNNCellBase.extra_reprc                 C   s@   | j dkrdt| j  nd}|  D ]}t|| | q&d S r   r   r   r   r   r   r[     s    zRNNCellBase.reset_parameters)NN)r   r   r   r   r   r   rL   r   rG   r   r   r[   r   r   r   rf   r   r     s   
 
r   c                       sZ   e Zd ZU dZg dZeed< deeeedd fdd	Z	de
ee
 e
d
ddZ  ZS )RNNCellar  An Elman RNN cell with tanh or ReLU non-linearity.

    .. math::

        h' = \tanh(W_{ih} x + b_{ih}  +  W_{hh} h + b_{hh})

    If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``

    Inputs: input, hidden
        - **input**: tensor containing input features
        - **hidden**: tensor containing the initial hidden state
          Defaults to zero if not provided.

    Outputs: h'
        - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
          for each element in the batch

    Shape:
        - input: :math:`(N, H_{in})` or :math:`(H_{in})` tensor containing input features where
          :math:`H_{in}` = `input_size`.
        - hidden: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the initial hidden
          state where :math:`H_{out}` = `hidden_size`. Defaults to zero if not provided.
        - output: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the next hidden state.

    Attributes:
        weight_ih: the learnable input-hidden weights, of shape
            `(hidden_size, input_size)`
        weight_hh: the learnable hidden-hidden weights, of shape
            `(hidden_size, hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
        bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    Examples::

        >>> rnn = nn.RNNCell(10, 20)
        >>> input = torch.randn(6, 3, 10)
        >>> hx = torch.randn(3, 20)
        >>> output = []
        >>> for i in range(6):
                hx = rnn(input[i], hx)
                output.append(hx)
    )r   r   r   r   r   Tr   N)r   r   r   r   r   c                    s4   ||d}t t| j|||fddi| || _d S )Nr&   r   r   )rF   r   rG   r   )rC   r   r   r   r   r'   r(   r\   rf   r   r   rG   .  s    
 zRNNCell.__init__r   c                 C   s   |  dv s J d|   d|  dk}|s:|d}|d u rbtj|d| j|j|jd}n|sp|dn|}| jdkrt	
||| j| j| j| j}n<| jdkrt	||| j| j| j| j}n|}td	| j|s|d}|S )
Nr   r	   z6RNNCell: Expected input to be 1-D or 2-D but received r   r	   r   r   r   r   zUnknown nonlinearity: {})r   r   rS   r   r   r   r(   r'   r   r   rnn_tanh_cellr   r   r   r   rnn_relu_cellr   r4   r   rC   r   r   r   r   r   r   r   r   4  s8    
 



zRNNCell.forward)Tr   NN)N)r   r   r   r   r   r   r   r   rL   rG   r   r   r   r   r   r   rf   r   r     s   
5  
r   c                       sV   e Zd ZdZd
eeedd fddZdeee	eef  e	eef ddd	Z
  ZS )LSTMCella
  A long short-term memory (LSTM) cell.

    .. math::

        \begin{array}{ll}
        i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
        f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
        g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\
        o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
        c' = f * c + i * g \\
        h' = o * \tanh(c') \\
        \end{array}

    where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        bias: If ``False``, then the layer does not use bias weights `b_ih` and
            `b_hh`. Default: ``True``

    Inputs: input, (h_0, c_0)
        - **input** of shape `(batch, input_size)` or `(input_size)`: tensor containing input features
        - **h_0** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the initial hidden state
        - **c_0** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the initial cell state

          If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.

    Outputs: (h_1, c_1)
        - **h_1** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the next hidden state
        - **c_1** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the next cell state

    Attributes:
        weight_ih: the learnable input-hidden weights, of shape
            `(4*hidden_size, input_size)`
        weight_hh: the learnable hidden-hidden weights, of shape
            `(4*hidden_size, hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)`
        bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Examples::

        >>> rnn = nn.LSTMCell(10, 20) # (input_size, hidden_size)
        >>> input = torch.randn(2, 3, 10) # (time_steps, batch, input_size)
        >>> hx = torch.randn(3, 20) # (batch, hidden_size)
        >>> cx = torch.randn(3, 20)
        >>> output = []
        >>> for i in range(input.size()[0]):
                hx, cx = rnn(input[i], (hx, cx))
                output.append(hx)
        >>> output = torch.stack(output, dim=0)
    TNr   r   r   r   c                    s.   ||d}t t| j|||fddi| d S )Nr&   r   r*   )rF   r   rG   rC   r   r   r   r'   r(   r\   rf   r   r   rG     s    
zLSTMCell.__init__r   c                 C   s   |  dv s J d|   d|  dk}|s:|d}|d u rjtj|d| j|j|jd}||f}n$|s|d d|d dfn|}t	||| j
| j| j| j}|s|d d|d df}|S )Nr   z7LSTMCell: Expected input to be 1-D or 2-D but received r   r	   r   r   r   )r   r   rS   r   r   r   r(   r'   r   	lstm_cellr   r   r   r   r   )rC   r   r   r   r   r   r   r   r   r     s$    

$zLSTMCell.forward)TNN)N)r   r   r   r   r   rL   rG   r   r   r   r   r   r   r   rf   r   r   W  s   ;  r   c                       sF   e Zd ZdZd
eeedd fddZdeee eddd	Z	  Z
S )GRUCellai	  A gated recurrent unit (GRU) cell

    .. math::

        \begin{array}{ll}
        r = \sigma(W_{ir} x + b_{ir} + W_{hr} h + b_{hr}) \\
        z = \sigma(W_{iz} x + b_{iz} + W_{hz} h + b_{hz}) \\
        n = \tanh(W_{in} x + b_{in} + r * (W_{hn} h + b_{hn})) \\
        h' = (1 - z) * n + z * h
        \end{array}

    where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        bias: If ``False``, then the layer does not use bias weights `b_ih` and
            `b_hh`. Default: ``True``

    Inputs: input, hidden
        - **input** : tensor containing input features
        - **hidden** : tensor containing the initial hidden
          state for each element in the batch.
          Defaults to zero if not provided.

    Outputs: h'
        - **h'** : tensor containing the next hidden state
          for each element in the batch

    Shape:
        - input: :math:`(N, H_{in})` or :math:`(H_{in})` tensor containing input features where
          :math:`H_{in}` = `input_size`.
        - hidden: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the initial hidden
          state where :math:`H_{out}` = `hidden_size`. Defaults to zero if not provided.
        - output: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the next hidden state.

    Attributes:
        weight_ih: the learnable input-hidden weights, of shape
            `(3*hidden_size, input_size)`
        weight_hh: the learnable hidden-hidden weights, of shape
            `(3*hidden_size, hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(3*hidden_size)`
        bias_hh: the learnable hidden-hidden bias, of shape `(3*hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Examples::

        >>> rnn = nn.GRUCell(10, 20)
        >>> input = torch.randn(6, 3, 10)
        >>> hx = torch.randn(3, 20)
        >>> output = []
        >>> for i in range(6):
                hx = rnn(input[i], hx)
                output.append(hx)
    TNr   c                    s.   ||d}t t| j|||fddi| d S )Nr&   r   r   )rF   r   rG   r   rf   r   r   rG     s    
zGRUCell.__init__r   c                 C   s   |  dv s J d|   d|  dk}|s:|d}|d u rbtj|d| j|j|jd}n|sp|dn|}t	||| j
| j| j| j}|s|d}|S )Nr   z6GRUCell: Expected input to be 1-D or 2-D but received r   r	   r   r   )r   r   rS   r   r   r   r(   r'   r   gru_cellr   r   r   r   r   r   r   r   r   r     s"    
 
zGRUCell.forward)TNN)N)r   r   r   r   r   rL   rG   r   r   r   r   r   r   rf   r   r     s   =  r   )r   ) r   rN   rJ   typingr   r   r   r   rS   r   moduler   	parameterr
   Z	utils.rnnr   r-   r   r   r   r   Z
_rnn_implsr   r   r   r   r)   r+   r   r   r   r   r   r   r   r   <module>   s8      O   8+bY