a
    8Sic.3                     @   s  d dl Z d dlZd dlmZ ddlmZ d dlmZmZ G dd deZdee ee ee ee ee ee e	e
e
e
e
e
e
dd	d
Zee ee ee ee ee ee e
e
e
e
e
e
dddZee ee ee ee ee ee e
e
e
e
e
e
dddZdS )    N)Tensor   )	Optimizer)ListOptionalc                       sJ   e Zd ZdZdee d fd	d
Z fddZe	 dddZ
  ZS )NAdama  Implements NAdam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma_t \text{ (lr)}, \: \beta_1,\beta_2 \text{ (betas)},
                \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}                   \\
            &\hspace{13mm} \: \lambda \text{ (weight decay)}, \:\psi \text{ (momentum decay)}    \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0 \leftarrow 0 \text{ ( second moment)}                                 \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm}if \: \lambda \neq 0                                                    \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm} \mu_t \leftarrow \beta_1 \big(1 - \frac{1}{2}  0.96^{t \psi} \big)     \\
            &\hspace{5mm} \mu_{t+1} \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{(t+1)\psi}\big)\\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow \mu_{t+1} m_t/(1-\prod_{i=1}^{t+1}\mu_i)\\[-1.ex]
            & \hspace{11mm} + (1-\mu_t) g_t /(1-\prod_{i=1}^{t} \mu_{i})                         \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 2e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        momentum_decay (float, optional): momentum momentum_decay (default: 4e-3)
        foreach (bool, optional): whether foreach implementation of optimizer
            is used (default: None)

    .. _Incorporating Nesterov Momentum into Adam:
        https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ
    Mb`?g?g+?:0yE>r   Mbp?N)foreachc           	         s   d|kst d|d|ks,t d|d|d   krDdk sXn t d|d d|d   krpdk sn t d|d d|kst d	|d|kst d
|t||||||d}tt| || d S )N        zInvalid learning rate: {}zInvalid epsilon value: {}r         ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {}zInvalid weight_decay value: {}z Invalid momentum_decay value: {})lrbetasepsweight_decaymomentum_decayr   )
ValueErrorformatdictsuperr   __init__)	selfparamsr   r   r   r   r   r   defaults	__class__ M/var/www/html/django/DPS/env/lib/python3.9/site-packages/torch/optim/nadam.pyr   9   s"    zNAdam.__init__c                    s   t  | | jD ]}|dd  qt| j }t|dkoNt	|d d }|st|D ]}t
t|d |d< qXt|dkot	|d d }|s|D ]}t
|d |d< qd S )Nr   r   step
mu_product)r   __setstate__param_groups
setdefaultliststatevalueslentorch	is_tensortensorfloat)r   r&   groupstate_valuesstep_is_tensorsZmu_product_is_tensorr   r   r   r"   L   s    
zNAdam.__setstate__c                 C   sx  d}|dur:t   | }W d   n1 s00    Y  | jD ]0}g }g }g }g }g }g }	|d \}
}|d D ]}|jdurr|| |jjrtd||j | j| }t|dkrt 	d|d< t 	d|d	< t j
|t jd
|d< t j
|t jd
|d< ||d  ||d  ||d	  |	|d  qrt||||||	|
||d |d |d |d |d d q@|S )zPerforms a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   z'NAdam does not support sparse gradientsr   r   r    r   r!   )memory_formatexp_avg
exp_avg_sqr   r   r   r   r   )beta1beta2r   r   r   r   r   )r)   enable_gradr#   gradappend	is_sparseRuntimeErrorr&   r(   r+   
zeros_likepreserve_formatnadam)r   closurelossr-   params_with_gradgradsexp_avgsexp_avg_sqsmu_productsstate_stepsr4   r5   pr&   r   r   r   r    Z   sV    
$


z
NAdam.step)r   r	   r
   r   r   N)N)__name__
__module____qualname____doc__r   boolr   r"   r)   no_gradr    __classcell__r   r   r   r   r      s   0  r   )r   rA   rB   rC   rD   rE   r   r4   r5   r   r   r   r   c                C   s   t dd |D stdt dd |D s4td|du r@d}|rVtj rVtd|rjtj sjt}nt}|| ||||||||	|
||d	 dS )
zpFunctional API that performs NAdam algorithm computation.

    See :class:`~torch.optim.NAdam` for details.
    c                 S   s   g | ]}t |tjqS r   
isinstancer)   r   .0tr   r   r   
<listcomp>       znadam.<locals>.<listcomp>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsc                 S   s   g | ]}t |tjqS r   rN   rP   r   r   r   rS      rT   zPAPI has changed, `mu_products` argument must contain a list of singleton tensorsNFz6torch.jit.script not supported with foreach optimizers)r4   r5   r   r   r   r   )allr:   r)   jitis_scripting_multi_tensor_nadam_single_tensor_nadam)r   rA   rB   rC   rD   rE   r   r4   r5   r   r   r   r   funcr   r   r   r=      s0    r=   )r   rA   rB   rC   rD   rE   r4   r5   r   r   r   r   c                C   sH  t | D ]8\}}|| }|| }|| }|| }|| }|d7 }| }d||  }|	dkrl|j||	d}|ddd||
     }|ddd|d |
     }||9 }|| | }||j|d| d ||j||d| d || |}|j||| d|  d|   d |j||| | d|   d qd S )Nr   r   alphar         ?Q?)value)		enumerateitemaddmul_add_addcmul_divsqrtaddcdiv_)r   rA   rB   rC   rD   rE   r4   r5   r   r   r   r   iparamr7   r2   r3   r!   step_tr    bias_correction2mumu_nextZmu_product_nextdenomr   r   r   rY      s(    &rY   c                   sP  t | dkrd S t|d  fdd|D }fdd|D } fdd|D } fdd|D }t|| |	dkrtj|| |	d t|  tj||d  d t| t|||d  t|}d	d |D }t|| t||}fd
dt||D }fddt||D }t	| ||| t	| ||| d S )Nr   r   c                    s   g | ]}d  |    qS r   ra   rQ   r    )r4   r   r   rS     rT   z'_multi_tensor_nadam.<locals>.<listcomp>c                    s   g | ]}d  |    qS rp   rq   rr   )r5   r   r   rS     rT   c                    s(   g | ] } d dd|       qS )r   r]   r^   rq   rr   r4   r   r   r   rS     rT   c                    s,   g | ]$} d dd|  d      qS )r   r]   r^   r   rq   rr   rs   r   r   rS     s   r[   c                 S   s   g | ]}t |qS r   )mathrg   )rQ   bcr   r   r   rS   "  rT   c                    s,   g | ]$\}} d |  d |    d qS r   rq   )rQ   r!   rm   r   r   r   rS   &  s   c                    s,   g | ]$\}} | d |  |   d qS rv   rq   )rQ   r!   rn   rx   r   r   rS   (  s   )
r(   r)   _foreach_add__foreach_mul__foreach_addcmul__foreach_sqrt_foreach_div__foreach_addzip_foreach_addcdiv_)r   rA   rB   rC   rD   rE   r4   r5   r   r   r   r   bias_correction1rl   musZmu_nextsexp_avg_sq_sqrtZbias_correction_sqrtro   Zstep_size_gradsZstep_size_expavgr   )r4   r5   r   r   r   rX      s8    


rX   )N)rt   r)   r   	optimizerr   typingr   r   r   rK   r,   r=   rY   rX   r   r   r   r   <module>   s^     5.