a
    8SicÀH  ã                   @   s  d dl Z d dlZd dlmZ ddlmZ d dlmZmZ G dd„ deƒZdee ee ee ee ee ee e	e	e	e
e
e
e
e
e	d	œd
d„Zee ee ee ee ee ee e	e
e
e
e
e
e	e	dœdd„Zee ee ee ee ee ee e	e
e
e
e
e
e	e	dœdd„ZdS )é    N)ÚTensoré   )Ú	Optimizer)ÚListÚOptionalc                       sX   e Zd ZdZdddddœee eedœ‡ fd	d
„Z‡ fdd„Ze 	¡ ddd„ƒZ
‡  ZS )ÚAdamaš  Implements Adam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \beta_1, \beta_2
                \text{ (betas)},\theta_0 \text{ (params)},f(\theta) \text{ (objective)}          \\
            &\hspace{13mm}      \lambda \text{ (weight decay)},  \: \textit{amsgrad},
                \:\textit{maximize}                                                              \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0\leftarrow 0 \text{ (second moment)},\: \widehat{v_0}^{max}\leftarrow 0\\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\

            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})         \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})          \\
            &\hspace{5mm}\textbf{if} \: \lambda \neq 0                                           \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow   m_t/\big(1-\beta_1^t \big)                   \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\textbf{if} \: amsgrad                                                  \\
            &\hspace{10mm}\widehat{v_t}^{max} \leftarrow \mathrm{max}(\widehat{v_t}^{max},
                \widehat{v_t})                                                                   \\
            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}^{max}} + \epsilon \big)                                 \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
        foreach (bool, optional): whether foreach implementation of optimizer
            is used (default: None)
        maximize (bool, optional): maximize the params based on the objective, instead of
            minimizing (default: False)
        capturable (bool, optional): whether this instance is safe to capture in a CUDA graph.
            Passing True can impair ungraphed performance, so if you don't intend to
            graph capture this instance, leave it False (default: False)

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    çü©ñÒMbP?©gÍÌÌÌÌÌì?g+‡ÙÎ÷ï?ç:Œ0âŽyE>r   FN)ÚforeachÚmaximizeÚ
capturablec             
      sÈ   d|kst d |¡ƒ‚d|ks,t d |¡ƒ‚d|d   krDdk sXn t d |d ¡ƒ‚d|d   krpdk s„n t d |d ¡ƒ‚d|ksšt d	 |¡ƒ‚t||||||||	d
}
tt| ƒ ||
¡ d S )Nç        zInvalid learning rate: {}zInvalid epsilon value: {}r   g      ð?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {}zInvalid weight_decay value: {})ÚlrÚbetasÚepsÚweight_decayÚamsgradr   r   r   )Ú
ValueErrorÚformatÚdictÚsuperr   Ú__init__)ÚselfÚparamsr   r   r   r   r   r   r   r   Údefaults©Ú	__class__© úL/var/www/html/django/DPS/env/lib/python3.9/site-packages/torch/optim/adam.pyr   J   s    þzAdam.__init__c                    sœ   t ƒ  |¡ | jD ]4}| dd¡ | dd¡ | dd ¡ | dd¡ qt| j ¡ ƒ}t|ƒdkort 	|d d ¡}|s˜|D ]}t 
t|d ƒ¡|d< q|d S )Nr   Fr   r   r   r   Ústep)r   Ú__setstate__Úparam_groupsÚ
setdefaultÚlistÚstateÚvaluesÚlenÚtorchÚ	is_tensorÚtensorÚfloat)r   r%   ÚgroupÚstate_valuesÚstep_is_tensorÚsr   r   r   r!   \   s    
zAdam.__setstate__c                 C   sÆ  |   ¡  d}|durBt ¡  |ƒ }W d  ƒ n1 s80    Y  | jD ]v}g }g }g }g }g }g }	|d \}
}|d D ] }|jdurz| |¡ |jjr¤tdƒ‚| |j¡ | j| }t	|ƒdkr:| j
d rætjdtj|jdnt d	¡|d
< tj|tjd|d< tj|tjd|d< |d r:tj|tjd|d< | |d ¡ | |d ¡ |d rn| |d ¡ |	 |d
 ¡ qzt||||||	|d |
||d |d |d |d |d |d d qH|S )z±Performs a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   zJAdam does not support sparse gradients, please consider SparseAdam insteadr   r   ©r   )ÚdtypeÚdevicer   r    )Úmemory_formatÚexp_avgÚ
exp_avg_sqr   Zmax_exp_avg_sqr   r   r   r   r   )	r   Úbeta1Úbeta2r   r   r   r   r   r   )Ú _cuda_graph_capture_health_checkr(   Úenable_gradr"   ÚgradÚappendÚ	is_sparseÚRuntimeErrorr%   r'   r   Úzerosr+   r2   r*   Ú
zeros_likeÚpreserve_formatÚadam)r   ÚclosureÚlossr,   Úparams_with_gradÚgradsÚexp_avgsÚexp_avg_sqsÚmax_exp_avg_sqsÚstate_stepsr6   r7   Úpr%   r   r   r   r    i   sf    
$


ÿÿ

òz	Adam.step)r   r	   r
   r   F)N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   Úboolr   r!   r(   Úno_gradr    Ú__classcell__r   r   r   r   r      s   A  ÿþþr   F)r   rE   rF   rG   rH   rI   r   r   r   r6   r7   r   r   r   r   c                C   s|   t dd„ |D ƒƒstdƒ‚|du r&d}|r<tj ¡ r<tdƒ‚|rPtj ¡ sPt}nt}|| |||||||	|
|||||d dS )zmFunctional API that performs Adam algorithm computation.
    See :class:`~torch.optim.Adam` for details.
    c                 S   s   g | ]}t |tjƒ‘qS r   )Ú
isinstancer(   r   )Ú.0Útr   r   r   Ú
<listcomp>Æ   ó    zadam.<locals>.<listcomp>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNFz6torch.jit.script not supported with foreach optimizers)r   r6   r7   r   r   r   r   r   )Úallr=   r(   ÚjitÚis_scriptingÚ_multi_tensor_adamÚ_single_tensor_adam)r   rE   rF   rG   rH   rI   r   r   r   r6   r7   r   r   r   r   Úfuncr   r   r   rA   °   s0    órA   )r   rE   rF   rG   rH   rI   r   r6   r7   r   r   r   r   r   c                C   sæ  t | ƒD ]Ö\}}|s|| n||  }|| }|| }|| }|rX|jrP|jsXJ dƒ‚|d7 }|
dkrv|j||
d}| |¡j|d| d | |¡j|| ¡ d| d |rT|}dt ||¡ }dt ||¡ }|	| }| 	¡ }| 
¡ }|r,tj|| ||| d ||  
¡ ||   || ¡}n| 
¡ ||   || ¡}| ||¡ q| ¡ }d||  }d||  }|	| }t 
|¡}|r¼tj|| ||| d ||  
¡ |  |¡}n| 
¡ |  |¡}|j||| d qd S )Nú@If capturable=True, params and state_steps must be CUDA tensors.r   r   ©Úalpha)Úvalue)Úout)Ú	enumerateÚis_cudaÚaddÚmul_Úadd_Úaddcmul_Úconjr(   ÚpowÚnegÚsqrtÚmaximumÚaddcdiv_ÚitemÚmath)r   rE   rF   rG   rH   rI   r   r6   r7   r   r   r   r   r   ÚiÚparamr:   r4   r5   Ústep_tr    Úbias_correction1Úbias_correction2Ú	step_sizeZstep_size_negÚbias_correction2_sqrtÚdenomr   r   r   r[   å   sD     
r[   c                   s~  t | ƒdkrd S |r4tdd„ t| |ƒD ƒƒs4J dƒ‚|rFt t|ƒ¡}t |d¡ |
dkrjtj|| |
d t |ˆ ¡ tj||dˆ  d t |ˆ¡ t |||dˆ ¡ |rÎ‡ fdd„|D ƒ}‡fd	d„|D ƒ}t 	|d¡ t 	|d¡ t 
|¡ t 
|¡ t |ˆ¡}t |¡ t 
|¡ t |¡}|r~t ||¡}t |¡}t |t ||¡¡ t ||¡}t |¡ t ||¡}n@t |¡}t |t ||¡¡ t ||¡}t |¡ t ||¡}t | ||¡ n¬‡ fd
d„|D ƒ}‡fdd„|D ƒ}‡fdd„|D ƒ}dd„ |D ƒ}|rHt ||¡}t |¡}t ||¡ t ||¡}n"t |¡}t ||¡ t ||¡}t | |||¡ d S )Nr   c                 s   s   | ]\}}|j o|j V  qd S )N)rc   )rS   rJ   r    r   r   r   Ú	<genexpr>I  rV   z%_multi_tensor_adam.<locals>.<genexpr>r]   r   r^   c                    s   g | ]}t  ˆ |¡‘qS r   ©r(   ri   ©rS   r    ©r6   r   r   rU   ^  rV   z&_multi_tensor_adam.<locals>.<listcomp>c                    s   g | ]}t  ˆ |¡‘qS r   ry   rz   ©r7   r   r   rU   _  rV   c                    s   g | ]}d ˆ |  ¡   ‘qS r0   ©rn   rz   r{   r   r   rU   ‚  rV   c                    s   g | ]}d ˆ |  ¡   ‘qS r0   r}   rz   r|   r   r   rU   ƒ  rV   c                    s   g | ]}ˆ | d  ‘qS )éÿÿÿÿr   ©rS   Úbc)r   r   r   rU   …  rV   c                 S   s   g | ]}t  |¡‘qS r   )ro   rk   r   r   r   r   rU   ‡  rV   )r'   rW   Úzipr(   Ú_foreach_negÚtupleÚ_foreach_add_Ú_foreach_mul_Ú_foreach_addcmul_Z_foreach_sub_Z_foreach_neg_Ú_foreach_divZ_foreach_reciprocal_Ú_foreach_sqrtZ_foreach_maximumÚ_foreach_div_Ú_foreach_mulÚ_foreach_addZ_foreach_addcdiv_)r   rE   rF   rG   rH   rI   r   r6   r7   r   r   r   r   r   rs   rt   ru   rv   Zmax_exp_avg_sq_sqrtZeps_over_step_sizerw   Zexp_avg_sq_sqrtr   )r6   r7   r   r   rZ   6  sh    ÿ










rZ   )NF)ro   r(   r   Ú	optimizerr   Útypingr   r   r   rO   r+   rA   r[   rZ   r   r   r   r   Ú<module>   sl    1  ÷ï5òQò