a
    8Sic®H  ã                   @   s  d dl Z d dlZd dlmZ ddlmZ d dlmZmZ G dd„ deƒZdee ee ee ee ee ee e	e	e	e
e
e
e
e
e	d	œd
d„Zee ee ee ee ee ee e	e
e
e
e
e
e	e	dœdd„Zee ee ee ee ee ee e	e
e
e
e
e
e	e	dœdd„ZdS )é    N)ÚTensoré   )Ú	Optimizer)ÚListÚOptionalc                       sX   e Zd ZdZdddddœeee edœ‡ fd	d
„Z‡ fdd„Ze 	¡ ddd„ƒZ
‡  ZS )ÚAdamWa—  Implements AdamW algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{(lr)}, \: \beta_1, \beta_2
                \text{(betas)}, \: \theta_0 \text{(params)}, \: f(\theta) \text{(objective)},
                \: \epsilon \text{ (epsilon)}                                                    \\
            &\hspace{13mm}      \lambda \text{(weight decay)},  \: \textit{amsgrad},
                \: \textit{maximize}                                                             \\
            &\textbf{initialize} : m_0 \leftarrow 0 \text{ (first moment)}, v_0 \leftarrow 0
                \text{ ( second moment)}, \: \widehat{v_0}^{max}\leftarrow 0              \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\

            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})          \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm} \theta_t \leftarrow \theta_{t-1} - \gamma \lambda \theta_{t-1}         \\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow   m_t/\big(1-\beta_1^t \big)                   \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\textbf{if} \: amsgrad                                                  \\
            &\hspace{10mm}\widehat{v_t}^{max} \leftarrow \mathrm{max}(\widehat{v_t}^{max},
                \widehat{v_t})                                                                   \\
            &\hspace{10mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}^{max}} + \epsilon \big)                                 \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Decoupled Weight Decay Regularization`_.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
        maximize (bool, optional): maximize the params based on the objective, instead of
            minimizing (default: False)
        foreach (bool, optional): whether foreach implementation of optimizer
            is used (default: None)
        capturable (bool, optional): whether this instance is safe to capture in a CUDA graph.
            Passing True can impair ungraphed performance, so if you don't intend to
            graph capture this instance, leave it False (default: False)

    .. _Decoupled Weight Decay Regularization:
        https://arxiv.org/abs/1711.05101
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    çü©ñÒMbP?©gÍÌÌÌÌÌì?g+‡ÙÎ÷ï?ç:Œ0âŽyE>ç{®Gáz„?FN)ÚmaximizeÚforeachÚ
capturablec             
      sÈ   d|kst d |¡ƒ‚d|ks,t d |¡ƒ‚d|d   krDdk sXn t d |d ¡ƒ‚d|d   krpdk s„n t d |d ¡ƒ‚d|ksšt d	 |¡ƒ‚t||||||||	d
}
tt| ƒ ||
¡ d S )Nç        zInvalid learning rate: {}zInvalid epsilon value: {}r   g      ð?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {}zInvalid weight_decay value: {})ÚlrÚbetasÚepsÚweight_decayÚamsgradr   r   r   )Ú
ValueErrorÚformatÚdictÚsuperr   Ú__init__)ÚselfÚparamsr   r   r   r   r   r   r   r   Údefaults©Ú	__class__© úM/var/www/html/django/DPS/env/lib/python3.9/site-packages/torch/optim/adamw.pyr   J   s    þzAdamW.__init__c                    sœ   t ƒ  |¡ | jD ]4}| dd¡ | dd¡ | dd ¡ | dd¡ qt| j ¡ ƒ}t|ƒdkort 	|d d ¡}|s˜|D ]}t 
t|d ƒ¡|d< q|d S )Nr   Fr   r   r   r   Ústep)r   Ú__setstate__Úparam_groupsÚ
setdefaultÚlistÚstateÚvaluesÚlenÚtorchÚ	is_tensorÚtensorÚfloat)r   r&   ÚgroupÚstate_valuesÚstep_is_tensorÚsr   r   r    r"   ]   s    
zAdamW.__setstate__c                 C   sÂ  |   ¡  d}|durBt ¡  |ƒ }W d  ƒ n1 s80    Y  | jD ]r}g }g }g }g }g }g }	|d }
|d \}}|d D ]ú}|jdu r’q‚| |¡ |jjr¬tdƒ‚| |j¡ | j| }t	|ƒdkr>| j
d rîtjdtj|jd	nt d
¡|d< tj|tjd|d< tj|tjd|d< |
r>tj|tjd|d< | |d ¡ | |d ¡ |
rn| |d ¡ |	 |d ¡ q‚t||||||	|
|||d |d |d |d |d |d d qH|S )z±Performs a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   z'AdamW does not support sparse gradientsr   r   ©r   )ÚdtypeÚdevicer   r!   )Úmemory_formatÚexp_avgÚ
exp_avg_sqÚmax_exp_avg_sqr   r   r   r   r   )	r   Úbeta1Úbeta2r   r   r   r   r   r   )Ú _cuda_graph_capture_health_checkr)   Úenable_gradr#   ÚgradÚappendÚ	is_sparseÚRuntimeErrorr&   r(   r   Úzerosr,   r3   r+   Ú
zeros_likeÚpreserve_formatÚadamw)r   ÚclosureÚlossr-   Úparams_with_gradÚgradsÚexp_avgsÚexp_avg_sqsÚmax_exp_avg_sqsÚstate_stepsr   r8   r9   Úpr&   r   r   r    r!   j   sj    
$


ÿÿòz
AdamW.step)r   r	   r
   r   F)N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Úboolr   r   r"   r)   Úno_gradr!   Ú__classcell__r   r   r   r    r      s   A  ÿýýr   F)r   rG   rH   rI   rJ   rK   r   r   r   r8   r9   r   r   r   r   c                C   s|   t dd„ |D ƒƒstdƒ‚|du r&d}|r<tj ¡ r<tdƒ‚|rPtj ¡ sPt}nt}|| |||||||	|
|||||d dS )zpFunctional API that performs AdamW algorithm computation.

    See :class:`~torch.optim.AdamW` for details.
    c                 S   s   g | ]}t |tjƒ‘qS r   )Ú
isinstancer)   r   )Ú.0Útr   r   r    Ú
<listcomp>Ë   ó    zadamw.<locals>.<listcomp>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNFz6torch.jit.script not supported with foreach optimizers)r   r8   r9   r   r   r   r   r   )Úallr?   r)   ÚjitÚis_scriptingÚ_multi_tensor_adamwÚ_single_tensor_adamw)r   rG   rH   rI   rJ   rK   r   r   r   r8   r9   r   r   r   r   Úfuncr   r   r    rC   ´   s0    órC   )r   rG   rH   rI   rJ   rK   r   r8   r9   r   r   r   r   r   c                C   sÞ  t | ƒD ]Î\}}|s|| n||  }|| }|| }|| }|rX|jrP|jsXJ dƒ‚|d7 }| d|	|
  ¡ | |¡j|d| d | |¡j||d| d |rL|}dt ||¡ }dt ||¡ }|	| }| ¡ }| ¡ }|r$tj	|| ||| d ||  ¡ ||   || ¡}n| ¡ ||   || ¡}| 
||¡ q| ¡ }d||  }d||  }|	| }t |¡}|r´tj	|| ||| d ||  ¡ |  |¡}n| ¡ |  |¡}|j
||| d qd S )Nú@If capturable=True, params and state_steps must be CUDA tensors.r   ©Úalpha)Úvalue)Úout)Ú	enumerateÚis_cudaÚmul_Úadd_Úaddcmul_r)   ÚpowÚnegÚsqrtÚmaximumÚaddcdiv_ÚitemÚmath)r   rG   rH   rI   rJ   rK   r   r8   r9   r   r   r   r   r   ÚiÚparamr<   r5   r6   Ústep_tr!   Úbias_correction1Úbias_correction2Ú	step_sizeÚstep_size_negÚbias_correction2_sqrtÚdenomr   r   r    r]   ê   sB     
r]   c                   sz  t | ƒdkrd S |r4tdd„ t| |ƒD ƒƒs4J dƒ‚|rFt t|ƒ¡}t |d¡ t | dˆ|
  ¡ t |ˆ ¡ tj||dˆ  d t |ˆ¡ t |||dˆ ¡ |rÊ‡ fdd„|D ƒ}‡fd	d„|D ƒ}t 	|d¡ t 	|d¡ t 
|¡ t 
|¡ t |ˆ¡}t |¡ t 
|¡ t |¡}|rzt ||¡}t |¡}t |t ||¡¡ t ||¡}t |¡ t ||¡}n@t |¡}t |t ||¡¡ t ||¡}t |¡ t ||¡}t | ||¡ n¬‡ fd
d„|D ƒ}‡fdd„|D ƒ}‡fdd„|D ƒ}dd„ |D ƒ}|rDt ||¡}t |¡}t ||¡ t ||¡}n"t |¡}t ||¡ t ||¡}t | |||¡ d S )Nr   c                 s   s   | ]\}}|j o|j V  qd S )N)re   )rU   rL   r!   r   r   r    Ú	<genexpr>M  rX   z&_multi_tensor_adamw.<locals>.<genexpr>r_   r   r`   c                    s   g | ]}t  ˆ |¡‘qS r   ©r)   ri   ©rU   r!   ©r8   r   r    rW   b  rX   z'_multi_tensor_adamw.<locals>.<listcomp>c                    s   g | ]}t  ˆ |¡‘qS r   rz   r{   ©r9   r   r    rW   c  rX   c                    s   g | ]}d ˆ |  ¡   ‘qS r1   ©rn   r{   r|   r   r    rW   †  rX   c                    s   g | ]}d ˆ |  ¡   ‘qS r1   r~   r{   r}   r   r    rW   ‡  rX   c                    s   g | ]}ˆ | d  ‘qS )éÿÿÿÿr   ©rU   Úbc)r   r   r    rW   ‰  rX   c                 S   s   g | ]}t  |¡‘qS r   )ro   rk   r€   r   r   r    rW   ‹  rX   )r(   rY   Úzipr)   Ú_foreach_negÚtupleÚ_foreach_add_Ú_foreach_mul_Ú_foreach_addcmul_Ú_foreach_sub_Ú_foreach_neg_Ú_foreach_divÚ_foreach_reciprocal_Ú_foreach_sqrtÚ_foreach_maximumÚ_foreach_div_Ú_foreach_mulÚ_foreach_addÚ_foreach_addcdiv_)r   rG   rH   rI   rJ   rK   r   r8   r9   r   r   r   r   r   rs   rt   ru   rw   Úmax_exp_avg_sq_sqrtÚeps_over_step_sizerx   Úexp_avg_sq_sqrtr   )r8   r9   r   r    r\   :  sf    ÿ










r\   )NF)ro   r)   r   Ú	optimizerr   Útypingr   r   r   rQ   r,   rC   r]   r\   r   r   r   r    Ú<module>   sl    5  ÷ï6òPò