a
    8Sic/                     @   s   d dl Z d dl mZ ddlmZmZ d dlmZmZ G dd deZdee ee eee  e	e	e
e
e
e
e	e	dd	d
Zee ee eee  e
e
e
e
e	e	e	d
ddZee ee eee  e
e
e
e
e	e	e	d
ddZdS )    N)Tensor   )	Optimizerrequired)ListOptionalc                       s\   e Zd ZdZeddddfdddee d fddZ fd	d
Ze	
 dddZ  ZS )SGDax  Implements stochastic gradient descent (optionally with momentum).

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta)
                \text{ (objective)}, \: \lambda \text{ (weight decay)},                          \\
            &\hspace{13mm} \:\mu \text{ (momentum)}, \:\tau \text{ (dampening)},
            \:\textit{ nesterov,}\:\textit{ maximize}                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm}\textbf{if} \: \lambda \neq 0                                           \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
            &\hspace{5mm}\textbf{if} \: \mu \neq 0                                               \\
            &\hspace{10mm}\textbf{if} \: t > 1                                                   \\
            &\hspace{15mm} \textbf{b}_t \leftarrow \mu \textbf{b}_{t-1} + (1-\tau) g_t           \\
            &\hspace{10mm}\textbf{else}                                                          \\
            &\hspace{15mm} \textbf{b}_t \leftarrow g_t                                           \\
            &\hspace{10mm}\textbf{if} \: \textit{nesterov}                                       \\
            &\hspace{15mm} g_t \leftarrow g_{t} + \mu \textbf{b}_t                             \\
            &\hspace{10mm}\textbf{else}                                                   \\[-1.ex]
            &\hspace{15mm} g_t  \leftarrow  \textbf{b}_t                                         \\
            &\hspace{5mm}\textbf{if} \: \textit{maximize}                                          \\
            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} + \gamma g_t                   \\[-1.ex]
            &\hspace{5mm}\textbf{else}                                                    \\[-1.ex]
            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma g_t                   \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    Nesterov momentum is based on the formula from
    `On the importance of initialization and momentum in deep learning`__.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate
        momentum (float, optional): momentum factor (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        dampening (float, optional): dampening for momentum (default: 0)
        nesterov (bool, optional): enables Nesterov momentum (default: False)
        maximize (bool, optional): maximize the params based on the objective, instead of
            minimizing (default: False)
        foreach (bool, optional): whether foreach implementation of optimizer
            is used (default: None)

    Example:
        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
        >>> optimizer.zero_grad()
        >>> loss_fn(model(input), target).backward()
        >>> optimizer.step()

    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf

    .. note::
        The implementation of SGD with Momentum/Nesterov subtly differs from
        Sutskever et. al. and implementations in some other frameworks.

        Considering the specific case of Momentum, the update can be written as

        .. math::
            \begin{aligned}
                v_{t+1} & = \mu * v_{t} + g_{t+1}, \\
                p_{t+1} & = p_{t} - \text{lr} * v_{t+1},
            \end{aligned}

        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the
        parameters, gradient, velocity, and momentum respectively.

        This is in contrast to Sutskever et. al. and
        other frameworks which employ an update of the form

        .. math::
            \begin{aligned}
                v_{t+1} & = \mu * v_{t} + \text{lr} * g_{t+1}, \\
                p_{t+1} & = p_{t} - v_{t+1}.
            \end{aligned}

        The Nesterov version is analogously modified.
    r   FN)maximizeforeach)r
   c          
   	      s   |t ur|dk rtd||dk r4td||dk rJtd|t|||||||d}	|r||dkst|dkr|tdtt| ||	 d S )Ng        zInvalid learning rate: {}zInvalid momentum value: {}zInvalid weight_decay value: {})lrmomentum	dampeningweight_decaynesterovr	   r
   r   z8Nesterov momentum requires a momentum and zero dampening)r   
ValueErrorformatdictsuperr   __init__)
selfparamsr   r   r   r   r   r	   r
   defaults	__class__ K/var/www/html/django/DPS/env/lib/python3.9/site-packages/torch/optim/sgd.pyr   [   s    zSGD.__init__c                    s@   t  | | jD ](}|dd |dd |dd  qd S )Nr   Fr	   r
   )r   __setstate__param_groups
setdefault)r   stategroupr   r   r   r   k   s
    
zSGD.__setstate__c                 C   s   d}|dur:t   | }W d   n1 s00    Y  | jD ]}g }g }g }d}|d D ]\}|jdur\|| ||j |jjrd}| j| }	d|	vr|d q\||	d  q\t||||d |d |d |d	 |d
 |d ||d d t||D ]\}}
| j| }	|
|	d< qq@|S )zPerforms a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        NFr   Tmomentum_bufferr   r   r   r   r   r	   r
   )r   r   r   r   r   r	   has_sparse_gradr
   )	torchenable_gradr   gradappend	is_sparser   sgdzip)r   closurelossr    params_with_gradd_p_listmomentum_buffer_listr"   pr   r!   r   r   r   stepr   sF    
$




zSGD.step)N)__name__
__module____qualname____doc__r   r   boolr   r   r#   no_gradr0   __classcell__r   r   r   r   r      s   Sr   )r   r-   r.   r"   r
   r   r   r   r   r   r	   c                C   sZ   |du rd}|r"t j r"td|r6t j s6t}nt}|| |||||||	||
d
 dS )zlFunctional API that performs SGD algorithm computation.

    See :class:`~torch.optim.SGD` for details.
    NFz6torch.jit.script not supported with foreach optimizers)r   r   r   r   r   r"   r	   )r#   jitis_scriptingRuntimeError_multi_tensor_sgd_single_tensor_sgd)r   r-   r.   r"   r
   r   r   r   r   r   r	   funcr   r   r   r(      s$    r(   )
r   r-   r.   r   r   r   r   r   r	   r"   c                C   s   t | D ]\}
}||
 }|dkr.|j||d}|dkr||
 }|d u r^t| }|||
< n||j|d| d |r|j||d}n|}|r|n| }|j||d qd S )Nr   alphar   )	enumerateaddr#   clonedetachmul_add_)r   r-   r.   r   r   r   r   r   r	   r"   iparamZd_pbufr?   r   r   r   r<      s    
r<   )
r   gradsr.   r   r   r   r   r   r	   r"   c                C   s  t | dkrd S |	d u r*tdd |D }	|dkrBtj|| |d}|dkr8g }
d}tt |D ](}|| d u rzd} qq`|
||  q`|rt|
| tj|
|d| d nlg }
tt |D ]Z}|| d u rt|| 	  }||< n$|| }|
|j|| d| d |
| q|r4tj||
|d n|
}|rB|n| }|	s`tj| ||d n*tt | D ]}| | j|| |d qld S )Nr   c                 S   s   g | ]
}|j qS r   )r'   ).0r%   r   r   r   
<listcomp>      z%_multi_tensor_sgd.<locals>.<listcomp>r>   TFr   )lenanyr#   _foreach_addranger&   _foreach_mul__foreach_add_rB   rC   rD   rE   )r   rI   r.   r   r   r   r   r   r	   r"   bufsZall_states_with_momentum_bufferrF   rH   r?   r   r   r   r;      s@    
r;   )NN)r#   r   	optimizerr   r   typingr   r   r   r5   floatr(   r<   r;   r   r   r   r   <module>   sR    %  
*
$
