a
    yfh                     @   s  d Z ddlZddlmZ ddlm  mZ ddlmZ ddl	m
Z
mZmZmZmZmZ ddlmZ dZG dd	 d	ejZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZG dd dejZG d d! d!eZ G d"d# d#eZ!G d$d% d%ejZ"G d&d' d'ejZ#G d(d) d)ejZ$G d*d+ d+ejZ%G d,d- d-ejZ&G d.d/ d/ejZ'G d0d1 d1ejZ(G d2d3 d3ejZ)G d4d5 d5ejZ*G d6d7 d7ejZ+G d8d9 d9e#Z,G d:d; d;eZ-G d<d= d=ejZ.G d>d? d?e.Z/G d@dA dAejZ0G dBdC dCejZ1G dDdE dEejZ2G dFdG dGejZ3G dHdI dIejZ4G dJdK dKejZ5G dLdM dMeZ6G dNdO dOeZ7G dPdQ dQejjZ8G dRdS dSejZ9G dTdU dUeZ:G dVdW dWejZ;G dXdY dYejZ<G dZd[ d[ejZ=G d\d] d]ejZ>G d^d_ d_eZ?G d`da daejZ@dS )bzBlock modules.    N)fuse_conv_and_bn   )ConvDWConv	GhostConv	LightConvRepConvautopad)TransformerBlock)&DFLHGBlockHGStemSPPSPPFC1C2C3C2fC2fAttnImagePoolingAttnContrastiveHeadBNContrastiveHeadC3xC3TRC3GhostGhostBottleneck
BottleneckBottleneckCSPProtoRepC3ResNetLayerRepNCSPELAN4ELAN1ADownAConvSPPELANCBFuseCBLinearC3k2C2fPSAC2PSARepVGGDWCIBC2fCIB	AttentionPSASCDownc                       s*   e Zd ZdZd fdd	Zdd Z  ZS )r   z
    Integral module of Distribution Focal Loss (DFL).

    Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
       c                    sb   t    tj|ddddd| _tj|tjd}t	|
d|dd| jjjdd< || _dS )zGInitialize a convolutional layer with a given number of input channels.r   Fbias)ZdtypeN)super__init__nnConv2dZrequires_grad_convtorchZarangefloat	Parameterviewweightdatac1)selfr?   x	__class__ X/var/www/html/django/DPS/env/lib/python3.9/site-packages/ultralytics/nn/modules/block.pyr5   >   s
    
$zDFL.__init__c                 C   s<   |j \}}}| ||d| j|ddd|d|S )zEApplies a transformer layer on input tensor 'x' and returns a tensor.      r   )shaper8   r<   r?   	transposesoftmax)r@   rA   b_arD   rD   rE   forwardF   s    zDFL.forward)r1   __name__
__module____qualname____doc__r5   rN   __classcell__rD   rD   rB   rE   r   7   s   r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	r   z1YOLOv8 mask Proto module for segmentation models.       c                    sR   t    t||dd| _tj||ddddd| _t||dd| _t||| _dS )z
        Initializes the YOLOv8 mask Proto module with specified number of protos and masks.

        Input arguments are ch_in, number of protos, number of masks.
           krG   r   Tr2   N)	r4   r5   r   cv1r6   ZConvTranspose2dupsamplecv2cv3)r@   r?   c_c2rB   rD   rE   r5   P   s
    
zProto.__init__c              	   C   s   |  | | | |S )zFPerforms a forward pass through layers using an upsampled input image.)r]   r\   r[   rZ   r@   rA   rD   rD   rE   rN   \   s    zProto.forward)rU   rV   rO   rD   rD   rB   rE   r   M   s   r   c                       s(   e Zd ZdZ fddZdd Z  ZS )r   z
    StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    c                    s   t    t||ddt d| _t||d dddt d| _t|d |dddt d| _t|d |ddt d| _t||ddt d| _	tj
ddddd| _dS )	z_Initialize the SPP layer with input/output channels and specified kernel sizes for max pooling.rW   rG   actr   r   T)kernel_sizestridepaddingZ	ceil_modeN)r4   r5   r   r6   ReLUstem1stem2astem2bstem3stem4	MaxPool2dpool)r@   r?   cmr_   rB   rD   rE   r5   h   s    
zHGStem.__init__c                 C   sr   |  |}t|g d}| |}t|g d}| |}| |}tj||gdd}| |}| 	|}|S )+Forward pass of a PPHGNetV2 backbone layer.)r   r   r   r   r   dim)
rg   Fpadrh   ri   rm   r9   catrj   rk   )r@   rA   x2x1rD   rD   rE   rN   r   s    





zHGStem.forwardrO   rD   rD   rB   rE   r   a   s   
r   c                       s8   e Zd ZdZdddde f fdd	Zdd Z  ZS )	r   z
    HG_Block of PPHGNetV2 with 2 convolutions and LightConv.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    rW      Fc	           	         s   t    |rtntt fddt|D | _t|  |d dd d| _t|d |dd d| _	|o~|k| _
dS )zZInitializes a CSP Bottleneck with 1 convolution using specified input and output channels.c                 3   s*   | ]"}|d krn dV  qdS )r   rY   rb   NrD   ).0irb   blockr?   rn   rY   rD   rE   	<genexpr>       z#HGBlock.__init__.<locals>.<genexpr>rG   r   ra   N)r4   r5   r   r   r6   
ModuleListrangemscecadd)	r@   r?   rn   r_   rY   nZ	lightconvshortcutrb   rB   r{   rE   r5      s    
& zHGBlock.__init__c                    sJ   |g    fdd| jD  | | t d | jrF | S  S )ro   c                 3   s   | ]}| d  V  qdS NrD   ry   r   yrD   rE   r}      r~   z"HGBlock.forward.<locals>.<genexpr>r   )extendr   r   r   r9   rt   r   r`   rD   r   rE   rN      s    zHGBlock.forward)	rP   rQ   rR   rS   r6   rf   r5   rN   rT   rD   rD   rB   rE   r      s   	r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )r   zDSpatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729.   	      c                    sX   t    |d }t||dd| _t|t|d  |dd| _tdd |D | _dS )zMInitialize the SPP layer with input/output channels and pooling kernel sizes.rG   r   c                 S   s    g | ]}t j|d |d dqS )r   rG   rc   rd   re   )r6   rl   )ry   rA   rD   rD   rE   
<listcomp>   r~   z SPP.__init__.<locals>.<listcomp>N)	r4   r5   r   rZ   lenr\   r6   r   r   r@   r?   r_   rY   r^   rB   rD   rE   r5      s
    
zSPP.__init__c                    s2   |    | t g fdd| jD  dS )zBForward pass of the SPP layer, performing spatial pyramid pooling.c                    s   g | ]}| qS rD   rD   r   rA   rD   rE   r      r~   zSPP.forward.<locals>.<listcomp>r   )rZ   r\   r9   rt   r   r`   rD   r   rE   rN      s    
zSPP.forward)r   rO   rD   rD   rB   rE   r      s   r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )r   zGSpatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher.r   c                    sP   t    |d }t||dd| _t|d |dd| _tj|d|d d| _dS )z
        Initializes the SPPF layer with given input/output channels and kernel size.

        This module is equivalent to SPP(k=(5, 9, 13)).
        rG   r   rF   r   N)r4   r5   r   rZ   r\   r6   rl   r   r   rB   rD   rE   r5      s
    
zSPPF.__init__c                    s<     |g fddtdD   tdS )z-Forward pass through Ghost Convolution block.c                 3   s   | ]}  d  V  qdS r   )r   ry   rL   r@   r   rD   rE   r}      r~   zSPPF.forward.<locals>.<genexpr>rW   r   )rZ   r   r   r\   r9   rt   r`   rD   r   rE   rN      s    zSPPF.forward)r   rO   rD   rD   rB   rE   r      s   r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )r   z"CSP Bottleneck with 1 convolution.r   c                    s<   t    t| dd| _tj fddt|D  | _dS )zjInitializes the CSP Bottleneck with configurations for 1 convolution with arguments ch_in, ch_out, number.r   c                 3   s   | ]}t   d V  qdS )rW   N)r   r   r_   rD   rE   r}      r~   zC1.__init__.<locals>.<genexpr>N)r4   r5   r   rZ   r6   
Sequentialr   r   )r@   r?   r_   r   rB   r   rE   r5      s    
zC1.__init__c                 C   s   |  |}| || S )z5Applies cross-convolutions to input in the C3 module.)rZ   r   )r@   rA   r   rD   rD   rE   rN      s    
z
C1.forward)r   rO   rD   rD   rB   rE   r      s   r   c                       s*   e Zd ZdZd	 fdd	Zdd Z  ZS )
r   z#CSP Bottleneck with 2 convolutions.r   T      ?c                    sh   t    t|| _t|dj dd_tdj |d_tj fddt	|D  _
dS )zRInitializes a CSP Bottleneck with 2 convolutions and optional shortcut connection.rG   r   c              	   3   s&   | ]}t jj d ddV  qdS )rW   rW   r         ?rY   eNr   cr   gr@   r   rD   rE   r}      r~   zC2.__init__.<locals>.<genexpr>Nr4   r5   intr   r   rZ   r\   r6   r   r   r   r@   r?   r_   r   r   r   r   rB   r   rE   r5      s
    
zC2.__init__c                 C   s2   |  |dd\}}| t| ||fdS )<Forward pass through the CSP bottleneck with 2 convolutions.rG   r   )rZ   chunkr\   r9   rt   r   r@   rA   rM   rK   rD   rD   rE   rN      s    z
C2.forward)r   Tr   r   rO   rD   rD   rB   rE   r      s   	r   c                       s2   e Zd ZdZd fdd	Zdd Zd	d
 Z  ZS )r   <Faster Implementation of CSP Bottleneck with 2 convolutions.r   Fr   c                    sl   t    t|| _t|dj dd_td| j |d_t fddt	|D _
dS )z_Initializes a CSP bottleneck with 2 convolutions and n Bottleneck blocks for faster processing.rG   r   c              	   3   s&   | ]}t jj d ddV  qdS r   r   r   r   rD   rE   r}      r~   zC2f.__init__.<locals>.<genexpr>N)r4   r5   r   r   r   rZ   r\   r6   r   r   r   r   rB   r   rE   r5      s
    
zC2f.__init__c                    sB   t | |dd   fdd| jD  | t dS )Forward pass through C2f layer.rG   r   c                 3   s   | ]}| d  V  qdS r   rD   r   r   rD   rE   r}      r~   zC2f.forward.<locals>.<genexpr>)listrZ   r   r   r   r\   r9   rt   r`   rD   r   rE   rN      s    zC2f.forwardc                    sJ   t | || j| jfd   fdd| jD  | t dS ).Forward pass using split() instead of chunk().r   c                 3   s   | ]}| d  V  qdS r   rD   r   r   rD   rE   r}      r~   z$C2f.forward_split.<locals>.<genexpr>)	r   rZ   splitr   r   r   r\   r9   rt   r`   rD   r   rE   forward_split   s    zC2f.forward_split)r   Fr   r   rP   rQ   rR   rS   r5   rN   r   rT   rD   rD   rB   rE   r      s   r   c                       s*   e Zd ZdZd	 fdd	Zdd Z  ZS )
r   z#CSP Bottleneck with 3 convolutions.r   Tr   c                    sn   t    t||  t| dd| _t| dd| _td  |d| _tj fddt	|D  | _
dS )zbInitialize the CSP Bottleneck with given channels, number, shortcut, groups, and expansion values.r   rG   c              	   3   s"   | ]}t   d ddV  qdS )))r   r   r   r   r   Nr   r   r^   r   r   rD   rE   r}     r~   zC3.__init__.<locals>.<genexpr>N)r4   r5   r   r   rZ   r\   r]   r6   r   r   r   r   rB   r   rE   r5      s    
zC3.__init__c              	   C   s(   |  t| | || |fdS )r   r   )r]   r9   rt   r   rZ   r\   r`   rD   rD   rE   rN     s    z
C3.forward)r   Tr   r   rO   rD   rD   rB   rE   r      s   	r   c                       s"   e Zd ZdZd fdd	Z  ZS )r   z"C3 module with cross-convolutions.r   Tr   c                    sJ   t  ||| | t|| _tj fddt|D  _dS )z4Initialize C3TR instance and set default parameters.c              	   3   s&   | ]}t jj d ddV  qdS )))r   rW   )rW   r   r   r   N)r   r^   r   r   rD   rE   r}     r~   zC3x.__init__.<locals>.<genexpr>N)r4   r5   r   r^   r6   r   r   r   r   rB   r   rE   r5     s    zC3x.__init__)r   Tr   r   rP   rQ   rR   rS   r5   rT   rD   rD   rB   rE   r   	  s   r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	r   zRep C3.rW   r   c                    sx   t    t||  t||dd| _t||dd| _tj fddt|D  | _	 |krjt |ddnt
 | _dS )zfInitialize CSP Bottleneck with a single convolution using input channels, output channels, and number.r   c                    s   g | ]}t   qS rD   )r   r   r^   rD   rE   r     r~   z"RepC3.__init__.<locals>.<listcomp>N)r4   r5   r   r   rZ   r\   r6   r   r   r   Identityr]   r@   r?   r_   r   r   rB   r   rE   r5     s    
zRepC3.__init__c                 C   s    |  | | || | S )z#Forward pass of RT-DETR neck layer.)r]   r   rZ   r\   r`   rD   rD   rE   rN     s    zRepC3.forward)rW   r   rO   rD   rD   rB   rE   r     s   	r   c                       s"   e Zd ZdZd fdd	Z  ZS )r   z"C3 module with TransformerBlock().r   Tr   c                    s6   t  |||||| t|| }t||d|| _dS )z1Initialize C3Ghost module with GhostBottleneck().rF   N)r4   r5   r   r
   r   )r@   r?   r_   r   r   r   r   r^   rB   rD   rE   r5   '  s    zC3TR.__init__)r   Tr   r   r   rD   rD   rB   rE   r   $  s   r   c                       s"   e Zd ZdZd fdd	Z  ZS )r   z!C3 module with GhostBottleneck().r   Tr   c                    sD   t  |||||| t||  tj fddt|D  | _dS )zOInitialize 'SPP' module with various pooling sizes for spatial pyramid pooling.c                 3   s   | ]}t   V  qd S )N)r   r   r   rD   rE   r}   5  r~   z#C3Ghost.__init__.<locals>.<genexpr>Nr4   r5   r   r6   r   r   r   r   rB   r   rE   r5   1  s    zC3Ghost.__init__)r   Tr   r   r   rD   rD   rB   rE   r   .  s   r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	r   z9Ghost Bottleneck https://github.com/huawei-noah/ghostnet.rW   r   c                    s   t    |d }tt||dd|dkr<t||||ddnt t||dddd| _|dkrtt||||ddt||ddddnt | _	dS )zPInitializes GhostBottleneck module with arguments ch_in, ch_out, kernel, stride.rG   r   Fra   N)
r4   r5   r6   r   r   r   r   r8   r   r   )r@   r?   r_   rY   sr^   rB   rD   rE   r5   ;  s    
 6zGhostBottleneck.__init__c                 C   s   |  || | S )z:Applies skip connection and concatenation to input tensor.)r8   r   r`   rD   rD   rE   rN   H  s    zGhostBottleneck.forward)rW   r   rO   rD   rD   rB   rE   r   8  s   r   c                       s*   e Zd ZdZd
 fdd	Zdd	 Z  ZS )r   zStandard bottleneck.Tr   r   r   c                    sT   t    t|| }t|||d d| _t|||d d|d| _|oL||k| _dS )zgInitializes a standard bottleneck module with optional shortcut connection and configurable parameters.r   r   r   N)r4   r5   r   r   rZ   r\   r   r@   r?   r_   r   r   rY   r   r^   rB   rD   rE   r5   P  s
    
zBottleneck.__init__c                 C   s*   | j r|| | | S | | |S )z#Applies the YOLO FPN to input data.)r   r\   rZ   r`   rD   rD   rE   rN   X  s    zBottleneck.forward)Tr   r   r   rO   rD   rD   rB   rE   r   M  s   r   c                       s*   e Zd ZdZd	 fdd	Zdd Z  ZS )
r   zGCSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks.r   Tr   c                    s   t    t||  t| dd| _tj| dddd| _tj  dddd| _td  |dd| _	t
d  | _t | _tj fddt|D  | _dS )zfInitializes the CSP Bottleneck given arguments for ch_in, ch_out, number, shortcut, groups, expansion.r   Fr2   rG   c                 3   s    | ]}t   d dV  qdS r   r   Nr   r   r   rD   rE   r}   j  r~   z)BottleneckCSP.__init__.<locals>.<genexpr>N)r4   r5   r   r   rZ   r6   r7   r\   r]   cv4BatchNorm2dbnSiLUrb   r   r   r   r   rB   r   rE   r5   `  s    

zBottleneckCSP.__init__c              
   C   sB   |  | | |}| |}| | | t||fdS )z-Applies a CSP bottleneck with 3 convolutions.r   )	r]   r   rZ   r\   r   rb   r   r9   rt   )r@   rA   y1y2rD   rD   rE   rN   l  s    
zBottleneckCSP.forward)r   Tr   r   rO   rD   rD   rB   rE   r   ]  s   r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	ResNetBlockz.ResNet block with standard convolution layers.r   rF   c              	      s   t    || }t||dddd| _t||d|ddd| _t||ddd| _|dks^||krvtt||d|ddnt | _	dS )	z-Initialize convolution with given parameters.r   TrY   r   rb   rW   rY   r   prb   Frx   N)
r4   r5   r   rZ   r\   r]   r6   r   r   r   )r@   r?   r_   r   r   c3rB   rD   rE   r5   v  s    
zResNetBlock.__init__c              	   C   s&   t | | | || | S )z&Forward pass through the ResNet block.)rr   Zrelur]   r\   rZ   r   r`   rD   rD   rE   rN     s    zResNetBlock.forward)r   rF   rO   rD   rD   rB   rE   r   s  s   	r   c                       s*   e Zd ZdZd	 fdd	Zdd Z  ZS )
r    z)ResNet layer with multiple ResNet blocks.r   FrF   c              
      s   t    || _| jrBtt| dddddtjdddd| _n@t| |dg}|	 fd	d
t
|d D  tj| | _dS )z,Initializes the ResNetLayer given arguments.   rG   rW   Tr   r   r   r   c                    s    g | ]}t    d dqS )r   r   )r   r   r_   r   rD   rE   r     r~   z(ResNetLayer.__init__.<locals>.<listcomp>N)r4   r5   is_firstr6   r   r   rl   layerr   r   r   )r@   r?   r_   r   r   r   r   blocksrB   r   rE   r5     s    
 "zResNetLayer.__init__c                 C   s
   |  |S )z&Forward pass through the ResNet layer.)r   r`   rD   rD   rE   rN     s    zResNetLayer.forward)r   Fr   rF   rO   rD   rD   rB   rE   r      s   r    c                       s*   e Zd ZdZd
 fdd	Zdd	 Z  ZS )MaxSigmoidAttnBlockzMax Sigmoid attention block.r         Fc                    s   t    || _|| | _||kr2t||dddnd| _t||| _t	t
|| _t||dddd| _|rt	t
d|ddnd| _dS )z9Initializes MaxSigmoidAttnBlock with specified arguments.r   Frx   NrW   r   r   )r4   r5   nhhcr   r   r6   Linearglr;   r9   Zzerosr3   	proj_convonesscale)r@   r?   r_   r   r   gcr   rB   rD   rE   r5     s    

zMaxSigmoidAttnBlock.__init__c           	      C   s   |j \}}}}| |}||d| j| j}| jdur@| |n|}||| j| j||}td||}|jddd }|| jd  }|| j	dddddf  }|
 | j }| |}||| jd||}||d }||d||S )zForward process.r   Nzbmchw,bnmc->bmhwnrp   r   r   rG   )rH   r   r<   r   r   r   r9   einsummaxr3   Zsigmoidr   r   Z	unsqueeze)	r@   rA   guidebsrL   hwZembedawrD   rD   rE   rN     s    

zMaxSigmoidAttnBlock.forward)r   r   r   FrO   rD   rD   rB   rE   r     s   r   c                       s2   e Zd ZdZd fdd	Zd	d
 Zdd Z  ZS )r   z*C2f module with an additional attn module.r   r   r   Fr   c
           
         s   t    t||	 _t|dj dd_td| j |d_t fddt	|D _
tjj|||d_dS )z_Initializes C2f module with attention mechanism for enhanced feature extraction and processing.rG   r   rW   c              	   3   s&   | ]}t jj d ddV  qdS r   r   r   r   rD   rE   r}     r~   z#C2fAttn.__init__.<locals>.<genexpr>)r   r   r   N)r4   r5   r   r   r   rZ   r\   r6   r   r   r   r   attn)
r@   r?   r_   r   r   r   r   r   r   r   rB   r   rE   r5     s    
"zC2fAttn.__init__c                    sX   t | |dd   fdd| jD   |  d | | t	 dS )r   rG   r   c                 3   s   | ]}| d  V  qdS r   rD   r   r   rD   rE   r}     r~   z"C2fAttn.forward.<locals>.<genexpr>r   )
r   rZ   r   r   r   appendr   r\   r9   rt   r@   rA   r   rD   r   rE   rN     s    zC2fAttn.forwardc                    s`   t | || j| jfd   fdd| jD   |  d | | t	
 dS )r   r   c                 3   s   | ]}| d  V  qdS r   rD   r   r   rD   rE   r}     r~   z(C2fAttn.forward_split.<locals>.<genexpr>r   )r   rZ   r   r   r   r   r   r   r\   r9   rt   r   rD   r   rE   r     s    zC2fAttn.forward_split)r   r   r   r   Fr   r   r   rD   rD   rB   rE   r     s   	r   c                       s*   e Zd ZdZd fdd		Zd
d Z  ZS )r   zKImagePoolingAttn: Enhance the text embeddings with image-aware information.rU   rD   r      rW   Fc                    s   t    t|}tt|t| | _tt t  | _tt t  | _	t || _
|rtjtdgddnd| _t fdd|D | _tfddt|D | _ | _|| _|| _ | | _| _dS )	z6Initializes ImagePoolingAttn with specified arguments.g        T)Zrequires_gradr   c                    s   g | ]}t j| d dqS )r   )rc   )r6   r7   )ry   Zin_channels)r   rD   rE   r     r~   z-ImagePoolingAttn.__init__.<locals>.<listcomp>c                    s   g | ]}t   fqS rD   )r6   ZAdaptiveMaxPool2dr   rX   rD   rE   r     r~   N)r4   r5   r   r6   r   Z	LayerNormr   querykeyvalueprojr;   r9   tensorr   r   projectionsr   im_poolsr   r   nfr   rY   )r@   r   chctr   rY   r   r   rB   )r   rY   rE   r5     s    
 
zImagePoolingAttn.__init__c                    s  |d j d  t|| jks J | jd  fddt|| j| jD }tj|dd	dd}| 
|}| |}| |}| d| j| j}| d| j| j}| d| j| j}td||}|| jd	  }tj|dd}td
||}| | d| j}|| j | S )z@Executes attention mechanism on input tensor x and guide tensor.r   rG   c                    s(   g | ] \}}}|||  d qS )r   )r<   )ry   rA   r   rm   r   Znum_patchesrD   rE   r     r~   z,ImagePoolingAttn.forward.<locals>.<listcomp>r   rp   r   zbnmc,bkmc->bmnkr   zbmnk,bkmc->bnmc)rH   r   r   rY   zipr   r   r9   rt   rI   r   r   r   reshaper   r   r   rr   rJ   r   r   r   )r@   rA   textqrY   vr   rD   r   rE   rN     s"    
 


zImagePoolingAttn.forward)rU   rD   r   r   rW   FrO   rD   rD   rB   rE   r     s   r   c                       s(   e Zd ZdZ fddZdd Z  ZS )r   zZImplements contrastive learning head for region-text similarity in vision-language models.c                    sB   t    ttdg| _ttg td  | _	dS )zMInitializes ContrastiveHead with specified region-text similarity parameters.      $g$I$I,@N)
r4   r5   r6   r;   r9   r   r3   r   loglogit_scaler@   rB   rD   rE   r5   	  s    
zContrastiveHead.__init__c                 C   sB   t j|ddd}t j|ddd}td||}|| j  | j S ))Forward function of contrastive learning.r   rG   rq   r   r   bchw,bkc->bkhw)rr   	normalizer9   r   r   expr3   r@   rA   r   rD   rD   rE   rN     s    zContrastiveHead.forwardrO   rD   rD   rB   rE   r     s   r   c                       s.   e Zd ZdZed fddZdd Z  ZS )r   z
    Batch Norm Contrastive Head for YOLO-World using batch norm instead of l2-normalization.

    Args:
        embed_dims (int): Embed dimensions of text and image features.
    )
embed_dimsc                    sD   t    t|| _ttdg| _tdt	g  | _
dS )zBInitialize ContrastiveHead with region-text similarity parameters.r   g      N)r4   r5   r6   r   normr;   r9   r   r3   r   r   )r@   r   rB   rD   rE   r5      s    
zBNContrastiveHead.__init__c                 C   s<   |  |}tj|ddd}td||}|| j  | j S )r   r   rG   r   r   )r   rr   r   r9   r   r   r   r3   r   rD   rD   rE   rN   )  s    
zBNContrastiveHead.forward)rP   rQ   rR   rS   r   r5   rN   rT   rD   rD   rB   rE   r     s   	r   c                       s"   e Zd ZdZd fdd	Z  ZS )	RepBottleneckzRep bottleneck.Tr   r   r   c                    s:   t  |||||| t|| }t|||d d| _dS )zfInitializes a RepBottleneck module with customizable in/out channels, shortcuts, groups and expansion.r   r   N)r4   r5   r   r   rZ   r   rB   rD   rE   r5   4  s    zRepBottleneck.__init__)Tr   r   r   r   rD   rD   rB   rE   r   1  s   r   c                       s"   e Zd ZdZd fdd	Z  ZS )RepCSPzXRepeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction.r   Tr   c                    sH   t  |||| t||  tj fddt|D  | _dS )z`Initializes RepCSP layer with given channels, repetitions, shortcut, groups and expansion ratio.c                 3   s    | ]}t   d dV  qdS r   )r   r   r   rD   rE   r}   B  r~   z"RepCSP.__init__.<locals>.<genexpr>Nr   r   rB   r   rE   r5   >  s    zRepCSP.__init__)r   Tr   r   r   rD   rD   rB   rE   r   ;  s   r   c                       s2   e Zd ZdZd	 fdd	Zdd Zdd Z  ZS )
r!   z	CSP-ELAN.r   c                    s   t    |d | _t||dd| _tt|d ||t||dd| _tt|||t||dd| _	t|d|  |dd| _
dS )zWInitializes CSP-ELAN layer with specified channel sizes, repetitions, and convolutions.rG   r   rW   N)r4   r5   r   r   rZ   r6   r   r   r\   r]   r   )r@   r?   r_   r   c4r   rB   rD   rE   r5   H  s    

$ zRepNCSPELAN4.__init__c                    sH   t | |dd   fdd| j| jfD  | t dS )z(Forward pass through RepNCSPELAN4 layer.rG   r   c                 3   s   | ]}| d  V  qdS r   rD   r   r   rD   rE   r}   T  r~   z'RepNCSPELAN4.forward.<locals>.<genexpr>)	r   rZ   r   r   r\   r]   r   r9   rt   r`   rD   r   rE   rN   Q  s     zRepNCSPELAN4.forwardc                    sP   t | || j| jfd   fdd| j| jfD  | t	 dS )r   r   c                 3   s   | ]}| d  V  qdS r   rD   r   r   rD   rE   r}   Z  r~   z-RepNCSPELAN4.forward_split.<locals>.<genexpr>)
r   rZ   r   r   r   r\   r]   r   r9   rt   r`   rD   r   rE   r   W  s     zRepNCSPELAN4.forward_split)r   r   rD   rD   rB   rE   r!   E  s   	r!   c                       s    e Zd ZdZ fddZ  ZS )r"   z!ELAN1 module with 4 convolutions.c                    sl   t  |||| |d | _t||dd| _t|d |dd| _t||dd| _t|d|  |dd| _dS )z5Initializes ELAN1 layer with specified channel sizes.rG   r   rW   N)r4   r5   r   r   rZ   r\   r]   r   )r@   r?   r_   r   r  rB   rD   rE   r5   a  s    
zELAN1.__init__r   rD   rD   rB   rE   r"   ^  s   r"   c                       s(   e Zd ZdZ fddZdd Z  ZS )r$   zAConv.c                    s    t    t||ddd| _dS )z1Initializes AConv module with convolution layers.rW   rG   r   N)r4   r5   r   rZ   r@   r?   r_   rB   rD   rE   r5   n  s    
zAConv.__init__c                 C   s"   t jj|ddddd}| |S )z!Forward pass through AConv layer.rG   r   r   FT)r9   r6   
functional
avg_pool2drZ   r`   rD   rD   rE   rN   s  s    zAConv.forwardrO   rD   rD   rB   rE   r$   k  s   r$   c                       s(   e Zd ZdZ fddZdd Z  ZS )r#   zADown.c                    sH   t    |d | _t|d | jddd| _t|d | jddd| _dS )z\Initializes ADown module with convolution layers to downsample input from channels c1 to c2.rG   rW   r   r   N)r4   r5   r   r   rZ   r\   r  rB   rD   rE   r5   |  s    

zADown.__init__c                 C   s`   t jj|ddddd}|dd\}}| |}t jj|ddd}| |}t ||fdS )z!Forward pass through ADown layer.rG   r   r   FTrW   )	r9   r6   r  r  r   rZ   Z
max_pool2dr\   rt   )r@   rA   rv   ru   rD   rD   rE   rN     s    

zADown.forwardrO   rD   rD   rB   rE   r#   y  s   r#   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )r%   z	SPP-ELAN.r   c                    sz   t    || _t||dd| _tj|d|d d| _tj|d|d d| _tj|d|d d| _	td| |dd| _
dS )z_Initializes SPP-ELAN block with convolution and max pooling layers for spatial pyramid pooling.r   rG   r   rF   N)r4   r5   r   r   rZ   r6   rl   r\   r]   r   cv5)r@   r?   r_   r   rY   rB   rD   rE   r5     s    
zSPPELAN.__init__c                    sB   |  |g   fdd| j| j| jfD  | t dS )z#Forward pass through SPPELAN layer.c                 3   s   | ]}| d  V  qdS r   rD   r   r   rD   rE   r}     r~   z"SPPELAN.forward.<locals>.<genexpr>r   )rZ   r   r\   r]   r   r  r9   rt   r`   rD   r   rE   rN     s    $zSPPELAN.forward)r   rO   rD   rD   rB   rE   r%     s   
r%   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	r'   z	CBLinear.r   Nc              	      s8   t    || _tj|t|||t|||dd| _dS )z:Initializes the CBLinear module, passing inputs unchanged.T)groupsr3   N)r4   r5   c2sr6   r7   sumr	   r8   )r@   r?   r  rY   r   r   r   rB   rD   rE   r5     s    
zCBLinear.__init__c                 C   s   |  |j| jddS )z$Forward pass through CBLinear layer.r   rp   )r8   r   r  r`   rD   rD   rE   rN     s    zCBLinear.forward)r   r   Nr   rO   rD   rD   rB   rE   r'     s   r'   c                       s(   e Zd ZdZ fddZdd Z  ZS )r&   zCBFuse.c                    s   t    || _dS )zHInitializes CBFuse module with layer index for selective feature fusion.N)r4   r5   idx)r@   r	  rB   rD   rE   r5     s    
zCBFuse.__init__c                    sR   |d j dd  fddt|dd D }tjt||dd  ddS )z"Forward pass through CBFuse layer.r   rG   Nc                    s*   g | ]"\}}t j| j|  d dqS )Znearest)sizemode)rr   Zinterpolater	  )ry   rz   rA   r@   Ztarget_sizerD   rE   r     r~   z"CBFuse.forward.<locals>.<listcomp>r   rp   )rH   	enumerater9   r  stack)r@   ZxsresrD   r  rE   rN     s     zCBFuse.forwardrO   rD   rD   rB   rE   r&     s   r&   c                       s*   e Zd ZdZd	 fdd	Zdd Z  ZS )
C3fr   r   Fr   c                    sr   t    t||  t| dd| _t| dd| _td|   |d| _t fddt	|D | _
dS )zInitialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,
        expansion.
        r   rG   c              	   3   s"   | ]}t   d ddV  qdS r   r   r   r   rD   rE   r}     r~   zC3f.__init__.<locals>.<genexpr>N)r4   r5   r   r   rZ   r\   r]   r6   r   r   r   r   rB   r   rE   r5     s    
zC3f.__init__c                    s@   |  || |g   fdd| jD  | t dS )r   c                 3   s   | ]}| d  V  qdS r   rD   r   r   rD   rE   r}     r~   zC3f.forward.<locals>.<genexpr>r   )r\   rZ   r   r   r]   r9   rt   r`   rD   r   rE   rN     s    zC3f.forward)r   Fr   r   rO   rD   rD   rB   rE   r    s   r  c                       s"   e Zd ZdZd fdd	Z  ZS )	r(   r   r   Fr   Tc                    s>   t  |||| t fddt|D _dS )zaInitializes the C3k2 module, a faster CSP Bottleneck with 2 convolutions and optional C3k blocks.c                 3   s8   | ]0} rt jjd ntjjV  qdS )rG   N)C3kr   r   r   c3kr   r@   r   rD   rE   r}     s   z C3k2.__init__.<locals>.<genexpr>Nr4   r5   r6   r   r   r   )r@   r?   r_   r   r  r   r   r   rB   r  rE   r5     s    zC3k2.__init__)r   Fr   r   Tr   rD   rD   rB   rE   r(     s   r(   c                       s"   e Zd ZdZd fdd	Z  ZS )	r  zhC3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks.r   Tr   rW   c                    sJ   t  |||| t||  tj fddt|D  | _dS )zYInitializes the C3k module with specified channels, number of layers, and configurations.c              	   3   s&   | ]}t   fd dV  qdS )r   r   Nr   r   r^   r   rY   r   rD   rE   r}     r~   zC3k.__init__.<locals>.<genexpr>Nr   )r@   r?   r_   r   r   r   r   rY   rB   r  rE   r5     s    zC3k.__init__)r   Tr   r   rW   r   rD   rD   rB   rE   r    s   r  c                       sF   e Zd ZdZdd fddZdd Zdd	 Ze d
d Z	  Z
S )r+   zfRepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture.Nreturnc              	      sN   t    t||ddd|dd| _t||ddd|dd| _|| _t | _dS )z\Initializes RepVGGDW with depthwise separable convolutional layers for efficient processing.r   r   rW   Fr   rb   N)	r4   r5   r   r8   conv1rq   r6   r   rb   )r@   ZedrB   rD   rE   r5     s
    
zRepVGGDW.__init__c                 C   s   |  | || | S )z
        Performs a forward pass of the RepVGGDW block.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
        )rb   r8   r  r`   rD   rD   rE   rN     s    
zRepVGGDW.forwardc                 C   s   |  | |S )a  
        Performs a forward pass of the RepVGGDW block without fusing the convolutions.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
        )rb   r8   r`   rD   rD   rE   forward_fuse   s    
zRepVGGDW.forward_fusec           	      C   s   t | jj| jj}t | jj| jj}|j}|j}|j}|j}tjj	|g d}|| }|| }|jj
| |jj
| || _| `dS )z
        Fuses the convolutional layers in the RepVGGDW block.

        This method fuses the convolutional layers and updates the weights and biases accordingly.
        )rG   rG   rG   rG   N)r   r8   r   r  r=   r3   r9   r6   r  rs   r>   Zcopy_)	r@   r8   r  Zconv_wZconv_bZconv1_wZconv1_bZfinal_conv_wZfinal_conv_brD   rD   rE   fuse  s    zRepVGGDW.fuse)rP   rQ   rR   rS   r5   rN   r  r9   Zno_gradr  rT   rD   rD   rB   rE   r+     s   r+   c                       s*   e Zd ZdZd	 fdd	Zdd Z  ZS )
r,   a  
    Conditional Identity Block (CIB) module.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        shortcut (bool, optional): Whether to add a shortcut connection. Defaults to True.
        e (float, optional): Scaling factor for the hidden channels. Defaults to 0.5.
        lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
    Tr   Fc                    s   t    t|| }tt||d|dt|d| d|rFtd| ntd| d| dd| dtd| |dt||d|d| _|o||k| _dS )zXInitializes the custom model with optional shortcut, scaling factor, and RepVGGDW layer.rW   r   rG   r   N)	r4   r5   r   r6   r   r   r+   rZ   r   )r@   r?   r_   r   r   lkr^   rB   rD   rE   r5   3  s    
*zCIB.__init__c                 C   s   | j r|| | S | |S )z
        Forward pass of the CIB module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor.
        )r   rZ   r`   rD   rD   rE   rN   A  s    
zCIB.forward)Tr   FrO   rD   rD   rB   rE   r,   '  s   r,   c                       s"   e Zd ZdZd fdd	Z  ZS )r-   aQ  
    C2fCIB class represents a convolutional block with C2f and CIB modules.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        n (int, optional): Number of CIB modules to stack. Defaults to 1.
        shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.
        lk (bool, optional): Whether to use local key connection. Defaults to False.
        g (int, optional): Number of groups for grouped convolution. Defaults to 1.
        e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
    r   Fr   c                    s<   t  ||||| t fddt|D _dS )ziInitializes the module with specified parameters for channel, shortcut, local key, groups, and expansion.c                 3   s$   | ]}t jjd  dV  qdS )r   )r   r  N)r,   r   r   r  r@   r   rD   rE   r}   _  r~   z"C2fCIB.__init__.<locals>.<genexpr>Nr  )r@   r?   r_   r   r   r  r   r   rB   r  rE   r5   \  s    zC2fCIB.__init__)r   FFr   r   r   rD   rD   rB   rE   r-   N  s   r-   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	r.   a  
    Attention module that performs self-attention on the input tensor.

    Args:
        dim (int): The input tensor dimension.
        num_heads (int): The number of attention heads.
        attn_ratio (float): The ratio of the attention key dimension to the head dimension.

    Attributes:
        num_heads (int): The number of attention heads.
        head_dim (int): The dimension of each attention head.
        key_dim (int): The dimension of the attention key.
        scale (float): The scaling factor for the attention scores.
        qkv (Conv): Convolutional layer for computing the query, key, and value.
        proj (Conv): Convolutional layer for projecting the attended values.
        pe (Conv): Convolutional layer for positional encoding.
    r   r   c                    s   t    || _|| | _t| j| | _| jd | _| j| }||d  }t||ddd| _t||ddd| _	t||dd|dd| _
dS )	zhInitializes multi-head attention module with query, key, and value convolutions and positional encoding.g      rG   r   Fra   rW   r  N)r4   r5   	num_headshead_dimr   key_dimr   r   qkvr   pe)r@   rq   r  
attn_ratioZnh_kdr   rB   rD   rE   r5   u  s    


zAttention.__init__c              	   C   s   |j \}}}}|| }| |}||| j| jd | j |j| j| j| jgdd\}}	}
|dd|	 | j }|j	dd}|
|dd ||||| 
|
|||| }| |}|S )z
        Forward pass of the Attention module.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            (torch.Tensor): The output tensor after self-attention.
        rG   rp   r   )rH   r!  r<   r  r   r  r   rI   r   rJ   r"  r   r   )r@   rA   BCHWNr!  r   rY   r   r   rD   rD   rE   rN     s    

2
zAttention.forward)r   r   rO   rD   rD   rB   rE   r.   b  s   r.   c                       s0   e Zd ZdZddd fddZd	d
 Z  ZS )PSABlockaK  
    PSABlock class implementing a Position-Sensitive Attention block for neural networks.

    This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
    with optional shortcut connections.

    Attributes:
        attn (Attention): Multi-head attention module.
        ffn (nn.Sequential): Feed-forward neural network module.
        add (bool): Flag indicating whether to add shortcut connections.

    Methods:
        forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers.

    Examples:
        Create a PSABlock and perform a forward pass
        >>> psablock = PSABlock(c=128, attn_ratio=0.5, num_heads=4, shortcut=True)
        >>> input_tensor = torch.randn(1, 128, 32, 32)
        >>> output_tensor = psablock(input_tensor)
    r   rF   TNr  c              	      sN   t    t|||d| _tt||d dt|d |ddd| _|| _dS )z`Initializes the PSABlock with attention and feed-forward layers for enhanced feature extraction.r#  r  rG   r   Fra   N)	r4   r5   r.   r   r6   r   r   ffnr   )r@   r   r#  r  r   rB   rD   rE   r5     s    
*zPSABlock.__init__c                 C   s@   | j r|| | n| |}| j r2|| | n| |}|S )ziExecutes a forward pass through PSABlock, applying attention and feed-forward layers to the input tensor.)r   r   r,  r`   rD   rD   rE   rN     s    zPSABlock.forward)r   rF   TrO   rD   rD   rB   rE   r*    s   r*  c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )r/   a  
    PSA class for implementing Position-Sensitive Attention in neural networks.

    This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
    input tensors, enhancing feature extraction and processing capabilities.

    Attributes:
        c (int): Number of hidden channels after applying the initial convolution.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        attn (Attention): Attention module for position-sensitive attention.
        ffn (nn.Sequential): Feed-forward network for further processing.

    Methods:
        forward: Applies position-sensitive attention and feed-forward network to the input tensor.

    Examples:
        Create a PSA module and apply it to an input tensor
        >>> psa = PSA(c1=128, c2=128, e=0.5)
        >>> input_tensor = torch.randn(1, 128, 64, 64)
        >>> output_tensor = psa.forward(input_tensor)
    r   c              	      s   t    ||ksJ t|| | _t|d| j dd| _td| j |d| _t| jd| jd d| _t	
t| j| jd dt| jd | jddd| _dS )	zeInitializes the PSA module with input/output channels and attention mechanism for feature extraction.rG   r   r   @   r+  Fra   N)r4   r5   r   r   r   rZ   r\   r.   r   r6   r   r,  )r@   r?   r_   r   rB   rD   rE   r5     s    
zPSA.__init__c                 C   sR   |  |j| j| jfdd\}}|| | }|| | }| t||fdS )zdExecutes forward pass in PSA module, applying attention and feed-forward layers to the input tensor.r   rp   )rZ   r   r   r   r,  r\   r9   rt   r   rD   rD   rE   rN     s     zPSA.forward)r   rO   rD   rD   rB   rE   r/     s   r/   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	r*   aL  
    C2PSA module with attention mechanism for enhanced feature extraction and processing.

    This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
    capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.

    Attributes:
        c (int): Number of hidden channels.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.

    Methods:
        forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.

    Notes:
        This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.

    Examples:
        >>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
        >>> input_tensor = torch.randn(1, 256, 64, 64)
        >>> output_tensor = c2psa(input_tensor)
    r   r   c                    sp   t    ||ksJ t||  _t|d j dd _td j |d _tj fddt	|D   _
dS )ziInitializes the C2PSA module with specified input/output channels, number of layers, and expansion ratio.rG   r   c                 3   s$   | ]}t  jd  jd dV  qdS r   r-  r+  Nr*  r   r   r   rD   rE   r}     r~   z!C2PSA.__init__.<locals>.<genexpr>Nr   r   rB   r   rE   r5     s    
zC2PSA.__init__c                 C   s@   |  |j| j| jfdd\}}| |}| t||fdS )zaProcesses the input tensor 'x' through a series of PSA blocks and returns the transformed tensor.r   rp   )rZ   r   r   r   r\   r9   rt   r   rD   rD   rE   rN     s     
zC2PSA.forward)r   r   rO   rD   rD   rB   rE   r*     s   
r*   c                       s"   e Zd ZdZd fdd	Z  ZS )r)   a  
    C2fPSA module with enhanced feature extraction using PSA blocks.

    This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature extraction.

    Attributes:
        c (int): Number of hidden channels.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        m (nn.ModuleList): List of PSA blocks for feature extraction.

    Methods:
        forward: Performs a forward pass through the C2fPSA module.
        forward_split: Performs a forward pass using split() instead of chunk().

    Examples:
        >>> import torch
        >>> from ultralytics.models.common import C2fPSA
        >>> model = C2fPSA(c1=64, c2=64, n=3, e=0.5)
        >>> x = torch.randn(1, 64, 128, 128)
        >>> output = model(x)
        >>> print(output.shape)
    r   r   c                    sB   ||ksJ t  j||||d t fddt|D  _dS )z`Initializes the C2fPSA module, a variant of C2f with PSA blocks for enhanced feature extraction.)r   r   c                 3   s$   | ]}t  jd  jd dV  qdS r.  r/  r   r   rD   rE   r}   1  r~   z"C2fPSA.__init__.<locals>.<genexpr>Nr  r   rB   r   rE   r5   -  s    zC2fPSA.__init__)r   r   r   rD   rD   rB   rE   r)     s   r)   c                       s(   e Zd ZdZ fddZdd Z  ZS )r0   a<  
    SCDown module for downsampling with separable convolutions.

    This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
    efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.

    Attributes:
        cv1 (Conv): Pointwise convolution layer that reduces the number of channels.
        cv2 (Conv): Depthwise convolution layer that performs spatial downsampling.

    Methods:
        forward: Applies the SCDown module to the input tensor.

    Examples:
        >>> import torch
        >>> from ultralytics import SCDown
        >>> model = SCDown(c1=64, c2=128, k=3, s=2)
        >>> x = torch.randn(1, 64, 128, 128)
        >>> y = model(x)
        >>> print(y.shape)
        torch.Size([1, 128, 64, 64])
    c                    s4   t    t||dd| _t|||||dd| _dS )z\Initializes the SCDown module with specified input/output channels, kernel size, and stride.r   F)rY   r   r   rb   N)r4   r5   r   rZ   r\   )r@   r?   r_   rY   r   rB   rD   rE   r5   L  s    
zSCDown.__init__c                 C   s   |  | |S )zNApplies convolution and downsampling to the input tensor in the SCDown module.)r\   rZ   r`   rD   rD   rE   rN   R  s    zSCDown.forwardrO   rD   rD   rB   rE   r0   4  s   r0   )ArS   r9   Ztorch.nnr6   Ztorch.nn.functionalr  rr   Zultralytics.utils.torch_utilsr   r8   r   r   r   r   r   r	   Ztransformerr
   __all__Moduler   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r   r   r   r   r   r   r   r!   r"   r$   r#   r%   r'   r&   r  r(   r  r+   r,   r-   r.   r*  r/   r*   r)   r0   rD   rD   rD   rE   <module>   sh    *


#.

>'8%+* 