a
    yfd                     @   sd   d dl mZmZmZmZ d dlZd dlmZ d dlmZm	Z	 G dd dej
ZG dd dej
ZdS )	    )ListOptionalTupleTypeN)nn)MLPLayerNorm2dc                	       s   e Zd ZdZdejddfeejeeej eedd fddZ	e
je
je
je
jeee
je
jf dd	d
Ze
je
je
je
jee
je
jf dddZ  ZS )MaskDecodera  
    Decoder module for generating masks and their associated quality scores using a transformer architecture.

    This class predicts masks given image and prompt embeddings, utilizing a transformer to process the inputs and
    generate mask predictions along with their quality scores.

    Attributes:
        transformer_dim (int): Channel dimension for the transformer module.
        transformer (nn.Module): Transformer module used for mask prediction.
        num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
        iou_token (nn.Embedding): Embedding for the IoU token.
        num_mask_tokens (int): Number of mask tokens.
        mask_tokens (nn.Embedding): Embedding for the mask tokens.
        output_upscaling (nn.Sequential): Neural network sequence for upscaling the output.
        output_hypernetworks_mlps (nn.ModuleList): Hypernetwork MLPs for generating masks.
        iou_prediction_head (nn.Module): MLP for predicting mask quality.

    Methods:
        forward: Predicts masks given image and prompt embeddings.
        predict_masks: Internal method for mask prediction.

    Examples:
        >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
        >>> masks, iou_pred = decoder(
        ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, multimask_output=True
        ... )
        >>> print(f"Predicted masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
          N)transformer_dimtransformernum_multimask_outputs
activationiou_head_depthiou_head_hidden_dimreturnc                    s   t     | _|| _|| _td | _|d | _t| j | _	t
tj  d dddt d | tj d  d ddd| | _t fddt| jD | _t || j|| _dS )	ay  
        Initializes the MaskDecoder module for generating masks and their quality scores.

        Args:
            transformer_dim (int): Channel dimension for the transformer module.
            transformer (nn.Module): Transformer module used for mask prediction.
            num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
            activation (Type[nn.Module]): Type of activation to use when upscaling masks.
            iou_head_depth (int): Depth of the MLP used to predict mask quality.
            iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.

        Examples:
            >>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
            >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer)
            >>> print(decoder)
                 Zkernel_sizeZstride   c                    s   g | ]}t    d  dqS r   r
   r   .0_r    c/var/www/html/django/DPS/env/lib/python3.9/site-packages/ultralytics/models/sam/modules/decoders.py
<listcomp>T       z(MaskDecoder.__init__.<locals>.<listcomp>N)super__init__r   r   r   r   	Embedding	iou_tokennum_mask_tokensmask_tokens
SequentialConvTranspose2dr   output_upscaling
ModuleListrangeoutput_hypernetworks_mlpsr   iou_prediction_head)selfr   r   r   r   r   r   	__class__r   r   r#   )   s$    


zMaskDecoder.__init__)image_embeddingsimage_pesparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputr   c           	      C   sb   | j ||||d\}}|r$tddntdd}|dd|ddddf }|dd|f }||fS )a  
        Predicts masks given image and prompt embeddings.

        Args:
            image_embeddings (torch.Tensor): Embeddings from the image encoder.
            image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings.
            sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes.
            dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs.
            multimask_output (bool): Whether to return multiple masks or a single mask.

        Returns:
            (Tuple[torch.Tensor, torch.Tensor]): A tuple containing:
                - masks (torch.Tensor): Batched predicted masks.
                - iou_pred (torch.Tensor): Batched predictions of mask quality.

        Examples:
            >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
            >>> image_emb = torch.rand(1, 256, 64, 64)
            >>> image_pe = torch.rand(1, 256, 64, 64)
            >>> sparse_emb = torch.rand(1, 2, 256)
            >>> dense_emb = torch.rand(1, 256, 64, 64)
            >>> masks, iou_pred = decoder(image_emb, image_pe, sparse_emb, dense_emb, multimask_output=True)
            >>> print(f"Masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
        )r2   r3   r4   r5   r   Nr   )predict_masksslice)	r/   r2   r3   r4   r5   r6   masksiou_predZ
mask_slicer   r   r   forwardY   s     
zMaskDecoder.forward)r2   r3   r4   r5   r   c                    s\  t jjjjjgdd}|d|jd dd}t j||fdd}t j||jd dd}|| }t j||jd dd}|j\}	}
}}	|||\}}|dddddf }|ddddj
 ddf  |dd|	|
||}|} fddtj
D }t j|dd}|j\}	}
}}|||	|
||  |	d||}|}||fS )	zaPredicts masks and quality scores using image and prompt embeddings via transformer architecture.r   dimr   Nr   c                    s,   g | ]$}j |  d d |d d f qS Nr-   r   imask_tokens_outr/   r   r   r       s   z-MaskDecoder.predict_masks.<locals>.<listcomp>)torchcatr%   weightr'   	unsqueezeexpandshaperepeat_interleaver   r&   	transposeviewr*   r,   stackr.   )r/   r2   r3   r4   r5   output_tokenstokenssrcpos_srcbchwhsiou_token_outupscaled_embeddinghyper_in_listhyper_inr9   r:   r   rC   r   r7      s(    	 
"
zMaskDecoder.predict_masks)__name__
__module____qualname____doc__r   GELUintModuler   r#   rE   Tensorboolr   r;   r7   __classcell__r   r   r0   r   r	      s6   !21r	   c                       s   e Zd ZdZdejddddddddddfeejeeej eee	e	e	e	dd fd	d
Z
dejejejeje	e	eeej  eejejf dddZdejejejeje	eeej  eejejf dddZdd Zdd Z  ZS )SAM2MaskDecodera
  
    Transformer-based decoder for predicting instance segmentation masks from image and prompt embeddings.

    This class extends the functionality of the MaskDecoder, incorporating additional features such as
    high-resolution feature processing, dynamic multimask output, and object score prediction.

    Attributes:
        transformer_dim (int): Channel dimension of the transformer.
        transformer (nn.Module): Transformer used to predict masks.
        num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
        iou_token (nn.Embedding): Embedding for IOU token.
        num_mask_tokens (int): Total number of mask tokens.
        mask_tokens (nn.Embedding): Embedding for mask tokens.
        pred_obj_scores (bool): Whether to predict object scores.
        obj_score_token (nn.Embedding): Embedding for object score token.
        use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.
        output_upscaling (nn.Sequential): Upscaling layers for output.
        use_high_res_features (bool): Whether to use high-resolution features.
        conv_s0 (nn.Conv2d): Convolutional layer for high-resolution features (s0).
        conv_s1 (nn.Conv2d): Convolutional layer for high-resolution features (s1).
        output_hypernetworks_mlps (nn.ModuleList): List of MLPs for output hypernetworks.
        iou_prediction_head (MLP): MLP for IOU prediction.
        pred_obj_score_head (nn.Linear | MLP): Linear layer or MLP for object score prediction.
        dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
        dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
        dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.

    Methods:
        forward: Predicts masks given image and prompt embeddings.
        predict_masks: Predicts instance segmentation masks from image and prompt embeddings.
        _get_stability_scores: Computes mask stability scores based on IoU between thresholds.
        _dynamic_multimask_via_stability: Dynamically selects the most stable mask output.

    Examples:
        >>> image_embeddings = torch.rand(1, 256, 64, 64)
        >>> image_pe = torch.rand(1, 256, 64, 64)
        >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
        >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
        >>> decoder = SAM2MaskDecoder(256, transformer)
        >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
        ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
        ... )
    r
   r   Fg?g\(\?N)r   r   r   r   r   r   use_high_res_featurespred_obj_scorespred_obj_scores_mlpuse_multimask_token_for_obj_ptrr   c                    s^  t     | _|| _|| _td | _|d | _t| j | _	|| _
| j
r^td | _|| _ttj  d dddt d | tj d  d ddd| | _|| _|rtj  d ddd| _tj  d ddd| _t fddt| jD | _t || j||d| _| j
rHt d| _|rHt  dd	| _|	| _|
| _|| _d
S )a  
        Initializes the SAM2MaskDecoder module for predicting instance segmentation masks.

        This decoder extends the functionality of MaskDecoder, incorporating additional features such as
        high-resolution feature processing, dynamic multimask output, and object score prediction.

        Args:
            transformer_dim (int): Channel dimension of the transformer.
            transformer (nn.Module): Transformer used to predict masks.
            num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
            activation (Type[nn.Module]): Type of activation to use when upscaling masks.
            iou_head_depth (int): Depth of the MLP used to predict mask quality.
            iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.
            use_high_res_features (bool): Whether to use high-resolution features.
            iou_prediction_use_sigmoid (bool): Whether to use sigmoid for IOU prediction.
            dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
            dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
            dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.
            pred_obj_scores (bool): Whether to predict object scores.
            pred_obj_scores_mlp (bool): Whether to use MLP for object score prediction.
            use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.

        Examples:
            >>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
            >>> decoder = SAM2MaskDecoder(transformer_dim=256, transformer=transformer)
            >>> print(decoder)
        r   r   r   r   r   c                    s   g | ]}t    d  dqS r   r   r   r   r   r   r    %  r!   z,SAM2MaskDecoder.__init__.<locals>.<listcomp>)Zsigmoidr
   N)r"   r#   r   r   r   r   r$   r%   r&   r'   rh   obj_score_tokenrj   r(   r)   r   r*   rg   ZConv2dZconv_s0Zconv_s1r+   r,   r-   r   r.   ZLinearpred_obj_score_headdynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_thresh)r/   r   r   r   r   r   r   rg   Ziou_prediction_use_sigmoidrm   rn   ro   rh   ri   rj   r0   r   r   r#      sN    ,


zSAM2MaskDecoder.__init__)r2   r3   r4   r5   r6   repeat_imagehigh_res_featuresr   c                 C   s   | j ||||||d\}}	}
}|rX|ddddddddf }|	ddddf }	nR| jrv| jsv| ||	\}}	n4|ddddddddf }|	ddddf }	|r| jr|
ddddf }n|
ddddf }||	||fS )a  
        Predicts masks given image and prompt embeddings.

        Args:
            image_embeddings (torch.Tensor): Embeddings from the image encoder with shape (B, C, H, W).
            image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings (B, C, H, W).
            sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes with shape (B, N, C).
            dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs with shape (B, C, H, W).
            multimask_output (bool): Whether to return multiple masks or a single mask.
            repeat_image (bool): Flag to repeat the image embeddings.
            high_res_features (List[torch.Tensor] | None): Optional high-resolution features.

        Returns:
            (Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]): A tuple containing:
                - masks (torch.Tensor): Batched predicted masks with shape (B, N, H, W).
                - iou_pred (torch.Tensor): Batched predictions of mask quality with shape (B, N).
                - sam_tokens_out (torch.Tensor): Batched SAM token for mask output with shape (B, N, C).
                - object_score_logits (torch.Tensor): Batched object score logits with shape (B, 1).

        Examples:
            >>> image_embeddings = torch.rand(1, 256, 64, 64)
            >>> image_pe = torch.rand(1, 256, 64, 64)
            >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
            >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
            >>> decoder = SAM2MaskDecoder(256, transformer)
            >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
            ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
            ... )
        )r2   r3   r4   r5   rp   rq   Nr   r   )r7   rm   Ztraining _dynamic_multimask_via_stabilityrj   )r/   r2   r3   r4   r5   r6   rp   rq   r9   r:   rD   object_score_logitsZsam_tokens_outr   r   r   r;   :  s&    '
  
zSAM2MaskDecoder.forward)r2   r3   r4   r5   rp   rq   r   c                    s`  d}j r0tjjjjjjjgdd}d}ntjjjjjgdd}|d|	ddd}tj||fdd}	|rtj
||	jd dd}
n|jd |	jd ksJ |}
|
| }
|	ddksJ dtj
||	jd dd}|
j\}}}}|
||	\}}
|dd|ddf }|dd|d |d j ddf  |
dd||||}
jsl|
}n<j\}}}}}|\}}||||
| }|||| } fdd	tjD }tj|dd}|j\}}}}||||||  |d||}|}j r>|dks J |dddddf }nd
||jd d }|| |fS )zZPredicts instance segmentation masks from image and prompt embeddings using a transformer.r   r<   r   r>   z@image_pe should have size 1 in batch dim (from `get_dense_pe()`)Nr   c                    s,   g | ]$}j |  d d |d d f qS r?   r@   rA   rC   r   r   r      s   z1SAM2MaskDecoder.predict_masks.<locals>.<listcomp>g      $@)rh   rE   rF   rk   rG   r%   r'   rH   rI   sizerK   rJ   r   r&   rL   rM   rg   r*   r,   rN   r.   rl   Znew_ones)r/   r2   r3   r4   r5   rp   rq   srO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   Zdc1Zln1Zact1Zdc2Zact2Zfeat_s0Zfeat_s1rZ   r[   r9   r:   rs   r   rC   r   r7     sV    ("
zSAM2MaskDecoder.predict_masksc                 C   sT   | d}| j}tj||kdd }tj|| kdd }t|dk|| dS )zOComputes mask stability scores based on IoU between upper and lower thresholds.r>   r<   r   g      ?)flattenrn   rE   sumfloatwhere)r/   Zmask_logitsZstability_deltaZarea_iZarea_ur   r   r   _get_stability_scores  s
    
z%SAM2MaskDecoder._get_stability_scoresc                 C   s   |ddddddddf }|ddddf }t j|dd}t j|d|jd}|||f }|d}|||f }|d}|ddddddddf }	|ddddf }
| |	}|| jk}t |d 	|	|	|}t |	|
|
|}||fS )a  
        Dynamically selects the most stable mask output based on stability scores and IoU predictions.

        This method is used when outputting a single mask. If the stability score from the current single-mask
        output (based on output token 0) falls below a threshold, it instead selects from multi-mask outputs
        (based on output tokens 1-3) the mask with the highest predicted IoU score. This ensures a valid mask
        for both clicking and tracking scenarios.

        Args:
            all_mask_logits (torch.Tensor): Logits for all predicted masks, shape (B, N, H, W) where B is
                batch size, N is number of masks (typically 4), and H, W are mask dimensions.
            all_iou_scores (torch.Tensor): Predicted IoU scores for all masks, shape (B, N).

        Returns:
            (Tuple[torch.Tensor, torch.Tensor]):
                - mask_logits_out (torch.Tensor): Selected mask logits, shape (B, 1, H, W).
                - iou_scores_out (torch.Tensor): Selected IoU scores, shape (B, 1).

        Examples:
            >>> decoder = SAM2MaskDecoder(...)
            >>> all_mask_logits = torch.rand(2, 4, 256, 256)  # 2 images, 4 masks each
            >>> all_iou_scores = torch.rand(2, 4)
            >>> mask_logits, iou_scores = decoder._dynamic_multimask_via_stability(all_mask_logits, all_iou_scores)
            >>> print(mask_logits.shape, iou_scores.shape)
            torch.Size([2, 1, 256, 256]) torch.Size([2, 1])
        Nr   r>   r<   r   )device).NN)
rE   ZargmaxZarangert   r|   rH   r{   ro   rz   Z	expand_as)r/   Zall_mask_logitsZall_iou_scoresZmultimask_logitsZmultimask_iou_scoresZbest_scores_indsZ
batch_indsZbest_multimask_logitsZbest_multimask_iou_scoresZsinglemask_logitsZsinglemask_iou_scoresZstability_scoresZ	is_stableZmask_logits_outZiou_scores_outr   r   r   rr     s.     

 

z0SAM2MaskDecoder._dynamic_multimask_via_stability)N)N)r\   r]   r^   r_   r   r`   ra   rb   r   rd   r#   rE   rc   r   r   r   r;   r7   r{   rr   re   r   r   r0   r   rf      sb   0e N Grf   )typingr   r   r   r   rE   r   Zultralytics.nn.modulesr   r   rb   r	   rf   r   r   r   r   <module>   s    &