a
    yµÿfSÃ  ã                   @   s´   d dl mZ d dlZd dlm  mZ d dlmZ d dlmZ d dl	m
Z
 ddlmZ ddlmZmZ dd	lmZmZ dd
lmZmZ dZG dd„ dejƒZG dd„ dejjƒZdS )é    )ÚListN)Únn)Útrunc_normal_)ÚMLPé   )ÚSAM2TwoWayTransformer)ÚMaskDecoderÚSAM2MaskDecoder)ÚImageEncoderViTÚPromptEncoder)Úget_1d_sine_peÚselect_closest_cond_framesg      Àc                       sP   e Zd ZU dZdZeed< deee	e
e e
e ddœ‡ fdd	„Zd
d„ Z‡  ZS )ÚSAMModelaÙ  
    Segment Anything Model (SAM) for object segmentation tasks.

    This class combines image encoders, prompt encoders, and mask decoders to predict object masks from images
    and input prompts.

    Attributes:
        mask_threshold (float): Threshold value for mask prediction.
        image_encoder (ImageEncoderViT): Backbone for encoding images into embeddings.
        prompt_encoder (PromptEncoder): Encoder for various types of input prompts.
        mask_decoder (MaskDecoder): Predicts object masks from image and prompt embeddings.
        pixel_mean (torch.Tensor): Mean pixel values for image normalization, shape (3, 1, 1).
        pixel_std (torch.Tensor): Standard deviation values for image normalization, shape (3, 1, 1).

    Methods:
        __init__: Initializes the SAMModel with encoders, decoder, and normalization parameters.

    Examples:
        >>> image_encoder = ImageEncoderViT(...)
        >>> prompt_encoder = PromptEncoder(...)
        >>> mask_decoder = MaskDecoder(...)
        >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
        >>> # Further usage depends on SAMPredictor class

    Notes:
        All forward() operations are implemented in the SAMPredictor class.
    ç        Úmask_threshold©g33333ë^@gR¸…ë]@gR¸…ëáY@©gÃõ(\2M@gÂõ(\L@g     °L@N)Úimage_encoderÚprompt_encoderÚmask_decoderÚ
pixel_meanÚ	pixel_stdÚreturnc                    s\   t ƒ  ¡  || _|| _|| _|  dt |¡ ddd¡d¡ |  dt |¡ ddd¡d¡ dS )aÛ  
        Initialize the SAMModel class to predict object masks from an image and input prompts.

        Args:
            image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
            prompt_encoder (PromptEncoder): Encodes various types of input prompts.
            mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
            pixel_mean (List[float]): Mean values for normalizing pixels in the input image.
            pixel_std (List[float]): Std values for normalizing pixels in the input image.

        Examples:
            >>> image_encoder = ImageEncoderViT(...)
            >>> prompt_encoder = PromptEncoder(...)
            >>> mask_decoder = MaskDecoder(...)
            >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
            >>> # Further usage depends on SAMPredictor class

        Notes:
            All forward() operations moved to SAMPredictor.
        r   éÿÿÿÿr   Fr   N)	ÚsuperÚ__init__r   r   r   Zregister_bufferÚtorchÚTensorÚview)Úselfr   r   r   r   r   ©Ú	__class__© ú^/var/www/html/django/DPS/env/lib/python3.9/site-packages/ultralytics/models/sam/modules/sam.pyr   :   s    
zSAMModel.__init__c                 C   sB   t | jdƒr| j |¡ || j_dd„ |D ƒ| j_|d | j_dS )ú¦
        Set image size to make model compatible with different image sizes.

        Args:
            imgsz (Tuple[int, int]): The size of the input image.
        Ú	set_imgszc                 S   s   g | ]}|d  ‘qS ©é   r"   ©Ú.0Úxr"   r"   r#   Ú
<listcomp>g   ó    z&SAMModel.set_imgsz.<locals>.<listcomp>r   N)Úhasattrr   r%   r   Úinput_image_sizeÚimage_embedding_sizeZimg_size©r   Zimgszr"   r"   r#   r%   ]   s
    zSAMModel.set_imgsz)r   r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   ÚfloatÚ__annotations__r
   r   r   r   r   r%   Ú__classcell__r"   r"   r    r#   r      s   
  úù#r   c                       s¼   e Zd ZU dZdZeed< d+eeeeeeedœ‡ fdd„Ze	dd„ ƒZ
dd„ Zdd„ Zd,dd„Zdd„ Zejdœdd„Zdd„ Zd-dd „Zd!d"„ Zd.d#d$„Zd%d&„ Zd'd(„ Zd)d*„ Z‡  ZS )/Ú	SAM2Modelaû  
    SAM2Model class for Segment Anything Model 2 with memory-based video object segmentation capabilities.

    This class extends the functionality of SAM to handle video sequences, incorporating memory mechanisms
    for temporal consistency and efficient tracking of objects across frames.

    Attributes:
        mask_threshold (float): Threshold value for mask prediction.
        image_encoder (ImageEncoderViT): Visual encoder for extracting image features.
        memory_attention (nn.Module): Module for attending to memory features.
        memory_encoder (nn.Module): Encoder for generating memory representations.
        num_maskmem (int): Number of accessible memory frames.
        image_size (int): Size of input images.
        backbone_stride (int): Stride of the backbone network output.
        sam_prompt_embed_dim (int): Dimension of SAM prompt embeddings.
        sam_image_embedding_size (int): Size of SAM image embeddings.
        sam_prompt_encoder (PromptEncoder): Encoder for processing input prompts.
        sam_mask_decoder (SAM2MaskDecoder): Decoder for generating object masks.
        obj_ptr_proj (nn.Module): Projection layer for object pointers.
        obj_ptr_tpos_proj (nn.Module): Projection for temporal positional encoding in object pointers.

    Methods:
        forward_image: Processes image batch through encoder to extract multi-level features.
        track_step: Performs a single tracking step, updating object masks and memory features.

    Examples:
        >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
        >>> image_batch = torch.rand(1, 3, 512, 512)
        >>> features = model.forward_image(image_batch)
        >>> track_results = model.track_step(0, True, features, None, None, None, {})
    r   r   é   é   r'   ç      ð?Fr   r   TN)Úuse_multimask_token_for_obj_ptrÚpred_obj_scoresÚpred_obj_scores_mlpÚfixed_no_obj_ptrÚsoft_no_obj_ptrÚuse_mlp_for_obj_ptr_projÚcompile_image_encoderc#           #         sD  t ƒ  ¡  || _|| _|rdnd| _|| _|| _|rJtjj	ddddd| _
|| _|r\|s\J ‚|| _|| _|| _|j| _|| _| j| _t| jdƒr°t| jjdƒr°| jjjjd | _|| _tj t |dd| j¡¡| _t| jdd	 tj t dd| j¡¡| _tj t dd| j¡¡| _t| jdd	 t| jdd	 || _|| _|| _ |	| _!|| _"|| _#|
| _$|| _%|| _&|| _'|| _(|| _)|| _*|| _+|| _,|!| _-|| _.|| _/|| _0|| _1| j0rÈ| j.s¼J ‚| jsÈJ ‚| j.rþ| jrþtj t d| j¡¡| _2t| j2dd	 | | _3|  4¡  || _5|| _6|"r@t7d
ƒ tj8| jj9dddd| j_9dS )a¨  
        Initializes the SAM2Model for video object segmentation with memory-based tracking.

        Args:
            image_encoder (nn.Module): Visual encoder for extracting image features.
            memory_attention (nn.Module): Module for attending to memory features.
            memory_encoder (nn.Module): Encoder for generating memory representations.
            num_maskmem (int): Number of accessible memory frames. Default is 7 (1 input frame + 6 previous frames).
            image_size (int): Size of input images.
            backbone_stride (int): Stride of the image backbone output.
            sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
            sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
            binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
                with clicks during evaluation.
            use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
                prompt encoder and mask decoder on frames with mask input.
            max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
                -1 means no limit.
            directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
                first frame.
            use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
            multimask_output_in_sam (bool): Whether to output multiple (3) masks for the first click on initial
                conditioning frames.
            multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
            multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
            multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
            use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
            iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
            memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
            add_all_frames_to_correct_as_cond (bool): Whether to append frames with correction clicks to conditioning
                frame list.
            non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
                memory encoder during evaluation.
            use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
            max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder
                cross-attention.
            add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers in
                the encoder.
            proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
                encoding in object pointers.
            only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past
                during evaluation.
            pred_obj_scores (bool): Whether to predict if there is an object in the frame.
            pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
            fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
            soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
            use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
            sam_mask_decoder_extra_args (Dict | None): Extra arguments for constructing the SAM mask decoder.
            compile_image_encoder (bool): Whether to compile the image encoder for faster inference.

        Examples:
            >>> image_encoder = ImageEncoderViT(...)
            >>> memory_attention = SAM2TwoWayTransformer(...)
            >>> memory_encoder = nn.Sequential(...)
            >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
            >>> image_batch = torch.rand(1, 3, 512, 512)
            >>> features = model.forward_image(image_batch)
            >>> track_results = model.track_step(0, True, features, None, None, None, {})
        é   r   é   )Zkernel_sizeZstrideÚout_projÚweightr   g{®Gáz”?)ZstdzFImage encoder compilation is enabled. First forward pass will be slow.zmax-autotuneTF)ÚmodeZ	fullgraphZdynamicN):r   r   r   Úuse_high_res_features_in_samÚnum_feature_levelsÚuse_obj_ptrs_in_encoderÚmax_obj_ptrs_in_encoderr   r   ZConv2dÚmask_downsampleÚadd_tpos_enc_to_obj_ptrsÚproj_tpos_enc_in_obj_ptrsÚ"only_obj_ptrs_in_the_past_for_evalÚmemory_attentionZd_modelÚ
hidden_dimÚmemory_encoderÚmem_dimr-   rE   rF   ÚshapeÚnum_maskmemÚ	ParameterÚzerosÚmaskmem_tpos_encr   Úno_mem_embedÚno_mem_pos_encÚdirectly_add_no_mem_embedÚsigmoid_scale_for_mem_encÚsigmoid_bias_for_mem_encÚ"binarize_mask_from_pts_for_mem_encÚnon_overlap_masks_for_mem_encÚmemory_temporal_stride_for_evalÚ$use_mask_input_as_output_without_samÚmultimask_output_in_samÚmultimask_min_pt_numÚmultimask_max_pt_numÚmultimask_output_for_trackingr<   Úiou_prediction_use_sigmoidÚ
image_sizeÚbackbone_strideÚsam_mask_decoder_extra_argsr=   r>   r?   r@   Ú
no_obj_ptrrA   Ú_build_sam_headsÚ!add_all_frames_to_correct_as_condÚmax_cond_frames_in_attnÚprintÚcompileÚforward)#r   r   rP   rR   rU   rg   rh   r\   r]   r^   ra   rm   r[   rH   rb   rc   rd   re   r<   rf   r`   rl   r_   rJ   rK   rM   rN   rO   r=   r>   r?   r@   rA   ri   rB   r    r"   r#   r   Ž   s€    `
üzSAM2Model.__init__c                 C   s   t |  ¡ ƒjS )z>Returns the device on which the model's parameters are stored.)ÚnextÚ
parametersÚdevice©r   r"   r"   r#   rs   H  s    zSAM2Model.devicec                 O   s   t dƒ‚dS )zYProcesses image and prompt inputs to generate object masks and scores in video sequences.z„Please use the corresponding methods in SAM2VideoPredictor for inference.See notebooks/video_predictor_example.ipynb for an example.N)ÚNotImplementedError)r   ÚargsÚkwargsr"   r"   r#   rp   M  s    ÿzSAM2Model.forwardc                 C   sð   | j | _| j| j | _t| j| j| jf| j| jfdd| _tf dtd| jddd| jdd| j	| j
| j| j| jd	œ
| jpxi ¤Ž| _| jr¸tj | j | j ¡| _| jrÄt| j | j | j dƒ| _ntj ¡ | _| jràtj | j | j¡| _ntj ¡ | _d
S )zNBuilds SAM-style prompt encoder and mask decoder for image segmentation tasks.r'   )Z	embed_dimr/   r.   Zmask_in_chansrC   é   i   é   )ÚdepthZembedding_dimZmlp_dimZ	num_headsé   )
Znum_multimask_outputsZtransformerZtransformer_dimZiou_head_depthZiou_head_hidden_dimZuse_high_res_featuresrf   r=   r>   r<   N)rQ   Úsam_prompt_embed_dimrg   rh   Úsam_image_embedding_sizer   Úsam_prompt_encoderr	   r   rH   rf   r=   r>   r<   ri   Úsam_mask_decoderrJ   r   r   ZLinearÚobj_ptr_projrA   r   ZIdentityrN   rS   Úobj_ptr_tpos_projrt   r"   r"   r#   rk   T  sJ    þ
ù	üñðzSAM2Model._build_sam_headsc              	   C   s–  |  d¡}|j}|  d¡| jks$J ‚|  d¡| jks8J ‚|  d¡| jksLJ ‚|dur†|d }|d }	|  d¡|kr€|	  d¡|ks®J ‚n(tj|dd|d}tj|dtj|d	 }	|durt|j	ƒd
krÜ|j	dd… |dfksàJ ‚|j	dd… | j
jkrtj| ¡ | j
jdddd}
n|}
nd}
| j
||	fd|
d\}}| j|| j
 ¡ |||d|d\}}}}| jrŠ|dk}t |dd…ddf |t¡}| ¡ }tj|| j| jfddd}|dd…df }|r tj|dd}tj||d}|||f  d¡}|||f  d¡}|  d¡dkr*|||f }n
|| }}|  |¡}| jr„| jrZ| jrPJ ‚| ¡ }n| ¡ }| jrr|| }|d| | j  }|||||||fS )aó
  
        Forward pass through SAM prompt encoders and mask heads.

        This method processes image features and optional point/mask inputs to generate object masks and scores.

        Args:
            backbone_features (torch.Tensor): Image features with shape (B, C, H, W).
            point_inputs (Dict[str, torch.Tensor] | None): Dictionary containing point prompts.
                'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute
                    pixel-unit coordinates in (x, y) format for P input points.
                'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks,
                    0 means negative clicks, and -1 means padding.
            mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the
                same spatial size as the image.
            high_res_features (List[torch.Tensor] | None): List of two feature maps with shapes
                (B, C, 4*H, 4*W) and (B, C, 2*H, 2*W) respectively, used as high-resolution feature maps
                for SAM decoder.
            multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False,
                output only 1 mask and its IoU estimate.

        Returns:
            (Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
                low_res_multimasks: Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
                high_res_multimasks: Tensor of shape (B, M, H*16, W*16) with upsampled mask logits.
                ious: Tensor of shape (B, M) with estimated IoU for each output mask.
                low_res_masks: Tensor of shape (B, 1, H*4, W*4) with best low-resolution mask.
                high_res_masks: Tensor of shape (B, 1, H*16, W*16) with best high-resolution mask.
                obj_ptr: Tensor of shape (B, C) with object pointer vector for the output mask.
                object_score_logits: Tensor of shape (B,) with object score logits.

            Where M is 3 if multimask_output=True, and 1 if multimask_output=False.

        Examples:
            >>> backbone_features = torch.rand(1, 256, 32, 32)
            >>> point_inputs = {"point_coords": torch.rand(1, 2, 2), "point_labels": torch.tensor([[1, 0]])}
            >>> mask_inputs = torch.rand(1, 1, 512, 512)
            >>> results = model._forward_sam_heads(backbone_features, point_inputs, mask_inputs)
            >>> (
            ...     low_res_multimasks,
            ...     high_res_multimasks,
            ...     ious,
            ...     low_res_masks,
            ...     high_res_masks,
            ...     obj_ptr,
            ...     object_score_logits,
            ... ) = results
        r   r   rx   rC   NZpoint_coordsÚpoint_labels©rs   )Zdtypers   rD   éþÿÿÿFÚbilinearT©ÚsizeÚalign_cornersrG   Z	antialias)ZpointsZboxesÚmasks)Zimage_embeddingsZimage_peZsparse_prompt_embeddingsZdense_prompt_embeddingsÚmultimask_outputZrepeat_imageÚhigh_res_features)r‡   rG   rˆ   r   ©Údim)r‡   rs   r|   r}   r   rW   ZonesZint32ÚlenrT   r~   Zmask_input_sizeÚFÚinterpolater5   r   Zget_dense_per=   ÚwhereÚNO_OBJ_SCORErg   ÚargmaxÚarangeÚ	unsqueezer€   r@   Z teacher_force_obj_scores_for_memÚsigmoidr?   rj   )r   Úbackbone_featuresÚpoint_inputsÚmask_inputsr‹   rŠ   ÚBrs   Zsam_point_coordsZsam_point_labelsZsam_mask_promptZsparse_embeddingsZdense_embeddingsZlow_res_multimasksÚiousZsam_output_tokensÚobject_score_logitsÚis_obj_appearingZhigh_res_multimasksZsam_output_tokenZbest_iou_indsZ
batch_indsÚlow_res_masksÚhigh_res_masksÚobj_ptrÚlambda_is_obj_appearingr"   r"   r#   Ú_forward_sam_heads„  s¤    7
"
(ûý

ùû
ý
ü


ùzSAM2Model._forward_sam_headsc                 C   s  d\}}|  ¡ }|| | }tj|| d¡d | d¡d fdddd}| | d	¡d
¡  ¡ }	| js€tj| d	¡| j|j	d}
n$| j
||  |¡|d\}}}}}}
}tj| d
¡  ¡ dkd
d}|d }|  ¡ }|| | }| jr| jrò||
 }
|
d
| | j  }
|||	|||
|fS )zHProcesses mask inputs directly as output, bypassing SAM encoder/decoder.)g      4@ç      $Àr„   rD   r   Fr…   Tr†   r   r   rƒ   )r—   r™   r‹   r   rŒ   ).N)r5   r   r   r‡   Znew_onesrJ   r   rW   rQ   rs   r¢   rL   ÚanyÚflattenr=   r?   rj   )r   r—   r‹   r™   Z	out_scaleZout_biasZmask_inputs_floatrŸ   rž   r›   r    Ú_r   r¡   rœ   r"   r"   r#   Ú_use_mask_as_output,  sD    ûýùzSAM2Model._use_mask_as_output)Ú	img_batchc                 C   sL   |   |¡}| jrH| j |d d ¡|d d< | j |d d ¡|d d< |S )zTProcesses image batch through encoder to extract multi-level features for SAM model.Úbackbone_fpnr   r   )r   rH   r   Zconv_s0Zconv_s1)r   r¨   Úbackbone_outr"   r"   r#   Úforward_image[  s
    
zSAM2Model.forward_imagec                 C   s˜   |  ¡ }t|d ƒt|d ƒks$J ‚t|d ƒ| jks:J ‚|d | j d… }|d | j d… }dd„ |D ƒ}dd„ |D ƒ}dd„ |D ƒ}||||fS )z\Prepares and flattens visual features from the image backbone output for further processing.r©   Úvision_pos_encNc                 S   s    g | ]}|j d  |j d f‘qS )r„   r   )rT   r(   r"   r"   r#   r+   n  r,   z8SAM2Model._prepare_backbone_features.<locals>.<listcomp>c                 S   s    g | ]}|  d ¡ d dd¡‘qS ©rx   r   r   ©r¥   Úpermuter(   r"   r"   r#   r+   p  r,   c                 S   s    g | ]}|  d ¡ d dd¡‘qS r­   r®   r(   r"   r"   r#   r+   q  r,   )ÚcopyrŽ   rI   )r   rª   Zfeature_mapsZvision_pos_embedsÚ
feat_sizesZvision_featsr"   r"   r#   Ú_prepare_backbone_featurese  s    z$SAM2Model._prepare_backbone_featuresc	           +         s‚  |d   d¡}	| j}
|d \}}|d j}| jdkrR|d  ddd¡ |	|
||¡S d}|sØg g  }}t|d ƒdkszJ ‚|d }tˆ || jƒ\}}dd„ | 	¡ D ƒ}| j
}td| jƒD ]¬}| j| }|dkräˆrÚˆ | nˆ | }nLˆsˆ d | | }||d |  }n$ˆ d  |  | }||d |  }|d  |d	¡}|d	u rV| |d	¡}| ||f¡ q¸|D ]ˆ\}}|d	u r€qj|d
 jdd}| | d¡ ddd¡¡ |d d  ¡ }| d¡ ddd¡}|| j| j| d   }| |¡ qj| jr4t|| jƒ}| js2| jr2‡ ‡fdd„| ¡ D ƒ}n|}‡ fdd„| ¡ D ƒ}td|ƒD ]t} ˆrhˆ |  nˆ |  }!|!dk sŽ|d	ur”|!|kr” qÌ|d  |!| |!d	¡¡}|d	urV| | |d f¡ qV|rÒt|Ž \}"}#tj|#dd}$| jrN|d }%| jr|
n| j}&tj|"|d}'t|'|% |&d}'|  |'¡}'|'  d¡ !d|	| j¡}'n|$ "t|"ƒ|	| j¡}'| j|
k r²|$ #d|	|
| j | j¡}$|$ dddd¡ dd¡}$|'j$|
| j dd}'| |$¡ | |'¡ |$j%d }nd}n\| j&r|d | j' }(|( ddd¡ |	|
||¡}(|(S | j' !d|	| j¡g}| j( !d|	| j¡g}tj)|dd})tj)|dd}*| j*|||)|*|d}(|( ddd¡ |	|
||¡}(|(S )zfPrepares memory-conditioned features by fusing current frame's visual features with previous memories.r   r   r   rx   Zcond_frame_outputsc                 S   s   g | ]}d |f‘qS )r   r"   )r)   Úoutr"   r"   r#   r+   –  r,   zBSAM2Model._prepare_memory_conditioned_features.<locals>.<listcomp>Znon_cond_frame_outputsNÚmaskmem_featuresT)Znon_blockingÚmaskmem_pos_encc                    s,   i | ]$\}}ˆr|ˆ krn|ˆ kr||“qS r"   r"   ©r)   Útr³   ©Ú	frame_idxÚtrack_in_reverser"   r#   Ú
<dictcomp>È  s   þzBSAM2Model._prepare_memory_conditioned_features.<locals>.<dictcomp>c                    s$   g | ]\}}t ˆ | ƒ|d  f‘qS )r    )Úabsr¶   )r¹   r"   r#   r+   Ï  s   ÿr    rŒ   rƒ   rC   )ÚcurrÚcurr_posÚmemoryZ
memory_posÚnum_obj_ptr_tokens)+r‡   rQ   rs   rU   r¯   r   rŽ   r   rm   Úvaluesr`   ÚrangeÚgetÚappendÚcudar¥   rX   rJ   ÚminrK   ÚtrainingrO   ÚitemsÚzipr   ÚstackrM   rN   rS   Ztensorr   r   r•   ÚexpandZ	new_zerosZreshapeZrepeat_interleaverT   r[   rY   rZ   ÚcatrP   )+r   r¹   Úis_init_cond_frameÚcurrent_vision_featsÚcurrent_vision_pos_embedsr±   Úoutput_dictÚ
num_framesrº   rš   ÚCÚHÚWrs   rÀ   Zto_cat_memoryZto_cat_memory_pos_embedZcond_outputsZselected_cond_outputsZunselected_cond_outputsZt_pos_and_prevsÚrZt_posZt_relZprev_frame_idxr³   ÚprevZfeatsZmaskmem_encrK   Zptr_cond_outputsZpos_and_ptrsZt_diffr·   Zpos_listZ	ptrs_listZobj_ptrsZ
t_diff_maxZtpos_dimZobj_posÚpix_feat_with_memr¿   Zmemory_pos_embedr"   r¸   r#   Ú$_prepare_memory_conditioned_featuresu  s¶    


ÿ


þ
ý



ûz.SAM2Model._prepare_memory_conditioned_featuresc                 C   sÐ   |d   d¡}| j}|d \}}|d  ddd¡ ||||¡}	| jrT| jsT|  |¡}| jo\|}
|
rv| jsv|dk ¡ }n
t	 
|¡}| jdkr”|| j }| jdkr¨|| j }| j|	|dd}|d	 }|d
 }||fS )zYEncodes frame features and masks into a new memory representation for video segmentation.r   r   rx   r   r;   r   T)Zskip_mask_sigmoidZvision_featuresr¬   )r‡   rQ   r¯   r   r_   rÇ   Ú"_apply_non_overlapping_constraintsr^   r5   r   r–   r\   r]   rR   )r   rÎ   r±   Úpred_masks_high_resÚis_mask_from_ptsrš   rÒ   rÓ   rÔ   Úpix_featZbinarizeZmask_for_memZmaskmem_outr´   rµ   r"   r"   r#   Ú_encode_new_memory  s,    







ýzSAM2Model._encode_new_memoryc              
   C   s|  ||dœ}t |ƒdkr<dd„ t|dd… |dd… ƒD ƒ}nd}|durŠ| jrŠ|d  ddd¡}|jd| jg|d ¢R Ž }|  |||¡}nr| j|||dd… |dd… |dd… ||	|
d	}|durÜ|durÔ|du sØJ ‚|}|  ||¡}| j	|||||d
}|\}}}}}}}||d< ||d< ||d< |rh| j
dkrh|}| j||||dud\}}||d< ||d< nd|d< d|d< |S )ziPerforms a single tracking step, updating object masks and memory features based on current frame inputs.)r˜   r™   r   c                 S   s:   g | ]2\}}|  d dd¡j| d ¡| d¡g|¢R Ž ‘qS )r   rx   r   )r¯   r   r‡   )r)   r*   Úsr"   r"   r#   r+   R  s   ÿz(SAM2Model.track_step.<locals>.<listcomp>Nr   rx   r   )r¹   rÍ   rÎ   rÏ   r±   rÐ   rÑ   rº   )r—   r˜   r™   r‹   rŠ   Ú
pred_masksrÚ   r    )rÎ   r±   rÚ   rÛ   r´   rµ   )rŽ   rÉ   ra   r¯   r   rQ   r§   rØ   Ú_use_multimaskr¢   rU   rÝ   )r   r¹   rÍ   rÎ   rÏ   r±   r˜   r™   rÐ   rÑ   rº   Zrun_mem_encoderZprev_sam_mask_logitsZcurrent_outr‹   rÜ   Zsam_outputsr×   rŠ   r¦   rž   rŸ   r    Zhigh_res_masks_for_mem_encr´   rµ   r"   r"   r#   Ú
track_step9  sp    
þ


øûøü

zSAM2Model.track_stepc                 C   sF   |du rdn|d   d¡}| joD|s*| joD| j|  ko@| jkS   S )zbDetermines whether to use multiple mask outputs in the SAM head based on configuration and inputs.Nr   r‚   r   )r‡   rb   re   rc   rd   )r   rÍ   r˜   Znum_ptsr"   r"   r#   rà   ™  s    ÿýzSAM2Model._use_multimaskc                 C   sn   |  d¡}|dkr|S |j}tj|ddd}tj||ddd…dddf }||k}t ||tj|dd¡}|S )	zZApplies non-overlapping constraints to masks, keeping highest scoring object per location.r   r   T)r   Zkeepdimrƒ   Nr£   )Úmax)r‡   rs   r   r“   r”   r‘   Úclamp)r   rß   Z
batch_sizers   Zmax_obj_indsZbatch_obj_indsZkeepr"   r"   r#   rÙ   ¢  s    
z,SAM2Model._apply_non_overlapping_constraintsc                 C   s(   |d | _ || j_dd„ |D ƒ| j_dS )r$   r   c                 S   s   g | ]}|d  ‘qS r&   r"   r(   r"   r"   r#   r+   ¼  r,   z'SAM2Model.set_imgsz.<locals>.<listcomp>N)rg   r~   r.   r/   r0   r"   r"   r#   r%   ³  s    
zSAM2Model.set_imgsz)r9   r:   r'   r;   r   FFr   FFFr   r   FFFr   FFFr'   TFFFFFFFNF)NNNF)F)FTN)r1   r2   r3   r4   r   r5   r6   Úboolr   Úpropertyrs   rp   rk   r¢   r§   r   r   r«   r²   rØ   rÝ   rá   rà   rÙ   r%   r7   r"   r"   r    r#   r8   k   s†   
                                Ý
Ý ;
3    ú
 )/
 ÷
 3   í
`	r8   )Útypingr   r   Ztorch.nn.functionalr   Z
functionalr   Ztorch.nn.initr   Zultralytics.nn.modulesr   Úblocksr   Zdecodersr   r	   Úencodersr
   r   Úutilsr   r   r’   ÚModuler   r8   r"   r"   r"   r#   Ú<module>	   s   P