a
    8Sic$*                     @   s   d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlmZ dddddZdd	d
ZdddZdddZdddddZdddddZdS )    N)nccl)_take_tensors_flatten_dense_tensors_unflatten_dense_tensors_reorder_tensors_as_get_device_index_handle_complex)List)outc                C   s^   t | } |du |du A s(td|||durLdd |D }tj| |S tj| |S dS )a  Broadcasts a tensor to specified GPU devices.

    Args:
        tensor (Tensor): tensor to broadcast. Can be on CPU or GPU.
        devices (Iterable[torch.device, str or int], optional): an iterable of
          GPU devices, among which to broadcast.
        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
          store output results.

    .. note::
        Exactly one of :attr:`devices` and :attr:`out` must be specified.

    Returns:
        - If :attr:`devices` is specified,
            a tuple containing copies of :attr:`tensor`, placed on
            :attr:`devices`.
        - If :attr:`out` is specified,
            a tuple containing :attr:`out` tensors, each containing a copy of
            :attr:`tensor`.
    NzSExactly one of 'devices' and 'out' must be specified, but got devices={} and out={}c                 S   s   g | ]}t |qS  r   .0dr   r   R/var/www/html/django/DPS/env/lib/python3.9/site-packages/torch/nn/parallel/comm.py
<listcomp>#       zbroadcast.<locals>.<listcomp>)r   RuntimeErrorformattorch_C
_broadcast_broadcast_out)tensordevicesr
   r   r   r   	broadcast   s    r      c                 C   s,   dd |D }dd | D } t j| ||S )a/  Broadcasts a sequence tensors to the specified GPUs.
    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Args:
        tensors (sequence): tensors to broadcast. Must be on the same device,
          either CPU or GPU.
        devices (Iterable[torch.device, str or int]): an iterable of GPU
          devices, among which to broadcast.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of :attr:`tensor`, placed on :attr:`devices`.
    c                 S   s   g | ]}t |qS r   r   r   r   r   r   r   8   r   z'broadcast_coalesced.<locals>.<listcomp>c                 S   s   g | ]}t |qS r   r   r   tr   r   r   r   9   r   )r   r   _broadcast_coalesced)tensorsr   buffer_sizer   r   r   broadcast_coalesced)   s    r#   c                    sZ  t |dd}| d  }d t| D ]v\}}|jjdks@J d| |krP| | |kr$ddd	 | D }dd
d	 |D }td|||q$ du rt	dt
| dkr| d S t| rt|   }tj| | d nlt|   jj|} fddt| D }	|   |	d j|dd }|	dd D ]}
||
j|dd q:|S )a  Sums tensors from multiple GPUs.

    All inputs should have matching shapes, dtype, and layout. The output tensor
    will be of the same shape, dtype, and layout.

    Args:
        inputs (Iterable[Tensor]): an iterable of tensors to add.
        destination (int, optional): a device on which the output will be
            placed (default: current device).

    Returns:
        A tensor containing an elementwise sum of all inputs, placed on the
        :attr:`destination` device.
    T)optionalr   Ncpuz+reduce_add expects all inputs to be on GPUsxc                 s   s   | ]}t |V  qd S Nstrr   r&   r   r   r   	<genexpr>T   r   zreduce_add.<locals>.<genexpr>c                 s   s   | ]}t |V  qd S r'   r(   r*   r   r   r   r+   U   r   z2input {} has invalid size: got {}, but expected {}zLreduce_add expects destination to be on the same GPU with one of the tensors   )outputrootc                    s   g | ]\}}| kr|qS r   r   )r   ir   Z
root_indexr   r   r   c   r   zreduce_add.<locals>.<listcomp>)devicenon_blocking)r   size	enumerater1   type
get_devicejoin
ValueErrorr   r   lenr   is_availabler   
empty_likereducetoadd_)inputsdestination
input_sizer/   inpgotexpectedresultZdestination_deviceZnonroototherr   r0   r   
reduce_add=   s4    
rG   c                    s   dd | D }g }g }t |  D ]x}tdd |D rXt||}|| ||d  qt ||D ] \}}	||	jr||	 n|	 qb||d d  q fdd|D }
t |
 D ]<}dd |D }t||}t||d D ]}	||	j qqtt	||S )	a]  Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Args:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    c                 S   s   g | ]}g qS r   r   )r   _r   r   r   r   ~   r   z(reduce_add_coalesced.<locals>.<listcomp>c                 s   s   | ]}|j V  qd S r'   )	is_sparser   r   r   r   r+      r   z'reduce_add_coalesced.<locals>.<genexpr>r   c                    s   g | ]}t | qS r   )r   )r   r!   r"   r   r   r      r   c                 S   s   g | ]}t |qS r   )r   )r   chunkr   r   r   r      r   )
zipallrG   appendrI   to_denser   datatupler   )r?   r@   r"   Zdense_tensorsr-   Z	ref_orderZtensor_at_gpusrE   collr   Zitrschunksflat_tensorsZflat_resultr   rK   r   reduce_add_coalescedk   s$    


rV   c                C   s|   t | } |du r6dd |D }ttj| ||||S |durLtd||durbtd|ttj| |||S dS )a1  Scatters tensor across multiple GPUs.

    Args:
        tensor (Tensor): tensor to scatter. Can be on CPU or GPU.
        devices (Iterable[torch.device, str or int], optional): an iterable of
          GPU devices, among which to scatter.
        chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
          each device. It should match :attr:`devices` in length and sums to
          ``tensor.size(dim)``. If not specified, :attr:`tensor` will be divided
          into equal chunks.
        dim (int, optional): A dimension along which to chunk :attr:`tensor`.
          Default: ``0``.
        streams (Iterable[Stream], optional): an iterable of Streams, among
          which to execute the scatter. If not specified, the default stream will
          be utilized.
        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
          store output results. Sizes of these tensors must match that of
          :attr:`tensor`, except for :attr:`dim`, where the total size must
          sum to ``tensor.size(dim)``.

    .. note::
        Exactly one of :attr:`devices` and :attr:`out` must be specified. When
        :attr:`out` is specified, :attr:`chunk_sizes` must not be specified and
        will be inferred from sizes of :attr:`out`.

    Returns:
        - If :attr:`devices` is specified,
            a tuple containing chunks of :attr:`tensor`, placed on
            :attr:`devices`.
        - If :attr:`out` is specified,
            a tuple containing :attr:`out` tensors, each containing a chunk of
            :attr:`tensor`.
    Nc                 S   s   g | ]}t |qS r   r   r   r   r   r   r      r   zscatter.<locals>.<listcomp>zK'devices' must not be specified when 'out' is specified, but got devices={}zS'chunk_sizes' must not be specified when 'out' is specified, but got chunk_sizes={})r   rR   r   r   _scatterr   r   _scatter_out)r   r   Zchunk_sizesdimstreamsr
   r   r   r   scatter   s"    "r[   c                C   sp   dd | D } |du rF|dkr(t d t|ddd}tj| ||S |dur\td|tj| ||S dS )	a  Gathers tensors from multiple GPU devices.

    Args:
        tensors (Iterable[Tensor]): an iterable of tensors to gather.
          Tensor sizes in all dimensions other than :attr:`dim` have to match.
        dim (int, optional): a dimension along which the tensors will be
          concatenated. Default: ``0``.
        destination (torch.device, str, or int, optional): the output device.
          Can be CPU or CUDA. Default: the current CUDA device.
        out (Tensor, optional, keyword-only): the tensor to store gather result.
          Its sizes must match those of :attr:`tensors`, except for :attr:`dim`,
          where the size must equal ``sum(tensor.size(dim) for tensor in tensors)``.
          Can be on CPU or CUDA.

    .. note::
        :attr:`destination` must not be specified when :attr:`out` is specified.

    Returns:
        - If :attr:`destination` is specified,
            a tensor located on :attr:`destination` device, that is a result of
            concatenating :attr:`tensors` along :attr:`dim`.
        - If :attr:`out` is specified,
            the :attr:`out` tensor, now containing results of concatenating
            :attr:`tensors` along :attr:`dim`.
    c                 S   s   g | ]}t |qS r   r   r   r   r   r   r      r   zgather.<locals>.<listcomp>NrJ   zjUsing -1 to represent CPU tensor is deprecated. Please use a device object or string instead, e.g., "cpu".T)	allow_cpur$   zS'destination' must not be specified when 'out' is specified, but got destination={})	warningswarnr   r   r   _gatherr   r   _gather_out)r!   rY   r@   r
   r   r   r   gather   s    ra   )N)r   )N)Nr   )NNr   N)r   N)r]   r   
torch.cudar   torch._utilsr   r   r   r   r   r   typingr	   r   r#   rG   rV   r[   ra   r   r   r   r   <module>   s    !

.
-2