a
    ==ic)                     @   s6  d dl mZ d dlmZmZmZmZmZ d dlmZ d dl	Z	d dl
Zd dlZd dlZe  edZeej dd Zdd	 Zd
d Zejdd Zdd Zedkr2e  \ZZejrdnd Zedddd edge  e!ej"rej#nej$dZ%e&e% ee W d   n1 s(0    Y  dS )    )
caffe2_pb2)	workspacecoreutilsrnn_cellmodel_helper)	recurrentNZ
lstm_benchc                 C   sz  t d|  td}|jg dd| d}|jg dd| d}t| td}||dgdg ||d	gd	g t	j
d
 g }t| D ]}	|	td| d  dkrtd|	|  |	dkr|st	j
d|d g|dd  n|}
t	j
j|
 t	j}|
d }|| }t	j
|
d | t	j}td| td	| t|  ||
d |
d   qt d |||fS )z&
    Fill a queue with input data
    z Generating T={} sequence batchesZgenerate_input_initZ
inputqueue   )Z	num_blobscapacityZ
labelqueueZgenerate_inputZscratchZ	label_scri+
  
   r   zGenerating data {}/{}NzFinished data generation)loginfoformatr   ZNetZCreateBlobsQueuer   
RunNetOnceZEnqueueBlobsnprandomseedrangemaxprintrandintZrandZastypefloat32int32FeedBlobProtoappend)TshapeZ
num_labelsfixed_shapeZgenerate_input_init_netqueuelabel_queueZgenerate_input_netentry_countstZrandom_shapeX
batch_sizeLlabels r'   m/home/droni/.local/share/virtualenvs/DPS-5Je3_V2c/lib/python3.9/site-packages/caffe2/python/lstm_benchmark.pygenerate_data   sB    


"
r)   c                 C   s^  t jdd}|jdd\}}|j|d}|j|d}g }	| jdv rd }
d| jv rz| jsfJ d	| j}
td
	|
 t
| jD ]0}|jd	|d	|\}}|	||g qtj||||	| j| jg| j d| j| jdd|
d\}}}}d| jv r~td d|j _d|j _n`| jdkrp|jdd}	|jjg ||d tj|||	| j| jd| jd\}}}nds~J d|j|d}|j||||gddg\}}| js||g |	D ]P}|j|| | j}| jdkr|| j9 }t|t j!d| j"|gt j#d q| j$rV|j j%D ](}|j&d r,t'j(|| j)| j*d! q,||fS )"NZ
LSTM_bench)nameseq_lengthstargetZ
input_datalabel)ownstaticZ
static_dagr/   z0Random input length is not static RNN compatiblezUsing static RNN of size {}zhidden_init_{}zcell_init_{}Zlstm1T)model
input_blobr+   initial_statesdim_indim_outscopememory_optimizationforward_onlyZdrop_statesZreturn_last_layer_onlyZstatic_rnn_unroll_sizeZdagzUsing DAG net type   Zcudnnhidden_init	cell_init)r   Z	cudnnlstm)r0   r1   r2   r3   r4   r5   
num_layersFzUnknown implementationweightssoftmaxlossr	   ZdtypeZRecurrentNetwork)Znum_threadsZmax_cuda_streams)+r   ZModelHelpernetZAddExternalInputsZDequeueBlobsimplementationr   
seq_lengthr   r   r   r;   extendr   ZLSTM	input_dim
hidden_dimr6   r7   r   typeZnum_workersparam_init_netZConstantFillZ
cudnn_LSTMZUniformFillZSoftmaxWithLossZFlattenZAddGradientOperatorsZCopyr   r   r   Zzerosr$   r   rnn_executorop
startswithr   Zset_rnn_executor_configZrnn_executor_num_threadsZrnn_executor_max_cuda_streams)argsr   r    Zinput_shaper0   r+   r,   r1   r&   Z
init_blobsr   ir9   r:   outputZlast_hidden_Z
last_stater<   r=   r>   Z	init_blobszrI   r'   r'   r(   create_modelC   s    



rP   c                 C   sB  | j | j }| j| j| jg}t|| j || j| j\}}}tdt	j
| jg| j t	jd t| |||\}}t|j t|j t }|| j }	d}
td t|j j | jrtd t }td|d d d  td	 t }t }td
|	| jD ]t}t| j|	| }|
|7 }
t|j j| t }td||	t	||||  ||  d d  |}qtdt	|d
d  t |  d d | jrdnd | jr6td t }td|d d d  |d |d kr6t d|d |d  t d t | S )Nr+   r?   r   z------ Warming up ------zMemory stats:zGPU memory:	{} MBZ	max_totali   z ------ Starting benchmark ------r	   z'Iter: {} / {}. Entries Per Second: {}k.d   r   z/Done. Total EPS excluding 1st iteration: {}k {}z (with RNN executor) totalz3Max usage differs from current total usage: {} > {}z.This means that costly deallocations occurred.)!	data_sizer$   rB   rD   r)   rE   r   r   r   r   arrayr   rP   r   rG   Z	CreateNetr@   timer   r   ZRunNetr   r*   gpur   ZGetGPUMemoryUsageStatsr   r   Ziters_to_reportminsumrH   warning)rK   r   Zinput_blob_shaper   r    r!   r0   rM   
start_timeZ	num_itersZtotal_itersstatsZ	last_time	iterationZ
iters_onceZnew_timer'   r'   r(   
Caffe2LSTM   sz    





$

r^   c                 C   s   t | S )N)r^   )rK   r'   r'   r(   	Benchmark   s    r_   c                  C   s  t jdd} | jdtddd | jdtdd	d | jd
tddd | jdtddd | jdtddd | jdtddd | jdddd | jdtddd | jdddd | jdddd | jd dd!d | jd"td#d$d | jd%dd&d | jd'td d(d | jd)td d*d | S )+NzLSTM benchmark.)descriptionz--hidden_dimi   zHidden dimension)rF   defaulthelpz--input_dim(   zInput dimensionz--batch_size   zThe batch size.z--seq_length   zMax sequence lengthz--data_sizei@B z!Number of data points to generatez--iters_to_reportz&Number of iteration to report progressz--gpu
store_truezRun all on GPU)actionrb   z--implementationr.   z('cudnn', 'own', 'static' or 'static_dag'z--fixed_shapezLWhether to randomize shape of input batches. Static RNN requires fixed shapez--memory_optimizationz+Whether to use memory optimized LSTM or notz--forward_onlyz Whether to run only forward passz--num_layersr	   zNNumber of LSTM layers. All output dimensions are going to beof hidden_dim sizez--rnn_executorzWhether to use RNN executorz--rnn_executor_num_threadsz*Number of threads used by CPU RNN Executorz--rnn_executor_max_cuda_streamsz:Maximum number of CUDA streams used by RNN executor on GPU)argparseArgumentParseradd_argumentintstr)parserr'   r'   r(   GetArgumentParser   s    rn   __main__r	   Zcaffe2z--caffe2_log_level=0z#--caffe2_print_blob_sizes_at_exit=0z--caffe2_rnn_executor={}z--caffe2_gpu_memory_tracking=1r8   )'Zcaffe2.protor   Zcaffe2.pythonr   r   r   r   r   r   rh   numpyr   rV   loggingbasicConfig	getLoggerr   setLevelDEBUGr)   rP   r^   debugr_   rn   __name__parse_known_argsrK   
extra_argsrH   Zrnn_executor_optZ
GlobalInitr   ZDeviceOptionrW   ZGpuDeviceTypeZCPUZdeviceZDeviceScoper'   r'   r'   r(   <module>   s@   
,`F
]
