a
    j=ict                     @  s  d Z ddlmZ ddlmZ ddlmZmZ ddlZddlm	Z	 ddl
ZddlmZmZmZ ddlmZmZ ddlZdd	lmZmZ dd
lmZ ddlmZ ddlm  m  mZ ddl m!Z! dddddZ"ddddddZ#G dd dZ$G dd dZ%G dd de!ej&Z'dS )a  
Read SAS7BDAT files

Based on code written by Jared Hobbs:
  https://bitbucket.org/jaredhobbs/sas7bdat

See also:
  https://github.com/BioStatMatt/sas7bdat

Partial documentation of the file format:
  https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf

Reference for binary data compression:
  http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
    )annotations)abc)datetime	timedeltaN)cast)CompressionOptionsFilePath
ReadBuffer)EmptyDataErrorOutOfBoundsDatetime)	DataFrameisna)
get_handle)Parser)
ReaderBasefloatstrZsas_datetimeunitc                 C  sV   t | rtjS |dkr,tdddt| d S |dkrJtdddt| d S tdd S )Ns     )secondsd)dayszunit must be 'd' or 's')r   pdZNaTr   r   
ValueErrorr    r   g/home/droni/.local/share/virtualenvs/DPS-5Je3_V2c/lib/python3.9/site-packages/pandas/io/sas/sas7bdat.py_parse_datetime2   s    r   z	pd.Series)sas_datetimesr   returnc                 C  sH   zt j| |ddW S  tyB   | jt|d}tt j|}| Y S 0 dS )a  
    Convert to Timestamp if possible, otherwise to datetime.datetime.
    SAS float64 lacks precision for more than ms resolution so the fit
    to datetime.datetime is ok.

    Parameters
    ----------
    sas_datetimes : {Series, Sequence[float]}
       Dates or datetimes in SAS
    unit : {str}
       "d" if the floats represent dates, "s" for datetimes

    Returns
    -------
    Series
       Series of datetime64 dtype or datetime.datetime.
    z
1960-01-01)r   originr   N)r   to_datetimer   applyr   r   Series)r    r   Zs_seriesr   r   r   _convert_datetimes@   s    r'   c                   @  sD   e Zd ZU ded< ded< ded< ded< dddddddd	Zd
S )_SubheaderPointerintoffsetlengthcompressionptypeNone)r*   r+   r,   r-   r!   c                 C  s   || _ || _|| _|| _d S N)r*   r+   r,   r-   )selfr*   r+   r,   r-   r   r   r   __init__`   s    z_SubheaderPointer.__init__N__name__
__module____qualname____annotations__r1   r   r   r   r   r(   Z   s
   
r(   c                   @  sX   e Zd ZU ded< ded< ded< ded< ded< ded	< ddddddd
dddZdS )_Columnr)   col_idstr | bytesnamelabelformatbytesctyper+   r.   )r8   r:   r;   r<   r>   r+   r!   c                 C  s(   || _ || _|| _|| _|| _|| _d S r/   )r8   r:   r;   r<   r>   r+   )r0   r8   r:   r;   r<   r>   r+   r   r   r   r1   o   s    
z_Column.__init__Nr2   r   r   r   r   r7   g   s   
r7   c                   @  s   e Zd ZU dZded< ded< d`d	d
d
ddd
d
ddd	ddZddddZddddZddddZddddZ	ddddZ
dddd Zddd!d"d#Zdddd$d%d&Zddd'd(d)Zddd*d+d,d-Zddd.d/Zd
dd0d1Zd2d3 Zddd4d5Zd6dd7d8d9Zddd:d;d<d=Zdd6d>d?d@Zdd:ddAdBdCZdddd+dDdEZdddd+dFdGZdddd+dHdIZdddd+dJdKZdddd+dLdMZdddd+dNdOZdddd+dPdQZdddd+dRdSZdadddTdUdVZ dWdX Z!dddYdZZ"d[d\ Z#d6d*d]d^d_Z$dS )bSAS7BDATReadera  
    Read SAS files in SAS7BDAT format.

    Parameters
    ----------
    path_or_buf : path name or buffer
        Name of SAS file or file-like object pointing to SAS file
        contents.
    index : column identifier, defaults to None
        Column to use as index.
    convert_dates : bool, defaults to True
        Attempt to convert dates to Pandas datetime values.  Note that
        some rarely used SAS date formats may be unsupported.
    blank_missing : bool, defaults to True
        Convert empty strings to missing values (SAS uses blanks to
        indicate missing character variables).
    chunksize : int, defaults to None
        Return SAS7BDATReader object for iterations, returns chunks
        with given number of lines.
    encoding : string, defaults to None
        String encoding.
    convert_text : bool, defaults to True
        If False, text variables are left as raw bytes.
    convert_header_text : bool, defaults to True
        If False, header text, including column names, are left as raw
        bytes.
    r)   _int_lengthzbytes | None_cached_pageNTinferzFilePath | ReadBuffer[bytes]boolz
int | Nonez
str | Noner   r.   )	path_or_bufconvert_datesblank_missing	chunksizeencodingconvert_textconvert_header_textr,   r!   c
           
      C  s   || _ || _|| _|| _|| _|| _|| _d| _d| _g | _	g | _
g | _g | _g | _d | _g | _g | _g | _d| _d| _d| _t|dd|	d| _| jj| _z|   |   W n ty   |    Y n0 d S )Nzlatin-1    r   rbF)Zis_textr,   )indexrE   rF   rG   rH   rI   rJ   default_encodingr,   column_names_rawcolumn_namescolumn_formatscolumns%_current_page_data_subheader_pointersrA   _column_data_lengths_column_data_offsets_column_types_current_row_in_file_indexZ_current_row_on_page_indexr   handleshandle_path_or_buf_get_properties_parse_metadata	Exceptionclose)
r0   rD   rM   rE   rF   rG   rH   rI   rJ   r,   r   r   r   r1      s>    
zSAS7BDATReader.__init__z
np.ndarray)r!   c                 C  s   t j| jt jdS )z5Return a numpy int64 array of the column data lengthsdtype)npasarrayrT   int64r0   r   r   r   column_data_lengths   s    z"SAS7BDATReader.column_data_lengthsc                 C  s   t j| jt jdS )z0Return a numpy int64 array of the column offsetsr_   )ra   rb   rU   rc   rd   r   r   r   column_data_offsets   s    z"SAS7BDATReader.column_data_offsetsc                 C  s   t j| jt ddS )zj
        Returns a numpy character array of the column types:
           s (string) or d (double)
        ZS1r_   )ra   rb   rV   r`   rd   r   r   r   column_types   s    zSAS7BDATReader.column_typesc                 C  s   | j   d S r/   )rX   r^   rd   r   r   r   r^      s    zSAS7BDATReader.closec                 C  s  | j d | j d| _| jdttj tjkr<tdd\}}| tj	tj
}|tjkrtj}d| _d| _tj| _tj| _nd| _tj| _tj| _d| _| tjtj}|tjkrtj}|| }| tjtj}|d	krd
| _nd| _| tjtjd }|tjv rtj| | _nd| d| _| tjtj }|dkrJd| _!n|dkr\d| _!nd| _!| "tj#tj$| _%| "tj&tj'| _(t)ddd}| *tj+| tj,}|t-j.|dd | _/| *tj0| tj1}|t-j.|dd | _2| 3tj4| tj5| _6| j | j6d }|  j|7  _t| j| j6kr2td| 3tj7| tj8| _9| 3tj:| tj;| _<| "tj=| tj>| _=| "tj?| tj@| _A| "tjB| tjC| _D| "tjE| tjF| _G| jGs| "tjH| tjI| _Gd S )Nr   i   z'magic number mismatch (not a SAS file?)r   r   T   F      <>zunknown (code=)   1unix   2windowsunknownr   r   r   r#   z*The SAS7BDAT file appears to be truncated.)JrZ   seekreadrA   lenconstmagicr   _read_bytesZalign_1_offsetZalign_1_lengthZu64_byte_checker_valueZalign_2_valueU64r@   Zpage_bit_offset_x64_page_bit_offsetZsubheader_pointer_length_x64_subheader_pointer_lengthZpage_bit_offset_x86Zsubheader_pointer_length_x86Zalign_2_offsetZalign_2_lengthZalign_1_checker_valueZendianness_offsetZendianness_length
byte_orderZencoding_offsetZencoding_lengthZencoding_namesfile_encodingZplatform_offsetZplatform_lengthplatform_read_and_convert_header_textZdataset_offsetZdataset_lengthr:   Zfile_type_offsetZfile_type_length	file_typer   _read_floatZdate_created_offsetZdate_created_lengthr   Zto_timedeltaZdate_createdZdate_modified_offsetZdate_modified_lengthZdate_modified	_read_intZheader_size_offsetZheader_size_lengthheader_lengthZpage_size_offsetZpage_size_length_page_lengthZpage_count_offsetZpage_count_lengthZ_page_countZsas_release_offsetZsas_release_lengthZsas_server_type_offsetZsas_server_type_lengthZserver_typeZos_version_number_offsetZos_version_number_length
os_versionZos_name_offsetZos_name_lengthos_nameZos_maker_offsetZos_maker_length)r0   Zalign1Zalign2bufZtotal_alignepochxr   r   r   r[      s    




zSAS7BDATReader._get_propertiesr   c                 C  s(   | j | jpdd}|jr$|   t|S )Nr   )nrows)ru   rG   emptyr^   StopIteration)r0   dar   r   r   __next__S  s
    zSAS7BDATReader.__next__)r*   widthc                 C  sJ   |dvr|    td| ||}|dkr0dnd}t| j| |d S )N)rj   ri   zinvalid float widthrj   fr   r   r^   r   ry   structunpackr}   )r0   r*   r   r   fdr   r   r   r   [  s    zSAS7BDATReader._read_float)r*   r   r!   c                 C  sP   |dvr|    td| ||}ddddd| }t| j| |d }|S )N)r      rj   ri   zinvalid int widthbhlqr   r   )r0   r*   r   r   itZivr   r   r   r   d  s    zSAS7BDATReader._read_int)r*   r+   c                 C  s   | j d u rX| j| | j|}t||k rT|   d|dd|dd}t||S || t| j krz|   td| j |||  S d S )NzUnable to read r   z bytes from file position .zThe cached page is too small.)rA   rZ   rt   ru   rv   r^   r   )r0   r*   r+   r   msgr   r   r   ry   m  s    
zSAS7BDATReader._read_bytesr9   )r*   r+   r!   c                 C  s   |  | ||dS )N     )_convert_header_textry   rstripr0   r*   r+   r   r   r   r   |  s    z,SAS7BDATReader._read_and_convert_header_textc                 C  sN   d}|sJ| j | j| _t| jdkr(qJt| j| jkr@td|  }qd S )NFr   z2Failed to read a meta data page from the SAS file.)rZ   ru   r   rA   rv   r   _process_page_meta)r0   doner   r   r   r\     s    zSAS7BDATReader._parse_metadatac                 C  sZ   |    tjtjtjg }| j|v r,|   | jtjk}| jtjk}t|pV|pV| j	g kS r/   )
_read_page_headerrw   page_meta_typesZpage_amd_typepage_mix_type_current_page_type_process_page_metadatapage_data_typerC   rS   )r0   ptZis_data_pageZis_mix_pager   r   r   r     s    
z!SAS7BDATReader._process_page_metac                 C  s^   | j }tj| }| |tjtj@ | _tj| }| |tj| _	tj
| }| |tj| _d S r/   )r{   rw   Zpage_type_offsetr   Zpage_type_lengthZpage_type_mask2r   Zblock_count_offsetZblock_count_lengthZ_current_page_block_countZsubheader_count_offsetZsubheader_count_length_current_page_subheaders_count)r0   
bit_offsetZtxr   r   r   r     s    


z SAS7BDATReader._read_page_headerc                 C  sp   | j }t| jD ]Z}| tj| |}|jdkr2q|jtjkr@q| 	|j
}| ||j|j}| || qd S )Nr   )r{   ranger   _process_subheader_pointersrw   Zsubheader_pointers_offsetr+   r,   Ztruncated_subheader_id_read_subheader_signaturer*   _get_subheader_indexr-   _process_subheader)r0   r   ipointersubheader_signaturesubheader_indexr   r   r   r     s    


z%SAS7BDATReader._process_page_metadatar=   )	signaturer!   c                 C  s`   t j|}|d u r\|t jkp$|dk}|t jk}| jdkrL|rL|rLt jj}n|   t	d|S )Nr   rK   zUnknown subheader signature)
rw   Zsubheader_signature_to_indexgetZcompressed_subheader_idZcompressed_subheader_typer,   SASIndexdata_subheader_indexr^   r   )r0   r   r,   r-   rM   f1f2r   r   r   r     s    

z#SAS7BDATReader._get_subheader_indexr(   )r*   subheader_pointer_indexr!   c           
      C  st   | j }|||  }| || j}|| j7 }| || j}|| j7 }| |d}|d7 }| |d}t||||}	|	S )Nr   )r|   r   r@   r(   )
r0   r*   r   Zsubheader_pointer_lengthZtotal_offsetZsubheader_offsetZsubheader_lengthZsubheader_compressionZsubheader_typer   r   r   r   r     s    

z*SAS7BDATReader._process_subheader_pointers)r*   r!   c                 C  s   |  || j}|S r/   )ry   r@   )r0   r*   r   r   r   r   r     s    z(SAS7BDATReader._read_subheader_signature)r   r   r!   c                 C  s   |j }|j}|tjjkr | j}n|tjjkr4| j}n|tjjkrH| j	}n|tjj
kr\| j}nt|tjjkrp| j}n`|tjjkr| j}nL|tjjkr| j}n8|tjjkr| j}n$|tjjkr| j| d S td||| d S )Nzunknown subheader index)r*   r+   rw   r   Zrow_size_index_process_rowsize_subheaderZcolumn_size_index_process_columnsize_subheaderZcolumn_text_index_process_columntext_subheaderZcolumn_name_index_process_columnname_subheaderZcolumn_attributes_index#_process_columnattributes_subheaderZformat_and_label_index_process_format_subheaderZcolumn_list_index_process_columnlist_subheaderZsubheader_counts_index_process_subheader_countsr   rS   appendr   )r0   r   r   r*   r+   	processorr   r   r   r     s.    z!SAS7BDATReader._process_subheaderc                 C  s   | j }|}|}| jr&|d7 }|d7 }n|d7 }|d7 }| |tj|  || _| |tj|  || _| |tj|  || _	| |tj
|  || _tj| }| || || _| |d| _| |d| _d S )Ni  i  ib  iz  r   )r@   rz   r   rw   Zrow_length_offset_multiplierZ
row_lengthZrow_count_offset_multiplier	row_countZcol_count_p1_multipliercol_count_p1Zcol_count_p2_multipliercol_count_p2Z'row_count_on_mix_page_offset_multiplierZ_mix_page_row_count_lcs_lcp)r0   r*   r+   int_lenZ
lcs_offsetZ
lcp_offsetZmxr   r   r   r     s0    

z)SAS7BDATReader._process_rowsize_subheaderc                 C  sT   | j }||7 }| ||| _| j| j | jkrPtd| j d| j d| j d d S )Nz Warning: column count mismatch (z + z != z)
)r@   r   column_countr   r   print)r0   r*   r+   r   r   r   r   r     s    
z,SAS7BDATReader._process_columnsize_subheaderc                 C  s   d S r/   r   r   r   r   r   r   %  s    z(SAS7BDATReader._process_subheader_countsc           	      C  s  || j 7 }| |tj}| ||}|d| d}| j| t| jdkrd}tj	D ]}||v r\|}q\|| _
|| j 8 }|d }| jr|d7 }| || j}|d}|dkrd| _|d }| jr|d7 }| || j}|d| j | _n|tjkr4|d	 }| jr|d7 }| || j}|d| j | _nH| jdkr|d| _|d }| jr^|d7 }| || j}|d| j | _t| d
r| | j| _d S )Nr   r   r   rK      rj           (   creator_proc)r@   r   rw   Ztext_block_size_lengthry   r   rO   r   rv   Zcompression_literalsr,   rz   r   r   r   Zrle_compressionhasattrr   )	r0   r*   r+   Ztext_block_sizer   Z	cname_rawZcompression_literalclZoffset1r   r   r   r   (  sN    



z,SAS7BDATReader._process_columntext_subheaderc                 C  s   | j }||7 }|d|  d d }t|D ]}|tj|d   tj }|tj|d   tj }|tj|d   tj }| |tj}	| |tj	}
| |tj
}| j|	 }||
|
|  }| j| | q*d S )Nr      ri   r   )r@   r   rw   Zcolumn_name_pointer_lengthZ!column_name_text_subheader_offsetZcolumn_name_offset_offsetZcolumn_name_length_offsetr   Z!column_name_text_subheader_lengthZcolumn_name_offset_lengthZcolumn_name_length_lengthrO   rP   r   r   )r0   r*   r+   r   Zcolumn_name_pointers_countr   Ztext_subheaderZcol_name_offsetZcol_name_lengthidx
col_offsetZcol_lenZname_rawcnamer   r   r   r   V  s@    
z,SAS7BDATReader._process_columnname_subheaderc           
      C  s   | j }|d|  d |d  }t|D ]}|| tj ||d   }|d|  tj ||d   }|d|  tj ||d   }| ||}	| j|	 | |tj	}	| j
|	 | |tj}	| j|	dkrdnd q&d S )Nr   r   ri   r      d   s)r@   r   rw   Zcolumn_data_offset_offsetZcolumn_data_length_offsetZcolumn_type_offsetr   rU   r   Zcolumn_data_length_lengthrT   Zcolumn_type_lengthrV   )
r0   r*   r+   r   Zcolumn_attributes_vectors_countr   Zcol_data_offsetZcol_data_lenZ	col_typesr   r   r   r   r   w  s*    
z2SAS7BDATReader._process_columnattributes_subheaderc                 C  s   d S r/   r   r   r   r   r   r     s    z,SAS7BDATReader._process_columnlist_subheaderc                 C  sx  | j }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }	| |tj	}
t
|
t| jd }| |tj}| |tj}| |tj}t
|t| jd }| |tj}| |	tj}| j| }| ||||  }| j| }| ||||  }t| j}t|| j| ||| j| | j| }| j| | j| d S )N   r   )r@   rw   Z)column_format_text_subheader_index_offsetZcolumn_format_offset_offsetZcolumn_format_length_offsetZ(column_label_text_subheader_index_offsetZcolumn_label_offset_offsetZcolumn_label_length_offsetr   Z)column_format_text_subheader_index_lengthminrv   rO   Zcolumn_format_offset_lengthZcolumn_format_length_lengthZ(column_label_text_subheader_index_lengthZcolumn_label_offset_lengthZcolumn_label_length_lengthr   rR   r7   rP   rV   rT   rQ   r   )r0   r*   r+   r   Ztext_subheader_formatZcol_format_offsetZcol_format_lenZtext_subheader_labelZcol_label_offsetZcol_label_lenr   Z
format_idxZformat_startZ
format_lenZ	label_idxZlabel_startZ	label_lenZlabel_namesZcolumn_labelZformat_namesZcolumn_formatZcurrent_column_numbercolr   r   r   r     sT    


	z(SAS7BDATReader._process_format_subheader)r   r!   c                 C  s   |d u r| j d ur| j }n|d u r(| j}t| jdkrF|   td|dkr`| j| jkr`t S | j| j }||krx|}| jd}| jd}t	j
||ftd| _t	j|d| ft	jd| _d| _t| }|| |  }| jd ur|| j}|S )Nr   zNo columns to parse from filer   r   r_   ri   )rG   r   rv   rV   r^   r
   rW   r   countra   r   object_string_chunkZzerosZuint8_byte_chunk_current_row_in_chunk_indexr   ru   _chunk_to_dataframerM   Z	set_index)r0   r   mndnsprsltr   r   r   ru     s.    

zSAS7BDATReader.readc                 C  s   g | _ | j| j| _t| jdkr(dS t| j| jkrf|   dt| jdd| jdd}t||   | j	t
jv r|   | j	t
jt
jt
jg vr|  S dS )Nr   Tz-failed to read complete page from file (read r   z of z bytes)F)rS   rZ   ru   r   rA   rv   r^   r   r   r   rw   r   r   r   r   _read_next_page)r0   r   r   r   r   r     s,    
zSAS7BDATReader._read_next_pagec                 C  s  | j }| j}t|| |}i }d\}}t| jD ]R}| j| }| j| dkr| j|d d f j| jd d}	t	j
|	tj|d||< | jr| j| tjv rt|| d||< n"| j| tjv rt|| d||< |d7 }q0| j| dkrdt	j
| j|d d f |d	||< | jr2| jd ur2| || j||< | jrZ|| j d
k}
tj|| |
< |d7 }q0|   tdt| j|  q0t|| j|dd}|S )Nrh   r   r   r_   )r`   rM   r   r   r   )rM   r   zunknown column type F)rR   rM   copy)r   rW   r   r   rP   rV   r   viewr}   r   r&   ra   float64rE   rQ   rw   Zsas_date_formatsr'   Zsas_datetime_formatsr   rI   rH   _decode_stringr   rF   rv   nanr^   r   reprr   )r0   nr   Zixr   jsZjbjr:   Zcol_arriidfr   r   r   r     s8    
 
 
z"SAS7BDATReader._chunk_to_dataframec                 C  s   | | jp| jS r/   )decoderH   rN   r0   r   r   r   r   r   )  s    zSAS7BDATReader._decode_string)r   r!   c                 C  s   | j r| |S |S d S r/   )rJ   r   r   r   r   r   r   ,  s    
z#SAS7BDATReader._convert_header_text)NTTNNTTrB   )N)%r3   r4   r5   __doc__r6   r1   re   rf   rg   r^   r[   r   r   r   ry   r   r\   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ru   r   r   r   r   r   r   r   r   r?      sV   
         3l		
.!5"$r?   )(r   
__future__r   collectionsr   r   r   r   typingr   numpyra   Zpandas._typingr   r   r	   Zpandas.errorsr
   r   Zpandasr   r   r   Zpandas.io.commonr   Zpandas.io.sas._sasr   Zpandas.io.sas.sas_constantsiosasZsas_constantsrw   Zpandas.io.sas.sasreaderr   r   r'   r(   r7   Iteratorr?   r   r   r   r   <module>   s&   