up follow livre
This commit is contained in:
		
							parent
							
								
									b4b4398bb0
								
							
						
					
					
						commit
						3a7a3849ae
					
				
					 12242 changed files with 2564461 additions and 6914 deletions
				
			
		|  | @ -0,0 +1,3 @@ | |||
| from pandas.io.sas.sasreader import read_sas | ||||
| 
 | ||||
| __all__ = ["read_sas"] | ||||
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										762
									
								
								venv/lib/python3.13/site-packages/pandas/io/sas/sas7bdat.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										762
									
								
								venv/lib/python3.13/site-packages/pandas/io/sas/sas7bdat.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,762 @@ | |||
| """ | ||||
| Read SAS7BDAT files | ||||
| 
 | ||||
| Based on code written by Jared Hobbs: | ||||
|   https://bitbucket.org/jaredhobbs/sas7bdat | ||||
| 
 | ||||
| See also: | ||||
|   https://github.com/BioStatMatt/sas7bdat | ||||
| 
 | ||||
| Partial documentation of the file format: | ||||
|   https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf | ||||
| 
 | ||||
| Reference for binary data compression: | ||||
|   http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm | ||||
| """ | ||||
| from __future__ import annotations | ||||
| 
 | ||||
| from collections import abc | ||||
| from datetime import ( | ||||
|     datetime, | ||||
|     timedelta, | ||||
| ) | ||||
| import sys | ||||
| from typing import TYPE_CHECKING | ||||
| 
 | ||||
| import numpy as np | ||||
| 
 | ||||
| from pandas._config import get_option | ||||
| 
 | ||||
| from pandas._libs.byteswap import ( | ||||
|     read_double_with_byteswap, | ||||
|     read_float_with_byteswap, | ||||
|     read_uint16_with_byteswap, | ||||
|     read_uint32_with_byteswap, | ||||
|     read_uint64_with_byteswap, | ||||
| ) | ||||
| from pandas._libs.sas import ( | ||||
|     Parser, | ||||
|     get_subheader_index, | ||||
| ) | ||||
| from pandas._libs.tslibs.conversion import cast_from_unit_vectorized | ||||
| from pandas.errors import EmptyDataError | ||||
| 
 | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Timestamp, | ||||
|     isna, | ||||
| ) | ||||
| 
 | ||||
| from pandas.io.common import get_handle | ||||
| import pandas.io.sas.sas_constants as const | ||||
| from pandas.io.sas.sasreader import ReaderBase | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from pandas._typing import ( | ||||
|         CompressionOptions, | ||||
|         FilePath, | ||||
|         ReadBuffer, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| _unix_origin = Timestamp("1970-01-01") | ||||
| _sas_origin = Timestamp("1960-01-01") | ||||
| 
 | ||||
| 
 | ||||
| def _parse_datetime(sas_datetime: float, unit: str): | ||||
|     if isna(sas_datetime): | ||||
|         return pd.NaT | ||||
| 
 | ||||
|     if unit == "s": | ||||
|         return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime) | ||||
| 
 | ||||
|     elif unit == "d": | ||||
|         return datetime(1960, 1, 1) + timedelta(days=sas_datetime) | ||||
| 
 | ||||
|     else: | ||||
|         raise ValueError("unit must be 'd' or 's'") | ||||
| 
 | ||||
| 
 | ||||
| def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: | ||||
|     """ | ||||
|     Convert to Timestamp if possible, otherwise to datetime.datetime. | ||||
|     SAS float64 lacks precision for more than ms resolution so the fit | ||||
|     to datetime.datetime is ok. | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     sas_datetimes : {Series, Sequence[float]} | ||||
|        Dates or datetimes in SAS | ||||
|     unit : {'d', 's'} | ||||
|        "d" if the floats represent dates, "s" for datetimes | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     Series | ||||
|        Series of datetime64 dtype or datetime.datetime. | ||||
|     """ | ||||
|     td = (_sas_origin - _unix_origin).as_unit("s") | ||||
|     if unit == "s": | ||||
|         millis = cast_from_unit_vectorized( | ||||
|             sas_datetimes._values, unit="s", out_unit="ms" | ||||
|         ) | ||||
|         dt64ms = millis.view("M8[ms]") + td | ||||
|         return pd.Series(dt64ms, index=sas_datetimes.index, copy=False) | ||||
|     else: | ||||
|         vals = np.array(sas_datetimes, dtype="M8[D]") + td | ||||
|         return pd.Series(vals, dtype="M8[s]", index=sas_datetimes.index, copy=False) | ||||
| 
 | ||||
| 
 | ||||
| class _Column: | ||||
|     col_id: int | ||||
|     name: str | bytes | ||||
|     label: str | bytes | ||||
|     format: str | bytes | ||||
|     ctype: bytes | ||||
|     length: int | ||||
| 
 | ||||
|     def __init__( | ||||
|         self, | ||||
|         col_id: int, | ||||
|         # These can be bytes when convert_header_text is False | ||||
|         name: str | bytes, | ||||
|         label: str | bytes, | ||||
|         format: str | bytes, | ||||
|         ctype: bytes, | ||||
|         length: int, | ||||
|     ) -> None: | ||||
|         self.col_id = col_id | ||||
|         self.name = name | ||||
|         self.label = label | ||||
|         self.format = format | ||||
|         self.ctype = ctype | ||||
|         self.length = length | ||||
| 
 | ||||
| 
 | ||||
| # SAS7BDAT represents a SAS data file in SAS7BDAT format. | ||||
| class SAS7BDATReader(ReaderBase, abc.Iterator): | ||||
|     """ | ||||
|     Read SAS files in SAS7BDAT format. | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     path_or_buf : path name or buffer | ||||
|         Name of SAS file or file-like object pointing to SAS file | ||||
|         contents. | ||||
|     index : column identifier, defaults to None | ||||
|         Column to use as index. | ||||
|     convert_dates : bool, defaults to True | ||||
|         Attempt to convert dates to Pandas datetime values.  Note that | ||||
|         some rarely used SAS date formats may be unsupported. | ||||
|     blank_missing : bool, defaults to True | ||||
|         Convert empty strings to missing values (SAS uses blanks to | ||||
|         indicate missing character variables). | ||||
|     chunksize : int, defaults to None | ||||
|         Return SAS7BDATReader object for iterations, returns chunks | ||||
|         with given number of lines. | ||||
|     encoding : str, 'infer', defaults to None | ||||
|         String encoding acc. to Python standard encodings, | ||||
|         encoding='infer' tries to detect the encoding from the file header, | ||||
|         encoding=None will leave the data in binary format. | ||||
|     convert_text : bool, defaults to True | ||||
|         If False, text variables are left as raw bytes. | ||||
|     convert_header_text : bool, defaults to True | ||||
|         If False, header text, including column names, are left as raw | ||||
|         bytes. | ||||
|     """ | ||||
| 
 | ||||
|     _int_length: int | ||||
|     _cached_page: bytes | None | ||||
| 
 | ||||
|     def __init__( | ||||
|         self, | ||||
|         path_or_buf: FilePath | ReadBuffer[bytes], | ||||
|         index=None, | ||||
|         convert_dates: bool = True, | ||||
|         blank_missing: bool = True, | ||||
|         chunksize: int | None = None, | ||||
|         encoding: str | None = None, | ||||
|         convert_text: bool = True, | ||||
|         convert_header_text: bool = True, | ||||
|         compression: CompressionOptions = "infer", | ||||
|     ) -> None: | ||||
|         self.index = index | ||||
|         self.convert_dates = convert_dates | ||||
|         self.blank_missing = blank_missing | ||||
|         self.chunksize = chunksize | ||||
|         self.encoding = encoding | ||||
|         self.convert_text = convert_text | ||||
|         self.convert_header_text = convert_header_text | ||||
| 
 | ||||
|         self.default_encoding = "latin-1" | ||||
|         self.compression = b"" | ||||
|         self.column_names_raw: list[bytes] = [] | ||||
|         self.column_names: list[str | bytes] = [] | ||||
|         self.column_formats: list[str | bytes] = [] | ||||
|         self.columns: list[_Column] = [] | ||||
| 
 | ||||
|         self._current_page_data_subheader_pointers: list[tuple[int, int]] = [] | ||||
|         self._cached_page = None | ||||
|         self._column_data_lengths: list[int] = [] | ||||
|         self._column_data_offsets: list[int] = [] | ||||
|         self._column_types: list[bytes] = [] | ||||
| 
 | ||||
|         self._current_row_in_file_index = 0 | ||||
|         self._current_row_on_page_index = 0 | ||||
|         self._current_row_in_file_index = 0 | ||||
| 
 | ||||
|         self.handles = get_handle( | ||||
|             path_or_buf, "rb", is_text=False, compression=compression | ||||
|         ) | ||||
| 
 | ||||
|         self._path_or_buf = self.handles.handle | ||||
| 
 | ||||
|         # Same order as const.SASIndex | ||||
|         self._subheader_processors = [ | ||||
|             self._process_rowsize_subheader, | ||||
|             self._process_columnsize_subheader, | ||||
|             self._process_subheader_counts, | ||||
|             self._process_columntext_subheader, | ||||
|             self._process_columnname_subheader, | ||||
|             self._process_columnattributes_subheader, | ||||
|             self._process_format_subheader, | ||||
|             self._process_columnlist_subheader, | ||||
|             None,  # Data | ||||
|         ] | ||||
| 
 | ||||
|         try: | ||||
|             self._get_properties() | ||||
|             self._parse_metadata() | ||||
|         except Exception: | ||||
|             self.close() | ||||
|             raise | ||||
| 
 | ||||
|     def column_data_lengths(self) -> np.ndarray: | ||||
|         """Return a numpy int64 array of the column data lengths""" | ||||
|         return np.asarray(self._column_data_lengths, dtype=np.int64) | ||||
| 
 | ||||
|     def column_data_offsets(self) -> np.ndarray: | ||||
|         """Return a numpy int64 array of the column offsets""" | ||||
|         return np.asarray(self._column_data_offsets, dtype=np.int64) | ||||
| 
 | ||||
|     def column_types(self) -> np.ndarray: | ||||
|         """ | ||||
|         Returns a numpy character array of the column types: | ||||
|            s (string) or d (double) | ||||
|         """ | ||||
|         return np.asarray(self._column_types, dtype=np.dtype("S1")) | ||||
| 
 | ||||
|     def close(self) -> None: | ||||
|         self.handles.close() | ||||
| 
 | ||||
|     def _get_properties(self) -> None: | ||||
|         # Check magic number | ||||
|         self._path_or_buf.seek(0) | ||||
|         self._cached_page = self._path_or_buf.read(288) | ||||
|         if self._cached_page[0 : len(const.magic)] != const.magic: | ||||
|             raise ValueError("magic number mismatch (not a SAS file?)") | ||||
| 
 | ||||
|         # Get alignment information | ||||
|         buf = self._read_bytes(const.align_1_offset, const.align_1_length) | ||||
|         if buf == const.u64_byte_checker_value: | ||||
|             self.U64 = True | ||||
|             self._int_length = 8 | ||||
|             self._page_bit_offset = const.page_bit_offset_x64 | ||||
|             self._subheader_pointer_length = const.subheader_pointer_length_x64 | ||||
|         else: | ||||
|             self.U64 = False | ||||
|             self._page_bit_offset = const.page_bit_offset_x86 | ||||
|             self._subheader_pointer_length = const.subheader_pointer_length_x86 | ||||
|             self._int_length = 4 | ||||
|         buf = self._read_bytes(const.align_2_offset, const.align_2_length) | ||||
|         if buf == const.align_1_checker_value: | ||||
|             align1 = const.align_2_value | ||||
|         else: | ||||
|             align1 = 0 | ||||
| 
 | ||||
|         # Get endianness information | ||||
|         buf = self._read_bytes(const.endianness_offset, const.endianness_length) | ||||
|         if buf == b"\x01": | ||||
|             self.byte_order = "<" | ||||
|             self.need_byteswap = sys.byteorder == "big" | ||||
|         else: | ||||
|             self.byte_order = ">" | ||||
|             self.need_byteswap = sys.byteorder == "little" | ||||
| 
 | ||||
|         # Get encoding information | ||||
|         buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0] | ||||
|         if buf in const.encoding_names: | ||||
|             self.inferred_encoding = const.encoding_names[buf] | ||||
|             if self.encoding == "infer": | ||||
|                 self.encoding = self.inferred_encoding | ||||
|         else: | ||||
|             self.inferred_encoding = f"unknown (code={buf})" | ||||
| 
 | ||||
|         # Timestamp is epoch 01/01/1960 | ||||
|         epoch = datetime(1960, 1, 1) | ||||
|         x = self._read_float( | ||||
|             const.date_created_offset + align1, const.date_created_length | ||||
|         ) | ||||
|         self.date_created = epoch + pd.to_timedelta(x, unit="s") | ||||
|         x = self._read_float( | ||||
|             const.date_modified_offset + align1, const.date_modified_length | ||||
|         ) | ||||
|         self.date_modified = epoch + pd.to_timedelta(x, unit="s") | ||||
| 
 | ||||
|         self.header_length = self._read_uint( | ||||
|             const.header_size_offset + align1, const.header_size_length | ||||
|         ) | ||||
| 
 | ||||
|         # Read the rest of the header into cached_page. | ||||
|         buf = self._path_or_buf.read(self.header_length - 288) | ||||
|         self._cached_page += buf | ||||
|         # error: Argument 1 to "len" has incompatible type "Optional[bytes]"; | ||||
|         #  expected "Sized" | ||||
|         if len(self._cached_page) != self.header_length:  # type: ignore[arg-type] | ||||
|             raise ValueError("The SAS7BDAT file appears to be truncated.") | ||||
| 
 | ||||
|         self._page_length = self._read_uint( | ||||
|             const.page_size_offset + align1, const.page_size_length | ||||
|         ) | ||||
| 
 | ||||
|     def __next__(self) -> DataFrame: | ||||
|         da = self.read(nrows=self.chunksize or 1) | ||||
|         if da.empty: | ||||
|             self.close() | ||||
|             raise StopIteration | ||||
|         return da | ||||
| 
 | ||||
|     # Read a single float of the given width (4 or 8). | ||||
|     def _read_float(self, offset: int, width: int): | ||||
|         assert self._cached_page is not None | ||||
|         if width == 4: | ||||
|             return read_float_with_byteswap( | ||||
|                 self._cached_page, offset, self.need_byteswap | ||||
|             ) | ||||
|         elif width == 8: | ||||
|             return read_double_with_byteswap( | ||||
|                 self._cached_page, offset, self.need_byteswap | ||||
|             ) | ||||
|         else: | ||||
|             self.close() | ||||
|             raise ValueError("invalid float width") | ||||
| 
 | ||||
|     # Read a single unsigned integer of the given width (1, 2, 4 or 8). | ||||
|     def _read_uint(self, offset: int, width: int) -> int: | ||||
|         assert self._cached_page is not None | ||||
|         if width == 1: | ||||
|             return self._read_bytes(offset, 1)[0] | ||||
|         elif width == 2: | ||||
|             return read_uint16_with_byteswap( | ||||
|                 self._cached_page, offset, self.need_byteswap | ||||
|             ) | ||||
|         elif width == 4: | ||||
|             return read_uint32_with_byteswap( | ||||
|                 self._cached_page, offset, self.need_byteswap | ||||
|             ) | ||||
|         elif width == 8: | ||||
|             return read_uint64_with_byteswap( | ||||
|                 self._cached_page, offset, self.need_byteswap | ||||
|             ) | ||||
|         else: | ||||
|             self.close() | ||||
|             raise ValueError("invalid int width") | ||||
| 
 | ||||
|     def _read_bytes(self, offset: int, length: int): | ||||
|         assert self._cached_page is not None | ||||
|         if offset + length > len(self._cached_page): | ||||
|             self.close() | ||||
|             raise ValueError("The cached page is too small.") | ||||
|         return self._cached_page[offset : offset + length] | ||||
| 
 | ||||
|     def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes: | ||||
|         return self._convert_header_text( | ||||
|             self._read_bytes(offset, length).rstrip(b"\x00 ") | ||||
|         ) | ||||
| 
 | ||||
|     def _parse_metadata(self) -> None: | ||||
|         done = False | ||||
|         while not done: | ||||
|             self._cached_page = self._path_or_buf.read(self._page_length) | ||||
|             if len(self._cached_page) <= 0: | ||||
|                 break | ||||
|             if len(self._cached_page) != self._page_length: | ||||
|                 raise ValueError("Failed to read a meta data page from the SAS file.") | ||||
|             done = self._process_page_meta() | ||||
| 
 | ||||
|     def _process_page_meta(self) -> bool: | ||||
|         self._read_page_header() | ||||
|         pt = const.page_meta_types + [const.page_amd_type, const.page_mix_type] | ||||
|         if self._current_page_type in pt: | ||||
|             self._process_page_metadata() | ||||
|         is_data_page = self._current_page_type == const.page_data_type | ||||
|         is_mix_page = self._current_page_type == const.page_mix_type | ||||
|         return bool( | ||||
|             is_data_page | ||||
|             or is_mix_page | ||||
|             or self._current_page_data_subheader_pointers != [] | ||||
|         ) | ||||
| 
 | ||||
|     def _read_page_header(self) -> None: | ||||
|         bit_offset = self._page_bit_offset | ||||
|         tx = const.page_type_offset + bit_offset | ||||
|         self._current_page_type = ( | ||||
|             self._read_uint(tx, const.page_type_length) & const.page_type_mask2 | ||||
|         ) | ||||
|         tx = const.block_count_offset + bit_offset | ||||
|         self._current_page_block_count = self._read_uint(tx, const.block_count_length) | ||||
|         tx = const.subheader_count_offset + bit_offset | ||||
|         self._current_page_subheaders_count = self._read_uint( | ||||
|             tx, const.subheader_count_length | ||||
|         ) | ||||
| 
 | ||||
|     def _process_page_metadata(self) -> None: | ||||
|         bit_offset = self._page_bit_offset | ||||
| 
 | ||||
|         for i in range(self._current_page_subheaders_count): | ||||
|             offset = const.subheader_pointers_offset + bit_offset | ||||
|             total_offset = offset + self._subheader_pointer_length * i | ||||
| 
 | ||||
|             subheader_offset = self._read_uint(total_offset, self._int_length) | ||||
|             total_offset += self._int_length | ||||
| 
 | ||||
|             subheader_length = self._read_uint(total_offset, self._int_length) | ||||
|             total_offset += self._int_length | ||||
| 
 | ||||
|             subheader_compression = self._read_uint(total_offset, 1) | ||||
|             total_offset += 1 | ||||
| 
 | ||||
|             subheader_type = self._read_uint(total_offset, 1) | ||||
| 
 | ||||
|             if ( | ||||
|                 subheader_length == 0 | ||||
|                 or subheader_compression == const.truncated_subheader_id | ||||
|             ): | ||||
|                 continue | ||||
| 
 | ||||
|             subheader_signature = self._read_bytes(subheader_offset, self._int_length) | ||||
|             subheader_index = get_subheader_index(subheader_signature) | ||||
|             subheader_processor = self._subheader_processors[subheader_index] | ||||
| 
 | ||||
|             if subheader_processor is None: | ||||
|                 f1 = subheader_compression in (const.compressed_subheader_id, 0) | ||||
|                 f2 = subheader_type == const.compressed_subheader_type | ||||
|                 if self.compression and f1 and f2: | ||||
|                     self._current_page_data_subheader_pointers.append( | ||||
|                         (subheader_offset, subheader_length) | ||||
|                     ) | ||||
|                 else: | ||||
|                     self.close() | ||||
|                     raise ValueError( | ||||
|                         f"Unknown subheader signature {subheader_signature}" | ||||
|                     ) | ||||
|             else: | ||||
|                 subheader_processor(subheader_offset, subheader_length) | ||||
| 
 | ||||
|     def _process_rowsize_subheader(self, offset: int, length: int) -> None: | ||||
|         int_len = self._int_length | ||||
|         lcs_offset = offset | ||||
|         lcp_offset = offset | ||||
|         if self.U64: | ||||
|             lcs_offset += 682 | ||||
|             lcp_offset += 706 | ||||
|         else: | ||||
|             lcs_offset += 354 | ||||
|             lcp_offset += 378 | ||||
| 
 | ||||
|         self.row_length = self._read_uint( | ||||
|             offset + const.row_length_offset_multiplier * int_len, | ||||
|             int_len, | ||||
|         ) | ||||
|         self.row_count = self._read_uint( | ||||
|             offset + const.row_count_offset_multiplier * int_len, | ||||
|             int_len, | ||||
|         ) | ||||
|         self.col_count_p1 = self._read_uint( | ||||
|             offset + const.col_count_p1_multiplier * int_len, int_len | ||||
|         ) | ||||
|         self.col_count_p2 = self._read_uint( | ||||
|             offset + const.col_count_p2_multiplier * int_len, int_len | ||||
|         ) | ||||
|         mx = const.row_count_on_mix_page_offset_multiplier * int_len | ||||
|         self._mix_page_row_count = self._read_uint(offset + mx, int_len) | ||||
|         self._lcs = self._read_uint(lcs_offset, 2) | ||||
|         self._lcp = self._read_uint(lcp_offset, 2) | ||||
| 
 | ||||
|     def _process_columnsize_subheader(self, offset: int, length: int) -> None: | ||||
|         int_len = self._int_length | ||||
|         offset += int_len | ||||
|         self.column_count = self._read_uint(offset, int_len) | ||||
|         if self.col_count_p1 + self.col_count_p2 != self.column_count: | ||||
|             print( | ||||
|                 f"Warning: column count mismatch ({self.col_count_p1} + " | ||||
|                 f"{self.col_count_p2} != {self.column_count})\n" | ||||
|             ) | ||||
| 
 | ||||
|     # Unknown purpose | ||||
|     def _process_subheader_counts(self, offset: int, length: int) -> None: | ||||
|         pass | ||||
| 
 | ||||
|     def _process_columntext_subheader(self, offset: int, length: int) -> None: | ||||
|         offset += self._int_length | ||||
|         text_block_size = self._read_uint(offset, const.text_block_size_length) | ||||
| 
 | ||||
|         buf = self._read_bytes(offset, text_block_size) | ||||
|         cname_raw = buf[0:text_block_size].rstrip(b"\x00 ") | ||||
|         self.column_names_raw.append(cname_raw) | ||||
| 
 | ||||
|         if len(self.column_names_raw) == 1: | ||||
|             compression_literal = b"" | ||||
|             for cl in const.compression_literals: | ||||
|                 if cl in cname_raw: | ||||
|                     compression_literal = cl | ||||
|             self.compression = compression_literal | ||||
|             offset -= self._int_length | ||||
| 
 | ||||
|             offset1 = offset + 16 | ||||
|             if self.U64: | ||||
|                 offset1 += 4 | ||||
| 
 | ||||
|             buf = self._read_bytes(offset1, self._lcp) | ||||
|             compression_literal = buf.rstrip(b"\x00") | ||||
|             if compression_literal == b"": | ||||
|                 self._lcs = 0 | ||||
|                 offset1 = offset + 32 | ||||
|                 if self.U64: | ||||
|                     offset1 += 4 | ||||
|                 buf = self._read_bytes(offset1, self._lcp) | ||||
|                 self.creator_proc = buf[0 : self._lcp] | ||||
|             elif compression_literal == const.rle_compression: | ||||
|                 offset1 = offset + 40 | ||||
|                 if self.U64: | ||||
|                     offset1 += 4 | ||||
|                 buf = self._read_bytes(offset1, self._lcp) | ||||
|                 self.creator_proc = buf[0 : self._lcp] | ||||
|             elif self._lcs > 0: | ||||
|                 self._lcp = 0 | ||||
|                 offset1 = offset + 16 | ||||
|                 if self.U64: | ||||
|                     offset1 += 4 | ||||
|                 buf = self._read_bytes(offset1, self._lcs) | ||||
|                 self.creator_proc = buf[0 : self._lcp] | ||||
|             if hasattr(self, "creator_proc"): | ||||
|                 self.creator_proc = self._convert_header_text(self.creator_proc) | ||||
| 
 | ||||
|     def _process_columnname_subheader(self, offset: int, length: int) -> None: | ||||
|         int_len = self._int_length | ||||
|         offset += int_len | ||||
|         column_name_pointers_count = (length - 2 * int_len - 12) // 8 | ||||
|         for i in range(column_name_pointers_count): | ||||
|             text_subheader = ( | ||||
|                 offset | ||||
|                 + const.column_name_pointer_length * (i + 1) | ||||
|                 + const.column_name_text_subheader_offset | ||||
|             ) | ||||
|             col_name_offset = ( | ||||
|                 offset | ||||
|                 + const.column_name_pointer_length * (i + 1) | ||||
|                 + const.column_name_offset_offset | ||||
|             ) | ||||
|             col_name_length = ( | ||||
|                 offset | ||||
|                 + const.column_name_pointer_length * (i + 1) | ||||
|                 + const.column_name_length_offset | ||||
|             ) | ||||
| 
 | ||||
|             idx = self._read_uint( | ||||
|                 text_subheader, const.column_name_text_subheader_length | ||||
|             ) | ||||
|             col_offset = self._read_uint( | ||||
|                 col_name_offset, const.column_name_offset_length | ||||
|             ) | ||||
|             col_len = self._read_uint(col_name_length, const.column_name_length_length) | ||||
| 
 | ||||
|             name_raw = self.column_names_raw[idx] | ||||
|             cname = name_raw[col_offset : col_offset + col_len] | ||||
|             self.column_names.append(self._convert_header_text(cname)) | ||||
| 
 | ||||
|     def _process_columnattributes_subheader(self, offset: int, length: int) -> None: | ||||
|         int_len = self._int_length | ||||
|         column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8) | ||||
|         for i in range(column_attributes_vectors_count): | ||||
|             col_data_offset = ( | ||||
|                 offset + int_len + const.column_data_offset_offset + i * (int_len + 8) | ||||
|             ) | ||||
|             col_data_len = ( | ||||
|                 offset | ||||
|                 + 2 * int_len | ||||
|                 + const.column_data_length_offset | ||||
|                 + i * (int_len + 8) | ||||
|             ) | ||||
|             col_types = ( | ||||
|                 offset + 2 * int_len + const.column_type_offset + i * (int_len + 8) | ||||
|             ) | ||||
| 
 | ||||
|             x = self._read_uint(col_data_offset, int_len) | ||||
|             self._column_data_offsets.append(x) | ||||
| 
 | ||||
|             x = self._read_uint(col_data_len, const.column_data_length_length) | ||||
|             self._column_data_lengths.append(x) | ||||
| 
 | ||||
|             x = self._read_uint(col_types, const.column_type_length) | ||||
|             self._column_types.append(b"d" if x == 1 else b"s") | ||||
| 
 | ||||
|     def _process_columnlist_subheader(self, offset: int, length: int) -> None: | ||||
|         # unknown purpose | ||||
|         pass | ||||
| 
 | ||||
|     def _process_format_subheader(self, offset: int, length: int) -> None: | ||||
|         int_len = self._int_length | ||||
|         text_subheader_format = ( | ||||
|             offset + const.column_format_text_subheader_index_offset + 3 * int_len | ||||
|         ) | ||||
|         col_format_offset = offset + const.column_format_offset_offset + 3 * int_len | ||||
|         col_format_len = offset + const.column_format_length_offset + 3 * int_len | ||||
|         text_subheader_label = ( | ||||
|             offset + const.column_label_text_subheader_index_offset + 3 * int_len | ||||
|         ) | ||||
|         col_label_offset = offset + const.column_label_offset_offset + 3 * int_len | ||||
|         col_label_len = offset + const.column_label_length_offset + 3 * int_len | ||||
| 
 | ||||
|         x = self._read_uint( | ||||
|             text_subheader_format, const.column_format_text_subheader_index_length | ||||
|         ) | ||||
|         format_idx = min(x, len(self.column_names_raw) - 1) | ||||
| 
 | ||||
|         format_start = self._read_uint( | ||||
|             col_format_offset, const.column_format_offset_length | ||||
|         ) | ||||
|         format_len = self._read_uint(col_format_len, const.column_format_length_length) | ||||
| 
 | ||||
|         label_idx = self._read_uint( | ||||
|             text_subheader_label, const.column_label_text_subheader_index_length | ||||
|         ) | ||||
|         label_idx = min(label_idx, len(self.column_names_raw) - 1) | ||||
| 
 | ||||
|         label_start = self._read_uint( | ||||
|             col_label_offset, const.column_label_offset_length | ||||
|         ) | ||||
|         label_len = self._read_uint(col_label_len, const.column_label_length_length) | ||||
| 
 | ||||
|         label_names = self.column_names_raw[label_idx] | ||||
|         column_label = self._convert_header_text( | ||||
|             label_names[label_start : label_start + label_len] | ||||
|         ) | ||||
|         format_names = self.column_names_raw[format_idx] | ||||
|         column_format = self._convert_header_text( | ||||
|             format_names[format_start : format_start + format_len] | ||||
|         ) | ||||
|         current_column_number = len(self.columns) | ||||
| 
 | ||||
|         col = _Column( | ||||
|             current_column_number, | ||||
|             self.column_names[current_column_number], | ||||
|             column_label, | ||||
|             column_format, | ||||
|             self._column_types[current_column_number], | ||||
|             self._column_data_lengths[current_column_number], | ||||
|         ) | ||||
| 
 | ||||
|         self.column_formats.append(column_format) | ||||
|         self.columns.append(col) | ||||
| 
 | ||||
|     def read(self, nrows: int | None = None) -> DataFrame: | ||||
|         if (nrows is None) and (self.chunksize is not None): | ||||
|             nrows = self.chunksize | ||||
|         elif nrows is None: | ||||
|             nrows = self.row_count | ||||
| 
 | ||||
|         if len(self._column_types) == 0: | ||||
|             self.close() | ||||
|             raise EmptyDataError("No columns to parse from file") | ||||
| 
 | ||||
|         if nrows > 0 and self._current_row_in_file_index >= self.row_count: | ||||
|             return DataFrame() | ||||
| 
 | ||||
|         nrows = min(nrows, self.row_count - self._current_row_in_file_index) | ||||
| 
 | ||||
|         nd = self._column_types.count(b"d") | ||||
|         ns = self._column_types.count(b"s") | ||||
| 
 | ||||
|         self._string_chunk = np.empty((ns, nrows), dtype=object) | ||||
|         self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8) | ||||
| 
 | ||||
|         self._current_row_in_chunk_index = 0 | ||||
|         p = Parser(self) | ||||
|         p.read(nrows) | ||||
| 
 | ||||
|         rslt = self._chunk_to_dataframe() | ||||
|         if self.index is not None: | ||||
|             rslt = rslt.set_index(self.index) | ||||
| 
 | ||||
|         return rslt | ||||
| 
 | ||||
|     def _read_next_page(self): | ||||
|         self._current_page_data_subheader_pointers = [] | ||||
|         self._cached_page = self._path_or_buf.read(self._page_length) | ||||
|         if len(self._cached_page) <= 0: | ||||
|             return True | ||||
|         elif len(self._cached_page) != self._page_length: | ||||
|             self.close() | ||||
|             msg = ( | ||||
|                 "failed to read complete page from file (read " | ||||
|                 f"{len(self._cached_page):d} of {self._page_length:d} bytes)" | ||||
|             ) | ||||
|             raise ValueError(msg) | ||||
| 
 | ||||
|         self._read_page_header() | ||||
|         if self._current_page_type in const.page_meta_types: | ||||
|             self._process_page_metadata() | ||||
| 
 | ||||
|         if self._current_page_type not in const.page_meta_types + [ | ||||
|             const.page_data_type, | ||||
|             const.page_mix_type, | ||||
|         ]: | ||||
|             return self._read_next_page() | ||||
| 
 | ||||
|         return False | ||||
| 
 | ||||
|     def _chunk_to_dataframe(self) -> DataFrame: | ||||
|         n = self._current_row_in_chunk_index | ||||
|         m = self._current_row_in_file_index | ||||
|         ix = range(m - n, m) | ||||
|         rslt = {} | ||||
| 
 | ||||
|         js, jb = 0, 0 | ||||
|         infer_string = get_option("future.infer_string") | ||||
|         for j in range(self.column_count): | ||||
|             name = self.column_names[j] | ||||
| 
 | ||||
|             if self._column_types[j] == b"d": | ||||
|                 col_arr = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d") | ||||
|                 rslt[name] = pd.Series(col_arr, dtype=np.float64, index=ix, copy=False) | ||||
|                 if self.convert_dates: | ||||
|                     if self.column_formats[j] in const.sas_date_formats: | ||||
|                         rslt[name] = _convert_datetimes(rslt[name], "d") | ||||
|                     elif self.column_formats[j] in const.sas_datetime_formats: | ||||
|                         rslt[name] = _convert_datetimes(rslt[name], "s") | ||||
|                 jb += 1 | ||||
|             elif self._column_types[j] == b"s": | ||||
|                 rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False) | ||||
|                 if self.convert_text and (self.encoding is not None): | ||||
|                     rslt[name] = self._decode_string(rslt[name].str) | ||||
|                     if infer_string: | ||||
|                         rslt[name] = rslt[name].astype("str") | ||||
| 
 | ||||
|                 js += 1 | ||||
|             else: | ||||
|                 self.close() | ||||
|                 raise ValueError(f"unknown column type {repr(self._column_types[j])}") | ||||
| 
 | ||||
|         df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False) | ||||
|         return df | ||||
| 
 | ||||
|     def _decode_string(self, b): | ||||
|         return b.decode(self.encoding or self.default_encoding) | ||||
| 
 | ||||
|     def _convert_header_text(self, b: bytes) -> str | bytes: | ||||
|         if self.convert_header_text: | ||||
|             return self._decode_string(b) | ||||
|         else: | ||||
|             return b | ||||
							
								
								
									
										310
									
								
								venv/lib/python3.13/site-packages/pandas/io/sas/sas_constants.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										310
									
								
								venv/lib/python3.13/site-packages/pandas/io/sas/sas_constants.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,310 @@ | |||
| from __future__ import annotations | ||||
| 
 | ||||
| from typing import Final | ||||
| 
 | ||||
| magic: Final = ( | ||||
|     b"\x00\x00\x00\x00\x00\x00\x00\x00" | ||||
|     b"\x00\x00\x00\x00\xc2\xea\x81\x60" | ||||
|     b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" | ||||
|     b"\x09\xc7\x31\x8c\x18\x1f\x10\x11" | ||||
| ) | ||||
| 
 | ||||
| align_1_checker_value: Final = b"3" | ||||
| align_1_offset: Final = 32 | ||||
| align_1_length: Final = 1 | ||||
| align_1_value: Final = 4 | ||||
| u64_byte_checker_value: Final = b"3" | ||||
| align_2_offset: Final = 35 | ||||
| align_2_length: Final = 1 | ||||
| align_2_value: Final = 4 | ||||
| endianness_offset: Final = 37 | ||||
| endianness_length: Final = 1 | ||||
| platform_offset: Final = 39 | ||||
| platform_length: Final = 1 | ||||
| encoding_offset: Final = 70 | ||||
| encoding_length: Final = 1 | ||||
| dataset_offset: Final = 92 | ||||
| dataset_length: Final = 64 | ||||
| file_type_offset: Final = 156 | ||||
| file_type_length: Final = 8 | ||||
| date_created_offset: Final = 164 | ||||
| date_created_length: Final = 8 | ||||
| date_modified_offset: Final = 172 | ||||
| date_modified_length: Final = 8 | ||||
| header_size_offset: Final = 196 | ||||
| header_size_length: Final = 4 | ||||
| page_size_offset: Final = 200 | ||||
| page_size_length: Final = 4 | ||||
| page_count_offset: Final = 204 | ||||
| page_count_length: Final = 4 | ||||
| sas_release_offset: Final = 216 | ||||
| sas_release_length: Final = 8 | ||||
| sas_server_type_offset: Final = 224 | ||||
| sas_server_type_length: Final = 16 | ||||
| os_version_number_offset: Final = 240 | ||||
| os_version_number_length: Final = 16 | ||||
| os_maker_offset: Final = 256 | ||||
| os_maker_length: Final = 16 | ||||
| os_name_offset: Final = 272 | ||||
| os_name_length: Final = 16 | ||||
| page_bit_offset_x86: Final = 16 | ||||
| page_bit_offset_x64: Final = 32 | ||||
| subheader_pointer_length_x86: Final = 12 | ||||
| subheader_pointer_length_x64: Final = 24 | ||||
| page_type_offset: Final = 0 | ||||
| page_type_length: Final = 2 | ||||
| block_count_offset: Final = 2 | ||||
| block_count_length: Final = 2 | ||||
| subheader_count_offset: Final = 4 | ||||
| subheader_count_length: Final = 2 | ||||
| page_type_mask: Final = 0x0F00 | ||||
| # Keep "page_comp_type" bits | ||||
| page_type_mask2: Final = 0xF000 | page_type_mask | ||||
| page_meta_type: Final = 0x0000 | ||||
| page_data_type: Final = 0x0100 | ||||
| page_mix_type: Final = 0x0200 | ||||
| page_amd_type: Final = 0x0400 | ||||
| page_meta2_type: Final = 0x4000 | ||||
| page_comp_type: Final = 0x9000 | ||||
| page_meta_types: Final = [page_meta_type, page_meta2_type] | ||||
| subheader_pointers_offset: Final = 8 | ||||
| truncated_subheader_id: Final = 1 | ||||
| compressed_subheader_id: Final = 4 | ||||
| compressed_subheader_type: Final = 1 | ||||
| text_block_size_length: Final = 2 | ||||
| row_length_offset_multiplier: Final = 5 | ||||
| row_count_offset_multiplier: Final = 6 | ||||
| col_count_p1_multiplier: Final = 9 | ||||
| col_count_p2_multiplier: Final = 10 | ||||
| row_count_on_mix_page_offset_multiplier: Final = 15 | ||||
| column_name_pointer_length: Final = 8 | ||||
| column_name_text_subheader_offset: Final = 0 | ||||
| column_name_text_subheader_length: Final = 2 | ||||
| column_name_offset_offset: Final = 2 | ||||
| column_name_offset_length: Final = 2 | ||||
| column_name_length_offset: Final = 4 | ||||
| column_name_length_length: Final = 2 | ||||
| column_data_offset_offset: Final = 8 | ||||
| column_data_length_offset: Final = 8 | ||||
| column_data_length_length: Final = 4 | ||||
| column_type_offset: Final = 14 | ||||
| column_type_length: Final = 1 | ||||
| column_format_text_subheader_index_offset: Final = 22 | ||||
| column_format_text_subheader_index_length: Final = 2 | ||||
| column_format_offset_offset: Final = 24 | ||||
| column_format_offset_length: Final = 2 | ||||
| column_format_length_offset: Final = 26 | ||||
| column_format_length_length: Final = 2 | ||||
| column_label_text_subheader_index_offset: Final = 28 | ||||
| column_label_text_subheader_index_length: Final = 2 | ||||
| column_label_offset_offset: Final = 30 | ||||
| column_label_offset_length: Final = 2 | ||||
| column_label_length_offset: Final = 32 | ||||
| column_label_length_length: Final = 2 | ||||
| rle_compression: Final = b"SASYZCRL" | ||||
| rdc_compression: Final = b"SASYZCR2" | ||||
| 
 | ||||
| compression_literals: Final = [rle_compression, rdc_compression] | ||||
| 
 | ||||
| # Incomplete list of encodings, using SAS nomenclature: | ||||
| # https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html | ||||
| # corresponding to the Python documentation of standard encodings | ||||
| # https://docs.python.org/3/library/codecs.html#standard-encodings | ||||
| encoding_names: Final = { | ||||
|     20: "utf-8", | ||||
|     29: "latin1", | ||||
|     30: "latin2", | ||||
|     31: "latin3", | ||||
|     32: "latin4", | ||||
|     33: "cyrillic", | ||||
|     34: "arabic", | ||||
|     35: "greek", | ||||
|     36: "hebrew", | ||||
|     37: "latin5", | ||||
|     38: "latin6", | ||||
|     39: "cp874", | ||||
|     40: "latin9", | ||||
|     41: "cp437", | ||||
|     42: "cp850", | ||||
|     43: "cp852", | ||||
|     44: "cp857", | ||||
|     45: "cp858", | ||||
|     46: "cp862", | ||||
|     47: "cp864", | ||||
|     48: "cp865", | ||||
|     49: "cp866", | ||||
|     50: "cp869", | ||||
|     51: "cp874", | ||||
|     # 52: "",  # not found | ||||
|     # 53: "",  # not found | ||||
|     # 54: "",  # not found | ||||
|     55: "cp720", | ||||
|     56: "cp737", | ||||
|     57: "cp775", | ||||
|     58: "cp860", | ||||
|     59: "cp863", | ||||
|     60: "cp1250", | ||||
|     61: "cp1251", | ||||
|     62: "cp1252", | ||||
|     63: "cp1253", | ||||
|     64: "cp1254", | ||||
|     65: "cp1255", | ||||
|     66: "cp1256", | ||||
|     67: "cp1257", | ||||
|     68: "cp1258", | ||||
|     118: "cp950", | ||||
|     # 119: "",  # not found | ||||
|     123: "big5", | ||||
|     125: "gb2312", | ||||
|     126: "cp936", | ||||
|     134: "euc_jp", | ||||
|     136: "cp932", | ||||
|     138: "shift_jis", | ||||
|     140: "euc-kr", | ||||
|     141: "cp949", | ||||
|     227: "latin8", | ||||
|     # 228: "", # not found | ||||
|     # 229: ""  # not found | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| class SASIndex: | ||||
|     row_size_index: Final = 0 | ||||
|     column_size_index: Final = 1 | ||||
|     subheader_counts_index: Final = 2 | ||||
|     column_text_index: Final = 3 | ||||
|     column_name_index: Final = 4 | ||||
|     column_attributes_index: Final = 5 | ||||
|     format_and_label_index: Final = 6 | ||||
|     column_list_index: Final = 7 | ||||
|     data_subheader_index: Final = 8 | ||||
| 
 | ||||
| 
 | ||||
| subheader_signature_to_index: Final = { | ||||
|     b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index, | ||||
|     b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index, | ||||
|     b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index, | ||||
|     b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index, | ||||
|     b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index, | ||||
|     b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index, | ||||
|     b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index, | ||||
|     b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index, | ||||
|     b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index, | ||||
|     b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index, | ||||
|     b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index, | ||||
|     b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index, | ||||
|     b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index, | ||||
|     b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index, | ||||
|     b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index, | ||||
|     b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index, | ||||
|     b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index, | ||||
|     b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index, | ||||
|     b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index, | ||||
|     b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index, | ||||
|     b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index, | ||||
|     b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index, | ||||
|     b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index, | ||||
|     b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index, | ||||
|     b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index, | ||||
|     b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index, | ||||
|     b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index, | ||||
|     b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index, | ||||
|     b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index, | ||||
|     b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index, | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| # List of frequently used SAS date and datetime formats | ||||
| # http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm | ||||
| # https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java | ||||
| sas_date_formats: Final = ( | ||||
|     "DATE", | ||||
|     "DAY", | ||||
|     "DDMMYY", | ||||
|     "DOWNAME", | ||||
|     "JULDAY", | ||||
|     "JULIAN", | ||||
|     "MMDDYY", | ||||
|     "MMYY", | ||||
|     "MMYYC", | ||||
|     "MMYYD", | ||||
|     "MMYYP", | ||||
|     "MMYYS", | ||||
|     "MMYYN", | ||||
|     "MONNAME", | ||||
|     "MONTH", | ||||
|     "MONYY", | ||||
|     "QTR", | ||||
|     "QTRR", | ||||
|     "NENGO", | ||||
|     "WEEKDATE", | ||||
|     "WEEKDATX", | ||||
|     "WEEKDAY", | ||||
|     "WEEKV", | ||||
|     "WORDDATE", | ||||
|     "WORDDATX", | ||||
|     "YEAR", | ||||
|     "YYMM", | ||||
|     "YYMMC", | ||||
|     "YYMMD", | ||||
|     "YYMMP", | ||||
|     "YYMMS", | ||||
|     "YYMMN", | ||||
|     "YYMON", | ||||
|     "YYMMDD", | ||||
|     "YYQ", | ||||
|     "YYQC", | ||||
|     "YYQD", | ||||
|     "YYQP", | ||||
|     "YYQS", | ||||
|     "YYQN", | ||||
|     "YYQR", | ||||
|     "YYQRC", | ||||
|     "YYQRD", | ||||
|     "YYQRP", | ||||
|     "YYQRS", | ||||
|     "YYQRN", | ||||
|     "YYMMDDP", | ||||
|     "YYMMDDC", | ||||
|     "E8601DA", | ||||
|     "YYMMDDN", | ||||
|     "MMDDYYC", | ||||
|     "MMDDYYS", | ||||
|     "MMDDYYD", | ||||
|     "YYMMDDS", | ||||
|     "B8601DA", | ||||
|     "DDMMYYN", | ||||
|     "YYMMDDD", | ||||
|     "DDMMYYB", | ||||
|     "DDMMYYP", | ||||
|     "MMDDYYP", | ||||
|     "YYMMDDB", | ||||
|     "MMDDYYN", | ||||
|     "DDMMYYC", | ||||
|     "DDMMYYD", | ||||
|     "DDMMYYS", | ||||
|     "MINGUO", | ||||
| ) | ||||
| 
 | ||||
| sas_datetime_formats: Final = ( | ||||
|     "DATETIME", | ||||
|     "DTWKDATX", | ||||
|     "B8601DN", | ||||
|     "B8601DT", | ||||
|     "B8601DX", | ||||
|     "B8601DZ", | ||||
|     "B8601LX", | ||||
|     "E8601DN", | ||||
|     "E8601DT", | ||||
|     "E8601DX", | ||||
|     "E8601DZ", | ||||
|     "E8601LX", | ||||
|     "DATEAMPM", | ||||
|     "DTDATE", | ||||
|     "DTMONYY", | ||||
|     "DTMONYY", | ||||
|     "DTWKDATX", | ||||
|     "DTYEAR", | ||||
|     "TOD", | ||||
|     "MDYAMPM", | ||||
| ) | ||||
							
								
								
									
										508
									
								
								venv/lib/python3.13/site-packages/pandas/io/sas/sas_xport.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										508
									
								
								venv/lib/python3.13/site-packages/pandas/io/sas/sas_xport.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,508 @@ | |||
| """ | ||||
| Read a SAS XPort format file into a Pandas DataFrame. | ||||
| 
 | ||||
| Based on code from Jack Cushman (github.com/jcushman/xport). | ||||
| 
 | ||||
| The file format is defined here: | ||||
| 
 | ||||
| https://support.sas.com/content/dam/SAS/support/en/technical-papers/record-layout-of-a-sas-version-5-or-6-data-set-in-sas-transport-xport-format.pdf | ||||
| """ | ||||
| from __future__ import annotations | ||||
| 
 | ||||
| from collections import abc | ||||
| from datetime import datetime | ||||
| import struct | ||||
| from typing import TYPE_CHECKING | ||||
| import warnings | ||||
| 
 | ||||
| import numpy as np | ||||
| 
 | ||||
| from pandas.util._decorators import Appender | ||||
| from pandas.util._exceptions import find_stack_level | ||||
| 
 | ||||
| import pandas as pd | ||||
| 
 | ||||
| from pandas.io.common import get_handle | ||||
| from pandas.io.sas.sasreader import ReaderBase | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from pandas._typing import ( | ||||
|         CompressionOptions, | ||||
|         DatetimeNaTType, | ||||
|         FilePath, | ||||
|         ReadBuffer, | ||||
|     ) | ||||
| _correct_line1 = ( | ||||
|     "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" | ||||
|     "000000000000000000000000000000  " | ||||
| ) | ||||
| _correct_header1 = ( | ||||
|     "HEADER RECORD*******MEMBER  HEADER RECORD!!!!!!!000000000000000001600000000" | ||||
| ) | ||||
| _correct_header2 = ( | ||||
|     "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" | ||||
|     "000000000000000000000000000000  " | ||||
| ) | ||||
| _correct_obs_header = ( | ||||
|     "HEADER RECORD*******OBS     HEADER RECORD!!!!!!!" | ||||
|     "000000000000000000000000000000  " | ||||
| ) | ||||
| _fieldkeys = [ | ||||
|     "ntype", | ||||
|     "nhfun", | ||||
|     "field_length", | ||||
|     "nvar0", | ||||
|     "name", | ||||
|     "label", | ||||
|     "nform", | ||||
|     "nfl", | ||||
|     "num_decimals", | ||||
|     "nfj", | ||||
|     "nfill", | ||||
|     "niform", | ||||
|     "nifl", | ||||
|     "nifd", | ||||
|     "npos", | ||||
|     "_", | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| _base_params_doc = """\ | ||||
| Parameters | ||||
| ---------- | ||||
| filepath_or_buffer : str or file-like object | ||||
|     Path to SAS file or object implementing binary read method.""" | ||||
| 
 | ||||
| _params2_doc = """\ | ||||
| index : identifier of index column | ||||
|     Identifier of column that should be used as index of the DataFrame. | ||||
| encoding : str | ||||
|     Encoding for text data. | ||||
| chunksize : int | ||||
|     Read file `chunksize` lines at a time, returns iterator.""" | ||||
| 
 | ||||
| _format_params_doc = """\ | ||||
| format : str | ||||
|     File format, only `xport` is currently supported.""" | ||||
| 
 | ||||
| _iterator_doc = """\ | ||||
| iterator : bool, default False | ||||
|     Return XportReader object for reading file incrementally.""" | ||||
| 
 | ||||
| 
 | ||||
| _read_sas_doc = f"""Read a SAS file into a DataFrame. | ||||
| 
 | ||||
| {_base_params_doc} | ||||
| {_format_params_doc} | ||||
| {_params2_doc} | ||||
| {_iterator_doc} | ||||
| 
 | ||||
| Returns | ||||
| ------- | ||||
| DataFrame or XportReader | ||||
| 
 | ||||
| Examples | ||||
| -------- | ||||
| Read a SAS Xport file: | ||||
| 
 | ||||
| >>> df = pd.read_sas('filename.XPT') | ||||
| 
 | ||||
| Read a Xport file in 10,000 line chunks: | ||||
| 
 | ||||
| >>> itr = pd.read_sas('filename.XPT', chunksize=10000) | ||||
| >>> for chunk in itr: | ||||
| >>>     do_something(chunk) | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| _xport_reader_doc = f"""\ | ||||
| Class for reading SAS Xport files. | ||||
| 
 | ||||
| {_base_params_doc} | ||||
| {_params2_doc} | ||||
| 
 | ||||
| Attributes | ||||
| ---------- | ||||
| member_info : list | ||||
|     Contains information about the file | ||||
| fields : list | ||||
|     Contains information about the variables in the file | ||||
| """ | ||||
| 
 | ||||
| _read_method_doc = """\ | ||||
| Read observations from SAS Xport file, returning as data frame. | ||||
| 
 | ||||
| Parameters | ||||
| ---------- | ||||
| nrows : int | ||||
|     Number of rows to read from data file; if None, read whole | ||||
|     file. | ||||
| 
 | ||||
| Returns | ||||
| ------- | ||||
| A DataFrame. | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| def _parse_date(datestr: str) -> DatetimeNaTType: | ||||
|     """Given a date in xport format, return Python date.""" | ||||
|     try: | ||||
|         # e.g. "16FEB11:10:07:55" | ||||
|         return datetime.strptime(datestr, "%d%b%y:%H:%M:%S") | ||||
|     except ValueError: | ||||
|         return pd.NaT | ||||
| 
 | ||||
| 
 | ||||
| def _split_line(s: str, parts): | ||||
|     """ | ||||
|     Parameters | ||||
|     ---------- | ||||
|     s: str | ||||
|         Fixed-length string to split | ||||
|     parts: list of (name, length) pairs | ||||
|         Used to break up string, name '_' will be filtered from output. | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     Dict of name:contents of string at given location. | ||||
|     """ | ||||
|     out = {} | ||||
|     start = 0 | ||||
|     for name, length in parts: | ||||
|         out[name] = s[start : start + length].strip() | ||||
|         start += length | ||||
|     del out["_"] | ||||
|     return out | ||||
| 
 | ||||
| 
 | ||||
| def _handle_truncated_float_vec(vec, nbytes): | ||||
|     # This feature is not well documented, but some SAS XPORT files | ||||
|     # have 2-7 byte "truncated" floats.  To read these truncated | ||||
|     # floats, pad them with zeros on the right to make 8 byte floats. | ||||
|     # | ||||
|     # References: | ||||
|     # https://github.com/jcushman/xport/pull/3 | ||||
|     # The R "foreign" library | ||||
| 
 | ||||
|     if nbytes != 8: | ||||
|         vec1 = np.zeros(len(vec), np.dtype("S8")) | ||||
|         dtype = np.dtype(f"S{nbytes},S{8 - nbytes}") | ||||
|         vec2 = vec1.view(dtype=dtype) | ||||
|         vec2["f0"] = vec | ||||
|         return vec2 | ||||
| 
 | ||||
|     return vec | ||||
| 
 | ||||
| 
 | ||||
| def _parse_float_vec(vec): | ||||
|     """ | ||||
|     Parse a vector of float values representing IBM 8 byte floats into | ||||
|     native 8 byte floats. | ||||
|     """ | ||||
|     dtype = np.dtype(">u4,>u4") | ||||
|     vec1 = vec.view(dtype=dtype) | ||||
|     xport1 = vec1["f0"] | ||||
|     xport2 = vec1["f1"] | ||||
| 
 | ||||
|     # Start by setting first half of ieee number to first half of IBM | ||||
|     # number sans exponent | ||||
|     ieee1 = xport1 & 0x00FFFFFF | ||||
| 
 | ||||
|     # The fraction bit to the left of the binary point in the ieee | ||||
|     # format was set and the number was shifted 0, 1, 2, or 3 | ||||
|     # places. This will tell us how to adjust the ibm exponent to be a | ||||
|     # power of 2 ieee exponent and how to shift the fraction bits to | ||||
|     # restore the correct magnitude. | ||||
|     shift = np.zeros(len(vec), dtype=np.uint8) | ||||
|     shift[np.where(xport1 & 0x00200000)] = 1 | ||||
|     shift[np.where(xport1 & 0x00400000)] = 2 | ||||
|     shift[np.where(xport1 & 0x00800000)] = 3 | ||||
| 
 | ||||
|     # shift the ieee number down the correct number of places then | ||||
|     # set the second half of the ieee number to be the second half | ||||
|     # of the ibm number shifted appropriately, ored with the bits | ||||
|     # from the first half that would have been shifted in if we | ||||
|     # could shift a double. All we are worried about are the low | ||||
|     # order 3 bits of the first half since we're only shifting by | ||||
|     # 1, 2, or 3. | ||||
|     ieee1 >>= shift | ||||
|     ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift))) | ||||
| 
 | ||||
|     # clear the 1 bit to the left of the binary point | ||||
|     ieee1 &= 0xFFEFFFFF | ||||
| 
 | ||||
|     # set the exponent of the ieee number to be the actual exponent | ||||
|     # plus the shift count + 1023. Or this into the first half of the | ||||
|     # ieee number. The ibm exponent is excess 64 but is adjusted by 65 | ||||
|     # since during conversion to ibm format the exponent is | ||||
|     # incremented by 1 and the fraction bits left 4 positions to the | ||||
|     # right of the radix point.  (had to add >> 24 because C treats & | ||||
|     # 0x7f as 0x7f000000 and Python doesn't) | ||||
|     ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | ( | ||||
|         xport1 & 0x80000000 | ||||
|     ) | ||||
| 
 | ||||
|     ieee = np.empty((len(ieee1),), dtype=">u4,>u4") | ||||
|     ieee["f0"] = ieee1 | ||||
|     ieee["f1"] = ieee2 | ||||
|     ieee = ieee.view(dtype=">f8") | ||||
|     ieee = ieee.astype("f8") | ||||
| 
 | ||||
|     return ieee | ||||
| 
 | ||||
| 
 | ||||
| class XportReader(ReaderBase, abc.Iterator): | ||||
|     __doc__ = _xport_reader_doc | ||||
| 
 | ||||
|     def __init__( | ||||
|         self, | ||||
|         filepath_or_buffer: FilePath | ReadBuffer[bytes], | ||||
|         index=None, | ||||
|         encoding: str | None = "ISO-8859-1", | ||||
|         chunksize: int | None = None, | ||||
|         compression: CompressionOptions = "infer", | ||||
|     ) -> None: | ||||
|         self._encoding = encoding | ||||
|         self._lines_read = 0 | ||||
|         self._index = index | ||||
|         self._chunksize = chunksize | ||||
| 
 | ||||
|         self.handles = get_handle( | ||||
|             filepath_or_buffer, | ||||
|             "rb", | ||||
|             encoding=encoding, | ||||
|             is_text=False, | ||||
|             compression=compression, | ||||
|         ) | ||||
|         self.filepath_or_buffer = self.handles.handle | ||||
| 
 | ||||
|         try: | ||||
|             self._read_header() | ||||
|         except Exception: | ||||
|             self.close() | ||||
|             raise | ||||
| 
 | ||||
|     def close(self) -> None: | ||||
|         self.handles.close() | ||||
| 
 | ||||
|     def _get_row(self): | ||||
|         return self.filepath_or_buffer.read(80).decode() | ||||
| 
 | ||||
|     def _read_header(self) -> None: | ||||
|         self.filepath_or_buffer.seek(0) | ||||
| 
 | ||||
|         # read file header | ||||
|         line1 = self._get_row() | ||||
|         if line1 != _correct_line1: | ||||
|             if "**COMPRESSED**" in line1: | ||||
|                 # this was created with the PROC CPORT method and can't be read | ||||
|                 # https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/movefile/p1bm6aqp3fw4uin1hucwh718f6kp.htm | ||||
|                 raise ValueError( | ||||
|                     "Header record indicates a CPORT file, which is not readable." | ||||
|                 ) | ||||
|             raise ValueError("Header record is not an XPORT file.") | ||||
| 
 | ||||
|         line2 = self._get_row() | ||||
|         fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]] | ||||
|         file_info = _split_line(line2, fif) | ||||
|         if file_info["prefix"] != "SAS     SAS     SASLIB": | ||||
|             raise ValueError("Header record has invalid prefix.") | ||||
|         file_info["created"] = _parse_date(file_info["created"]) | ||||
|         self.file_info = file_info | ||||
| 
 | ||||
|         line3 = self._get_row() | ||||
|         file_info["modified"] = _parse_date(line3[:16]) | ||||
| 
 | ||||
|         # read member header | ||||
|         header1 = self._get_row() | ||||
|         header2 = self._get_row() | ||||
|         headflag1 = header1.startswith(_correct_header1) | ||||
|         headflag2 = header2 == _correct_header2 | ||||
|         if not (headflag1 and headflag2): | ||||
|             raise ValueError("Member header not found") | ||||
|         # usually 140, could be 135 | ||||
|         fieldnamelength = int(header1[-5:-2]) | ||||
| 
 | ||||
|         # member info | ||||
|         mem = [ | ||||
|             ["prefix", 8], | ||||
|             ["set_name", 8], | ||||
|             ["sasdata", 8], | ||||
|             ["version", 8], | ||||
|             ["OS", 8], | ||||
|             ["_", 24], | ||||
|             ["created", 16], | ||||
|         ] | ||||
|         member_info = _split_line(self._get_row(), mem) | ||||
|         mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]] | ||||
|         member_info.update(_split_line(self._get_row(), mem)) | ||||
|         member_info["modified"] = _parse_date(member_info["modified"]) | ||||
|         member_info["created"] = _parse_date(member_info["created"]) | ||||
|         self.member_info = member_info | ||||
| 
 | ||||
|         # read field names | ||||
|         types = {1: "numeric", 2: "char"} | ||||
|         fieldcount = int(self._get_row()[54:58]) | ||||
|         datalength = fieldnamelength * fieldcount | ||||
|         # round up to nearest 80 | ||||
|         if datalength % 80: | ||||
|             datalength += 80 - datalength % 80 | ||||
|         fielddata = self.filepath_or_buffer.read(datalength) | ||||
|         fields = [] | ||||
|         obs_length = 0 | ||||
|         while len(fielddata) >= fieldnamelength: | ||||
|             # pull data for one field | ||||
|             fieldbytes, fielddata = ( | ||||
|                 fielddata[:fieldnamelength], | ||||
|                 fielddata[fieldnamelength:], | ||||
|             ) | ||||
| 
 | ||||
|             # rest at end gets ignored, so if field is short, pad out | ||||
|             # to match struct pattern below | ||||
|             fieldbytes = fieldbytes.ljust(140) | ||||
| 
 | ||||
|             fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", fieldbytes) | ||||
|             field = dict(zip(_fieldkeys, fieldstruct)) | ||||
|             del field["_"] | ||||
|             field["ntype"] = types[field["ntype"]] | ||||
|             fl = field["field_length"] | ||||
|             if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)): | ||||
|                 msg = f"Floating field width {fl} is not between 2 and 8." | ||||
|                 raise TypeError(msg) | ||||
| 
 | ||||
|             for k, v in field.items(): | ||||
|                 try: | ||||
|                     field[k] = v.strip() | ||||
|                 except AttributeError: | ||||
|                     pass | ||||
| 
 | ||||
|             obs_length += field["field_length"] | ||||
|             fields += [field] | ||||
| 
 | ||||
|         header = self._get_row() | ||||
|         if not header == _correct_obs_header: | ||||
|             raise ValueError("Observation header not found.") | ||||
| 
 | ||||
|         self.fields = fields | ||||
|         self.record_length = obs_length | ||||
|         self.record_start = self.filepath_or_buffer.tell() | ||||
| 
 | ||||
|         self.nobs = self._record_count() | ||||
|         self.columns = [x["name"].decode() for x in self.fields] | ||||
| 
 | ||||
|         # Setup the dtype. | ||||
|         dtypel = [ | ||||
|             ("s" + str(i), "S" + str(field["field_length"])) | ||||
|             for i, field in enumerate(self.fields) | ||||
|         ] | ||||
|         dtype = np.dtype(dtypel) | ||||
|         self._dtype = dtype | ||||
| 
 | ||||
|     def __next__(self) -> pd.DataFrame: | ||||
|         return self.read(nrows=self._chunksize or 1) | ||||
| 
 | ||||
|     def _record_count(self) -> int: | ||||
|         """ | ||||
|         Get number of records in file. | ||||
| 
 | ||||
|         This is maybe suboptimal because we have to seek to the end of | ||||
|         the file. | ||||
| 
 | ||||
|         Side effect: returns file position to record_start. | ||||
|         """ | ||||
|         self.filepath_or_buffer.seek(0, 2) | ||||
|         total_records_length = self.filepath_or_buffer.tell() - self.record_start | ||||
| 
 | ||||
|         if total_records_length % 80 != 0: | ||||
|             warnings.warn( | ||||
|                 "xport file may be corrupted.", | ||||
|                 stacklevel=find_stack_level(), | ||||
|             ) | ||||
| 
 | ||||
|         if self.record_length > 80: | ||||
|             self.filepath_or_buffer.seek(self.record_start) | ||||
|             return total_records_length // self.record_length | ||||
| 
 | ||||
|         self.filepath_or_buffer.seek(-80, 2) | ||||
|         last_card_bytes = self.filepath_or_buffer.read(80) | ||||
|         last_card = np.frombuffer(last_card_bytes, dtype=np.uint64) | ||||
| 
 | ||||
|         # 8 byte blank | ||||
|         ix = np.flatnonzero(last_card == 2314885530818453536) | ||||
| 
 | ||||
|         if len(ix) == 0: | ||||
|             tail_pad = 0 | ||||
|         else: | ||||
|             tail_pad = 8 * len(ix) | ||||
| 
 | ||||
|         self.filepath_or_buffer.seek(self.record_start) | ||||
| 
 | ||||
|         return (total_records_length - tail_pad) // self.record_length | ||||
| 
 | ||||
|     def get_chunk(self, size: int | None = None) -> pd.DataFrame: | ||||
|         """ | ||||
|         Reads lines from Xport file and returns as dataframe | ||||
| 
 | ||||
|         Parameters | ||||
|         ---------- | ||||
|         size : int, defaults to None | ||||
|             Number of lines to read.  If None, reads whole file. | ||||
| 
 | ||||
|         Returns | ||||
|         ------- | ||||
|         DataFrame | ||||
|         """ | ||||
|         if size is None: | ||||
|             size = self._chunksize | ||||
|         return self.read(nrows=size) | ||||
| 
 | ||||
|     def _missing_double(self, vec): | ||||
|         v = vec.view(dtype="u1,u1,u2,u4") | ||||
|         miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0) | ||||
|         miss1 = ( | ||||
|             ((v["f0"] >= 0x41) & (v["f0"] <= 0x5A)) | ||||
|             | (v["f0"] == 0x5F) | ||||
|             | (v["f0"] == 0x2E) | ||||
|         ) | ||||
|         miss &= miss1 | ||||
|         return miss | ||||
| 
 | ||||
|     @Appender(_read_method_doc) | ||||
|     def read(self, nrows: int | None = None) -> pd.DataFrame: | ||||
|         if nrows is None: | ||||
|             nrows = self.nobs | ||||
| 
 | ||||
|         read_lines = min(nrows, self.nobs - self._lines_read) | ||||
|         read_len = read_lines * self.record_length | ||||
|         if read_len <= 0: | ||||
|             self.close() | ||||
|             raise StopIteration | ||||
|         raw = self.filepath_or_buffer.read(read_len) | ||||
|         data = np.frombuffer(raw, dtype=self._dtype, count=read_lines) | ||||
| 
 | ||||
|         df_data = {} | ||||
|         for j, x in enumerate(self.columns): | ||||
|             vec = data["s" + str(j)] | ||||
|             ntype = self.fields[j]["ntype"] | ||||
|             if ntype == "numeric": | ||||
|                 vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"]) | ||||
|                 miss = self._missing_double(vec) | ||||
|                 v = _parse_float_vec(vec) | ||||
|                 v[miss] = np.nan | ||||
|             elif self.fields[j]["ntype"] == "char": | ||||
|                 v = [y.rstrip() for y in vec] | ||||
| 
 | ||||
|                 if self._encoding is not None: | ||||
|                     v = [y.decode(self._encoding) for y in v] | ||||
| 
 | ||||
|             df_data.update({x: v}) | ||||
|         df = pd.DataFrame(df_data) | ||||
| 
 | ||||
|         if self._index is None: | ||||
|             df.index = pd.Index(range(self._lines_read, self._lines_read + read_lines)) | ||||
|         else: | ||||
|             df = df.set_index(self._index) | ||||
| 
 | ||||
|         self._lines_read += read_lines | ||||
| 
 | ||||
|         return df | ||||
							
								
								
									
										178
									
								
								venv/lib/python3.13/site-packages/pandas/io/sas/sasreader.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										178
									
								
								venv/lib/python3.13/site-packages/pandas/io/sas/sasreader.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,178 @@ | |||
| """ | ||||
| Read SAS sas7bdat or xport files. | ||||
| """ | ||||
| from __future__ import annotations | ||||
| 
 | ||||
| from abc import ( | ||||
|     ABC, | ||||
|     abstractmethod, | ||||
| ) | ||||
| from typing import ( | ||||
|     TYPE_CHECKING, | ||||
|     overload, | ||||
| ) | ||||
| 
 | ||||
| from pandas.util._decorators import doc | ||||
| 
 | ||||
| from pandas.core.shared_docs import _shared_docs | ||||
| 
 | ||||
| from pandas.io.common import stringify_path | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from collections.abc import Hashable | ||||
|     from types import TracebackType | ||||
| 
 | ||||
|     from pandas._typing import ( | ||||
|         CompressionOptions, | ||||
|         FilePath, | ||||
|         ReadBuffer, | ||||
|         Self, | ||||
|     ) | ||||
| 
 | ||||
|     from pandas import DataFrame | ||||
| 
 | ||||
| 
 | ||||
| class ReaderBase(ABC): | ||||
|     """ | ||||
|     Protocol for XportReader and SAS7BDATReader classes. | ||||
|     """ | ||||
| 
 | ||||
|     @abstractmethod | ||||
|     def read(self, nrows: int | None = None) -> DataFrame: | ||||
|         ... | ||||
| 
 | ||||
|     @abstractmethod | ||||
|     def close(self) -> None: | ||||
|         ... | ||||
| 
 | ||||
|     def __enter__(self) -> Self: | ||||
|         return self | ||||
| 
 | ||||
|     def __exit__( | ||||
|         self, | ||||
|         exc_type: type[BaseException] | None, | ||||
|         exc_value: BaseException | None, | ||||
|         traceback: TracebackType | None, | ||||
|     ) -> None: | ||||
|         self.close() | ||||
| 
 | ||||
| 
 | ||||
| @overload | ||||
| def read_sas( | ||||
|     filepath_or_buffer: FilePath | ReadBuffer[bytes], | ||||
|     *, | ||||
|     format: str | None = ..., | ||||
|     index: Hashable | None = ..., | ||||
|     encoding: str | None = ..., | ||||
|     chunksize: int = ..., | ||||
|     iterator: bool = ..., | ||||
|     compression: CompressionOptions = ..., | ||||
| ) -> ReaderBase: | ||||
|     ... | ||||
| 
 | ||||
| 
 | ||||
| @overload | ||||
| def read_sas( | ||||
|     filepath_or_buffer: FilePath | ReadBuffer[bytes], | ||||
|     *, | ||||
|     format: str | None = ..., | ||||
|     index: Hashable | None = ..., | ||||
|     encoding: str | None = ..., | ||||
|     chunksize: None = ..., | ||||
|     iterator: bool = ..., | ||||
|     compression: CompressionOptions = ..., | ||||
| ) -> DataFrame | ReaderBase: | ||||
|     ... | ||||
| 
 | ||||
| 
 | ||||
| @doc(decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer") | ||||
| def read_sas( | ||||
|     filepath_or_buffer: FilePath | ReadBuffer[bytes], | ||||
|     *, | ||||
|     format: str | None = None, | ||||
|     index: Hashable | None = None, | ||||
|     encoding: str | None = None, | ||||
|     chunksize: int | None = None, | ||||
|     iterator: bool = False, | ||||
|     compression: CompressionOptions = "infer", | ||||
| ) -> DataFrame | ReaderBase: | ||||
|     """ | ||||
|     Read SAS files stored as either XPORT or SAS7BDAT format files. | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     filepath_or_buffer : str, path object, or file-like object | ||||
|         String, path object (implementing ``os.PathLike[str]``), or file-like | ||||
|         object implementing a binary ``read()`` function. The string could be a URL. | ||||
|         Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is | ||||
|         expected. A local file could be: | ||||
|         ``file://localhost/path/to/table.sas7bdat``. | ||||
|     format : str {{'xport', 'sas7bdat'}} or None | ||||
|         If None, file format is inferred from file extension. If 'xport' or | ||||
|         'sas7bdat', uses the corresponding format. | ||||
|     index : identifier of index column, defaults to None | ||||
|         Identifier of column that should be used as index of the DataFrame. | ||||
|     encoding : str, default is None | ||||
|         Encoding for text data.  If None, text data are stored as raw bytes. | ||||
|     chunksize : int | ||||
|         Read file `chunksize` lines at a time, returns iterator. | ||||
|     iterator : bool, defaults to False | ||||
|         If True, returns an iterator for reading the file incrementally. | ||||
|     {decompression_options} | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     DataFrame if iterator=False and chunksize=None, else SAS7BDATReader | ||||
|     or XportReader | ||||
| 
 | ||||
|     Examples | ||||
|     -------- | ||||
|     >>> df = pd.read_sas("sas_data.sas7bdat")  # doctest: +SKIP | ||||
|     """ | ||||
|     if format is None: | ||||
|         buffer_error_msg = ( | ||||
|             "If this is a buffer object rather " | ||||
|             "than a string name, you must specify a format string" | ||||
|         ) | ||||
|         filepath_or_buffer = stringify_path(filepath_or_buffer) | ||||
|         if not isinstance(filepath_or_buffer, str): | ||||
|             raise ValueError(buffer_error_msg) | ||||
|         fname = filepath_or_buffer.lower() | ||||
|         if ".xpt" in fname: | ||||
|             format = "xport" | ||||
|         elif ".sas7bdat" in fname: | ||||
|             format = "sas7bdat" | ||||
|         else: | ||||
|             raise ValueError( | ||||
|                 f"unable to infer format of SAS file from filename: {repr(fname)}" | ||||
|             ) | ||||
| 
 | ||||
|     reader: ReaderBase | ||||
|     if format.lower() == "xport": | ||||
|         from pandas.io.sas.sas_xport import XportReader | ||||
| 
 | ||||
|         reader = XportReader( | ||||
|             filepath_or_buffer, | ||||
|             index=index, | ||||
|             encoding=encoding, | ||||
|             chunksize=chunksize, | ||||
|             compression=compression, | ||||
|         ) | ||||
|     elif format.lower() == "sas7bdat": | ||||
|         from pandas.io.sas.sas7bdat import SAS7BDATReader | ||||
| 
 | ||||
|         reader = SAS7BDATReader( | ||||
|             filepath_or_buffer, | ||||
|             index=index, | ||||
|             encoding=encoding, | ||||
|             chunksize=chunksize, | ||||
|             compression=compression, | ||||
|         ) | ||||
|     else: | ||||
|         raise ValueError("unknown SAS format") | ||||
| 
 | ||||
|     if iterator or chunksize: | ||||
|         return reader | ||||
| 
 | ||||
|     with reader: | ||||
|         return reader.read() | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Tykayn
						Tykayn