1388 lines
		
	
	
	
		
			47 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			1388 lines
		
	
	
	
		
			47 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | from __future__ import annotations | ||
|  | 
 | ||
|  | from collections import ( | ||
|  |     abc, | ||
|  |     defaultdict, | ||
|  | ) | ||
|  | from collections.abc import ( | ||
|  |     Hashable, | ||
|  |     Iterator, | ||
|  |     Mapping, | ||
|  |     Sequence, | ||
|  | ) | ||
|  | import csv | ||
|  | from io import StringIO | ||
|  | import re | ||
|  | from typing import ( | ||
|  |     IO, | ||
|  |     TYPE_CHECKING, | ||
|  |     DefaultDict, | ||
|  |     Literal, | ||
|  |     cast, | ||
|  | ) | ||
|  | import warnings | ||
|  | 
 | ||
|  | import numpy as np | ||
|  | 
 | ||
|  | from pandas._libs import lib | ||
|  | from pandas.errors import ( | ||
|  |     EmptyDataError, | ||
|  |     ParserError, | ||
|  |     ParserWarning, | ||
|  | ) | ||
|  | from pandas.util._decorators import cache_readonly | ||
|  | from pandas.util._exceptions import find_stack_level | ||
|  | 
 | ||
|  | from pandas.core.dtypes.common import ( | ||
|  |     is_bool_dtype, | ||
|  |     is_integer, | ||
|  |     is_numeric_dtype, | ||
|  | ) | ||
|  | from pandas.core.dtypes.inference import is_dict_like | ||
|  | 
 | ||
|  | from pandas.io.common import ( | ||
|  |     dedup_names, | ||
|  |     is_potential_multi_index, | ||
|  | ) | ||
|  | from pandas.io.parsers.base_parser import ( | ||
|  |     ParserBase, | ||
|  |     parser_defaults, | ||
|  | ) | ||
|  | 
 | ||
|  | if TYPE_CHECKING: | ||
|  |     from pandas._typing import ( | ||
|  |         ArrayLike, | ||
|  |         ReadCsvBuffer, | ||
|  |         Scalar, | ||
|  |     ) | ||
|  | 
 | ||
|  |     from pandas import ( | ||
|  |         Index, | ||
|  |         MultiIndex, | ||
|  |     ) | ||
|  | 
 | ||
|  | # BOM character (byte order mark) | ||
|  | # This exists at the beginning of a file to indicate endianness | ||
|  | # of a file (stream). Unfortunately, this marker screws up parsing, | ||
|  | # so we need to remove it if we see it. | ||
|  | _BOM = "\ufeff" | ||
|  | 
 | ||
|  | 
 | ||
|  | class PythonParser(ParserBase): | ||
|  |     _no_thousands_columns: set[int] | ||
|  | 
 | ||
|  |     def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: | ||
|  |         """
 | ||
|  |         Workhorse function for processing nested list into DataFrame | ||
|  |         """
 | ||
|  |         super().__init__(kwds) | ||
|  | 
 | ||
|  |         self.data: Iterator[str] | None = None | ||
|  |         self.buf: list = [] | ||
|  |         self.pos = 0 | ||
|  |         self.line_pos = 0 | ||
|  | 
 | ||
|  |         self.skiprows = kwds["skiprows"] | ||
|  | 
 | ||
|  |         if callable(self.skiprows): | ||
|  |             self.skipfunc = self.skiprows | ||
|  |         else: | ||
|  |             self.skipfunc = lambda x: x in self.skiprows | ||
|  | 
 | ||
|  |         self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"]) | ||
|  |         self.delimiter = kwds["delimiter"] | ||
|  | 
 | ||
|  |         self.quotechar = kwds["quotechar"] | ||
|  |         if isinstance(self.quotechar, str): | ||
|  |             self.quotechar = str(self.quotechar) | ||
|  | 
 | ||
|  |         self.escapechar = kwds["escapechar"] | ||
|  |         self.doublequote = kwds["doublequote"] | ||
|  |         self.skipinitialspace = kwds["skipinitialspace"] | ||
|  |         self.lineterminator = kwds["lineterminator"] | ||
|  |         self.quoting = kwds["quoting"] | ||
|  |         self.skip_blank_lines = kwds["skip_blank_lines"] | ||
|  | 
 | ||
|  |         self.has_index_names = False | ||
|  |         if "has_index_names" in kwds: | ||
|  |             self.has_index_names = kwds["has_index_names"] | ||
|  | 
 | ||
|  |         self.verbose = kwds["verbose"] | ||
|  | 
 | ||
|  |         self.thousands = kwds["thousands"] | ||
|  |         self.decimal = kwds["decimal"] | ||
|  | 
 | ||
|  |         self.comment = kwds["comment"] | ||
|  | 
 | ||
|  |         # Set self.data to something that can read lines. | ||
|  |         if isinstance(f, list): | ||
|  |             # read_excel: f is a list | ||
|  |             self.data = cast(Iterator[str], f) | ||
|  |         else: | ||
|  |             assert hasattr(f, "readline") | ||
|  |             self.data = self._make_reader(f) | ||
|  | 
 | ||
|  |         # Get columns in two steps: infer from data, then | ||
|  |         # infer column indices from self.usecols if it is specified. | ||
|  |         self._col_indices: list[int] | None = None | ||
|  |         columns: list[list[Scalar | None]] | ||
|  |         ( | ||
|  |             columns, | ||
|  |             self.num_original_columns, | ||
|  |             self.unnamed_cols, | ||
|  |         ) = self._infer_columns() | ||
|  | 
 | ||
|  |         # Now self.columns has the set of columns that we will process. | ||
|  |         # The original set is stored in self.original_columns. | ||
|  |         # error: Cannot determine type of 'index_names' | ||
|  |         ( | ||
|  |             self.columns, | ||
|  |             self.index_names, | ||
|  |             self.col_names, | ||
|  |             _, | ||
|  |         ) = self._extract_multi_indexer_columns( | ||
|  |             columns, | ||
|  |             self.index_names,  # type: ignore[has-type] | ||
|  |         ) | ||
|  | 
 | ||
|  |         # get popped off for index | ||
|  |         self.orig_names: list[Hashable] = list(self.columns) | ||
|  | 
 | ||
|  |         # needs to be cleaned/refactored | ||
|  |         # multiple date column thing turning into a real spaghetti factory | ||
|  | 
 | ||
|  |         if not self._has_complex_date_col: | ||
|  |             (index_names, self.orig_names, self.columns) = self._get_index_name() | ||
|  |             self._name_processed = True | ||
|  |             if self.index_names is None: | ||
|  |                 self.index_names = index_names | ||
|  | 
 | ||
|  |         if self._col_indices is None: | ||
|  |             self._col_indices = list(range(len(self.columns))) | ||
|  | 
 | ||
|  |         self._parse_date_cols = self._validate_parse_dates_presence(self.columns) | ||
|  |         self._no_thousands_columns = self._set_no_thousand_columns() | ||
|  | 
 | ||
|  |         if len(self.decimal) != 1: | ||
|  |             raise ValueError("Only length-1 decimal markers supported") | ||
|  | 
 | ||
|  |     @cache_readonly | ||
|  |     def num(self) -> re.Pattern: | ||
|  |         decimal = re.escape(self.decimal) | ||
|  |         if self.thousands is None: | ||
|  |             regex = rf"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$" | ||
|  |         else: | ||
|  |             thousands = re.escape(self.thousands) | ||
|  |             regex = ( | ||
|  |                 rf"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" | ||
|  |                 rf"([0-9]?(E|e)\-?[0-9]+)?$" | ||
|  |             ) | ||
|  |         return re.compile(regex) | ||
|  | 
 | ||
|  |     def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]): | ||
|  |         sep = self.delimiter | ||
|  | 
 | ||
|  |         if sep is None or len(sep) == 1: | ||
|  |             if self.lineterminator: | ||
|  |                 raise ValueError( | ||
|  |                     "Custom line terminators not supported in python parser (yet)" | ||
|  |                 ) | ||
|  | 
 | ||
|  |             class MyDialect(csv.Dialect): | ||
|  |                 delimiter = self.delimiter | ||
|  |                 quotechar = self.quotechar | ||
|  |                 escapechar = self.escapechar | ||
|  |                 doublequote = self.doublequote | ||
|  |                 skipinitialspace = self.skipinitialspace | ||
|  |                 quoting = self.quoting | ||
|  |                 lineterminator = "\n" | ||
|  | 
 | ||
|  |             dia = MyDialect | ||
|  | 
 | ||
|  |             if sep is not None: | ||
|  |                 dia.delimiter = sep | ||
|  |             else: | ||
|  |                 # attempt to sniff the delimiter from the first valid line, | ||
|  |                 # i.e. no comment line and not in skiprows | ||
|  |                 line = f.readline() | ||
|  |                 lines = self._check_comments([[line]])[0] | ||
|  |                 while self.skipfunc(self.pos) or not lines: | ||
|  |                     self.pos += 1 | ||
|  |                     line = f.readline() | ||
|  |                     lines = self._check_comments([[line]])[0] | ||
|  |                 lines_str = cast(list[str], lines) | ||
|  | 
 | ||
|  |                 # since `line` was a string, lines will be a list containing | ||
|  |                 # only a single string | ||
|  |                 line = lines_str[0] | ||
|  | 
 | ||
|  |                 self.pos += 1 | ||
|  |                 self.line_pos += 1 | ||
|  |                 sniffed = csv.Sniffer().sniff(line) | ||
|  |                 dia.delimiter = sniffed.delimiter | ||
|  | 
 | ||
|  |                 # Note: encoding is irrelevant here | ||
|  |                 line_rdr = csv.reader(StringIO(line), dialect=dia) | ||
|  |                 self.buf.extend(list(line_rdr)) | ||
|  | 
 | ||
|  |             # Note: encoding is irrelevant here | ||
|  |             reader = csv.reader(f, dialect=dia, strict=True) | ||
|  | 
 | ||
|  |         else: | ||
|  | 
 | ||
|  |             def _read(): | ||
|  |                 line = f.readline() | ||
|  |                 pat = re.compile(sep) | ||
|  | 
 | ||
|  |                 yield pat.split(line.strip()) | ||
|  | 
 | ||
|  |                 for line in f: | ||
|  |                     yield pat.split(line.strip()) | ||
|  | 
 | ||
|  |             reader = _read() | ||
|  | 
 | ||
|  |         return reader | ||
|  | 
 | ||
|  |     def read( | ||
|  |         self, rows: int | None = None | ||
|  |     ) -> tuple[ | ||
|  |         Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike] | ||
|  |     ]: | ||
|  |         try: | ||
|  |             content = self._get_lines(rows) | ||
|  |         except StopIteration: | ||
|  |             if self._first_chunk: | ||
|  |                 content = [] | ||
|  |             else: | ||
|  |                 self.close() | ||
|  |                 raise | ||
|  | 
 | ||
|  |         # done with first read, next time raise StopIteration | ||
|  |         self._first_chunk = False | ||
|  | 
 | ||
|  |         columns: Sequence[Hashable] = list(self.orig_names) | ||
|  |         if not len(content):  # pragma: no cover | ||
|  |             # DataFrame with the right metadata, even though it's length 0 | ||
|  |             # error: Cannot determine type of 'index_col' | ||
|  |             names = dedup_names( | ||
|  |                 self.orig_names, | ||
|  |                 is_potential_multi_index( | ||
|  |                     self.orig_names, | ||
|  |                     self.index_col,  # type: ignore[has-type] | ||
|  |                 ), | ||
|  |             ) | ||
|  |             index, columns, col_dict = self._get_empty_meta( | ||
|  |                 names, | ||
|  |                 self.dtype, | ||
|  |             ) | ||
|  |             conv_columns = self._maybe_make_multi_index_columns(columns, self.col_names) | ||
|  |             return index, conv_columns, col_dict | ||
|  | 
 | ||
|  |         # handle new style for names in index | ||
|  |         count_empty_content_vals = count_empty_vals(content[0]) | ||
|  |         indexnamerow = None | ||
|  |         if self.has_index_names and count_empty_content_vals == len(columns): | ||
|  |             indexnamerow = content[0] | ||
|  |             content = content[1:] | ||
|  | 
 | ||
|  |         alldata = self._rows_to_cols(content) | ||
|  |         data, columns = self._exclude_implicit_index(alldata) | ||
|  | 
 | ||
|  |         conv_data = self._convert_data(data) | ||
|  |         columns, conv_data = self._do_date_conversions(columns, conv_data) | ||
|  | 
 | ||
|  |         index, result_columns = self._make_index( | ||
|  |             conv_data, alldata, columns, indexnamerow | ||
|  |         ) | ||
|  | 
 | ||
|  |         return index, result_columns, conv_data | ||
|  | 
 | ||
|  |     def _exclude_implicit_index( | ||
|  |         self, | ||
|  |         alldata: list[np.ndarray], | ||
|  |     ) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]: | ||
|  |         # error: Cannot determine type of 'index_col' | ||
|  |         names = dedup_names( | ||
|  |             self.orig_names, | ||
|  |             is_potential_multi_index( | ||
|  |                 self.orig_names, | ||
|  |                 self.index_col,  # type: ignore[has-type] | ||
|  |             ), | ||
|  |         ) | ||
|  | 
 | ||
|  |         offset = 0 | ||
|  |         if self._implicit_index: | ||
|  |             # error: Cannot determine type of 'index_col' | ||
|  |             offset = len(self.index_col)  # type: ignore[has-type] | ||
|  | 
 | ||
|  |         len_alldata = len(alldata) | ||
|  |         self._check_data_length(names, alldata) | ||
|  | 
 | ||
|  |         return { | ||
|  |             name: alldata[i + offset] for i, name in enumerate(names) if i < len_alldata | ||
|  |         }, names | ||
|  | 
 | ||
|  |     # legacy | ||
|  |     def get_chunk( | ||
|  |         self, size: int | None = None | ||
|  |     ) -> tuple[ | ||
|  |         Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike] | ||
|  |     ]: | ||
|  |         if size is None: | ||
|  |             # error: "PythonParser" has no attribute "chunksize" | ||
|  |             size = self.chunksize  # type: ignore[attr-defined] | ||
|  |         return self.read(rows=size) | ||
|  | 
 | ||
|  |     def _convert_data( | ||
|  |         self, | ||
|  |         data: Mapping[Hashable, np.ndarray], | ||
|  |     ) -> Mapping[Hashable, ArrayLike]: | ||
|  |         # apply converters | ||
|  |         clean_conv = self._clean_mapping(self.converters) | ||
|  |         clean_dtypes = self._clean_mapping(self.dtype) | ||
|  | 
 | ||
|  |         # Apply NA values. | ||
|  |         clean_na_values = {} | ||
|  |         clean_na_fvalues = {} | ||
|  | 
 | ||
|  |         if isinstance(self.na_values, dict): | ||
|  |             for col in self.na_values: | ||
|  |                 na_value = self.na_values[col] | ||
|  |                 na_fvalue = self.na_fvalues[col] | ||
|  | 
 | ||
|  |                 if isinstance(col, int) and col not in self.orig_names: | ||
|  |                     col = self.orig_names[col] | ||
|  | 
 | ||
|  |                 clean_na_values[col] = na_value | ||
|  |                 clean_na_fvalues[col] = na_fvalue | ||
|  |         else: | ||
|  |             clean_na_values = self.na_values | ||
|  |             clean_na_fvalues = self.na_fvalues | ||
|  | 
 | ||
|  |         return self._convert_to_ndarrays( | ||
|  |             data, | ||
|  |             clean_na_values, | ||
|  |             clean_na_fvalues, | ||
|  |             self.verbose, | ||
|  |             clean_conv, | ||
|  |             clean_dtypes, | ||
|  |         ) | ||
|  | 
 | ||
|  |     @cache_readonly | ||
|  |     def _have_mi_columns(self) -> bool: | ||
|  |         if self.header is None: | ||
|  |             return False | ||
|  | 
 | ||
|  |         header = self.header | ||
|  |         if isinstance(header, (list, tuple, np.ndarray)): | ||
|  |             return len(header) > 1 | ||
|  |         else: | ||
|  |             return False | ||
|  | 
 | ||
|  |     def _infer_columns( | ||
|  |         self, | ||
|  |     ) -> tuple[list[list[Scalar | None]], int, set[Scalar | None]]: | ||
|  |         names = self.names | ||
|  |         num_original_columns = 0 | ||
|  |         clear_buffer = True | ||
|  |         unnamed_cols: set[Scalar | None] = set() | ||
|  | 
 | ||
|  |         if self.header is not None: | ||
|  |             header = self.header | ||
|  |             have_mi_columns = self._have_mi_columns | ||
|  | 
 | ||
|  |             if isinstance(header, (list, tuple, np.ndarray)): | ||
|  |                 # we have a mi columns, so read an extra line | ||
|  |                 if have_mi_columns: | ||
|  |                     header = list(header) + [header[-1] + 1] | ||
|  |             else: | ||
|  |                 header = [header] | ||
|  | 
 | ||
|  |             columns: list[list[Scalar | None]] = [] | ||
|  |             for level, hr in enumerate(header): | ||
|  |                 try: | ||
|  |                     line = self._buffered_line() | ||
|  | 
 | ||
|  |                     while self.line_pos <= hr: | ||
|  |                         line = self._next_line() | ||
|  | 
 | ||
|  |                 except StopIteration as err: | ||
|  |                     if 0 < self.line_pos <= hr and ( | ||
|  |                         not have_mi_columns or hr != header[-1] | ||
|  |                     ): | ||
|  |                         # If no rows we want to raise a different message and if | ||
|  |                         # we have mi columns, the last line is not part of the header | ||
|  |                         joi = list(map(str, header[:-1] if have_mi_columns else header)) | ||
|  |                         msg = f"[{','.join(joi)}], len of {len(joi)}, " | ||
|  |                         raise ValueError( | ||
|  |                             f"Passed header={msg}" | ||
|  |                             f"but only {self.line_pos} lines in file" | ||
|  |                         ) from err | ||
|  | 
 | ||
|  |                     # We have an empty file, so check | ||
|  |                     # if columns are provided. That will | ||
|  |                     # serve as the 'line' for parsing | ||
|  |                     if have_mi_columns and hr > 0: | ||
|  |                         if clear_buffer: | ||
|  |                             self._clear_buffer() | ||
|  |                         columns.append([None] * len(columns[-1])) | ||
|  |                         return columns, num_original_columns, unnamed_cols | ||
|  | 
 | ||
|  |                     if not self.names: | ||
|  |                         raise EmptyDataError("No columns to parse from file") from err | ||
|  | 
 | ||
|  |                     line = self.names[:] | ||
|  | 
 | ||
|  |                 this_columns: list[Scalar | None] = [] | ||
|  |                 this_unnamed_cols = [] | ||
|  | 
 | ||
|  |                 for i, c in enumerate(line): | ||
|  |                     if c == "": | ||
|  |                         if have_mi_columns: | ||
|  |                             col_name = f"Unnamed: {i}_level_{level}" | ||
|  |                         else: | ||
|  |                             col_name = f"Unnamed: {i}" | ||
|  | 
 | ||
|  |                         this_unnamed_cols.append(i) | ||
|  |                         this_columns.append(col_name) | ||
|  |                     else: | ||
|  |                         this_columns.append(c) | ||
|  | 
 | ||
|  |                 if not have_mi_columns: | ||
|  |                     counts: DefaultDict = defaultdict(int) | ||
|  |                     # Ensure that regular columns are used before unnamed ones | ||
|  |                     # to keep given names and mangle unnamed columns | ||
|  |                     col_loop_order = [ | ||
|  |                         i | ||
|  |                         for i in range(len(this_columns)) | ||
|  |                         if i not in this_unnamed_cols | ||
|  |                     ] + this_unnamed_cols | ||
|  | 
 | ||
|  |                     # TODO: Use pandas.io.common.dedup_names instead (see #50371) | ||
|  |                     for i in col_loop_order: | ||
|  |                         col = this_columns[i] | ||
|  |                         old_col = col | ||
|  |                         cur_count = counts[col] | ||
|  | 
 | ||
|  |                         if cur_count > 0: | ||
|  |                             while cur_count > 0: | ||
|  |                                 counts[old_col] = cur_count + 1 | ||
|  |                                 col = f"{old_col}.{cur_count}" | ||
|  |                                 if col in this_columns: | ||
|  |                                     cur_count += 1 | ||
|  |                                 else: | ||
|  |                                     cur_count = counts[col] | ||
|  | 
 | ||
|  |                             if ( | ||
|  |                                 self.dtype is not None | ||
|  |                                 and is_dict_like(self.dtype) | ||
|  |                                 and self.dtype.get(old_col) is not None | ||
|  |                                 and self.dtype.get(col) is None | ||
|  |                             ): | ||
|  |                                 self.dtype.update({col: self.dtype.get(old_col)}) | ||
|  |                         this_columns[i] = col | ||
|  |                         counts[col] = cur_count + 1 | ||
|  |                 elif have_mi_columns: | ||
|  |                     # if we have grabbed an extra line, but its not in our | ||
|  |                     # format so save in the buffer, and create an blank extra | ||
|  |                     # line for the rest of the parsing code | ||
|  |                     if hr == header[-1]: | ||
|  |                         lc = len(this_columns) | ||
|  |                         # error: Cannot determine type of 'index_col' | ||
|  |                         sic = self.index_col  # type: ignore[has-type] | ||
|  |                         ic = len(sic) if sic is not None else 0 | ||
|  |                         unnamed_count = len(this_unnamed_cols) | ||
|  | 
 | ||
|  |                         # if wrong number of blanks or no index, not our format | ||
|  |                         if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0: | ||
|  |                             clear_buffer = False | ||
|  |                             this_columns = [None] * lc | ||
|  |                             self.buf = [self.buf[-1]] | ||
|  | 
 | ||
|  |                 columns.append(this_columns) | ||
|  |                 unnamed_cols.update({this_columns[i] for i in this_unnamed_cols}) | ||
|  | 
 | ||
|  |                 if len(columns) == 1: | ||
|  |                     num_original_columns = len(this_columns) | ||
|  | 
 | ||
|  |             if clear_buffer: | ||
|  |                 self._clear_buffer() | ||
|  | 
 | ||
|  |             first_line: list[Scalar] | None | ||
|  |             if names is not None: | ||
|  |                 # Read first row after header to check if data are longer | ||
|  |                 try: | ||
|  |                     first_line = self._next_line() | ||
|  |                 except StopIteration: | ||
|  |                     first_line = None | ||
|  | 
 | ||
|  |                 len_first_data_row = 0 if first_line is None else len(first_line) | ||
|  | 
 | ||
|  |                 if len(names) > len(columns[0]) and len(names) > len_first_data_row: | ||
|  |                     raise ValueError( | ||
|  |                         "Number of passed names did not match " | ||
|  |                         "number of header fields in the file" | ||
|  |                     ) | ||
|  |                 if len(columns) > 1: | ||
|  |                     raise TypeError("Cannot pass names with multi-index columns") | ||
|  | 
 | ||
|  |                 if self.usecols is not None: | ||
|  |                     # Set _use_cols. We don't store columns because they are | ||
|  |                     # overwritten. | ||
|  |                     self._handle_usecols(columns, names, num_original_columns) | ||
|  |                 else: | ||
|  |                     num_original_columns = len(names) | ||
|  |                 if self._col_indices is not None and len(names) != len( | ||
|  |                     self._col_indices | ||
|  |                 ): | ||
|  |                     columns = [[names[i] for i in sorted(self._col_indices)]] | ||
|  |                 else: | ||
|  |                     columns = [names] | ||
|  |             else: | ||
|  |                 columns = self._handle_usecols( | ||
|  |                     columns, columns[0], num_original_columns | ||
|  |                 ) | ||
|  |         else: | ||
|  |             ncols = len(self._header_line) | ||
|  |             num_original_columns = ncols | ||
|  | 
 | ||
|  |             if not names: | ||
|  |                 columns = [list(range(ncols))] | ||
|  |                 columns = self._handle_usecols(columns, columns[0], ncols) | ||
|  |             elif self.usecols is None or len(names) >= ncols: | ||
|  |                 columns = self._handle_usecols([names], names, ncols) | ||
|  |                 num_original_columns = len(names) | ||
|  |             elif not callable(self.usecols) and len(names) != len(self.usecols): | ||
|  |                 raise ValueError( | ||
|  |                     "Number of passed names did not match number of " | ||
|  |                     "header fields in the file" | ||
|  |                 ) | ||
|  |             else: | ||
|  |                 # Ignore output but set used columns. | ||
|  |                 columns = [names] | ||
|  |                 self._handle_usecols(columns, columns[0], ncols) | ||
|  | 
 | ||
|  |         return columns, num_original_columns, unnamed_cols | ||
|  | 
 | ||
|  |     @cache_readonly | ||
|  |     def _header_line(self): | ||
|  |         # Store line for reuse in _get_index_name | ||
|  |         if self.header is not None: | ||
|  |             return None | ||
|  | 
 | ||
|  |         try: | ||
|  |             line = self._buffered_line() | ||
|  |         except StopIteration as err: | ||
|  |             if not self.names: | ||
|  |                 raise EmptyDataError("No columns to parse from file") from err | ||
|  | 
 | ||
|  |             line = self.names[:] | ||
|  |         return line | ||
|  | 
 | ||
|  |     def _handle_usecols( | ||
|  |         self, | ||
|  |         columns: list[list[Scalar | None]], | ||
|  |         usecols_key: list[Scalar | None], | ||
|  |         num_original_columns: int, | ||
|  |     ) -> list[list[Scalar | None]]: | ||
|  |         """
 | ||
|  |         Sets self._col_indices | ||
|  | 
 | ||
|  |         usecols_key is used if there are string usecols. | ||
|  |         """
 | ||
|  |         col_indices: set[int] | list[int] | ||
|  |         if self.usecols is not None: | ||
|  |             if callable(self.usecols): | ||
|  |                 col_indices = self._evaluate_usecols(self.usecols, usecols_key) | ||
|  |             elif any(isinstance(u, str) for u in self.usecols): | ||
|  |                 if len(columns) > 1: | ||
|  |                     raise ValueError( | ||
|  |                         "If using multiple headers, usecols must be integers." | ||
|  |                     ) | ||
|  |                 col_indices = [] | ||
|  | 
 | ||
|  |                 for col in self.usecols: | ||
|  |                     if isinstance(col, str): | ||
|  |                         try: | ||
|  |                             col_indices.append(usecols_key.index(col)) | ||
|  |                         except ValueError: | ||
|  |                             self._validate_usecols_names(self.usecols, usecols_key) | ||
|  |                     else: | ||
|  |                         col_indices.append(col) | ||
|  |             else: | ||
|  |                 missing_usecols = [ | ||
|  |                     col for col in self.usecols if col >= num_original_columns | ||
|  |                 ] | ||
|  |                 if missing_usecols: | ||
|  |                     raise ParserError( | ||
|  |                         "Defining usecols with out-of-bounds indices is not allowed. " | ||
|  |                         f"{missing_usecols} are out-of-bounds.", | ||
|  |                     ) | ||
|  |                 col_indices = self.usecols | ||
|  | 
 | ||
|  |             columns = [ | ||
|  |                 [n for i, n in enumerate(column) if i in col_indices] | ||
|  |                 for column in columns | ||
|  |             ] | ||
|  |             self._col_indices = sorted(col_indices) | ||
|  |         return columns | ||
|  | 
 | ||
|  |     def _buffered_line(self) -> list[Scalar]: | ||
|  |         """
 | ||
|  |         Return a line from buffer, filling buffer if required. | ||
|  |         """
 | ||
|  |         if len(self.buf) > 0: | ||
|  |             return self.buf[0] | ||
|  |         else: | ||
|  |             return self._next_line() | ||
|  | 
 | ||
|  |     def _check_for_bom(self, first_row: list[Scalar]) -> list[Scalar]: | ||
|  |         """
 | ||
|  |         Checks whether the file begins with the BOM character. | ||
|  |         If it does, remove it. In addition, if there is quoting | ||
|  |         in the field subsequent to the BOM, remove it as well | ||
|  |         because it technically takes place at the beginning of | ||
|  |         the name, not the middle of it. | ||
|  |         """
 | ||
|  |         # first_row will be a list, so we need to check | ||
|  |         # that that list is not empty before proceeding. | ||
|  |         if not first_row: | ||
|  |             return first_row | ||
|  | 
 | ||
|  |         # The first element of this row is the one that could have the | ||
|  |         # BOM that we want to remove. Check that the first element is a | ||
|  |         # string before proceeding. | ||
|  |         if not isinstance(first_row[0], str): | ||
|  |             return first_row | ||
|  | 
 | ||
|  |         # Check that the string is not empty, as that would | ||
|  |         # obviously not have a BOM at the start of it. | ||
|  |         if not first_row[0]: | ||
|  |             return first_row | ||
|  | 
 | ||
|  |         # Since the string is non-empty, check that it does | ||
|  |         # in fact begin with a BOM. | ||
|  |         first_elt = first_row[0][0] | ||
|  |         if first_elt != _BOM: | ||
|  |             return first_row | ||
|  | 
 | ||
|  |         first_row_bom = first_row[0] | ||
|  |         new_row: str | ||
|  | 
 | ||
|  |         if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar: | ||
|  |             start = 2 | ||
|  |             quote = first_row_bom[1] | ||
|  |             end = first_row_bom[2:].index(quote) + 2 | ||
|  | 
 | ||
|  |             # Extract the data between the quotation marks | ||
|  |             new_row = first_row_bom[start:end] | ||
|  | 
 | ||
|  |             # Extract any remaining data after the second | ||
|  |             # quotation mark. | ||
|  |             if len(first_row_bom) > end + 1: | ||
|  |                 new_row += first_row_bom[end + 1 :] | ||
|  | 
 | ||
|  |         else: | ||
|  |             # No quotation so just remove BOM from first element | ||
|  |             new_row = first_row_bom[1:] | ||
|  | 
 | ||
|  |         new_row_list: list[Scalar] = [new_row] | ||
|  |         return new_row_list + first_row[1:] | ||
|  | 
 | ||
|  |     def _is_line_empty(self, line: list[Scalar]) -> bool: | ||
|  |         """
 | ||
|  |         Check if a line is empty or not. | ||
|  | 
 | ||
|  |         Parameters | ||
|  |         ---------- | ||
|  |         line : str, array-like | ||
|  |             The line of data to check. | ||
|  | 
 | ||
|  |         Returns | ||
|  |         ------- | ||
|  |         boolean : Whether or not the line is empty. | ||
|  |         """
 | ||
|  |         return not line or all(not x for x in line) | ||
|  | 
 | ||
|  |     def _next_line(self) -> list[Scalar]: | ||
|  |         if isinstance(self.data, list): | ||
|  |             while self.skipfunc(self.pos): | ||
|  |                 if self.pos >= len(self.data): | ||
|  |                     break | ||
|  |                 self.pos += 1 | ||
|  | 
 | ||
|  |             while True: | ||
|  |                 try: | ||
|  |                     line = self._check_comments([self.data[self.pos]])[0] | ||
|  |                     self.pos += 1 | ||
|  |                     # either uncommented or blank to begin with | ||
|  |                     if not self.skip_blank_lines and ( | ||
|  |                         self._is_line_empty(self.data[self.pos - 1]) or line | ||
|  |                     ): | ||
|  |                         break | ||
|  |                     if self.skip_blank_lines: | ||
|  |                         ret = self._remove_empty_lines([line]) | ||
|  |                         if ret: | ||
|  |                             line = ret[0] | ||
|  |                             break | ||
|  |                 except IndexError: | ||
|  |                     raise StopIteration | ||
|  |         else: | ||
|  |             while self.skipfunc(self.pos): | ||
|  |                 self.pos += 1 | ||
|  |                 # assert for mypy, data is Iterator[str] or None, would error in next | ||
|  |                 assert self.data is not None | ||
|  |                 next(self.data) | ||
|  | 
 | ||
|  |             while True: | ||
|  |                 orig_line = self._next_iter_line(row_num=self.pos + 1) | ||
|  |                 self.pos += 1 | ||
|  | 
 | ||
|  |                 if orig_line is not None: | ||
|  |                     line = self._check_comments([orig_line])[0] | ||
|  | 
 | ||
|  |                     if self.skip_blank_lines: | ||
|  |                         ret = self._remove_empty_lines([line]) | ||
|  | 
 | ||
|  |                         if ret: | ||
|  |                             line = ret[0] | ||
|  |                             break | ||
|  |                     elif self._is_line_empty(orig_line) or line: | ||
|  |                         break | ||
|  | 
 | ||
|  |         # This was the first line of the file, | ||
|  |         # which could contain the BOM at the | ||
|  |         # beginning of it. | ||
|  |         if self.pos == 1: | ||
|  |             line = self._check_for_bom(line) | ||
|  | 
 | ||
|  |         self.line_pos += 1 | ||
|  |         self.buf.append(line) | ||
|  |         return line | ||
|  | 
 | ||
|  |     def _alert_malformed(self, msg: str, row_num: int) -> None: | ||
|  |         """
 | ||
|  |         Alert a user about a malformed row, depending on value of | ||
|  |         `self.on_bad_lines` enum. | ||
|  | 
 | ||
|  |         If `self.on_bad_lines` is ERROR, the alert will be `ParserError`. | ||
|  |         If `self.on_bad_lines` is WARN, the alert will be printed out. | ||
|  | 
 | ||
|  |         Parameters | ||
|  |         ---------- | ||
|  |         msg: str | ||
|  |             The error message to display. | ||
|  |         row_num: int | ||
|  |             The row number where the parsing error occurred. | ||
|  |             Because this row number is displayed, we 1-index, | ||
|  |             even though we 0-index internally. | ||
|  |         """
 | ||
|  |         if self.on_bad_lines == self.BadLineHandleMethod.ERROR: | ||
|  |             raise ParserError(msg) | ||
|  |         if self.on_bad_lines == self.BadLineHandleMethod.WARN: | ||
|  |             warnings.warn( | ||
|  |                 f"Skipping line {row_num}: {msg}\n", | ||
|  |                 ParserWarning, | ||
|  |                 stacklevel=find_stack_level(), | ||
|  |             ) | ||
|  | 
 | ||
|  |     def _next_iter_line(self, row_num: int) -> list[Scalar] | None: | ||
|  |         """
 | ||
|  |         Wrapper around iterating through `self.data` (CSV source). | ||
|  | 
 | ||
|  |         When a CSV error is raised, we check for specific | ||
|  |         error messages that allow us to customize the | ||
|  |         error message displayed to the user. | ||
|  | 
 | ||
|  |         Parameters | ||
|  |         ---------- | ||
|  |         row_num: int | ||
|  |             The row number of the line being parsed. | ||
|  |         """
 | ||
|  |         try: | ||
|  |             # assert for mypy, data is Iterator[str] or None, would error in next | ||
|  |             assert self.data is not None | ||
|  |             line = next(self.data) | ||
|  |             # for mypy | ||
|  |             assert isinstance(line, list) | ||
|  |             return line | ||
|  |         except csv.Error as e: | ||
|  |             if self.on_bad_lines in ( | ||
|  |                 self.BadLineHandleMethod.ERROR, | ||
|  |                 self.BadLineHandleMethod.WARN, | ||
|  |             ): | ||
|  |                 msg = str(e) | ||
|  | 
 | ||
|  |                 if "NULL byte" in msg or "line contains NUL" in msg: | ||
|  |                     msg = ( | ||
|  |                         "NULL byte detected. This byte " | ||
|  |                         "cannot be processed in Python's " | ||
|  |                         "native csv library at the moment, " | ||
|  |                         "so please pass in engine='c' instead" | ||
|  |                     ) | ||
|  | 
 | ||
|  |                 if self.skipfooter > 0: | ||
|  |                     reason = ( | ||
|  |                         "Error could possibly be due to " | ||
|  |                         "parsing errors in the skipped footer rows " | ||
|  |                         "(the skipfooter keyword is only applied " | ||
|  |                         "after Python's csv library has parsed " | ||
|  |                         "all rows)." | ||
|  |                     ) | ||
|  |                     msg += ". " + reason | ||
|  | 
 | ||
|  |                 self._alert_malformed(msg, row_num) | ||
|  |             return None | ||
|  | 
 | ||
|  |     def _check_comments(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: | ||
|  |         if self.comment is None: | ||
|  |             return lines | ||
|  |         ret = [] | ||
|  |         for line in lines: | ||
|  |             rl = [] | ||
|  |             for x in line: | ||
|  |                 if ( | ||
|  |                     not isinstance(x, str) | ||
|  |                     or self.comment not in x | ||
|  |                     or x in self.na_values | ||
|  |                 ): | ||
|  |                     rl.append(x) | ||
|  |                 else: | ||
|  |                     x = x[: x.find(self.comment)] | ||
|  |                     if len(x) > 0: | ||
|  |                         rl.append(x) | ||
|  |                     break | ||
|  |             ret.append(rl) | ||
|  |         return ret | ||
|  | 
 | ||
|  |     def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: | ||
|  |         """
 | ||
|  |         Iterate through the lines and remove any that are | ||
|  |         either empty or contain only one whitespace value | ||
|  | 
 | ||
|  |         Parameters | ||
|  |         ---------- | ||
|  |         lines : list of list of Scalars | ||
|  |             The array of lines that we are to filter. | ||
|  | 
 | ||
|  |         Returns | ||
|  |         ------- | ||
|  |         filtered_lines : list of list of Scalars | ||
|  |             The same array of lines with the "empty" ones removed. | ||
|  |         """
 | ||
|  |         # Remove empty lines and lines with only one whitespace value | ||
|  |         ret = [ | ||
|  |             line | ||
|  |             for line in lines | ||
|  |             if ( | ||
|  |                 len(line) > 1 | ||
|  |                 or len(line) == 1 | ||
|  |                 and (not isinstance(line[0], str) or line[0].strip()) | ||
|  |             ) | ||
|  |         ] | ||
|  |         return ret | ||
|  | 
 | ||
|  |     def _check_thousands(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: | ||
|  |         if self.thousands is None: | ||
|  |             return lines | ||
|  | 
 | ||
|  |         return self._search_replace_num_columns( | ||
|  |             lines=lines, search=self.thousands, replace="" | ||
|  |         ) | ||
|  | 
 | ||
|  |     def _search_replace_num_columns( | ||
|  |         self, lines: list[list[Scalar]], search: str, replace: str | ||
|  |     ) -> list[list[Scalar]]: | ||
|  |         ret = [] | ||
|  |         for line in lines: | ||
|  |             rl = [] | ||
|  |             for i, x in enumerate(line): | ||
|  |                 if ( | ||
|  |                     not isinstance(x, str) | ||
|  |                     or search not in x | ||
|  |                     or i in self._no_thousands_columns | ||
|  |                     or not self.num.search(x.strip()) | ||
|  |                 ): | ||
|  |                     rl.append(x) | ||
|  |                 else: | ||
|  |                     rl.append(x.replace(search, replace)) | ||
|  |             ret.append(rl) | ||
|  |         return ret | ||
|  | 
 | ||
|  |     def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: | ||
|  |         if self.decimal == parser_defaults["decimal"]: | ||
|  |             return lines | ||
|  | 
 | ||
|  |         return self._search_replace_num_columns( | ||
|  |             lines=lines, search=self.decimal, replace="." | ||
|  |         ) | ||
|  | 
 | ||
|  |     def _clear_buffer(self) -> None: | ||
|  |         self.buf = [] | ||
|  | 
 | ||
|  |     def _get_index_name( | ||
|  |         self, | ||
|  |     ) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]: | ||
|  |         """
 | ||
|  |         Try several cases to get lines: | ||
|  | 
 | ||
|  |         0) There are headers on row 0 and row 1 and their | ||
|  |         total summed lengths equals the length of the next line. | ||
|  |         Treat row 0 as columns and row 1 as indices | ||
|  |         1) Look for implicit index: there are more columns | ||
|  |         on row 1 than row 0. If this is true, assume that row | ||
|  |         1 lists index columns and row 0 lists normal columns. | ||
|  |         2) Get index from the columns if it was listed. | ||
|  |         """
 | ||
|  |         columns: Sequence[Hashable] = self.orig_names | ||
|  |         orig_names = list(columns) | ||
|  |         columns = list(columns) | ||
|  | 
 | ||
|  |         line: list[Scalar] | None | ||
|  |         if self._header_line is not None: | ||
|  |             line = self._header_line | ||
|  |         else: | ||
|  |             try: | ||
|  |                 line = self._next_line() | ||
|  |             except StopIteration: | ||
|  |                 line = None | ||
|  | 
 | ||
|  |         next_line: list[Scalar] | None | ||
|  |         try: | ||
|  |             next_line = self._next_line() | ||
|  |         except StopIteration: | ||
|  |             next_line = None | ||
|  | 
 | ||
|  |         # implicitly index_col=0 b/c 1 fewer column names | ||
|  |         implicit_first_cols = 0 | ||
|  |         if line is not None: | ||
|  |             # leave it 0, #2442 | ||
|  |             # Case 1 | ||
|  |             # error: Cannot determine type of 'index_col' | ||
|  |             index_col = self.index_col  # type: ignore[has-type] | ||
|  |             if index_col is not False: | ||
|  |                 implicit_first_cols = len(line) - self.num_original_columns | ||
|  | 
 | ||
|  |             # Case 0 | ||
|  |             if ( | ||
|  |                 next_line is not None | ||
|  |                 and self.header is not None | ||
|  |                 and index_col is not False | ||
|  |             ): | ||
|  |                 if len(next_line) == len(line) + self.num_original_columns: | ||
|  |                     # column and index names on diff rows | ||
|  |                     self.index_col = list(range(len(line))) | ||
|  |                     self.buf = self.buf[1:] | ||
|  | 
 | ||
|  |                     for c in reversed(line): | ||
|  |                         columns.insert(0, c) | ||
|  | 
 | ||
|  |                     # Update list of original names to include all indices. | ||
|  |                     orig_names = list(columns) | ||
|  |                     self.num_original_columns = len(columns) | ||
|  |                     return line, orig_names, columns | ||
|  | 
 | ||
|  |         if implicit_first_cols > 0: | ||
|  |             # Case 1 | ||
|  |             self._implicit_index = True | ||
|  |             if self.index_col is None: | ||
|  |                 self.index_col = list(range(implicit_first_cols)) | ||
|  | 
 | ||
|  |             index_name = None | ||
|  | 
 | ||
|  |         else: | ||
|  |             # Case 2 | ||
|  |             (index_name, _, self.index_col) = self._clean_index_names( | ||
|  |                 columns, self.index_col | ||
|  |             ) | ||
|  | 
 | ||
|  |         return index_name, orig_names, columns | ||
|  | 
 | ||
|  |     def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: | ||
|  |         col_len = self.num_original_columns | ||
|  | 
 | ||
|  |         if self._implicit_index: | ||
|  |             col_len += len(self.index_col) | ||
|  | 
 | ||
|  |         max_len = max(len(row) for row in content) | ||
|  | 
 | ||
|  |         # Check that there are no rows with too many | ||
|  |         # elements in their row (rows with too few | ||
|  |         # elements are padded with NaN). | ||
|  |         # error: Non-overlapping identity check (left operand type: "List[int]", | ||
|  |         # right operand type: "Literal[False]") | ||
|  |         if ( | ||
|  |             max_len > col_len | ||
|  |             and self.index_col is not False  # type: ignore[comparison-overlap] | ||
|  |             and self.usecols is None | ||
|  |         ): | ||
|  |             footers = self.skipfooter if self.skipfooter else 0 | ||
|  |             bad_lines = [] | ||
|  | 
 | ||
|  |             iter_content = enumerate(content) | ||
|  |             content_len = len(content) | ||
|  |             content = [] | ||
|  | 
 | ||
|  |             for i, _content in iter_content: | ||
|  |                 actual_len = len(_content) | ||
|  | 
 | ||
|  |                 if actual_len > col_len: | ||
|  |                     if callable(self.on_bad_lines): | ||
|  |                         new_l = self.on_bad_lines(_content) | ||
|  |                         if new_l is not None: | ||
|  |                             content.append(new_l) | ||
|  |                     elif self.on_bad_lines in ( | ||
|  |                         self.BadLineHandleMethod.ERROR, | ||
|  |                         self.BadLineHandleMethod.WARN, | ||
|  |                     ): | ||
|  |                         row_num = self.pos - (content_len - i + footers) | ||
|  |                         bad_lines.append((row_num, actual_len)) | ||
|  | 
 | ||
|  |                         if self.on_bad_lines == self.BadLineHandleMethod.ERROR: | ||
|  |                             break | ||
|  |                 else: | ||
|  |                     content.append(_content) | ||
|  | 
 | ||
|  |             for row_num, actual_len in bad_lines: | ||
|  |                 msg = ( | ||
|  |                     f"Expected {col_len} fields in line {row_num + 1}, saw " | ||
|  |                     f"{actual_len}" | ||
|  |                 ) | ||
|  |                 if ( | ||
|  |                     self.delimiter | ||
|  |                     and len(self.delimiter) > 1 | ||
|  |                     and self.quoting != csv.QUOTE_NONE | ||
|  |                 ): | ||
|  |                     # see gh-13374 | ||
|  |                     reason = ( | ||
|  |                         "Error could possibly be due to quotes being " | ||
|  |                         "ignored when a multi-char delimiter is used." | ||
|  |                     ) | ||
|  |                     msg += ". " + reason | ||
|  | 
 | ||
|  |                 self._alert_malformed(msg, row_num + 1) | ||
|  | 
 | ||
|  |         # see gh-13320 | ||
|  |         zipped_content = list(lib.to_object_array(content, min_width=col_len).T) | ||
|  | 
 | ||
|  |         if self.usecols: | ||
|  |             assert self._col_indices is not None | ||
|  |             col_indices = self._col_indices | ||
|  | 
 | ||
|  |             if self._implicit_index: | ||
|  |                 zipped_content = [ | ||
|  |                     a | ||
|  |                     for i, a in enumerate(zipped_content) | ||
|  |                     if ( | ||
|  |                         i < len(self.index_col) | ||
|  |                         or i - len(self.index_col) in col_indices | ||
|  |                     ) | ||
|  |                 ] | ||
|  |             else: | ||
|  |                 zipped_content = [ | ||
|  |                     a for i, a in enumerate(zipped_content) if i in col_indices | ||
|  |                 ] | ||
|  |         return zipped_content | ||
|  | 
 | ||
|  |     def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: | ||
|  |         lines = self.buf | ||
|  |         new_rows = None | ||
|  | 
 | ||
|  |         # already fetched some number | ||
|  |         if rows is not None: | ||
|  |             # we already have the lines in the buffer | ||
|  |             if len(self.buf) >= rows: | ||
|  |                 new_rows, self.buf = self.buf[:rows], self.buf[rows:] | ||
|  | 
 | ||
|  |             # need some lines | ||
|  |             else: | ||
|  |                 rows -= len(self.buf) | ||
|  | 
 | ||
|  |         if new_rows is None: | ||
|  |             if isinstance(self.data, list): | ||
|  |                 if self.pos > len(self.data): | ||
|  |                     raise StopIteration | ||
|  |                 if rows is None: | ||
|  |                     new_rows = self.data[self.pos :] | ||
|  |                     new_pos = len(self.data) | ||
|  |                 else: | ||
|  |                     new_rows = self.data[self.pos : self.pos + rows] | ||
|  |                     new_pos = self.pos + rows | ||
|  | 
 | ||
|  |                 new_rows = self._remove_skipped_rows(new_rows) | ||
|  |                 lines.extend(new_rows) | ||
|  |                 self.pos = new_pos | ||
|  | 
 | ||
|  |             else: | ||
|  |                 new_rows = [] | ||
|  |                 try: | ||
|  |                     if rows is not None: | ||
|  |                         row_index = 0 | ||
|  |                         row_ct = 0 | ||
|  |                         offset = self.pos if self.pos is not None else 0 | ||
|  |                         while row_ct < rows: | ||
|  |                             # assert for mypy, data is Iterator[str] or None, would | ||
|  |                             # error in next | ||
|  |                             assert self.data is not None | ||
|  |                             new_row = next(self.data) | ||
|  |                             if not self.skipfunc(offset + row_index): | ||
|  |                                 row_ct += 1 | ||
|  |                             row_index += 1 | ||
|  |                             new_rows.append(new_row) | ||
|  | 
 | ||
|  |                         len_new_rows = len(new_rows) | ||
|  |                         new_rows = self._remove_skipped_rows(new_rows) | ||
|  |                         lines.extend(new_rows) | ||
|  |                     else: | ||
|  |                         rows = 0 | ||
|  | 
 | ||
|  |                         while True: | ||
|  |                             next_row = self._next_iter_line(row_num=self.pos + rows + 1) | ||
|  |                             rows += 1 | ||
|  | 
 | ||
|  |                             if next_row is not None: | ||
|  |                                 new_rows.append(next_row) | ||
|  |                         len_new_rows = len(new_rows) | ||
|  | 
 | ||
|  |                 except StopIteration: | ||
|  |                     len_new_rows = len(new_rows) | ||
|  |                     new_rows = self._remove_skipped_rows(new_rows) | ||
|  |                     lines.extend(new_rows) | ||
|  |                     if len(lines) == 0: | ||
|  |                         raise | ||
|  |                 self.pos += len_new_rows | ||
|  | 
 | ||
|  |             self.buf = [] | ||
|  |         else: | ||
|  |             lines = new_rows | ||
|  | 
 | ||
|  |         if self.skipfooter: | ||
|  |             lines = lines[: -self.skipfooter] | ||
|  | 
 | ||
|  |         lines = self._check_comments(lines) | ||
|  |         if self.skip_blank_lines: | ||
|  |             lines = self._remove_empty_lines(lines) | ||
|  |         lines = self._check_thousands(lines) | ||
|  |         return self._check_decimal(lines) | ||
|  | 
 | ||
|  |     def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar]]: | ||
|  |         if self.skiprows: | ||
|  |             return [ | ||
|  |                 row for i, row in enumerate(new_rows) if not self.skipfunc(i + self.pos) | ||
|  |             ] | ||
|  |         return new_rows | ||
|  | 
 | ||
|  |     def _set_no_thousand_columns(self) -> set[int]: | ||
|  |         no_thousands_columns: set[int] = set() | ||
|  |         if self.columns and self.parse_dates: | ||
|  |             assert self._col_indices is not None | ||
|  |             no_thousands_columns = self._set_noconvert_dtype_columns( | ||
|  |                 self._col_indices, self.columns | ||
|  |             ) | ||
|  |         if self.columns and self.dtype: | ||
|  |             assert self._col_indices is not None | ||
|  |             for i, col in zip(self._col_indices, self.columns): | ||
|  |                 if not isinstance(self.dtype, dict) and not is_numeric_dtype( | ||
|  |                     self.dtype | ||
|  |                 ): | ||
|  |                     no_thousands_columns.add(i) | ||
|  |                 if ( | ||
|  |                     isinstance(self.dtype, dict) | ||
|  |                     and col in self.dtype | ||
|  |                     and ( | ||
|  |                         not is_numeric_dtype(self.dtype[col]) | ||
|  |                         or is_bool_dtype(self.dtype[col]) | ||
|  |                     ) | ||
|  |                 ): | ||
|  |                     no_thousands_columns.add(i) | ||
|  |         return no_thousands_columns | ||
|  | 
 | ||
|  | 
 | ||
|  | class FixedWidthReader(abc.Iterator): | ||
|  |     """
 | ||
|  |     A reader of fixed-width lines. | ||
|  |     """
 | ||
|  | 
 | ||
|  |     def __init__( | ||
|  |         self, | ||
|  |         f: IO[str] | ReadCsvBuffer[str], | ||
|  |         colspecs: list[tuple[int, int]] | Literal["infer"], | ||
|  |         delimiter: str | None, | ||
|  |         comment: str | None, | ||
|  |         skiprows: set[int] | None = None, | ||
|  |         infer_nrows: int = 100, | ||
|  |     ) -> None: | ||
|  |         self.f = f | ||
|  |         self.buffer: Iterator | None = None | ||
|  |         self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t " | ||
|  |         self.comment = comment | ||
|  |         if colspecs == "infer": | ||
|  |             self.colspecs = self.detect_colspecs( | ||
|  |                 infer_nrows=infer_nrows, skiprows=skiprows | ||
|  |             ) | ||
|  |         else: | ||
|  |             self.colspecs = colspecs | ||
|  | 
 | ||
|  |         if not isinstance(self.colspecs, (tuple, list)): | ||
|  |             raise TypeError( | ||
|  |                 "column specifications must be a list or tuple, " | ||
|  |                 f"input was a {type(colspecs).__name__}" | ||
|  |             ) | ||
|  | 
 | ||
|  |         for colspec in self.colspecs: | ||
|  |             if not ( | ||
|  |                 isinstance(colspec, (tuple, list)) | ||
|  |                 and len(colspec) == 2 | ||
|  |                 and isinstance(colspec[0], (int, np.integer, type(None))) | ||
|  |                 and isinstance(colspec[1], (int, np.integer, type(None))) | ||
|  |             ): | ||
|  |                 raise TypeError( | ||
|  |                     "Each column specification must be " | ||
|  |                     "2 element tuple or list of integers" | ||
|  |                 ) | ||
|  | 
 | ||
|  |     def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]: | ||
|  |         """
 | ||
|  |         Read rows from self.f, skipping as specified. | ||
|  | 
 | ||
|  |         We distinguish buffer_rows (the first <= infer_nrows | ||
|  |         lines) from the rows returned to detect_colspecs | ||
|  |         because it's simpler to leave the other locations | ||
|  |         with skiprows logic alone than to modify them to | ||
|  |         deal with the fact we skipped some rows here as | ||
|  |         well. | ||
|  | 
 | ||
|  |         Parameters | ||
|  |         ---------- | ||
|  |         infer_nrows : int | ||
|  |             Number of rows to read from self.f, not counting | ||
|  |             rows that are skipped. | ||
|  |         skiprows: set, optional | ||
|  |             Indices of rows to skip. | ||
|  | 
 | ||
|  |         Returns | ||
|  |         ------- | ||
|  |         detect_rows : list of str | ||
|  |             A list containing the rows to read. | ||
|  | 
 | ||
|  |         """
 | ||
|  |         if skiprows is None: | ||
|  |             skiprows = set() | ||
|  |         buffer_rows = [] | ||
|  |         detect_rows = [] | ||
|  |         for i, row in enumerate(self.f): | ||
|  |             if i not in skiprows: | ||
|  |                 detect_rows.append(row) | ||
|  |             buffer_rows.append(row) | ||
|  |             if len(detect_rows) >= infer_nrows: | ||
|  |                 break | ||
|  |         self.buffer = iter(buffer_rows) | ||
|  |         return detect_rows | ||
|  | 
 | ||
|  |     def detect_colspecs( | ||
|  |         self, infer_nrows: int = 100, skiprows: set[int] | None = None | ||
|  |     ) -> list[tuple[int, int]]: | ||
|  |         # Regex escape the delimiters | ||
|  |         delimiters = "".join([rf"\{x}" for x in self.delimiter]) | ||
|  |         pattern = re.compile(f"([^{delimiters}]+)") | ||
|  |         rows = self.get_rows(infer_nrows, skiprows) | ||
|  |         if not rows: | ||
|  |             raise EmptyDataError("No rows from which to infer column width") | ||
|  |         max_len = max(map(len, rows)) | ||
|  |         mask = np.zeros(max_len + 1, dtype=int) | ||
|  |         if self.comment is not None: | ||
|  |             rows = [row.partition(self.comment)[0] for row in rows] | ||
|  |         for row in rows: | ||
|  |             for m in pattern.finditer(row): | ||
|  |                 mask[m.start() : m.end()] = 1 | ||
|  |         shifted = np.roll(mask, 1) | ||
|  |         shifted[0] = 0 | ||
|  |         edges = np.where((mask ^ shifted) == 1)[0] | ||
|  |         edge_pairs = list(zip(edges[::2], edges[1::2])) | ||
|  |         return edge_pairs | ||
|  | 
 | ||
|  |     def __next__(self) -> list[str]: | ||
|  |         # Argument 1 to "next" has incompatible type "Union[IO[str], | ||
|  |         # ReadCsvBuffer[str]]"; expected "SupportsNext[str]" | ||
|  |         if self.buffer is not None: | ||
|  |             try: | ||
|  |                 line = next(self.buffer) | ||
|  |             except StopIteration: | ||
|  |                 self.buffer = None | ||
|  |                 line = next(self.f)  # type: ignore[arg-type] | ||
|  |         else: | ||
|  |             line = next(self.f)  # type: ignore[arg-type] | ||
|  |         # Note: 'colspecs' is a sequence of half-open intervals. | ||
|  |         return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs] | ||
|  | 
 | ||
|  | 
 | ||
|  | class FixedWidthFieldParser(PythonParser): | ||
|  |     """
 | ||
|  |     Specialization that Converts fixed-width fields into DataFrames. | ||
|  |     See PythonParser for details. | ||
|  |     """
 | ||
|  | 
 | ||
|  |     def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None: | ||
|  |         # Support iterators, convert to a list. | ||
|  |         self.colspecs = kwds.pop("colspecs") | ||
|  |         self.infer_nrows = kwds.pop("infer_nrows") | ||
|  |         PythonParser.__init__(self, f, **kwds) | ||
|  | 
 | ||
|  |     def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader: | ||
|  |         return FixedWidthReader( | ||
|  |             f, | ||
|  |             self.colspecs, | ||
|  |             self.delimiter, | ||
|  |             self.comment, | ||
|  |             self.skiprows, | ||
|  |             self.infer_nrows, | ||
|  |         ) | ||
|  | 
 | ||
|  |     def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: | ||
|  |         """
 | ||
|  |         Returns the list of lines without the empty ones. With fixed-width | ||
|  |         fields, empty lines become arrays of empty strings. | ||
|  | 
 | ||
|  |         See PythonParser._remove_empty_lines. | ||
|  |         """
 | ||
|  |         return [ | ||
|  |             line | ||
|  |             for line in lines | ||
|  |             if any(not isinstance(e, str) or e.strip() for e in line) | ||
|  |         ] | ||
|  | 
 | ||
|  | 
 | ||
|  | def count_empty_vals(vals) -> int: | ||
|  |     return sum(1 for v in vals if v == "" or v is None) | ||
|  | 
 | ||
|  | 
 | ||
|  | def _validate_skipfooter_arg(skipfooter: int) -> int: | ||
|  |     """
 | ||
|  |     Validate the 'skipfooter' parameter. | ||
|  | 
 | ||
|  |     Checks whether 'skipfooter' is a non-negative integer. | ||
|  |     Raises a ValueError if that is not the case. | ||
|  | 
 | ||
|  |     Parameters | ||
|  |     ---------- | ||
|  |     skipfooter : non-negative integer | ||
|  |         The number of rows to skip at the end of the file. | ||
|  | 
 | ||
|  |     Returns | ||
|  |     ------- | ||
|  |     validated_skipfooter : non-negative integer | ||
|  |         The original input if the validation succeeds. | ||
|  | 
 | ||
|  |     Raises | ||
|  |     ------ | ||
|  |     ValueError : 'skipfooter' was not a non-negative integer. | ||
|  |     """
 | ||
|  |     if not is_integer(skipfooter): | ||
|  |         raise ValueError("skipfooter must be an integer") | ||
|  | 
 | ||
|  |     if skipfooter < 0: | ||
|  |         raise ValueError("skipfooter cannot be negative") | ||
|  | 
 | ||
|  |     # Incompatible return value type (got "Union[int, integer[Any]]", expected "int") | ||
|  |     return skipfooter  # type: ignore[return-value] |