688 lines
		
	
	
	
		
			25 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			688 lines
		
	
	
	
		
			25 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | # -*- coding: utf-8 -*- | ||
|  | """
 | ||
|  | This tokenizer has been copied from the ``tokenize.py`` standard library | ||
|  | tokenizer. The reason was simple: The standard library tokenizer fails | ||
|  | if the indentation is not right. To make it possible to do error recovery the | ||
|  |     tokenizer needed to be rewritten. | ||
|  | 
 | ||
|  | Basically this is a stripped down version of the standard library module, so | ||
|  | you can read the documentation there. Additionally we included some speed and | ||
|  | memory optimizations here. | ||
|  | """
 | ||
|  | from __future__ import absolute_import | ||
|  | 
 | ||
|  | import sys | ||
|  | import re | ||
|  | import itertools as _itertools | ||
|  | from codecs import BOM_UTF8 | ||
|  | from typing import NamedTuple, Tuple, Iterator, Iterable, List, Dict, \ | ||
|  |     Pattern, Set | ||
|  | 
 | ||
|  | from parso.python.token import PythonTokenTypes | ||
|  | from parso.utils import split_lines, PythonVersionInfo, parse_version_string | ||
|  | 
 | ||
|  | 
 | ||
|  | # Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) | ||
|  | MAX_UNICODE = '\U0010ffff' | ||
|  | 
 | ||
|  | STRING = PythonTokenTypes.STRING | ||
|  | NAME = PythonTokenTypes.NAME | ||
|  | NUMBER = PythonTokenTypes.NUMBER | ||
|  | OP = PythonTokenTypes.OP | ||
|  | NEWLINE = PythonTokenTypes.NEWLINE | ||
|  | INDENT = PythonTokenTypes.INDENT | ||
|  | DEDENT = PythonTokenTypes.DEDENT | ||
|  | ENDMARKER = PythonTokenTypes.ENDMARKER | ||
|  | ERRORTOKEN = PythonTokenTypes.ERRORTOKEN | ||
|  | ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT | ||
|  | FSTRING_START = PythonTokenTypes.FSTRING_START | ||
|  | FSTRING_STRING = PythonTokenTypes.FSTRING_STRING | ||
|  | FSTRING_END = PythonTokenTypes.FSTRING_END | ||
|  | 
 | ||
|  | 
 | ||
|  | class TokenCollection(NamedTuple): | ||
|  |     pseudo_token: Pattern | ||
|  |     single_quoted: Set[str] | ||
|  |     triple_quoted: Set[str] | ||
|  |     endpats: Dict[str, Pattern] | ||
|  |     whitespace: Pattern | ||
|  |     fstring_pattern_map: Dict[str, str] | ||
|  |     always_break_tokens: Tuple[str] | ||
|  | 
 | ||
|  | 
 | ||
|  | BOM_UTF8_STRING = BOM_UTF8.decode('utf-8') | ||
|  | 
 | ||
|  | _token_collection_cache: Dict[PythonVersionInfo, TokenCollection] = {} | ||
|  | 
 | ||
|  | 
 | ||
|  | def group(*choices, capture=False, **kwargs): | ||
|  |     assert not kwargs | ||
|  | 
 | ||
|  |     start = '(' | ||
|  |     if not capture: | ||
|  |         start += '?:' | ||
|  |     return start + '|'.join(choices) + ')' | ||
|  | 
 | ||
|  | 
 | ||
|  | def maybe(*choices): | ||
|  |     return group(*choices) + '?' | ||
|  | 
 | ||
|  | 
 | ||
|  | # Return the empty string, plus all of the valid string prefixes. | ||
|  | def _all_string_prefixes(*, include_fstring=False, only_fstring=False): | ||
|  |     def different_case_versions(prefix): | ||
|  |         for s in _itertools.product(*[(c, c.upper()) for c in prefix]): | ||
|  |             yield ''.join(s) | ||
|  |     # The valid string prefixes. Only contain the lower case versions, | ||
|  |     #  and don't contain any permuations (include 'fr', but not | ||
|  |     #  'rf'). The various permutations will be generated. | ||
|  |     valid_string_prefixes = ['b', 'r', 'u', 'br'] | ||
|  | 
 | ||
|  |     result = {''} | ||
|  |     if include_fstring: | ||
|  |         f = ['f', 'fr'] | ||
|  |         if only_fstring: | ||
|  |             valid_string_prefixes = f | ||
|  |             result = set() | ||
|  |         else: | ||
|  |             valid_string_prefixes += f | ||
|  |     elif only_fstring: | ||
|  |         return set() | ||
|  | 
 | ||
|  |     # if we add binary f-strings, add: ['fb', 'fbr'] | ||
|  |     for prefix in valid_string_prefixes: | ||
|  |         for t in _itertools.permutations(prefix): | ||
|  |             # create a list with upper and lower versions of each | ||
|  |             #  character | ||
|  |             result.update(different_case_versions(t)) | ||
|  |     return result | ||
|  | 
 | ||
|  | 
 | ||
|  | def _compile(expr): | ||
|  |     return re.compile(expr, re.UNICODE) | ||
|  | 
 | ||
|  | 
 | ||
|  | def _get_token_collection(version_info): | ||
|  |     try: | ||
|  |         return _token_collection_cache[tuple(version_info)] | ||
|  |     except KeyError: | ||
|  |         _token_collection_cache[tuple(version_info)] = result = \ | ||
|  |             _create_token_collection(version_info) | ||
|  |         return result | ||
|  | 
 | ||
|  | 
 | ||
|  | unicode_character_name = r'[A-Za-z0-9\-]+(?: [A-Za-z0-9\-]+)*' | ||
|  | fstring_string_single_line = _compile( | ||
|  |     r'(?:\{\{|\}\}|\\N\{' + unicode_character_name | ||
|  |     + r'\}|\\(?:\r\n?|\n)|\\[^\r\nN]|[^{}\r\n\\])+' | ||
|  | ) | ||
|  | fstring_string_multi_line = _compile( | ||
|  |     r'(?:\{\{|\}\}|\\N\{' + unicode_character_name + r'\}|\\[^N]|[^{}\\])+' | ||
|  | ) | ||
|  | fstring_format_spec_single_line = _compile(r'(?:\\(?:\r\n?|\n)|[^{}\r\n])+') | ||
|  | fstring_format_spec_multi_line = _compile(r'[^{}]+') | ||
|  | 
 | ||
|  | 
 | ||
|  | def _create_token_collection(version_info): | ||
|  |     # Note: we use unicode matching for names ("\w") but ascii matching for | ||
|  |     # number literals. | ||
|  |     Whitespace = r'[ \f\t]*' | ||
|  |     whitespace = _compile(Whitespace) | ||
|  |     Comment = r'#[^\r\n]*' | ||
|  |     Name = '([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)' | ||
|  | 
 | ||
|  |     Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' | ||
|  |     Binnumber = r'0[bB](?:_?[01])+' | ||
|  |     Octnumber = r'0[oO](?:_?[0-7])+' | ||
|  |     Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' | ||
|  |     Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) | ||
|  |     Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' | ||
|  |     Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', | ||
|  |                        r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) | ||
|  |     Expfloat = r'[0-9](?:_?[0-9])*' + Exponent | ||
|  |     Floatnumber = group(Pointfloat, Expfloat) | ||
|  |     Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') | ||
|  |     Number = group(Imagnumber, Floatnumber, Intnumber) | ||
|  | 
 | ||
|  |     # Note that since _all_string_prefixes includes the empty string, | ||
|  |     #  StringPrefix can be the empty string (making it optional). | ||
|  |     possible_prefixes = _all_string_prefixes() | ||
|  |     StringPrefix = group(*possible_prefixes) | ||
|  |     StringPrefixWithF = group(*_all_string_prefixes(include_fstring=True)) | ||
|  |     fstring_prefixes = _all_string_prefixes(include_fstring=True, only_fstring=True) | ||
|  |     FStringStart = group(*fstring_prefixes) | ||
|  | 
 | ||
|  |     # Tail end of ' string. | ||
|  |     Single = r"(?:\\.|[^'\\])*'" | ||
|  |     # Tail end of " string. | ||
|  |     Double = r'(?:\\.|[^"\\])*"' | ||
|  |     # Tail end of ''' string. | ||
|  |     Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''" | ||
|  |     # Tail end of """ string. | ||
|  |     Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""' | ||
|  |     Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""') | ||
|  | 
 | ||
|  |     # Because of leftmost-then-longest match semantics, be sure to put the | ||
|  |     # longest operators first (e.g., if = came before ==, == would get | ||
|  |     # recognized as two instances of =). | ||
|  |     Operator = group(r"\*\*=?", r">>=?", r"<<=?", | ||
|  |                      r"//=?", r"->", | ||
|  |                      r"[+\-*/%&@`|^!=<>]=?", | ||
|  |                      r"~") | ||
|  | 
 | ||
|  |     Bracket = '[][(){}]' | ||
|  | 
 | ||
|  |     special_args = [r'\.\.\.', r'\r\n?', r'\n', r'[;.,@]'] | ||
|  |     if version_info >= (3, 8): | ||
|  |         special_args.insert(0, ":=?") | ||
|  |     else: | ||
|  |         special_args.insert(0, ":") | ||
|  |     Special = group(*special_args) | ||
|  | 
 | ||
|  |     Funny = group(Operator, Bracket, Special) | ||
|  | 
 | ||
|  |     # First (or only) line of ' or " string. | ||
|  |     ContStr = group(StringPrefix + r"'[^\r\n'\\]*(?:\\.[^\r\n'\\]*)*" | ||
|  |                     + group("'", r'\\(?:\r\n?|\n)'), | ||
|  |                     StringPrefix + r'"[^\r\n"\\]*(?:\\.[^\r\n"\\]*)*' | ||
|  |                     + group('"', r'\\(?:\r\n?|\n)')) | ||
|  |     pseudo_extra_pool = [Comment, Triple] | ||
|  |     all_quotes = '"', "'", '"""', "'''" | ||
|  |     if fstring_prefixes: | ||
|  |         pseudo_extra_pool.append(FStringStart + group(*all_quotes)) | ||
|  | 
 | ||
|  |     PseudoExtras = group(r'\\(?:\r\n?|\n)|\Z', *pseudo_extra_pool) | ||
|  |     PseudoToken = group(Whitespace, capture=True) + \ | ||
|  |         group(PseudoExtras, Number, Funny, ContStr, Name, capture=True) | ||
|  | 
 | ||
|  |     # For a given string prefix plus quotes, endpats maps it to a regex | ||
|  |     #  to match the remainder of that string. _prefix can be empty, for | ||
|  |     #  a normal single or triple quoted string (with no prefix). | ||
|  |     endpats = {} | ||
|  |     for _prefix in possible_prefixes: | ||
|  |         endpats[_prefix + "'"] = _compile(Single) | ||
|  |         endpats[_prefix + '"'] = _compile(Double) | ||
|  |         endpats[_prefix + "'''"] = _compile(Single3) | ||
|  |         endpats[_prefix + '"""'] = _compile(Double3) | ||
|  | 
 | ||
|  |     # A set of all of the single and triple quoted string prefixes, | ||
|  |     #  including the opening quotes. | ||
|  |     single_quoted = set() | ||
|  |     triple_quoted = set() | ||
|  |     fstring_pattern_map = {} | ||
|  |     for t in possible_prefixes: | ||
|  |         for quote in '"', "'": | ||
|  |             single_quoted.add(t + quote) | ||
|  | 
 | ||
|  |         for quote in '"""', "'''": | ||
|  |             triple_quoted.add(t + quote) | ||
|  | 
 | ||
|  |     for t in fstring_prefixes: | ||
|  |         for quote in all_quotes: | ||
|  |             fstring_pattern_map[t + quote] = quote | ||
|  | 
 | ||
|  |     ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', | ||
|  |                            'finally', 'while', 'with', 'return', 'continue', | ||
|  |                            'break', 'del', 'pass', 'global', 'assert', 'nonlocal') | ||
|  |     pseudo_token_compiled = _compile(PseudoToken) | ||
|  |     return TokenCollection( | ||
|  |         pseudo_token_compiled, single_quoted, triple_quoted, endpats, | ||
|  |         whitespace, fstring_pattern_map, set(ALWAYS_BREAK_TOKENS) | ||
|  |     ) | ||
|  | 
 | ||
|  | 
 | ||
|  | class Token(NamedTuple): | ||
|  |     type: PythonTokenTypes | ||
|  |     string: str | ||
|  |     start_pos: Tuple[int, int] | ||
|  |     prefix: str | ||
|  | 
 | ||
|  |     @property | ||
|  |     def end_pos(self) -> Tuple[int, int]: | ||
|  |         lines = split_lines(self.string) | ||
|  |         if len(lines) > 1: | ||
|  |             return self.start_pos[0] + len(lines) - 1, 0 | ||
|  |         else: | ||
|  |             return self.start_pos[0], self.start_pos[1] + len(self.string) | ||
|  | 
 | ||
|  | 
 | ||
|  | class PythonToken(Token): | ||
|  |     def __repr__(self): | ||
|  |         return ('TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)' % | ||
|  |                 self._replace(type=self.type.name)) | ||
|  | 
 | ||
|  | 
 | ||
|  | class FStringNode: | ||
|  |     def __init__(self, quote): | ||
|  |         self.quote = quote | ||
|  |         self.parentheses_count = 0 | ||
|  |         self.previous_lines = '' | ||
|  |         self.last_string_start_pos = None | ||
|  |         # In the syntax there can be multiple format_spec's nested: | ||
|  |         # {x:{y:3}} | ||
|  |         self.format_spec_count = 0 | ||
|  | 
 | ||
|  |     def open_parentheses(self, character): | ||
|  |         self.parentheses_count += 1 | ||
|  | 
 | ||
|  |     def close_parentheses(self, character): | ||
|  |         self.parentheses_count -= 1 | ||
|  |         if self.parentheses_count == 0: | ||
|  |             # No parentheses means that the format spec is also finished. | ||
|  |             self.format_spec_count = 0 | ||
|  | 
 | ||
|  |     def allow_multiline(self): | ||
|  |         return len(self.quote) == 3 | ||
|  | 
 | ||
|  |     def is_in_expr(self): | ||
|  |         return self.parentheses_count > self.format_spec_count | ||
|  | 
 | ||
|  |     def is_in_format_spec(self): | ||
|  |         return not self.is_in_expr() and self.format_spec_count | ||
|  | 
 | ||
|  | 
 | ||
|  | def _close_fstring_if_necessary(fstring_stack, string, line_nr, column, additional_prefix): | ||
|  |     for fstring_stack_index, node in enumerate(fstring_stack): | ||
|  |         lstripped_string = string.lstrip() | ||
|  |         len_lstrip = len(string) - len(lstripped_string) | ||
|  |         if lstripped_string.startswith(node.quote): | ||
|  |             token = PythonToken( | ||
|  |                 FSTRING_END, | ||
|  |                 node.quote, | ||
|  |                 (line_nr, column + len_lstrip), | ||
|  |                 prefix=additional_prefix+string[:len_lstrip], | ||
|  |             ) | ||
|  |             additional_prefix = '' | ||
|  |             assert not node.previous_lines | ||
|  |             del fstring_stack[fstring_stack_index:] | ||
|  |             return token, '', len(node.quote) + len_lstrip | ||
|  |     return None, additional_prefix, 0 | ||
|  | 
 | ||
|  | 
 | ||
|  | def _find_fstring_string(endpats, fstring_stack, line, lnum, pos): | ||
|  |     tos = fstring_stack[-1] | ||
|  |     allow_multiline = tos.allow_multiline() | ||
|  |     if tos.is_in_format_spec(): | ||
|  |         if allow_multiline: | ||
|  |             regex = fstring_format_spec_multi_line | ||
|  |         else: | ||
|  |             regex = fstring_format_spec_single_line | ||
|  |     else: | ||
|  |         if allow_multiline: | ||
|  |             regex = fstring_string_multi_line | ||
|  |         else: | ||
|  |             regex = fstring_string_single_line | ||
|  | 
 | ||
|  |     match = regex.match(line, pos) | ||
|  |     if match is None: | ||
|  |         return tos.previous_lines, pos | ||
|  | 
 | ||
|  |     if not tos.previous_lines: | ||
|  |         tos.last_string_start_pos = (lnum, pos) | ||
|  | 
 | ||
|  |     string = match.group(0) | ||
|  |     for fstring_stack_node in fstring_stack: | ||
|  |         end_match = endpats[fstring_stack_node.quote].match(string) | ||
|  |         if end_match is not None: | ||
|  |             string = end_match.group(0)[:-len(fstring_stack_node.quote)] | ||
|  | 
 | ||
|  |     new_pos = pos | ||
|  |     new_pos += len(string) | ||
|  |     # even if allow_multiline is False, we still need to check for trailing | ||
|  |     # newlines, because a single-line f-string can contain line continuations | ||
|  |     if string.endswith('\n') or string.endswith('\r'): | ||
|  |         tos.previous_lines += string | ||
|  |         string = '' | ||
|  |     else: | ||
|  |         string = tos.previous_lines + string | ||
|  | 
 | ||
|  |     return string, new_pos | ||
|  | 
 | ||
|  | 
 | ||
|  | def tokenize( | ||
|  |     code: str, *, version_info: PythonVersionInfo, start_pos: Tuple[int, int] = (1, 0) | ||
|  | ) -> Iterator[PythonToken]: | ||
|  |     """Generate tokens from a the source code (string).""" | ||
|  |     lines = split_lines(code, keepends=True) | ||
|  |     return tokenize_lines(lines, version_info=version_info, start_pos=start_pos) | ||
|  | 
 | ||
|  | 
 | ||
|  | def _print_tokens(func): | ||
|  |     """
 | ||
|  |     A small helper function to help debug the tokenize_lines function. | ||
|  |     """
 | ||
|  |     def wrapper(*args, **kwargs): | ||
|  |         for token in func(*args, **kwargs): | ||
|  |             print(token)  # This print is intentional for debugging! | ||
|  |             yield token | ||
|  | 
 | ||
|  |     return wrapper | ||
|  | 
 | ||
|  | 
 | ||
|  | # @_print_tokens | ||
|  | def tokenize_lines( | ||
|  |     lines: Iterable[str], | ||
|  |     *, | ||
|  |     version_info: PythonVersionInfo, | ||
|  |     indents: List[int] = None, | ||
|  |     start_pos: Tuple[int, int] = (1, 0), | ||
|  |     is_first_token=True, | ||
|  | ) -> Iterator[PythonToken]: | ||
|  |     """
 | ||
|  |     A heavily modified Python standard library tokenizer. | ||
|  | 
 | ||
|  |     Additionally to the default information, yields also the prefix of each | ||
|  |     token. This idea comes from lib2to3. The prefix contains all information | ||
|  |     that is irrelevant for the parser like newlines in parentheses or comments. | ||
|  |     """
 | ||
|  |     def dedent_if_necessary(start): | ||
|  |         while start < indents[-1]: | ||
|  |             if start > indents[-2]: | ||
|  |                 yield PythonToken(ERROR_DEDENT, '', (lnum, start), '') | ||
|  |                 indents[-1] = start | ||
|  |                 break | ||
|  |             indents.pop() | ||
|  |             yield PythonToken(DEDENT, '', spos, '') | ||
|  | 
 | ||
|  |     pseudo_token, single_quoted, triple_quoted, endpats, whitespace, \ | ||
|  |         fstring_pattern_map, always_break_tokens, = \ | ||
|  |         _get_token_collection(version_info) | ||
|  |     paren_level = 0  # count parentheses | ||
|  |     if indents is None: | ||
|  |         indents = [0] | ||
|  |     max_ = 0 | ||
|  |     numchars = '0123456789' | ||
|  |     contstr = '' | ||
|  |     contline: str | ||
|  |     contstr_start: Tuple[int, int] | ||
|  |     endprog: Pattern | ||
|  |     # We start with a newline. This makes indent at the first position | ||
|  |     # possible. It's not valid Python, but still better than an INDENT in the | ||
|  |     # second line (and not in the first). This makes quite a few things in | ||
|  |     # Jedi's fast parser possible. | ||
|  |     new_line = True | ||
|  |     prefix = ''  # Should never be required, but here for safety | ||
|  |     additional_prefix = '' | ||
|  |     lnum = start_pos[0] - 1 | ||
|  |     fstring_stack: List[FStringNode] = [] | ||
|  |     for line in lines:  # loop over lines in stream | ||
|  |         lnum += 1 | ||
|  |         pos = 0 | ||
|  |         max_ = len(line) | ||
|  |         if is_first_token: | ||
|  |             if line.startswith(BOM_UTF8_STRING): | ||
|  |                 additional_prefix = BOM_UTF8_STRING | ||
|  |                 line = line[1:] | ||
|  |                 max_ = len(line) | ||
|  | 
 | ||
|  |             # Fake that the part before was already parsed. | ||
|  |             line = '^' * start_pos[1] + line | ||
|  |             pos = start_pos[1] | ||
|  |             max_ += start_pos[1] | ||
|  | 
 | ||
|  |             is_first_token = False | ||
|  | 
 | ||
|  |         if contstr:                                         # continued string | ||
|  |             endmatch = endprog.match(line)  # noqa: F821 | ||
|  |             if endmatch: | ||
|  |                 pos = endmatch.end(0) | ||
|  |                 yield PythonToken( | ||
|  |                     STRING, contstr + line[:pos], | ||
|  |                     contstr_start, prefix)  # noqa: F821 | ||
|  |                 contstr = '' | ||
|  |                 contline = '' | ||
|  |             else: | ||
|  |                 contstr = contstr + line | ||
|  |                 contline = contline + line | ||
|  |                 continue | ||
|  | 
 | ||
|  |         while pos < max_: | ||
|  |             if fstring_stack: | ||
|  |                 tos = fstring_stack[-1] | ||
|  |                 if not tos.is_in_expr(): | ||
|  |                     string, pos = _find_fstring_string(endpats, fstring_stack, line, lnum, pos) | ||
|  |                     if string: | ||
|  |                         yield PythonToken( | ||
|  |                             FSTRING_STRING, string, | ||
|  |                             tos.last_string_start_pos, | ||
|  |                             # Never has a prefix because it can start anywhere and | ||
|  |                             # include whitespace. | ||
|  |                             prefix='' | ||
|  |                         ) | ||
|  |                         tos.previous_lines = '' | ||
|  |                         continue | ||
|  |                     if pos == max_: | ||
|  |                         break | ||
|  | 
 | ||
|  |                 rest = line[pos:] | ||
|  |                 fstring_end_token, additional_prefix, quote_length = _close_fstring_if_necessary( | ||
|  |                     fstring_stack, | ||
|  |                     rest, | ||
|  |                     lnum, | ||
|  |                     pos, | ||
|  |                     additional_prefix, | ||
|  |                 ) | ||
|  |                 pos += quote_length | ||
|  |                 if fstring_end_token is not None: | ||
|  |                     yield fstring_end_token | ||
|  |                     continue | ||
|  | 
 | ||
|  |             # in an f-string, match until the end of the string | ||
|  |             if fstring_stack: | ||
|  |                 string_line = line | ||
|  |                 for fstring_stack_node in fstring_stack: | ||
|  |                     quote = fstring_stack_node.quote | ||
|  |                     end_match = endpats[quote].match(line, pos) | ||
|  |                     if end_match is not None: | ||
|  |                         end_match_string = end_match.group(0) | ||
|  |                         if len(end_match_string) - len(quote) + pos < len(string_line): | ||
|  |                             string_line = line[:pos] + end_match_string[:-len(quote)] | ||
|  |                 pseudomatch = pseudo_token.match(string_line, pos) | ||
|  |             else: | ||
|  |                 pseudomatch = pseudo_token.match(line, pos) | ||
|  | 
 | ||
|  |             if pseudomatch: | ||
|  |                 prefix = additional_prefix + pseudomatch.group(1) | ||
|  |                 additional_prefix = '' | ||
|  |                 start, pos = pseudomatch.span(2) | ||
|  |                 spos = (lnum, start) | ||
|  |                 token = pseudomatch.group(2) | ||
|  |                 if token == '': | ||
|  |                     assert prefix | ||
|  |                     additional_prefix = prefix | ||
|  |                     # This means that we have a line with whitespace/comments at | ||
|  |                     # the end, which just results in an endmarker. | ||
|  |                     break | ||
|  |                 initial = token[0] | ||
|  |             else: | ||
|  |                 match = whitespace.match(line, pos) | ||
|  |                 initial = line[match.end()] | ||
|  |                 start = match.end() | ||
|  |                 spos = (lnum, start) | ||
|  | 
 | ||
|  |             if new_line and initial not in '\r\n#' and (initial != '\\' or pseudomatch is None): | ||
|  |                 new_line = False | ||
|  |                 if paren_level == 0 and not fstring_stack: | ||
|  |                     indent_start = start | ||
|  |                     if indent_start > indents[-1]: | ||
|  |                         yield PythonToken(INDENT, '', spos, '') | ||
|  |                         indents.append(indent_start) | ||
|  |                     yield from dedent_if_necessary(indent_start) | ||
|  | 
 | ||
|  |             if not pseudomatch:  # scan for tokens | ||
|  |                 match = whitespace.match(line, pos) | ||
|  |                 if new_line and paren_level == 0 and not fstring_stack: | ||
|  |                     yield from dedent_if_necessary(match.end()) | ||
|  |                 pos = match.end() | ||
|  |                 new_line = False | ||
|  |                 yield PythonToken( | ||
|  |                     ERRORTOKEN, line[pos], (lnum, pos), | ||
|  |                     additional_prefix + match.group(0) | ||
|  |                 ) | ||
|  |                 additional_prefix = '' | ||
|  |                 pos += 1 | ||
|  |                 continue | ||
|  | 
 | ||
|  |             if (initial in numchars                      # ordinary number | ||
|  |                     or (initial == '.' and token != '.' and token != '...')): | ||
|  |                 yield PythonToken(NUMBER, token, spos, prefix) | ||
|  |             elif pseudomatch.group(3) is not None:            # ordinary name | ||
|  |                 if token in always_break_tokens and (fstring_stack or paren_level): | ||
|  |                     fstring_stack[:] = [] | ||
|  |                     paren_level = 0 | ||
|  |                     # We only want to dedent if the token is on a new line. | ||
|  |                     m = re.match(r'[ \f\t]*$', line[:start]) | ||
|  |                     if m is not None: | ||
|  |                         yield from dedent_if_necessary(m.end()) | ||
|  |                 if token.isidentifier(): | ||
|  |                     yield PythonToken(NAME, token, spos, prefix) | ||
|  |                 else: | ||
|  |                     yield from _split_illegal_unicode_name(token, spos, prefix) | ||
|  |             elif initial in '\r\n': | ||
|  |                 if any(not f.allow_multiline() for f in fstring_stack): | ||
|  |                     fstring_stack.clear() | ||
|  | 
 | ||
|  |                 if not new_line and paren_level == 0 and not fstring_stack: | ||
|  |                     yield PythonToken(NEWLINE, token, spos, prefix) | ||
|  |                 else: | ||
|  |                     additional_prefix = prefix + token | ||
|  |                 new_line = True | ||
|  |             elif initial == '#':  # Comments | ||
|  |                 assert not token.endswith("\n") and not token.endswith("\r") | ||
|  |                 if fstring_stack and fstring_stack[-1].is_in_expr(): | ||
|  |                     # `#` is not allowed in f-string expressions | ||
|  |                     yield PythonToken(ERRORTOKEN, initial, spos, prefix) | ||
|  |                     pos = start + 1 | ||
|  |                 else: | ||
|  |                     additional_prefix = prefix + token | ||
|  |             elif token in triple_quoted: | ||
|  |                 endprog = endpats[token] | ||
|  |                 endmatch = endprog.match(line, pos) | ||
|  |                 if endmatch:                                # all on one line | ||
|  |                     pos = endmatch.end(0) | ||
|  |                     token = line[start:pos] | ||
|  |                     yield PythonToken(STRING, token, spos, prefix) | ||
|  |                 else: | ||
|  |                     contstr_start = spos                    # multiple lines | ||
|  |                     contstr = line[start:] | ||
|  |                     contline = line | ||
|  |                     break | ||
|  | 
 | ||
|  |             # Check up to the first 3 chars of the token to see if | ||
|  |             #  they're in the single_quoted set. If so, they start | ||
|  |             #  a string. | ||
|  |             # We're using the first 3, because we're looking for | ||
|  |             #  "rb'" (for example) at the start of the token. If | ||
|  |             #  we switch to longer prefixes, this needs to be | ||
|  |             #  adjusted. | ||
|  |             # Note that initial == token[:1]. | ||
|  |             # Also note that single quote checking must come after | ||
|  |             #  triple quote checking (above). | ||
|  |             elif initial in single_quoted or \ | ||
|  |                     token[:2] in single_quoted or \ | ||
|  |                     token[:3] in single_quoted: | ||
|  |                 if token[-1] in '\r\n':                       # continued string | ||
|  |                     # This means that a single quoted string ends with a | ||
|  |                     # backslash and is continued. | ||
|  |                     contstr_start = lnum, start | ||
|  |                     endprog = (endpats.get(initial) or endpats.get(token[1]) | ||
|  |                                or endpats.get(token[2])) | ||
|  |                     contstr = line[start:] | ||
|  |                     contline = line | ||
|  |                     break | ||
|  |                 else:                                       # ordinary string | ||
|  |                     yield PythonToken(STRING, token, spos, prefix) | ||
|  |             elif token in fstring_pattern_map:  # The start of an fstring. | ||
|  |                 fstring_stack.append(FStringNode(fstring_pattern_map[token])) | ||
|  |                 yield PythonToken(FSTRING_START, token, spos, prefix) | ||
|  |             elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'):  # continued stmt | ||
|  |                 additional_prefix += prefix + line[start:] | ||
|  |                 break | ||
|  |             else: | ||
|  |                 if token in '([{': | ||
|  |                     if fstring_stack: | ||
|  |                         fstring_stack[-1].open_parentheses(token) | ||
|  |                     else: | ||
|  |                         paren_level += 1 | ||
|  |                 elif token in ')]}': | ||
|  |                     if fstring_stack: | ||
|  |                         fstring_stack[-1].close_parentheses(token) | ||
|  |                     else: | ||
|  |                         if paren_level: | ||
|  |                             paren_level -= 1 | ||
|  |                 elif token.startswith(':') and fstring_stack \ | ||
|  |                         and fstring_stack[-1].parentheses_count \ | ||
|  |                         - fstring_stack[-1].format_spec_count == 1: | ||
|  |                     # `:` and `:=` both count | ||
|  |                     fstring_stack[-1].format_spec_count += 1 | ||
|  |                     token = ':' | ||
|  |                     pos = start + 1 | ||
|  | 
 | ||
|  |                 yield PythonToken(OP, token, spos, prefix) | ||
|  | 
 | ||
|  |     if contstr: | ||
|  |         yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix) | ||
|  |         if contstr.endswith('\n') or contstr.endswith('\r'): | ||
|  |             new_line = True | ||
|  | 
 | ||
|  |     if fstring_stack: | ||
|  |         tos = fstring_stack[-1] | ||
|  |         if tos.previous_lines: | ||
|  |             yield PythonToken( | ||
|  |                 FSTRING_STRING, tos.previous_lines, | ||
|  |                 tos.last_string_start_pos, | ||
|  |                 # Never has a prefix because it can start anywhere and | ||
|  |                 # include whitespace. | ||
|  |                 prefix='' | ||
|  |             ) | ||
|  | 
 | ||
|  |     end_pos = lnum, max_ | ||
|  |     # As the last position we just take the maximally possible position. We | ||
|  |     # remove -1 for the last new line. | ||
|  |     for indent in indents[1:]: | ||
|  |         indents.pop() | ||
|  |         yield PythonToken(DEDENT, '', end_pos, '') | ||
|  |     yield PythonToken(ENDMARKER, '', end_pos, additional_prefix) | ||
|  | 
 | ||
|  | 
 | ||
|  | def _split_illegal_unicode_name(token, start_pos, prefix): | ||
|  |     def create_token(): | ||
|  |         return PythonToken(ERRORTOKEN if is_illegal else NAME, found, pos, prefix) | ||
|  | 
 | ||
|  |     found = '' | ||
|  |     is_illegal = False | ||
|  |     pos = start_pos | ||
|  |     for i, char in enumerate(token): | ||
|  |         if is_illegal: | ||
|  |             if char.isidentifier(): | ||
|  |                 yield create_token() | ||
|  |                 found = char | ||
|  |                 is_illegal = False | ||
|  |                 prefix = '' | ||
|  |                 pos = start_pos[0], start_pos[1] + i | ||
|  |             else: | ||
|  |                 found += char | ||
|  |         else: | ||
|  |             new_found = found + char | ||
|  |             if new_found.isidentifier(): | ||
|  |                 found = new_found | ||
|  |             else: | ||
|  |                 if found: | ||
|  |                     yield create_token() | ||
|  |                     prefix = '' | ||
|  |                     pos = start_pos[0], start_pos[1] + i | ||
|  |                 found = char | ||
|  |                 is_illegal = True | ||
|  | 
 | ||
|  |     if found: | ||
|  |         yield create_token() | ||
|  | 
 | ||
|  | 
 | ||
|  | if __name__ == "__main__": | ||
|  |     path = sys.argv[1] | ||
|  |     with open(path) as f: | ||
|  |         code = f.read() | ||
|  | 
 | ||
|  |     for token in tokenize(code, version_info=parse_version_string('3.10')): | ||
|  |         print(token) |