blob: 1aee21b5e18fa716dfaa5306fc6aa8a96d253641 [file] [log] [blame]
Olivier Deprezf4ef2d02021-04-20 13:36:24 +02001"""Tokenization help for Python programs.
2
3tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens. It decodes the bytes according to PEP-0263 for
5determining source file encoding.
6
7It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF). It generates 5-tuples with these
9members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators. Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
22
23__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26 'Michael Foord')
27from builtins import open as _builtin_open
28from codecs import lookup, BOM_UTF8
29import collections
30from io import TextIOWrapper
31import itertools as _itertools
32import re
33import sys
34from token import *
35from token import EXACT_TOKEN_TYPES
36
37cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
38blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
39
40import token
41__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
42 "untokenize", "TokenInfo"]
43del token
44
45class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
46 def __repr__(self):
47 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
48 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
49 self._replace(type=annotated_type))
50
51 @property
52 def exact_type(self):
53 if self.type == OP and self.string in EXACT_TOKEN_TYPES:
54 return EXACT_TOKEN_TYPES[self.string]
55 else:
56 return self.type
57
58def group(*choices): return '(' + '|'.join(choices) + ')'
59def any(*choices): return group(*choices) + '*'
60def maybe(*choices): return group(*choices) + '?'
61
62# Note: we use unicode matching for names ("\w") but ascii matching for
63# number literals.
64Whitespace = r'[ \f\t]*'
65Comment = r'#[^\r\n]*'
66Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
67Name = r'\w+'
68
69Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
70Binnumber = r'0[bB](?:_?[01])+'
71Octnumber = r'0[oO](?:_?[0-7])+'
72Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
73Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
74Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
75Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
76 r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
77Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
78Floatnumber = group(Pointfloat, Expfloat)
79Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
80Number = group(Imagnumber, Floatnumber, Intnumber)
81
82# Return the empty string, plus all of the valid string prefixes.
83def _all_string_prefixes():
84 # The valid string prefixes. Only contain the lower case versions,
85 # and don't contain any permutations (include 'fr', but not
86 # 'rf'). The various permutations will be generated.
87 _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
88 # if we add binary f-strings, add: ['fb', 'fbr']
89 result = {''}
90 for prefix in _valid_string_prefixes:
91 for t in _itertools.permutations(prefix):
92 # create a list with upper and lower versions of each
93 # character
94 for u in _itertools.product(*[(c, c.upper()) for c in t]):
95 result.add(''.join(u))
96 return result
97
98def _compile(expr):
99 return re.compile(expr, re.UNICODE)
100
101# Note that since _all_string_prefixes includes the empty string,
102# StringPrefix can be the empty string (making it optional).
103StringPrefix = group(*_all_string_prefixes())
104
105# Tail end of ' string.
106Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
107# Tail end of " string.
108Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
109# Tail end of ''' string.
110Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
111# Tail end of """ string.
112Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
113Triple = group(StringPrefix + "'''", StringPrefix + '"""')
114# Single-line ' or " string.
115String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
116 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
117
118# Sorting in reverse order puts the long operators before their prefixes.
119# Otherwise if = came before ==, == would get recognized as two instances
120# of =.
121Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
122Funny = group(r'\r?\n', Special)
123
124PlainToken = group(Number, Funny, String, Name)
125Token = Ignore + PlainToken
126
127# First (or only) line of ' or " string.
128ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
129 group("'", r'\\\r?\n'),
130 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
131 group('"', r'\\\r?\n'))
132PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
133PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
134
135# For a given string prefix plus quotes, endpats maps it to a regex
136# to match the remainder of that string. _prefix can be empty, for
137# a normal single or triple quoted string (with no prefix).
138endpats = {}
139for _prefix in _all_string_prefixes():
140 endpats[_prefix + "'"] = Single
141 endpats[_prefix + '"'] = Double
142 endpats[_prefix + "'''"] = Single3
143 endpats[_prefix + '"""'] = Double3
144
145# A set of all of the single and triple quoted string prefixes,
146# including the opening quotes.
147single_quoted = set()
148triple_quoted = set()
149for t in _all_string_prefixes():
150 for u in (t + '"', t + "'"):
151 single_quoted.add(u)
152 for u in (t + '"""', t + "'''"):
153 triple_quoted.add(u)
154
155tabsize = 8
156
157class TokenError(Exception): pass
158
159class StopTokenizing(Exception): pass
160
161
162class Untokenizer:
163
164 def __init__(self):
165 self.tokens = []
166 self.prev_row = 1
167 self.prev_col = 0
168 self.encoding = None
169
170 def add_whitespace(self, start):
171 row, col = start
172 if row < self.prev_row or row == self.prev_row and col < self.prev_col:
173 raise ValueError("start ({},{}) precedes previous end ({},{})"
174 .format(row, col, self.prev_row, self.prev_col))
175 row_offset = row - self.prev_row
176 if row_offset:
177 self.tokens.append("\\\n" * row_offset)
178 self.prev_col = 0
179 col_offset = col - self.prev_col
180 if col_offset:
181 self.tokens.append(" " * col_offset)
182
183 def untokenize(self, iterable):
184 it = iter(iterable)
185 indents = []
186 startline = False
187 for t in it:
188 if len(t) == 2:
189 self.compat(t, it)
190 break
191 tok_type, token, start, end, line = t
192 if tok_type == ENCODING:
193 self.encoding = token
194 continue
195 if tok_type == ENDMARKER:
196 break
197 if tok_type == INDENT:
198 indents.append(token)
199 continue
200 elif tok_type == DEDENT:
201 indents.pop()
202 self.prev_row, self.prev_col = end
203 continue
204 elif tok_type in (NEWLINE, NL):
205 startline = True
206 elif startline and indents:
207 indent = indents[-1]
208 if start[1] >= len(indent):
209 self.tokens.append(indent)
210 self.prev_col = len(indent)
211 startline = False
212 self.add_whitespace(start)
213 self.tokens.append(token)
214 self.prev_row, self.prev_col = end
215 if tok_type in (NEWLINE, NL):
216 self.prev_row += 1
217 self.prev_col = 0
218 return "".join(self.tokens)
219
220 def compat(self, token, iterable):
221 indents = []
222 toks_append = self.tokens.append
223 startline = token[0] in (NEWLINE, NL)
224 prevstring = False
225
226 for tok in _itertools.chain([token], iterable):
227 toknum, tokval = tok[:2]
228 if toknum == ENCODING:
229 self.encoding = tokval
230 continue
231
232 if toknum in (NAME, NUMBER):
233 tokval += ' '
234
235 # Insert a space between two consecutive strings
236 if toknum == STRING:
237 if prevstring:
238 tokval = ' ' + tokval
239 prevstring = True
240 else:
241 prevstring = False
242
243 if toknum == INDENT:
244 indents.append(tokval)
245 continue
246 elif toknum == DEDENT:
247 indents.pop()
248 continue
249 elif toknum in (NEWLINE, NL):
250 startline = True
251 elif startline and indents:
252 toks_append(indents[-1])
253 startline = False
254 toks_append(tokval)
255
256
257def untokenize(iterable):
258 """Transform tokens back into Python source code.
259 It returns a bytes object, encoded using the ENCODING
260 token, which is the first token sequence output by tokenize.
261
262 Each element returned by the iterable must be a token sequence
263 with at least two elements, a token number and token value. If
264 only two tokens are passed, the resulting output is poor.
265
266 Round-trip invariant for full input:
267 Untokenized source will match input source exactly
268
269 Round-trip invariant for limited input:
270 # Output bytes will tokenize back to the input
271 t1 = [tok[:2] for tok in tokenize(f.readline)]
272 newcode = untokenize(t1)
273 readline = BytesIO(newcode).readline
274 t2 = [tok[:2] for tok in tokenize(readline)]
275 assert t1 == t2
276 """
277 ut = Untokenizer()
278 out = ut.untokenize(iterable)
279 if ut.encoding is not None:
280 out = out.encode(ut.encoding)
281 return out
282
283
284def _get_normal_name(orig_enc):
285 """Imitates get_normal_name in tokenizer.c."""
286 # Only care about the first 12 characters.
287 enc = orig_enc[:12].lower().replace("_", "-")
288 if enc == "utf-8" or enc.startswith("utf-8-"):
289 return "utf-8"
290 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
291 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
292 return "iso-8859-1"
293 return orig_enc
294
295def detect_encoding(readline):
296 """
297 The detect_encoding() function is used to detect the encoding that should
298 be used to decode a Python source file. It requires one argument, readline,
299 in the same way as the tokenize() generator.
300
301 It will call readline a maximum of twice, and return the encoding used
302 (as a string) and a list of any lines (left as bytes) it has read in.
303
304 It detects the encoding from the presence of a utf-8 bom or an encoding
305 cookie as specified in pep-0263. If both a bom and a cookie are present,
306 but disagree, a SyntaxError will be raised. If the encoding cookie is an
307 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
308 'utf-8-sig' is returned.
309
310 If no encoding is specified, then the default of 'utf-8' will be returned.
311 """
312 try:
313 filename = readline.__self__.name
314 except AttributeError:
315 filename = None
316 bom_found = False
317 encoding = None
318 default = 'utf-8'
319 def read_or_stop():
320 try:
321 return readline()
322 except StopIteration:
323 return b''
324
325 def find_cookie(line):
326 try:
327 # Decode as UTF-8. Either the line is an encoding declaration,
328 # in which case it should be pure ASCII, or it must be UTF-8
329 # per default encoding.
330 line_string = line.decode('utf-8')
331 except UnicodeDecodeError:
332 msg = "invalid or missing encoding declaration"
333 if filename is not None:
334 msg = '{} for {!r}'.format(msg, filename)
335 raise SyntaxError(msg)
336
337 match = cookie_re.match(line_string)
338 if not match:
339 return None
340 encoding = _get_normal_name(match.group(1))
341 try:
342 codec = lookup(encoding)
343 except LookupError:
344 # This behaviour mimics the Python interpreter
345 if filename is None:
346 msg = "unknown encoding: " + encoding
347 else:
348 msg = "unknown encoding for {!r}: {}".format(filename,
349 encoding)
350 raise SyntaxError(msg)
351
352 if bom_found:
353 if encoding != 'utf-8':
354 # This behaviour mimics the Python interpreter
355 if filename is None:
356 msg = 'encoding problem: utf-8'
357 else:
358 msg = 'encoding problem for {!r}: utf-8'.format(filename)
359 raise SyntaxError(msg)
360 encoding += '-sig'
361 return encoding
362
363 first = read_or_stop()
364 if first.startswith(BOM_UTF8):
365 bom_found = True
366 first = first[3:]
367 default = 'utf-8-sig'
368 if not first:
369 return default, []
370
371 encoding = find_cookie(first)
372 if encoding:
373 return encoding, [first]
374 if not blank_re.match(first):
375 return default, [first]
376
377 second = read_or_stop()
378 if not second:
379 return default, [first]
380
381 encoding = find_cookie(second)
382 if encoding:
383 return encoding, [first, second]
384
385 return default, [first, second]
386
387
388def open(filename):
389 """Open a file in read only mode using the encoding detected by
390 detect_encoding().
391 """
392 buffer = _builtin_open(filename, 'rb')
393 try:
394 encoding, lines = detect_encoding(buffer.readline)
395 buffer.seek(0)
396 text = TextIOWrapper(buffer, encoding, line_buffering=True)
397 text.mode = 'r'
398 return text
399 except:
400 buffer.close()
401 raise
402
403
404def tokenize(readline):
405 """
406 The tokenize() generator requires one argument, readline, which
407 must be a callable object which provides the same interface as the
408 readline() method of built-in file objects. Each call to the function
409 should return one line of input as bytes. Alternatively, readline
410 can be a callable function terminating with StopIteration:
411 readline = open(myfile, 'rb').__next__ # Example of alternate readline
412
413 The generator produces 5-tuples with these members: the token type; the
414 token string; a 2-tuple (srow, scol) of ints specifying the row and
415 column where the token begins in the source; a 2-tuple (erow, ecol) of
416 ints specifying the row and column where the token ends in the source;
417 and the line on which the token was found. The line passed is the
418 physical line.
419
420 The first token sequence will always be an ENCODING token
421 which tells you which encoding was used to decode the bytes stream.
422 """
423 encoding, consumed = detect_encoding(readline)
424 empty = _itertools.repeat(b"")
425 rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
426 return _tokenize(rl_gen.__next__, encoding)
427
428
429def _tokenize(readline, encoding):
430 lnum = parenlev = continued = 0
431 numchars = '0123456789'
432 contstr, needcont = '', 0
433 contline = None
434 indents = [0]
435
436 if encoding is not None:
437 if encoding == "utf-8-sig":
438 # BOM will already have been stripped.
439 encoding = "utf-8"
440 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
441 last_line = b''
442 line = b''
443 while True: # loop over lines in stream
444 try:
445 # We capture the value of the line variable here because
446 # readline uses the empty string '' to signal end of input,
447 # hence `line` itself will always be overwritten at the end
448 # of this loop.
449 last_line = line
450 line = readline()
451 except StopIteration:
452 line = b''
453
454 if encoding is not None:
455 line = line.decode(encoding)
456 lnum += 1
457 pos, max = 0, len(line)
458
459 if contstr: # continued string
460 if not line:
461 raise TokenError("EOF in multi-line string", strstart)
462 endmatch = endprog.match(line)
463 if endmatch:
464 pos = end = endmatch.end(0)
465 yield TokenInfo(STRING, contstr + line[:end],
466 strstart, (lnum, end), contline + line)
467 contstr, needcont = '', 0
468 contline = None
469 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
470 yield TokenInfo(ERRORTOKEN, contstr + line,
471 strstart, (lnum, len(line)), contline)
472 contstr = ''
473 contline = None
474 continue
475 else:
476 contstr = contstr + line
477 contline = contline + line
478 continue
479
480 elif parenlev == 0 and not continued: # new statement
481 if not line: break
482 column = 0
483 while pos < max: # measure leading whitespace
484 if line[pos] == ' ':
485 column += 1
486 elif line[pos] == '\t':
487 column = (column//tabsize + 1)*tabsize
488 elif line[pos] == '\f':
489 column = 0
490 else:
491 break
492 pos += 1
493 if pos == max:
494 break
495
496 if line[pos] in '#\r\n': # skip comments or blank lines
497 if line[pos] == '#':
498 comment_token = line[pos:].rstrip('\r\n')
499 yield TokenInfo(COMMENT, comment_token,
500 (lnum, pos), (lnum, pos + len(comment_token)), line)
501 pos += len(comment_token)
502
503 yield TokenInfo(NL, line[pos:],
504 (lnum, pos), (lnum, len(line)), line)
505 continue
506
507 if column > indents[-1]: # count indents or dedents
508 indents.append(column)
509 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
510 while column < indents[-1]:
511 if column not in indents:
512 raise IndentationError(
513 "unindent does not match any outer indentation level",
514 ("<tokenize>", lnum, pos, line))
515 indents = indents[:-1]
516
517 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
518
519 else: # continued statement
520 if not line:
521 raise TokenError("EOF in multi-line statement", (lnum, 0))
522 continued = 0
523
524 while pos < max:
525 pseudomatch = _compile(PseudoToken).match(line, pos)
526 if pseudomatch: # scan for tokens
527 start, end = pseudomatch.span(1)
528 spos, epos, pos = (lnum, start), (lnum, end), end
529 if start == end:
530 continue
531 token, initial = line[start:end], line[start]
532
533 if (initial in numchars or # ordinary number
534 (initial == '.' and token != '.' and token != '...')):
535 yield TokenInfo(NUMBER, token, spos, epos, line)
536 elif initial in '\r\n':
537 if parenlev > 0:
538 yield TokenInfo(NL, token, spos, epos, line)
539 else:
540 yield TokenInfo(NEWLINE, token, spos, epos, line)
541
542 elif initial == '#':
543 assert not token.endswith("\n")
544 yield TokenInfo(COMMENT, token, spos, epos, line)
545
546 elif token in triple_quoted:
547 endprog = _compile(endpats[token])
548 endmatch = endprog.match(line, pos)
549 if endmatch: # all on one line
550 pos = endmatch.end(0)
551 token = line[start:pos]
552 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
553 else:
554 strstart = (lnum, start) # multiple lines
555 contstr = line[start:]
556 contline = line
557 break
558
559 # Check up to the first 3 chars of the token to see if
560 # they're in the single_quoted set. If so, they start
561 # a string.
562 # We're using the first 3, because we're looking for
563 # "rb'" (for example) at the start of the token. If
564 # we switch to longer prefixes, this needs to be
565 # adjusted.
566 # Note that initial == token[:1].
567 # Also note that single quote checking must come after
568 # triple quote checking (above).
569 elif (initial in single_quoted or
570 token[:2] in single_quoted or
571 token[:3] in single_quoted):
572 if token[-1] == '\n': # continued string
573 strstart = (lnum, start)
574 # Again, using the first 3 chars of the
575 # token. This is looking for the matching end
576 # regex for the correct type of quote
577 # character. So it's really looking for
578 # endpats["'"] or endpats['"'], by trying to
579 # skip string prefix characters, if any.
580 endprog = _compile(endpats.get(initial) or
581 endpats.get(token[1]) or
582 endpats.get(token[2]))
583 contstr, needcont = line[start:], 1
584 contline = line
585 break
586 else: # ordinary string
587 yield TokenInfo(STRING, token, spos, epos, line)
588
589 elif initial.isidentifier(): # ordinary name
590 yield TokenInfo(NAME, token, spos, epos, line)
591 elif initial == '\\': # continued stmt
592 continued = 1
593 else:
594 if initial in '([{':
595 parenlev += 1
596 elif initial in ')]}':
597 parenlev -= 1
598 yield TokenInfo(OP, token, spos, epos, line)
599 else:
600 yield TokenInfo(ERRORTOKEN, line[pos],
601 (lnum, pos), (lnum, pos+1), line)
602 pos += 1
603
604 # Add an implicit NEWLINE if the input doesn't end in one
605 if last_line and last_line[-1] not in '\r\n':
606 yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
607 for indent in indents[1:]: # pop remaining indent levels
608 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
609 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
610
611
612def generate_tokens(readline):
613 """Tokenize a source reading Python code as unicode strings.
614
615 This has the same API as tokenize(), except that it expects the *readline*
616 callable to return str objects instead of bytes.
617 """
618 return _tokenize(readline, None)
619
620def main():
621 import argparse
622
623 # Helper error handling routines
624 def perror(message):
625 sys.stderr.write(message)
626 sys.stderr.write('\n')
627
628 def error(message, filename=None, location=None):
629 if location:
630 args = (filename,) + location + (message,)
631 perror("%s:%d:%d: error: %s" % args)
632 elif filename:
633 perror("%s: error: %s" % (filename, message))
634 else:
635 perror("error: %s" % message)
636 sys.exit(1)
637
638 # Parse the arguments and options
639 parser = argparse.ArgumentParser(prog='python -m tokenize')
640 parser.add_argument(dest='filename', nargs='?',
641 metavar='filename.py',
642 help='the file to tokenize; defaults to stdin')
643 parser.add_argument('-e', '--exact', dest='exact', action='store_true',
644 help='display token names using the exact type')
645 args = parser.parse_args()
646
647 try:
648 # Tokenize the input
649 if args.filename:
650 filename = args.filename
651 with _builtin_open(filename, 'rb') as f:
652 tokens = list(tokenize(f.readline))
653 else:
654 filename = "<stdin>"
655 tokens = _tokenize(sys.stdin.readline, None)
656
657 # Output the tokenization
658 for token in tokens:
659 token_type = token.type
660 if args.exact:
661 token_type = token.exact_type
662 token_range = "%d,%d-%d,%d:" % (token.start + token.end)
663 print("%-20s%-15s%-15r" %
664 (token_range, tok_name[token_type], token.string))
665 except IndentationError as err:
666 line, column = err.args[1][1:3]
667 error(err.args[0], filename, (line, column))
668 except TokenError as err:
669 line, column = err.args[1]
670 error(err.args[0], filename, (line, column))
671 except SyntaxError as err:
672 error(err, filename)
673 except OSError as err:
674 error(err)
675 except KeyboardInterrupt:
676 print("interrupted\n")
677 except Exception as err:
678 perror("unexpected error: %s" % err)
679 raise
680
681if __name__ == "__main__":
682 main()