[Python-checkins] Add tests for the C tokenizer and expose it as a private module (GH-27924)

pablogsal webhook-mailer at python.org
Tue Aug 24 12:50:10 EDT 2021


https://github.com/python/cpython/commit/a24676bedcd332dd7e6fa5521d0449206391d190
commit: a24676bedcd332dd7e6fa5521d0449206391d190
branch: main
author: Pablo Galindo Salgado <Pablogsal at gmail.com>
committer: pablogsal <Pablogsal at gmail.com>
date: 2021-08-24T17:50:05+01:00
summary:

Add tests for the C tokenizer and expose it as a private module (GH-27924)

files:
A Python/Python-tokenize.c
A Python/clinic/Python-tokenize.c.h
M Lib/test/test_tokenize.py
M Lib/tokenize.py
M Makefile.pre.in
M Modules/config.c.in
M PC/config.c
M PCbuild/pythoncore.vcxproj
M Python/stdlib_module_names.h

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 4bce1ca9c76f7c..f8b16e52976451 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -3,7 +3,7 @@
 from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
                      STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
                      open as tokenize_open, Untokenizer, generate_tokens,
-                     NEWLINE)
+                     NEWLINE, _generate_tokens_from_c_tokenizer)
 from io import BytesIO, StringIO
 import unittest
 from unittest import TestCase, mock
@@ -12,7 +12,6 @@
 import os
 import token
 
-
 # Converts a source string into a list of textual representation
 # of the tokens such as:
 # `    NAME       'if'          (1, 0) (1, 2)`
@@ -1654,5 +1653,865 @@ def test_indentation_semantics_retained(self):
         self.check_roundtrip(code)
 
 
+class CTokenizeTest(TestCase):
+    def check_tokenize(self, s, expected):
+        # Format the tokens in s in a table format.
+        # The ENDMARKER and final NEWLINE are omitted.
+        with self.subTest(source=s):
+            result = stringify_tokens_from_source(
+                _generate_tokens_from_c_tokenizer(s), s
+            )
+            self.assertEqual(result, expected.rstrip().splitlines())
+
+    def test_int(self):
+
+        self.check_tokenize('0xff <= 255', """\
+    NUMBER     '0xff'        (1, 0) (1, 4)
+    LESSEQUAL  '<='          (1, 5) (1, 7)
+    NUMBER     '255'         (1, 8) (1, 11)
+    """)
+
+        self.check_tokenize('0b10 <= 255', """\
+    NUMBER     '0b10'        (1, 0) (1, 4)
+    LESSEQUAL  '<='          (1, 5) (1, 7)
+    NUMBER     '255'         (1, 8) (1, 11)
+    """)
+
+        self.check_tokenize('0o123 <= 0O123', """\
+    NUMBER     '0o123'       (1, 0) (1, 5)
+    LESSEQUAL  '<='          (1, 6) (1, 8)
+    NUMBER     '0O123'       (1, 9) (1, 14)
+    """)
+
+        self.check_tokenize('1234567 > ~0x15', """\
+    NUMBER     '1234567'     (1, 0) (1, 7)
+    GREATER    '>'           (1, 8) (1, 9)
+    TILDE      '~'           (1, 10) (1, 11)
+    NUMBER     '0x15'        (1, 11) (1, 15)
+    """)
+
+        self.check_tokenize('2134568 != 1231515', """\
+    NUMBER     '2134568'     (1, 0) (1, 7)
+    NOTEQUAL   '!='          (1, 8) (1, 10)
+    NUMBER     '1231515'     (1, 11) (1, 18)
+    """)
+
+        self.check_tokenize('(-124561-1) & 200000000', """\
+    LPAR       '('           (1, 0) (1, 1)
+    MINUS      '-'           (1, 1) (1, 2)
+    NUMBER     '124561'      (1, 2) (1, 8)
+    MINUS      '-'           (1, 8) (1, 9)
+    NUMBER     '1'           (1, 9) (1, 10)
+    RPAR       ')'           (1, 10) (1, 11)
+    AMPER      '&'           (1, 12) (1, 13)
+    NUMBER     '200000000'   (1, 14) (1, 23)
+    """)
+
+        self.check_tokenize('0xdeadbeef != -1', """\
+    NUMBER     '0xdeadbeef'  (1, 0) (1, 10)
+    NOTEQUAL   '!='          (1, 11) (1, 13)
+    MINUS      '-'           (1, 14) (1, 15)
+    NUMBER     '1'           (1, 15) (1, 16)
+    """)
+
+        self.check_tokenize('0xdeadc0de & 12345', """\
+    NUMBER     '0xdeadc0de'  (1, 0) (1, 10)
+    AMPER      '&'           (1, 11) (1, 12)
+    NUMBER     '12345'       (1, 13) (1, 18)
+    """)
+
+        self.check_tokenize('0xFF & 0x15 | 1234', """\
+    NUMBER     '0xFF'        (1, 0) (1, 4)
+    AMPER      '&'           (1, 5) (1, 6)
+    NUMBER     '0x15'        (1, 7) (1, 11)
+    VBAR       '|'           (1, 12) (1, 13)
+    NUMBER     '1234'        (1, 14) (1, 18)
+    """)
+
+    def test_float(self):
+
+        self.check_tokenize('x = 3.14159', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '3.14159'     (1, 4) (1, 11)
+    """)
+
+        self.check_tokenize('x = 314159.', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '314159.'     (1, 4) (1, 11)
+    """)
+
+        self.check_tokenize('x = .314159', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '.314159'     (1, 4) (1, 11)
+    """)
+
+        self.check_tokenize('x = 3e14159', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '3e14159'     (1, 4) (1, 11)
+    """)
+
+        self.check_tokenize('x = 3E123', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '3E123'       (1, 4) (1, 9)
+    """)
+
+        self.check_tokenize('x+y = 3e-1230', """\
+    NAME       'x'           (1, 0) (1, 1)
+    PLUS       '+'           (1, 1) (1, 2)
+    NAME       'y'           (1, 2) (1, 3)
+    EQUAL      '='           (1, 4) (1, 5)
+    NUMBER     '3e-1230'     (1, 6) (1, 13)
+    """)
+
+        self.check_tokenize('x = 3.14e159', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '3.14e159'    (1, 4) (1, 12)
+    """)
+
+    def test_string(self):
+
+        self.check_tokenize('x = \'\'; y = ""', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     "''"          (1, 4) (1, 6)
+    SEMI       ';'           (1, 6) (1, 7)
+    NAME       'y'           (1, 8) (1, 9)
+    EQUAL      '='           (1, 10) (1, 11)
+    STRING     '""'          (1, 12) (1, 14)
+    """)
+
+        self.check_tokenize('x = \'"\'; y = "\'"', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     '\\'"\\''       (1, 4) (1, 7)
+    SEMI       ';'           (1, 7) (1, 8)
+    NAME       'y'           (1, 9) (1, 10)
+    EQUAL      '='           (1, 11) (1, 12)
+    STRING     '"\\'"'        (1, 13) (1, 16)
+    """)
+
+        self.check_tokenize('x = "doesn\'t "shrink", does it"', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     '"doesn\\'t "' (1, 4) (1, 14)
+    NAME       'shrink'      (1, 14) (1, 20)
+    STRING     '", does it"' (1, 20) (1, 31)
+    """)
+
+        self.check_tokenize("x = 'abc' + 'ABC'", """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     "'abc'"       (1, 4) (1, 9)
+    PLUS       '+'           (1, 10) (1, 11)
+    STRING     "'ABC'"       (1, 12) (1, 17)
+    """)
+
+        self.check_tokenize('y = "ABC" + "ABC"', """\
+    NAME       'y'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     '"ABC"'       (1, 4) (1, 9)
+    PLUS       '+'           (1, 10) (1, 11)
+    STRING     '"ABC"'       (1, 12) (1, 17)
+    """)
+
+        self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     "r'abc'"      (1, 4) (1, 10)
+    PLUS       '+'           (1, 11) (1, 12)
+    STRING     "r'ABC'"      (1, 13) (1, 19)
+    PLUS       '+'           (1, 20) (1, 21)
+    STRING     "R'ABC'"      (1, 22) (1, 28)
+    PLUS       '+'           (1, 29) (1, 30)
+    STRING     "R'ABC'"      (1, 31) (1, 37)
+    """)
+
+        self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
+    NAME       'y'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     'r"abc"'      (1, 4) (1, 10)
+    PLUS       '+'           (1, 11) (1, 12)
+    STRING     'r"ABC"'      (1, 13) (1, 19)
+    PLUS       '+'           (1, 20) (1, 21)
+    STRING     'R"ABC"'      (1, 22) (1, 28)
+    PLUS       '+'           (1, 29) (1, 30)
+    STRING     'R"ABC"'      (1, 31) (1, 37)
+    """)
+
+        self.check_tokenize("u'abc' + U'abc'", """\
+    STRING     "u'abc'"      (1, 0) (1, 6)
+    PLUS       '+'           (1, 7) (1, 8)
+    STRING     "U'abc'"      (1, 9) (1, 15)
+    """)
+
+        self.check_tokenize('u"abc" + U"abc"', """\
+    STRING     'u"abc"'      (1, 0) (1, 6)
+    PLUS       '+'           (1, 7) (1, 8)
+    STRING     'U"abc"'      (1, 9) (1, 15)
+    """)
+
+        self.check_tokenize("b'abc' + B'abc'", """\
+    STRING     "b'abc'"      (1, 0) (1, 6)
+    PLUS       '+'           (1, 7) (1, 8)
+    STRING     "B'abc'"      (1, 9) (1, 15)
+    """)
+
+        self.check_tokenize('b"abc" + B"abc"', """\
+    STRING     'b"abc"'      (1, 0) (1, 6)
+    PLUS       '+'           (1, 7) (1, 8)
+    STRING     'B"abc"'      (1, 9) (1, 15)
+    """)
+
+        self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
+    STRING     "br'abc'"     (1, 0) (1, 7)
+    PLUS       '+'           (1, 8) (1, 9)
+    STRING     "bR'abc'"     (1, 10) (1, 17)
+    PLUS       '+'           (1, 18) (1, 19)
+    STRING     "Br'abc'"     (1, 20) (1, 27)
+    PLUS       '+'           (1, 28) (1, 29)
+    STRING     "BR'abc'"     (1, 30) (1, 37)
+    """)
+
+        self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
+    STRING     'br"abc"'     (1, 0) (1, 7)
+    PLUS       '+'           (1, 8) (1, 9)
+    STRING     'bR"abc"'     (1, 10) (1, 17)
+    PLUS       '+'           (1, 18) (1, 19)
+    STRING     'Br"abc"'     (1, 20) (1, 27)
+    PLUS       '+'           (1, 28) (1, 29)
+    STRING     'BR"abc"'     (1, 30) (1, 37)
+    """)
+
+        self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
+    STRING     "rb'abc'"     (1, 0) (1, 7)
+    PLUS       '+'           (1, 8) (1, 9)
+    STRING     "rB'abc'"     (1, 10) (1, 17)
+    PLUS       '+'           (1, 18) (1, 19)
+    STRING     "Rb'abc'"     (1, 20) (1, 27)
+    PLUS       '+'           (1, 28) (1, 29)
+    STRING     "RB'abc'"     (1, 30) (1, 37)
+    """)
+
+        self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
+    STRING     'rb"abc"'     (1, 0) (1, 7)
+    PLUS       '+'           (1, 8) (1, 9)
+    STRING     'rB"abc"'     (1, 10) (1, 17)
+    PLUS       '+'           (1, 18) (1, 19)
+    STRING     'Rb"abc"'     (1, 20) (1, 27)
+    PLUS       '+'           (1, 28) (1, 29)
+    STRING     'RB"abc"'     (1, 30) (1, 37)
+    """)
+
+        self.check_tokenize('"a\\\nde\\\nfg"', """\
+    STRING     '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
+    """)
+
+        self.check_tokenize('u"a\\\nde"', """\
+    STRING     'u"a\\\\\\nde"\'  (1, 0) (2, 3)
+    """)
+
+        self.check_tokenize('rb"a\\\nd"', """\
+    STRING     'rb"a\\\\\\nd"\'  (1, 0) (2, 2)
+    """)
+
+        self.check_tokenize(r'"""a\
+b"""', """\
+    STRING     '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
+    """)
+        self.check_tokenize(r'u"""a\
+b"""', """\
+    STRING     'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
+    """)
+        self.check_tokenize(r'rb"""a\
+b\
+c"""', """\
+    STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
+    """)
+ 
+        self.check_tokenize('f"abc"', """\
+    STRING     'f"abc"'      (1, 0) (1, 6)
+    """)
+
+        self.check_tokenize('fR"a{b}c"', """\
+    STRING     'fR"a{b}c"'   (1, 0) (1, 9)
+    """)
+
+        self.check_tokenize('f"""abc"""', """\
+    STRING     'f\"\"\"abc\"\"\"'  (1, 0) (1, 10)
+    """)
+
+        self.check_tokenize(r'f"abc\
+def"', """\
+    STRING     'f"abc\\\\\\ndef"' (1, 0) (2, 4)
+    """)
+
+        self.check_tokenize(r'Rf"abc\
+def"', """\
+    STRING     'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
+    """)
+
+    def test_function(self):
+
+        self.check_tokenize('def d22(a, b, c=2, d=2, *k): pass', """\
+    NAME       'def'         (1, 0) (1, 3)
+    NAME       'd22'         (1, 4) (1, 7)
+    LPAR       '('           (1, 7) (1, 8)
+    NAME       'a'           (1, 8) (1, 9)
+    COMMA      ','           (1, 9) (1, 10)
+    NAME       'b'           (1, 11) (1, 12)
+    COMMA      ','           (1, 12) (1, 13)
+    NAME       'c'           (1, 14) (1, 15)
+    EQUAL      '='           (1, 15) (1, 16)
+    NUMBER     '2'           (1, 16) (1, 17)
+    COMMA      ','           (1, 17) (1, 18)
+    NAME       'd'           (1, 19) (1, 20)
+    EQUAL      '='           (1, 20) (1, 21)
+    NUMBER     '2'           (1, 21) (1, 22)
+    COMMA      ','           (1, 22) (1, 23)
+    STAR       '*'           (1, 24) (1, 25)
+    NAME       'k'           (1, 25) (1, 26)
+    RPAR       ')'           (1, 26) (1, 27)
+    COLON      ':'           (1, 27) (1, 28)
+    NAME       'pass'        (1, 29) (1, 33)
+    """)
+
+        self.check_tokenize('def d01v_(a=1, *k, **w): pass', """\
+    NAME       'def'         (1, 0) (1, 3)
+    NAME       'd01v_'       (1, 4) (1, 9)
+    LPAR       '('           (1, 9) (1, 10)
+    NAME       'a'           (1, 10) (1, 11)
+    EQUAL      '='           (1, 11) (1, 12)
+    NUMBER     '1'           (1, 12) (1, 13)
+    COMMA      ','           (1, 13) (1, 14)
+    STAR       '*'           (1, 15) (1, 16)
+    NAME       'k'           (1, 16) (1, 17)
+    COMMA      ','           (1, 17) (1, 18)
+    DOUBLESTAR '**'          (1, 19) (1, 21)
+    NAME       'w'           (1, 21) (1, 22)
+    RPAR       ')'           (1, 22) (1, 23)
+    COLON      ':'           (1, 23) (1, 24)
+    NAME       'pass'        (1, 25) (1, 29)
+    """)
+
+        self.check_tokenize('def d23(a: str, b: int=3) -> int: pass', """\
+    NAME       'def'         (1, 0) (1, 3)
+    NAME       'd23'         (1, 4) (1, 7)
+    LPAR       '('           (1, 7) (1, 8)
+    NAME       'a'           (1, 8) (1, 9)
+    COLON      ':'           (1, 9) (1, 10)
+    NAME       'str'         (1, 11) (1, 14)
+    COMMA      ','           (1, 14) (1, 15)
+    NAME       'b'           (1, 16) (1, 17)
+    COLON      ':'           (1, 17) (1, 18)
+    NAME       'int'         (1, 19) (1, 22)
+    EQUAL      '='           (1, 22) (1, 23)
+    NUMBER     '3'           (1, 23) (1, 24)
+    RPAR       ')'           (1, 24) (1, 25)
+    RARROW     '->'          (1, 26) (1, 28)
+    NAME       'int'         (1, 29) (1, 32)
+    COLON      ':'           (1, 32) (1, 33)
+    NAME       'pass'        (1, 34) (1, 38)
+    """)
+
+    def test_comparison(self):
+
+        self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
+                            "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
+    NAME       'if'          (1, 0) (1, 2)
+    NUMBER     '1'           (1, 3) (1, 4)
+    LESS       '<'           (1, 5) (1, 6)
+    NUMBER     '1'           (1, 7) (1, 8)
+    GREATER    '>'           (1, 9) (1, 10)
+    NUMBER     '1'           (1, 11) (1, 12)
+    EQEQUAL    '=='          (1, 13) (1, 15)
+    NUMBER     '1'           (1, 16) (1, 17)
+    GREATEREQUAL '>='          (1, 18) (1, 20)
+    NUMBER     '5'           (1, 21) (1, 22)
+    LESSEQUAL  '<='          (1, 23) (1, 25)
+    NUMBER     '0x15'        (1, 26) (1, 30)
+    LESSEQUAL  '<='          (1, 31) (1, 33)
+    NUMBER     '0x12'        (1, 34) (1, 38)
+    NOTEQUAL   '!='          (1, 39) (1, 41)
+    NUMBER     '1'           (1, 42) (1, 43)
+    NAME       'and'         (1, 44) (1, 47)
+    NUMBER     '5'           (1, 48) (1, 49)
+    NAME       'in'          (1, 50) (1, 52)
+    NUMBER     '1'           (1, 53) (1, 54)
+    NAME       'not'         (1, 55) (1, 58)
+    NAME       'in'          (1, 59) (1, 61)
+    NUMBER     '1'           (1, 62) (1, 63)
+    NAME       'is'          (1, 64) (1, 66)
+    NUMBER     '1'           (1, 67) (1, 68)
+    NAME       'or'          (1, 69) (1, 71)
+    NUMBER     '5'           (1, 72) (1, 73)
+    NAME       'is'          (1, 74) (1, 76)
+    NAME       'not'         (1, 77) (1, 80)
+    NUMBER     '1'           (1, 81) (1, 82)
+    COLON      ':'           (1, 82) (1, 83)
+    NAME       'pass'        (1, 84) (1, 88)
+    """)
+
+    def test_additive(self):
+
+        self.check_tokenize('x = 1 - y + 15 - 1 + 0x124 + z + a[5]', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '1'           (1, 4) (1, 5)
+    MINUS      '-'           (1, 6) (1, 7)
+    NAME       'y'           (1, 8) (1, 9)
+    PLUS       '+'           (1, 10) (1, 11)
+    NUMBER     '15'          (1, 12) (1, 14)
+    MINUS      '-'           (1, 15) (1, 16)
+    NUMBER     '1'           (1, 17) (1, 18)
+    PLUS       '+'           (1, 19) (1, 20)
+    NUMBER     '0x124'       (1, 21) (1, 26)
+    PLUS       '+'           (1, 27) (1, 28)
+    NAME       'z'           (1, 29) (1, 30)
+    PLUS       '+'           (1, 31) (1, 32)
+    NAME       'a'           (1, 33) (1, 34)
+    LSQB       '['           (1, 34) (1, 35)
+    NUMBER     '5'           (1, 35) (1, 36)
+    RSQB       ']'           (1, 36) (1, 37)
+    """)
+
+    def test_multiplicative(self):
+
+        self.check_tokenize('x = 1//1*1/5*12%0x12 at 42', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '1'           (1, 4) (1, 5)
+    DOUBLESLASH '//'          (1, 5) (1, 7)
+    NUMBER     '1'           (1, 7) (1, 8)
+    STAR       '*'           (1, 8) (1, 9)
+    NUMBER     '1'           (1, 9) (1, 10)
+    SLASH      '/'           (1, 10) (1, 11)
+    NUMBER     '5'           (1, 11) (1, 12)
+    STAR       '*'           (1, 12) (1, 13)
+    NUMBER     '12'          (1, 13) (1, 15)
+    PERCENT    '%'           (1, 15) (1, 16)
+    NUMBER     '0x12'        (1, 16) (1, 20)
+    AT         '@'           (1, 20) (1, 21)
+    NUMBER     '42'          (1, 21) (1, 23)
+    """)
+
+    def test_unary(self):
+
+        self.check_tokenize('~1 ^ 1 & 1 |1 ^ -1', """\
+    TILDE      '~'           (1, 0) (1, 1)
+    NUMBER     '1'           (1, 1) (1, 2)
+    CIRCUMFLEX '^'           (1, 3) (1, 4)
+    NUMBER     '1'           (1, 5) (1, 6)
+    AMPER      '&'           (1, 7) (1, 8)
+    NUMBER     '1'           (1, 9) (1, 10)
+    VBAR       '|'           (1, 11) (1, 12)
+    NUMBER     '1'           (1, 12) (1, 13)
+    CIRCUMFLEX '^'           (1, 14) (1, 15)
+    MINUS      '-'           (1, 16) (1, 17)
+    NUMBER     '1'           (1, 17) (1, 18)
+    """)
+
+        self.check_tokenize('-1*1/1+1*1//1 - ---1**1', """\
+    MINUS      '-'           (1, 0) (1, 1)
+    NUMBER     '1'           (1, 1) (1, 2)
+    STAR       '*'           (1, 2) (1, 3)
+    NUMBER     '1'           (1, 3) (1, 4)
+    SLASH      '/'           (1, 4) (1, 5)
+    NUMBER     '1'           (1, 5) (1, 6)
+    PLUS       '+'           (1, 6) (1, 7)
+    NUMBER     '1'           (1, 7) (1, 8)
+    STAR       '*'           (1, 8) (1, 9)
+    NUMBER     '1'           (1, 9) (1, 10)
+    DOUBLESLASH '//'          (1, 10) (1, 12)
+    NUMBER     '1'           (1, 12) (1, 13)
+    MINUS      '-'           (1, 14) (1, 15)
+    MINUS      '-'           (1, 16) (1, 17)
+    MINUS      '-'           (1, 17) (1, 18)
+    MINUS      '-'           (1, 18) (1, 19)
+    NUMBER     '1'           (1, 19) (1, 20)
+    DOUBLESTAR '**'          (1, 20) (1, 22)
+    NUMBER     '1'           (1, 22) (1, 23)
+    """)
+
+    def test_selector(self):
+
+        self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
+    NAME       'import'      (1, 0) (1, 6)
+    NAME       'sys'         (1, 7) (1, 10)
+    COMMA      ','           (1, 10) (1, 11)
+    NAME       'time'        (1, 12) (1, 16)
+    NEWLINE    ''            (1, 16) (1, 16)
+    NAME       'x'           (2, 0) (2, 1)
+    EQUAL      '='           (2, 2) (2, 3)
+    NAME       'sys'         (2, 4) (2, 7)
+    DOT        '.'           (2, 7) (2, 8)
+    NAME       'modules'     (2, 8) (2, 15)
+    LSQB       '['           (2, 15) (2, 16)
+    STRING     "'time'"      (2, 16) (2, 22)
+    RSQB       ']'           (2, 22) (2, 23)
+    DOT        '.'           (2, 23) (2, 24)
+    NAME       'time'        (2, 24) (2, 28)
+    LPAR       '('           (2, 28) (2, 29)
+    RPAR       ')'           (2, 29) (2, 30)
+    """)
+
+    def test_method(self):
+
+        self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
+    AT         '@'           (1, 0) (1, 1)
+    NAME       'staticmethod' (1, 1) (1, 13)
+    NEWLINE    ''            (1, 13) (1, 13)
+    NAME       'def'         (2, 0) (2, 3)
+    NAME       'foo'         (2, 4) (2, 7)
+    LPAR       '('           (2, 7) (2, 8)
+    NAME       'x'           (2, 8) (2, 9)
+    COMMA      ','           (2, 9) (2, 10)
+    NAME       'y'           (2, 10) (2, 11)
+    RPAR       ')'           (2, 11) (2, 12)
+    COLON      ':'           (2, 12) (2, 13)
+    NAME       'pass'        (2, 14) (2, 18)
+    """)
+
+    def test_tabs(self):
+
+        self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
+    AT         '@'           (1, 0) (1, 1)
+    NAME       'staticmethod' (1, 1) (1, 13)
+    NEWLINE    ''            (1, 13) (1, 13)
+    NAME       'def'         (2, 0) (2, 3)
+    NAME       'foo'         (2, 4) (2, 7)
+    LPAR       '('           (2, 7) (2, 8)
+    NAME       'x'           (2, 8) (2, 9)
+    COMMA      ','           (2, 9) (2, 10)
+    NAME       'y'           (2, 10) (2, 11)
+    RPAR       ')'           (2, 11) (2, 12)
+    COLON      ':'           (2, 12) (2, 13)
+    NAME       'pass'        (2, 14) (2, 18)
+    """)
+
+    def test_async(self):
+
+        self.check_tokenize('async = 1', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    EQUAL      '='           (1, 6) (1, 7)
+    NUMBER     '1'           (1, 8) (1, 9)
+    """)
+
+        self.check_tokenize('a = (async = 1)', """\
+    NAME       'a'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    LPAR       '('           (1, 4) (1, 5)
+    ASYNC      'async'       (1, 5) (1, 10)
+    EQUAL      '='           (1, 11) (1, 12)
+    NUMBER     '1'           (1, 13) (1, 14)
+    RPAR       ')'           (1, 14) (1, 15)
+    """)
+
+        self.check_tokenize('async()', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    LPAR       '('           (1, 5) (1, 6)
+    RPAR       ')'           (1, 6) (1, 7)
+    """)
+
+        self.check_tokenize('class async(Bar):pass', """\
+    NAME       'class'       (1, 0) (1, 5)
+    ASYNC      'async'       (1, 6) (1, 11)
+    LPAR       '('           (1, 11) (1, 12)
+    NAME       'Bar'         (1, 12) (1, 15)
+    RPAR       ')'           (1, 15) (1, 16)
+    COLON      ':'           (1, 16) (1, 17)
+    NAME       'pass'        (1, 17) (1, 21)
+    """)
+
+        self.check_tokenize('class async:pass', """\
+    NAME       'class'       (1, 0) (1, 5)
+    ASYNC      'async'       (1, 6) (1, 11)
+    COLON      ':'           (1, 11) (1, 12)
+    NAME       'pass'        (1, 12) (1, 16)
+    """)
+
+        self.check_tokenize('await = 1', """\
+    AWAIT      'await'       (1, 0) (1, 5)
+    EQUAL      '='           (1, 6) (1, 7)
+    NUMBER     '1'           (1, 8) (1, 9)
+    """)
+
+        self.check_tokenize('foo.async', """\
+    NAME       'foo'         (1, 0) (1, 3)
+    DOT        '.'           (1, 3) (1, 4)
+    ASYNC      'async'       (1, 4) (1, 9)
+    """)
+
+        self.check_tokenize('async for a in b: pass', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    NAME       'for'         (1, 6) (1, 9)
+    NAME       'a'           (1, 10) (1, 11)
+    NAME       'in'          (1, 12) (1, 14)
+    NAME       'b'           (1, 15) (1, 16)
+    COLON      ':'           (1, 16) (1, 17)
+    NAME       'pass'        (1, 18) (1, 22)
+    """)
+
+        self.check_tokenize('async with a as b: pass', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    NAME       'with'        (1, 6) (1, 10)
+    NAME       'a'           (1, 11) (1, 12)
+    NAME       'as'          (1, 13) (1, 15)
+    NAME       'b'           (1, 16) (1, 17)
+    COLON      ':'           (1, 17) (1, 18)
+    NAME       'pass'        (1, 19) (1, 23)
+    """)
+
+        self.check_tokenize('async.foo', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    DOT        '.'           (1, 5) (1, 6)
+    NAME       'foo'         (1, 6) (1, 9)
+    """)
+
+        self.check_tokenize('async', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    """)
+
+        self.check_tokenize('async\n#comment\nawait', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    NEWLINE    ''            (1, 5) (1, 5)
+    AWAIT      'await'       (3, 0) (3, 5)
+    """)
+
+        self.check_tokenize('async\n...\nawait', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    NEWLINE    ''            (1, 5) (1, 5)
+    ELLIPSIS   '...'         (2, 0) (2, 3)
+    NEWLINE    ''            (2, 3) (2, 3)
+    AWAIT      'await'       (3, 0) (3, 5)
+    """)
+
+        self.check_tokenize('async\nawait', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    NEWLINE    ''            (1, 5) (1, 5)
+    AWAIT      'await'       (2, 0) (2, 5)
+    """)
+
+        self.check_tokenize('foo.async + 1', """\
+    NAME       'foo'         (1, 0) (1, 3)
+    DOT        '.'           (1, 3) (1, 4)
+    ASYNC      'async'       (1, 4) (1, 9)
+    PLUS       '+'           (1, 10) (1, 11)
+    NUMBER     '1'           (1, 12) (1, 13)
+    """)
+
+        self.check_tokenize('async def foo(): pass', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'foo'         (1, 10) (1, 13)
+    LPAR       '('           (1, 13) (1, 14)
+    RPAR       ')'           (1, 14) (1, 15)
+    COLON      ':'           (1, 15) (1, 16)
+    NAME       'pass'        (1, 17) (1, 21)
+    """)
+
+        self.check_tokenize('''\
+async def foo():
+  def foo(await):
+    await = 1
+  if 1:
+    await
+async += 1
+''', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'foo'         (1, 10) (1, 13)
+    LPAR       '('           (1, 13) (1, 14)
+    RPAR       ')'           (1, 14) (1, 15)
+    COLON      ':'           (1, 15) (1, 16)
+    NEWLINE    ''            (1, 16) (1, 16)
+    INDENT     ''            (2, -1) (2, -1)
+    NAME       'def'         (2, 2) (2, 5)
+    NAME       'foo'         (2, 6) (2, 9)
+    LPAR       '('           (2, 9) (2, 10)
+    AWAIT      'await'       (2, 10) (2, 15)
+    RPAR       ')'           (2, 15) (2, 16)
+    COLON      ':'           (2, 16) (2, 17)
+    NEWLINE    ''            (2, 17) (2, 17)
+    INDENT     ''            (3, -1) (3, -1)
+    AWAIT      'await'       (3, 4) (3, 9)
+    EQUAL      '='           (3, 10) (3, 11)
+    NUMBER     '1'           (3, 12) (3, 13)
+    NEWLINE    ''            (3, 13) (3, 13)
+    DEDENT     ''            (4, -1) (4, -1)
+    NAME       'if'          (4, 2) (4, 4)
+    NUMBER     '1'           (4, 5) (4, 6)
+    COLON      ':'           (4, 6) (4, 7)
+    NEWLINE    ''            (4, 7) (4, 7)
+    INDENT     ''            (5, -1) (5, -1)
+    AWAIT      'await'       (5, 4) (5, 9)
+    NEWLINE    ''            (5, 9) (5, 9)
+    DEDENT     ''            (6, -1) (6, -1)
+    DEDENT     ''            (6, -1) (6, -1)
+    ASYNC      'async'       (6, 0) (6, 5)
+    PLUSEQUAL  '+='          (6, 6) (6, 8)
+    NUMBER     '1'           (6, 9) (6, 10)
+    NEWLINE    ''            (6, 10) (6, 10)
+    """)
+
+        self.check_tokenize('async def foo():\n  async for i in 1: pass', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'foo'         (1, 10) (1, 13)
+    LPAR       '('           (1, 13) (1, 14)
+    RPAR       ')'           (1, 14) (1, 15)
+    COLON      ':'           (1, 15) (1, 16)
+    NEWLINE    ''            (1, 16) (1, 16)
+    INDENT     ''            (2, -1) (2, -1)
+    ASYNC      'async'       (2, 2) (2, 7)
+    NAME       'for'         (2, 8) (2, 11)
+    NAME       'i'           (2, 12) (2, 13)
+    NAME       'in'          (2, 14) (2, 16)
+    NUMBER     '1'           (2, 17) (2, 18)
+    COLON      ':'           (2, 18) (2, 19)
+    NAME       'pass'        (2, 20) (2, 24)
+    DEDENT     ''            (2, -1) (2, -1)
+    """)
+
+        self.check_tokenize('async def foo(async): await', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'foo'         (1, 10) (1, 13)
+    LPAR       '('           (1, 13) (1, 14)
+    ASYNC      'async'       (1, 14) (1, 19)
+    RPAR       ')'           (1, 19) (1, 20)
+    COLON      ':'           (1, 20) (1, 21)
+    AWAIT      'await'       (1, 22) (1, 27)
+    """)
+
+        self.check_tokenize('''\
+def f():
+
+  def baz(): pass
+  async def bar(): pass
+
+  await = 2''', """\
+    NAME       'def'         (1, 0) (1, 3)
+    NAME       'f'           (1, 4) (1, 5)
+    LPAR       '('           (1, 5) (1, 6)
+    RPAR       ')'           (1, 6) (1, 7)
+    COLON      ':'           (1, 7) (1, 8)
+    NEWLINE    ''            (1, 8) (1, 8)
+    INDENT     ''            (3, -1) (3, -1)
+    NAME       'def'         (3, 2) (3, 5)
+    NAME       'baz'         (3, 6) (3, 9)
+    LPAR       '('           (3, 9) (3, 10)
+    RPAR       ')'           (3, 10) (3, 11)
+    COLON      ':'           (3, 11) (3, 12)
+    NAME       'pass'        (3, 13) (3, 17)
+    NEWLINE    ''            (3, 17) (3, 17)
+    ASYNC      'async'       (4, 2) (4, 7)
+    NAME       'def'         (4, 8) (4, 11)
+    NAME       'bar'         (4, 12) (4, 15)
+    LPAR       '('           (4, 15) (4, 16)
+    RPAR       ')'           (4, 16) (4, 17)
+    COLON      ':'           (4, 17) (4, 18)
+    NAME       'pass'        (4, 19) (4, 23)
+    NEWLINE    ''            (4, 23) (4, 23)
+    AWAIT      'await'       (6, 2) (6, 7)
+    EQUAL      '='           (6, 8) (6, 9)
+    NUMBER     '2'           (6, 10) (6, 11)
+    DEDENT     ''            (6, -1) (6, -1)
+    """)
+
+        self.check_tokenize('''\
+async def f():
+
+  def baz(): pass
+  async def bar(): pass
+
+  await = 2''', """\
+    ASYNC      'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'f'           (1, 10) (1, 11)
+    LPAR       '('           (1, 11) (1, 12)
+    RPAR       ')'           (1, 12) (1, 13)
+    COLON      ':'           (1, 13) (1, 14)
+    NEWLINE    ''            (1, 14) (1, 14)
+    INDENT     ''            (3, -1) (3, -1)
+    NAME       'def'         (3, 2) (3, 5)
+    NAME       'baz'         (3, 6) (3, 9)
+    LPAR       '('           (3, 9) (3, 10)
+    RPAR       ')'           (3, 10) (3, 11)
+    COLON      ':'           (3, 11) (3, 12)
+    NAME       'pass'        (3, 13) (3, 17)
+    NEWLINE    ''            (3, 17) (3, 17)
+    ASYNC      'async'       (4, 2) (4, 7)
+    NAME       'def'         (4, 8) (4, 11)
+    NAME       'bar'         (4, 12) (4, 15)
+    LPAR       '('           (4, 15) (4, 16)
+    RPAR       ')'           (4, 16) (4, 17)
+    COLON      ':'           (4, 17) (4, 18)
+    NAME       'pass'        (4, 19) (4, 23)
+    NEWLINE    ''            (4, 23) (4, 23)
+    AWAIT      'await'       (6, 2) (6, 7)
+    EQUAL      '='           (6, 8) (6, 9)
+    NUMBER     '2'           (6, 10) (6, 11)
+    DEDENT     ''            (6, -1) (6, -1)
+    """)
+
+    def test_unicode(self):
+
+        self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
+    NAME       'Örter'       (1, 0) (1, 6)
+    EQUAL      '='           (1, 7) (1, 8)
+    STRING     "u'places'"   (1, 9) (1, 18)
+    NEWLINE    ''            (1, 18) (1, 18)
+    NAME       'grün'        (2, 0) (2, 5)
+    EQUAL      '='           (2, 6) (2, 7)
+    STRING     "U'green'"    (2, 8) (2, 16)
+    """)
+
+    def test_invalid_syntax(self):
+        def get_tokens(string):
+            return list(_generate_tokens_from_c_tokenizer(string))
+
+        self.assertRaises(SyntaxError, get_tokens, "(1+2]")
+        self.assertRaises(SyntaxError, get_tokens, "(1+2}")
+        self.assertRaises(SyntaxError, get_tokens, "{1+2]")
+
+        self.assertRaises(SyntaxError, get_tokens, "1_")
+        self.assertRaises(SyntaxError, get_tokens, "1.2_")
+        self.assertRaises(SyntaxError, get_tokens, "1e2_")
+        self.assertRaises(SyntaxError, get_tokens, "1e+")
+
+        self.assertRaises(SyntaxError, get_tokens, "\xa0")
+        self.assertRaises(SyntaxError, get_tokens, "€")
+
+        self.assertRaises(SyntaxError, get_tokens, "0b12")
+        self.assertRaises(SyntaxError, get_tokens, "0b1_2")
+        self.assertRaises(SyntaxError, get_tokens, "0b2")
+        self.assertRaises(SyntaxError, get_tokens, "0b1_")
+        self.assertRaises(SyntaxError, get_tokens, "0b")
+        self.assertRaises(SyntaxError, get_tokens, "0o18")
+        self.assertRaises(SyntaxError, get_tokens, "0o1_8")
+        self.assertRaises(SyntaxError, get_tokens, "0o8")
+        self.assertRaises(SyntaxError, get_tokens, "0o1_")
+        self.assertRaises(SyntaxError, get_tokens, "0o")
+        self.assertRaises(SyntaxError, get_tokens, "0x1_")
+        self.assertRaises(SyntaxError, get_tokens, "0x")
+        self.assertRaises(SyntaxError, get_tokens, "1_")
+        self.assertRaises(SyntaxError, get_tokens, "012")
+        self.assertRaises(SyntaxError, get_tokens, "1.2_")
+        self.assertRaises(SyntaxError, get_tokens, "1e2_")
+        self.assertRaises(SyntaxError, get_tokens, "1e+")
+
+        self.assertRaises(SyntaxError, get_tokens, "'sdfsdf")
+        self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''")
+
+        self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
+        self.assertRaises(SyntaxError, get_tokens, "]")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 7d7736fe985981..0b9e238310049c 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -680,5 +680,13 @@ def error(message, filename=None, location=None):
         perror("unexpected error: %s" % err)
         raise
 
+def _generate_tokens_from_c_tokenizer(source):
+    """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
+    import _tokenize as c_tokenizer
+    for info in c_tokenizer.TokenizerIter(source):
+        tok, type, lineno, end_lineno, col_off, end_col_off, line = info
+        yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
+
+
 if __name__ == "__main__":
     main()
diff --git a/Makefile.pre.in b/Makefile.pre.in
index f503ac4d876726..1007f440759b1a 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -339,6 +339,7 @@ PARSER_HEADERS= \
 PYTHON_OBJS=	\
 		Python/_warnings.o \
 		Python/Python-ast.o \
+		Python/Python-tokenize.o \
 		Python/asdl.o \
 		Python/ast.o \
 		Python/ast_opt.o \
diff --git a/Modules/config.c.in b/Modules/config.c.in
index d69e8e88b0ca45..6081f95759538f 100644
--- a/Modules/config.c.in
+++ b/Modules/config.c.in
@@ -28,6 +28,7 @@ extern PyObject* PyMarshal_Init(void);
 extern PyObject* PyInit__imp(void);
 extern PyObject* PyInit_gc(void);
 extern PyObject* PyInit__ast(void);
+extern PyObject* PyInit__tokenize(void);
 extern PyObject* _PyWarnings_Init(void);
 extern PyObject* PyInit__string(void);
 
@@ -44,6 +45,9 @@ struct _inittab _PyImport_Inittab[] = {
     /* This lives in Python/Python-ast.c */
     {"_ast", PyInit__ast},
 
+    /* This lives in Python/Python-tokenizer.c */
+    {"_tokenize", PyInit__tokenize},
+
     /* These entries are here for sys.builtin_module_names */
     {"builtins", NULL},
     {"sys", NULL},
diff --git a/PC/config.c b/PC/config.c
index 11743ea45a969d..9d900c78e40d00 100644
--- a/PC/config.c
+++ b/PC/config.c
@@ -72,9 +72,8 @@ extern PyObject* _PyWarnings_Init(void);
 extern PyObject* PyInit__string(void);
 extern PyObject* PyInit__stat(void);
 extern PyObject* PyInit__opcode(void);
-
 extern PyObject* PyInit__contextvars(void);
-
+extern PyObject* PyInit__tokenize(void);
 
 /* tools/freeze/makeconfig.py marker for additional "extern" */
 /* -- ADDMODULE MARKER 1 -- */
@@ -83,7 +82,6 @@ extern PyObject* PyMarshal_Init(void);
 extern PyObject* PyInit__imp(void);
 
 struct _inittab _PyImport_Inittab[] = {
-
     {"_abc", PyInit__abc},
     {"array", PyInit_array},
     {"_ast", PyInit__ast},
@@ -105,6 +103,7 @@ struct _inittab _PyImport_Inittab[] = {
     {"_blake2", PyInit__blake2},
     {"time", PyInit_time},
     {"_thread", PyInit__thread},
+    {"_tokenize", PyInit__tokenize},
     {"_typing", PyInit__typing},
     {"_statistics", PyInit__statistics},
 #ifdef WIN32
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index ebc0f2879f9595..b8cadf469355f4 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -488,6 +488,7 @@
     <ClCompile Include="..\Python\pystrtod.c" />
     <ClCompile Include="..\Python\dtoa.c" />
     <ClCompile Include="..\Python\Python-ast.c" />
+    <ClCompile Include="..\Python\Python-tokenize.c" />
     <ClCompile Include="..\Python\pythonrun.c" />
     <ClCompile Include="..\Python\specialize.c" />
     <ClCompile Include="..\Python\suggestions.c" />
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
new file mode 100644
index 00000000000000..b9fb1693ce117e
--- /dev/null
+++ b/Python/Python-tokenize.c
@@ -0,0 +1,195 @@
+#include "Python.h"
+#include "../Parser/tokenizer.h"
+
+static struct PyModuleDef _tokenizemodule;
+
+typedef struct {
+    PyTypeObject* TokenizerIter;
+} tokenize_state;
+
+static tokenize_state*
+get_tokenize_state(PyObject* module)
+{
+    return (tokenize_state*)PyModule_GetState(module);
+}
+
+#define _tokenize_get_state_by_type(type)                                                               \
+    get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule))
+
+#include "clinic/Python-tokenize.c.h"
+
+/*[clinic input]
+module _tokenizer
+class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
+
+typedef struct {
+    PyObject_HEAD
+    struct tok_state* tok;
+} tokenizeriterobject;
+
+/*[clinic input]
+ at classmethod
+_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
+
+    source: str
+[clinic start generated code]*/
+
+static PyObject *
+tokenizeriter_new_impl(PyTypeObject *type, const char *source)
+/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
+{
+    tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0);
+    if (self == NULL) {
+        return NULL;
+    }
+    PyObject* filename = PyUnicode_FromString("<string>");
+    if (filename == NULL) {
+        return NULL;
+    }
+    self->tok = PyTokenizer_FromUTF8(source, 1);
+    if (self->tok == NULL) {
+        return NULL;
+    }
+    self->tok->filename = filename;
+    return (PyObject*)self;
+}
+
+static PyObject*
+tokenizeriter_next(tokenizeriterobject* it)
+{
+    const char* start;
+    const char* end;
+    int type = PyTokenizer_Get(it->tok, &start, &end);
+    if (type == ERRORTOKEN && PyErr_Occurred()) {
+        return NULL;
+    }
+    if (type == ERRORTOKEN || type == ENDMARKER) {
+        PyErr_SetString(PyExc_StopIteration, "EOF");
+        return NULL;
+    }
+    PyObject* str = NULL;
+    if (start == NULL || end == NULL) {
+        str = PyUnicode_FromString("");
+    } else {
+        str = PyUnicode_FromStringAndSize(start, end - start);
+    }
+    if (str == NULL) {
+        return NULL;
+    }
+
+    Py_ssize_t size = it->tok->inp - it->tok->buf;
+    PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
+    if (line == NULL) {
+        Py_DECREF(str);
+        return NULL;
+    }
+    const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
+    int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
+    int end_lineno = it->tok->lineno;
+    int col_offset = -1;
+    int end_col_offset = -1;
+    if (start != NULL && start >= line_start) {
+        col_offset = (int)(start - line_start);
+    }
+    if (end != NULL && end >= it->tok->line_start) {
+        end_col_offset = (int)(end - it->tok->line_start);
+    }
+
+    return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+}
+
+static void
+tokenizeriter_dealloc(tokenizeriterobject* it)
+{
+    PyTypeObject* tp = Py_TYPE(it);
+    PyTokenizer_Free(it->tok);
+    tp->tp_free(it);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot tokenizeriter_slots[] = {
+        {Py_tp_new, tokenizeriter_new},
+        {Py_tp_dealloc, tokenizeriter_dealloc},
+        {Py_tp_getattro, PyObject_GenericGetAttr},
+        {Py_tp_iter, PyObject_SelfIter},
+        {Py_tp_iternext, tokenizeriter_next},
+        {0, NULL},
+};
+
+static PyType_Spec tokenizeriter_spec = {
+        .name = "_tokenize.TokenizerIter",
+        .basicsize = sizeof(tokenizeriterobject),
+        .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
+        .slots = tokenizeriter_slots,
+};
+
+
+static int
+tokenizemodule_exec(PyObject* m)
+{
+    tokenize_state* state = get_tokenize_state(m);
+    if (state == NULL) {
+        return -1;
+    }
+
+    state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(
+        m, &tokenizeriter_spec, NULL);
+    if (state->TokenizerIter == NULL) {
+        return -1;
+    }
+    if (PyModule_AddType(m, state->TokenizerIter) < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static PyMethodDef tokenize_methods[] = {
+        {NULL, NULL, 0, NULL} /* Sentinel */
+};
+
+static PyModuleDef_Slot tokenizemodule_slots[] = {
+    {Py_mod_exec, tokenizemodule_exec},
+    {0, NULL}
+};
+
+static int
+tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
+{
+    tokenize_state *state = get_tokenize_state(m);
+    Py_VISIT(state->TokenizerIter);
+    return 0;
+}
+
+static int
+tokenizemodule_clear(PyObject *m)
+{
+    tokenize_state *state = get_tokenize_state(m);
+    Py_CLEAR(state->TokenizerIter);
+    return 0;
+}
+
+static void
+tokenizemodule_free(void *m)
+{
+    tokenizemodule_clear((PyObject *)m);
+}
+
+static struct PyModuleDef _tokenizemodule = {
+        PyModuleDef_HEAD_INIT,
+        .m_name = "_tokenize",
+        .m_size = sizeof(tokenize_state),
+        .m_slots = tokenizemodule_slots,
+        .m_methods = tokenize_methods,
+        .m_traverse = tokenizemodule_traverse,
+        .m_clear = tokenizemodule_clear,
+        .m_free = tokenizemodule_free,
+};
+
+PyMODINIT_FUNC
+PyInit__tokenize(void)
+{
+    return PyModuleDef_Init(&_tokenizemodule);
+}
diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h
new file mode 100644
index 00000000000000..050b4d49448c36
--- /dev/null
+++ b/Python/clinic/Python-tokenize.c.h
@@ -0,0 +1,41 @@
+/*[clinic input]
+preserve
+[clinic start generated code]*/
+
+static PyObject *
+tokenizeriter_new_impl(PyTypeObject *type, const char *source);
+
+static PyObject *
+tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+    PyObject *return_value = NULL;
+    static const char * const _keywords[] = {"source", NULL};
+    static _PyArg_Parser _parser = {NULL, _keywords, "tokenizeriter", 0};
+    PyObject *argsbuf[1];
+    PyObject * const *fastargs;
+    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+    const char *source;
+
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 0, argsbuf);
+    if (!fastargs) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(fastargs[0])) {
+        _PyArg_BadArgument("tokenizeriter", "argument 'source'", "str", fastargs[0]);
+        goto exit;
+    }
+    Py_ssize_t source_length;
+    source = PyUnicode_AsUTF8AndSize(fastargs[0], &source_length);
+    if (source == NULL) {
+        goto exit;
+    }
+    if (strlen(source) != (size_t)source_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+    return_value = tokenizeriter_new_impl(type, source);
+
+exit:
+    return return_value;
+}
+/*[clinic end generated code: output=dfcd64774e01bfe6 input=a9049054013a1b77]*/
diff --git a/Python/stdlib_module_names.h b/Python/stdlib_module_names.h
index 3c5f1768305cb0..2f75c2e54cd5e9 100644
--- a/Python/stdlib_module_names.h
+++ b/Python/stdlib_module_names.h
@@ -80,6 +80,7 @@ static const char* _Py_stdlib_module_names[] = {
 "_thread",
 "_threading_local",
 "_tkinter",
+"_tokenize",
 "_tracemalloc",
 "_typing",
 "_uuid",



More information about the Python-checkins mailing list