[Python-checkins] gh-102856: Tokenize performance improvement (#104731)
pablogsal
webhook-mailer at python.org
Sun May 21 20:29:11 EDT 2023
https://github.com/python/cpython/commit/8817886ae571f5b5ce4e2e6cfd2458622d0efac1
commit: 8817886ae571f5b5ce4e2e6cfd2458622d0efac1
branch: main
author: Marta Gómez Macías <mgmacias at google.com>
committer: pablogsal <Pablogsal at gmail.com>
date: 2023-05-22T00:29:04Z
summary:
gh-102856: Tokenize performance improvement (#104731)
files:
M Lib/tokenize.py
M Python/Python-tokenize.c
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index cef2773feac2..911f0f12f9bb 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -449,16 +449,6 @@ def _tokenize(rl_gen, encoding):
source = b"".join(rl_gen).decode(encoding)
token = None
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
- # TODO: Marta -> limpiar esto
- if 6 < token.type <= 54:
- token = token._replace(type=OP)
- if token.type in {ASYNC, AWAIT}:
- token = token._replace(type=NAME)
- if token.type == NEWLINE:
- l_start, c_start = token.start
- l_end, c_end = token.end
- token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
-
yield token
if token is not None:
last_line, _ = token.start
@@ -550,8 +540,7 @@ def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
import _tokenize as c_tokenizer
for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
- tok, type, lineno, end_lineno, col_off, end_col_off, line = info
- yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
+ yield TokenInfo._make(info)
if __name__ == "__main__":
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index ece238672e34..43b44be94583 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -207,7 +207,22 @@ tokenizeriter_next(tokenizeriterobject *it)
end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
}
- result = Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+ if (it->tok->tok_extra_tokens) {
+ // Necessary adjustments to match the original Python tokenize
+ // implementation
+ if (type > DEDENT && type < OP) {
+ type = OP;
+ }
+ else if (type == ASYNC || type == AWAIT) {
+ type = NAME;
+ }
+ else if (type == NEWLINE) {
+ str = PyUnicode_FromString("\n");
+ end_col_offset++;
+ }
+ }
+
+ result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
exit:
_PyToken_Free(&token);
return result;
More information about the Python-checkins
mailing list