gh-41872: Fix quick extraction of module docstrings from a file in pydoc (GH-127520)
https://github.com/python/cpython/commit/474e419792484d1c16e7d9c99b7bf144136... commit: 474e419792484d1c16e7d9c99b7bf144136b9307 branch: main author: Srinivas Reddy Thatiparthy (తాటిపర్తి శ్రీనివాస్ రెడ్డి) <thatiparthysreenivas@gmail.com> committer: serhiy-storchaka <storchaka@gmail.com> date: 2025-01-08T10:32:07Z summary: gh-41872: Fix quick extraction of module docstrings from a file in pydoc (GH-127520) It now supports docstrings with single quotes, escape sequences, raw string literals, and other Python syntax. Co-authored-by: Éric <merwok@netwok.org> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> files: A Misc/NEWS.d/next/Library/2024-12-17-15-23-40.gh-issue-41872.31LjKY.rst M Lib/pydoc.py M Lib/test/test_pydoc/test_pydoc.py diff --git a/Lib/pydoc.py b/Lib/pydoc.py index c863794ea14ef9..9e84292aaf825f 100644 --- a/Lib/pydoc.py +++ b/Lib/pydoc.py @@ -53,6 +53,7 @@ class or function within a module or module in a package. If the # the current directory is changed with os.chdir(), an incorrect # path will be displayed. +import ast import __future__ import builtins import importlib._bootstrap @@ -384,21 +385,29 @@ def ispackage(path): return False def source_synopsis(file): - line = file.readline() - while line[:1] == '#' or not line.strip(): - line = file.readline() - if not line: break - line = line.strip() - if line[:4] == 'r"""': line = line[1:] - if line[:3] == '"""': - line = line[3:] - if line[-1:] == '\\': line = line[:-1] - while not line.strip(): - line = file.readline() - if not line: break - result = line.split('"""')[0].strip() - else: result = None - return result + """Return the one-line summary of a file object, if present""" + + string = '' + try: + tokens = tokenize.generate_tokens(file.readline) + for tok_type, tok_string, _, _, _ in tokens: + if tok_type == tokenize.STRING: + string += tok_string + elif tok_type == tokenize.NEWLINE: + with warnings.catch_warnings(): + # Ignore the "invalid escape sequence" warning. + warnings.simplefilter("ignore", SyntaxWarning) + docstring = ast.literal_eval(string) + if not isinstance(docstring, str): + return None + return docstring.strip().split('\n')[0].strip() + elif tok_type == tokenize.OP and tok_string in ('(', ')'): + string += tok_string + elif tok_type not in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING): + return None + except (tokenize.TokenError, UnicodeDecodeError, SyntaxError): + return None + return None def synopsis(filename, cache={}): """Get the one-line summary out of a module file.""" diff --git a/Lib/test/test_pydoc/test_pydoc.py b/Lib/test/test_pydoc/test_pydoc.py index c798b11f5aa56e..cec18aa9440c9e 100644 --- a/Lib/test/test_pydoc/test_pydoc.py +++ b/Lib/test/test_pydoc/test_pydoc.py @@ -4,6 +4,7 @@ import contextlib import importlib.util import inspect +import io import pydoc import py_compile import keyword @@ -899,6 +900,82 @@ def test_synopsis(self): synopsis = pydoc.synopsis(TESTFN, {}) self.assertEqual(synopsis, 'line 1: h\xe9') + def test_source_synopsis(self): + def check(source, expected, encoding=None): + if isinstance(source, str): + source_file = StringIO(source) + else: + source_file = io.TextIOWrapper(io.BytesIO(source), encoding=encoding) + with source_file: + result = pydoc.source_synopsis(source_file) + self.assertEqual(result, expected) + + check('"""Single line docstring."""', + 'Single line docstring.') + check('"""First line of docstring.\nSecond line.\nThird line."""', + 'First line of docstring.') + check('"""First line of docstring.\\nSecond line.\\nThird line."""', + 'First line of docstring.') + check('""" Whitespace around docstring. """', + 'Whitespace around docstring.') + check('import sys\n"""No docstring"""', + None) + check(' \n"""Docstring after empty line."""', + 'Docstring after empty line.') + check('# Comment\n"""Docstring after comment."""', + 'Docstring after comment.') + check(' # Indented comment\n"""Docstring after comment."""', + 'Docstring after comment.') + check('""""""', # Empty docstring + '') + check('', # Empty file + None) + check('"""Embedded\0null byte"""', + None) + check('"""Embedded null byte"""\0', + None) + check('"""Café and résumé."""', + 'Café and résumé.') + check("'''Triple single quotes'''", + 'Triple single quotes') + check('"Single double quotes"', + 'Single double quotes') + check("'Single single quotes'", + 'Single single quotes') + check('"""split\\\nline"""', + 'splitline') + check('"""Unrecognized escape \\sequence"""', + 'Unrecognized escape \\sequence') + check('"""Invalid escape seq\\uence"""', + None) + check('r"""Raw \\stri\\ng"""', + 'Raw \\stri\\ng') + check('b"""Bytes literal"""', + None) + check('f"""f-string"""', + None) + check('"""Concatenated""" \\\n"string" \'literals\'', + 'Concatenatedstringliterals') + check('"""String""" + """expression"""', + None) + check('("""In parentheses""")', + 'In parentheses') + check('("""Multiple lines """\n"""in parentheses""")', + 'Multiple lines in parentheses') + check('()', # tuple + None) + check(b'# coding: iso-8859-15\n"""\xa4uro sign"""', + '€uro sign', encoding='iso-8859-15') + check(b'"""\xa4"""', # Decoding error + None, encoding='utf-8') + + with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8') as temp_file: + temp_file.write('"""Real file test."""\n') + temp_file.flush() + temp_file.seek(0) + result = pydoc.source_synopsis(temp_file) + self.assertEqual(result, "Real file test.") + @requires_docstrings def test_synopsis_sourceless(self): os = import_helper.import_fresh_module('os') diff --git a/Misc/NEWS.d/next/Library/2024-12-17-15-23-40.gh-issue-41872.31LjKY.rst b/Misc/NEWS.d/next/Library/2024-12-17-15-23-40.gh-issue-41872.31LjKY.rst new file mode 100644 index 00000000000000..b807dcb284c248 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-12-17-15-23-40.gh-issue-41872.31LjKY.rst @@ -0,0 +1,3 @@ +Fix quick extraction of module docstrings from a file in :mod:`pydoc`. +It now supports docstrings with single quotes, escape sequences, +raw string literals, and other Python syntax.
participants (1)
-
serhiy-storchaka