[Python-checkins] bpo-38115: Deal with invalid bytecode offsets in lnotab (GH-16079)

Sat Sep 28 10:49:19 EDT 2019

https://github.com/python/cpython/commit/c8165036f374cd2ee64d4314eeb2514f7acb5026
commit: c8165036f374cd2ee64d4314eeb2514f7acb5026
branch: master
author: T. Wouters <thomas at python.org>
committer: Gregory P. Smith <greg at krypto.org>
date: 2019-09-28T07:49:15-07:00
summary:

bpo-38115: Deal with invalid bytecode offsets in lnotab (GH-16079)

Document that lnotab can contain invalid bytecode offsets (because of
terrible reasons that are difficult to fix). Make dis.findlinestarts()
ignore invalid offsets in lnotab. All other uses of lnotab in CPython
(various reimplementations of addr2line or line2addr in Python, C and gdb)
already ignore this, because they take an address to look for, instead.

Add tests for the result of dis.findlinestarts() on wacky constructs in
test_peepholer.py, because it's the easiest place to add them.

files:
A Misc/NEWS.d/next/Library/2019-09-13-09-24-58.bpo-38115.BOO-Y1.rst
M Lib/dis.py
M Lib/test/test_peepholer.py
M Objects/lnotab_notes.txt

diff --git a/Lib/dis.py b/Lib/dis.py
index a25fb2b41764..10e5f7fb08ab 100644
--- a/Lib/dis.py
+++ b/Lib/dis.py
@@ -454,6 +454,7 @@ def findlinestarts(code):
     """
     byte_increments = code.co_lnotab[0::2]
     line_increments = code.co_lnotab[1::2]
+    bytecode_len = len(code.co_code)
 
     lastlineno = None
     lineno = code.co_firstlineno
@@ -464,6 +465,10 @@ def findlinestarts(code):
                 yield (addr, lineno)
                 lastlineno = lineno
             addr += byte_incr
+            if addr >= bytecode_len:
+                # The rest of the lnotab byte offsets are past the end of
+                # the bytecode, so the lines were optimized away.
+                return
         if line_incr >= 0x80:
             # line_increments is an array of 8-bit signed integers
             line_incr -= 0x100
diff --git a/Lib/test/test_peepholer.py b/Lib/test/test_peepholer.py
index 47dee33076c5..23cc36c60537 100644
--- a/Lib/test/test_peepholer.py
+++ b/Lib/test/test_peepholer.py
@@ -40,6 +40,20 @@ def check_jump_targets(self, code):
                 self.fail(f'{instr.opname} at {instr.offset} '
                           f'jumps to {tgt.opname} at {tgt.offset}')
 
+    def check_lnotab(self, code):
+        "Check that the lnotab byte offsets are sensible."
+        code = dis._get_code_object(code)
+        lnotab = list(dis.findlinestarts(code))
+        # Don't bother checking if the line info is sensible, because
+        # most of the line info we can get at comes from lnotab.
+        min_bytecode = min(t[0] for t in lnotab)
+        max_bytecode = max(t[0] for t in lnotab)
+        self.assertGreaterEqual(min_bytecode, 0)
+        self.assertLess(max_bytecode, len(code.co_code))
+        # This could conceivably test more (and probably should, as there
+        # aren't very many tests of lnotab), if peepholer wasn't scheduled
+        # to be replaced anyway.
+
     def test_unot(self):
         # UNARY_NOT POP_JUMP_IF_FALSE  -->  POP_JUMP_IF_TRUE'
         def unot(x):
@@ -48,6 +62,7 @@ def unot(x):
         self.assertNotInBytecode(unot, 'UNARY_NOT')
         self.assertNotInBytecode(unot, 'POP_JUMP_IF_FALSE')
         self.assertInBytecode(unot, 'POP_JUMP_IF_TRUE')
+        self.check_lnotab(unot)
 
     def test_elim_inversion_of_is_or_in(self):
         for line, cmp_op in (
@@ -58,6 +73,7 @@ def test_elim_inversion_of_is_or_in(self):
             ):
             code = compile(line, '', 'single')
             self.assertInBytecode(code, 'COMPARE_OP', cmp_op)
+            self.check_lnotab(code)
 
     def test_global_as_constant(self):
         # LOAD_GLOBAL None/True/False  -->  LOAD_CONST None/True/False
@@ -75,6 +91,7 @@ def h():
         for func, elem in ((f, None), (g, True), (h, False)):
             self.assertNotInBytecode(func, 'LOAD_GLOBAL')
             self.assertInBytecode(func, 'LOAD_CONST', elem)
+            self.check_lnotab(func)
 
         def f():
             'Adding a docstring made this test fail in Py2.5.0'
@@ -82,6 +99,7 @@ def f():
 
         self.assertNotInBytecode(f, 'LOAD_GLOBAL')
         self.assertInBytecode(f, 'LOAD_CONST', None)
+        self.check_lnotab(f)
 
     def test_while_one(self):
         # Skip over:  LOAD_CONST trueconst  POP_JUMP_IF_FALSE xx
@@ -93,6 +111,7 @@ def f():
             self.assertNotInBytecode(f, elem)
         for elem in ('JUMP_ABSOLUTE',):
             self.assertInBytecode(f, elem)
+        self.check_lnotab(f)
 
     def test_pack_unpack(self):
         for line, elem in (
@@ -104,6 +123,7 @@ def test_pack_unpack(self):
             self.assertInBytecode(code, elem)
             self.assertNotInBytecode(code, 'BUILD_TUPLE')
             self.assertNotInBytecode(code, 'UNPACK_TUPLE')
+            self.check_lnotab(code)
 
     def test_folding_of_tuples_of_constants(self):
         for line, elem in (
@@ -116,6 +136,7 @@ def test_folding_of_tuples_of_constants(self):
             code = compile(line,'','single')
             self.assertInBytecode(code, 'LOAD_CONST', elem)
             self.assertNotInBytecode(code, 'BUILD_TUPLE')
+            self.check_lnotab(code)
 
         # Long tuples should be folded too.
         code = compile(repr(tuple(range(10000))),'','single')
@@ -124,6 +145,7 @@ def test_folding_of_tuples_of_constants(self):
         load_consts = [instr for instr in dis.get_instructions(code)
                               if instr.opname == 'LOAD_CONST']
         self.assertEqual(len(load_consts), 2)
+        self.check_lnotab(code)
 
         # Bug 1053819:  Tuple of constants misidentified when presented with:
         # . . . opcode_with_arg 100   unary_opcode   BUILD_TUPLE 1  . . .
@@ -141,6 +163,7 @@ def crater():
                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
             ],)
+        self.check_lnotab(crater)
 
     def test_folding_of_lists_of_constants(self):
         for line, elem in (
@@ -153,6 +176,7 @@ def test_folding_of_lists_of_constants(self):
             code = compile(line, '', 'single')
             self.assertInBytecode(code, 'LOAD_CONST', elem)
             self.assertNotInBytecode(code, 'BUILD_LIST')
+            self.check_lnotab(code)
 
     def test_folding_of_sets_of_constants(self):
         for line, elem in (
@@ -166,6 +190,7 @@ def test_folding_of_sets_of_constants(self):
             code = compile(line, '', 'single')
             self.assertNotInBytecode(code, 'BUILD_SET')
             self.assertInBytecode(code, 'LOAD_CONST', elem)
+            self.check_lnotab(code)
 
         # Ensure that the resulting code actually works:
         def f(a):
@@ -176,9 +201,11 @@ def g(a):
 
         self.assertTrue(f(3))
         self.assertTrue(not f(4))
+        self.check_lnotab(f)
 
         self.assertTrue(not g(3))
         self.assertTrue(g(4))
+        self.check_lnotab(g)
 
 
     def test_folding_of_binops_on_constants(self):
@@ -203,41 +230,50 @@ def test_folding_of_binops_on_constants(self):
             self.assertInBytecode(code, 'LOAD_CONST', elem)
             for instr in dis.get_instructions(code):
                 self.assertFalse(instr.opname.startswith('BINARY_'))
+            self.check_lnotab(code)
 
         # Verify that unfoldables are skipped
         code = compile('a=2+"b"', '', 'single')
         self.assertInBytecode(code, 'LOAD_CONST', 2)
         self.assertInBytecode(code, 'LOAD_CONST', 'b')
+        self.check_lnotab(code)
 
         # Verify that large sequences do not result from folding
         code = compile('a="x"*10000', '', 'single')
         self.assertInBytecode(code, 'LOAD_CONST', 10000)
         self.assertNotIn("x"*10000, code.co_consts)
+        self.check_lnotab(code)
         code = compile('a=1<<1000', '', 'single')
         self.assertInBytecode(code, 'LOAD_CONST', 1000)
         self.assertNotIn(1<<1000, code.co_consts)
+        self.check_lnotab(code)
         code = compile('a=2**1000', '', 'single')
         self.assertInBytecode(code, 'LOAD_CONST', 1000)
         self.assertNotIn(2**1000, code.co_consts)
+        self.check_lnotab(code)
 
     def test_binary_subscr_on_unicode(self):
         # valid code get optimized
         code = compile('"foo"[0]', '', 'single')
         self.assertInBytecode(code, 'LOAD_CONST', 'f')
         self.assertNotInBytecode(code, 'BINARY_SUBSCR')
+        self.check_lnotab(code)
         code = compile('"\u0061\uffff"[1]', '', 'single')
         self.assertInBytecode(code, 'LOAD_CONST', '\uffff')
         self.assertNotInBytecode(code,'BINARY_SUBSCR')
+        self.check_lnotab(code)
 
         # With PEP 393, non-BMP char get optimized
         code = compile('"\U00012345"[0]', '', 'single')
         self.assertInBytecode(code, 'LOAD_CONST', '\U00012345')
         self.assertNotInBytecode(code, 'BINARY_SUBSCR')
+        self.check_lnotab(code)
 
         # invalid code doesn't get optimized
         # out of range
         code = compile('"fuu"[10]', '', 'single')
         self.assertInBytecode(code, 'BINARY_SUBSCR')
+        self.check_lnotab(code)
 
     def test_folding_of_unaryops_on_constants(self):
         for line, elem in (
@@ -252,13 +288,15 @@ def test_folding_of_unaryops_on_constants(self):
             self.assertInBytecode(code, 'LOAD_CONST', elem)
             for instr in dis.get_instructions(code):
                 self.assertFalse(instr.opname.startswith('UNARY_'))
+            self.check_lnotab(code)
 
         # Check that -0.0 works after marshaling
         def negzero():
             return -(1.0-1.0)
 
-        for instr in dis.get_instructions(code):
+        for instr in dis.get_instructions(negzero):
             self.assertFalse(instr.opname.startswith('UNARY_'))
+        self.check_lnotab(negzero)
 
         # Verify that unfoldables are skipped
         for line, elem, opname in (
@@ -268,6 +306,7 @@ def negzero():
             code = compile(line, '', 'single')
             self.assertInBytecode(code, 'LOAD_CONST', elem)
             self.assertInBytecode(code, opname)
+            self.check_lnotab(code)
 
     def test_elim_extra_return(self):
         # RETURN LOAD_CONST None RETURN  -->  RETURN
@@ -277,6 +316,7 @@ def f(x):
         returns = [instr for instr in dis.get_instructions(f)
                           if instr.opname == 'RETURN_VALUE']
         self.assertEqual(len(returns), 1)
+        self.check_lnotab(f)
 
     def test_elim_jump_to_return(self):
         # JUMP_FORWARD to RETURN -->  RETURN
@@ -290,6 +330,7 @@ def f(cond, true_value, false_value):
         returns = [instr for instr in dis.get_instructions(f)
                           if instr.opname == 'RETURN_VALUE']
         self.assertEqual(len(returns), 2)
+        self.check_lnotab(f)
 
     def test_elim_jump_to_uncond_jump(self):
         # POP_JUMP_IF_FALSE to JUMP_FORWARD --> POP_JUMP_IF_FALSE to non-jump
@@ -302,6 +343,7 @@ def f():
             else:
                 baz()
         self.check_jump_targets(f)
+        self.check_lnotab(f)
 
     def test_elim_jump_to_uncond_jump2(self):
         # POP_JUMP_IF_FALSE to JUMP_ABSOLUTE --> POP_JUMP_IF_FALSE to non-jump
@@ -312,6 +354,7 @@ def f():
                     or d):
                     a = foo()
         self.check_jump_targets(f)
+        self.check_lnotab(f)
 
     def test_elim_jump_to_uncond_jump3(self):
         # Intentionally use two-line expressions to test issue37213.
@@ -320,18 +363,21 @@ def f(a, b, c):
             return ((a and b)
                     and c)
         self.check_jump_targets(f)
+        self.check_lnotab(f)
         self.assertEqual(count_instr_recursively(f, 'JUMP_IF_FALSE_OR_POP'), 2)
         # JUMP_IF_TRUE_OR_POP to JUMP_IF_TRUE_OR_POP --> JUMP_IF_TRUE_OR_POP to non-jump
         def f(a, b, c):
             return ((a or b)
                     or c)
         self.check_jump_targets(f)
+        self.check_lnotab(f)
         self.assertEqual(count_instr_recursively(f, 'JUMP_IF_TRUE_OR_POP'), 2)
         # JUMP_IF_FALSE_OR_POP to JUMP_IF_TRUE_OR_POP --> POP_JUMP_IF_FALSE to non-jump
         def f(a, b, c):
             return ((a and b)
                     or c)
         self.check_jump_targets(f)
+        self.check_lnotab(f)
         self.assertNotInBytecode(f, 'JUMP_IF_FALSE_OR_POP')
         self.assertInBytecode(f, 'JUMP_IF_TRUE_OR_POP')
         self.assertInBytecode(f, 'POP_JUMP_IF_FALSE')
@@ -340,6 +386,7 @@ def f(a, b, c):
             return ((a or b)
                     and c)
         self.check_jump_targets(f)
+        self.check_lnotab(f)
         self.assertNotInBytecode(f, 'JUMP_IF_TRUE_OR_POP')
         self.assertInBytecode(f, 'JUMP_IF_FALSE_OR_POP')
         self.assertInBytecode(f, 'POP_JUMP_IF_TRUE')
@@ -360,6 +407,7 @@ def f(cond1, cond2):
         returns = [instr for instr in dis.get_instructions(f)
                           if instr.opname == 'RETURN_VALUE']
         self.assertLessEqual(len(returns), 6)
+        self.check_lnotab(f)
 
     def test_elim_jump_after_return2(self):
         # Eliminate dead code: jumps immediately after returns can't be reached
@@ -374,6 +422,7 @@ def f(cond1, cond2):
         returns = [instr for instr in dis.get_instructions(f)
                           if instr.opname == 'RETURN_VALUE']
         self.assertLessEqual(len(returns), 2)
+        self.check_lnotab(f)
 
     def test_make_function_doesnt_bail(self):
         def f():
@@ -381,6 +430,7 @@ def g()->1+1:
                 pass
             return g
         self.assertNotInBytecode(f, 'BINARY_ADD')
+        self.check_lnotab(f)
 
     def test_constant_folding(self):
         # Issue #11244: aggressive constant folding.
@@ -401,17 +451,20 @@ def test_constant_folding(self):
                 self.assertFalse(instr.opname.startswith('UNARY_'))
                 self.assertFalse(instr.opname.startswith('BINARY_'))
                 self.assertFalse(instr.opname.startswith('BUILD_'))
+            self.check_lnotab(code)
 
     def test_in_literal_list(self):
         def containtest():
             return x in [a, b]
         self.assertEqual(count_instr_recursively(containtest, 'BUILD_LIST'), 0)
+        self.check_lnotab(containtest)
 
     def test_iterate_literal_list(self):
         def forloop():
             for x in [a, b]:
                 pass
         self.assertEqual(count_instr_recursively(forloop, 'BUILD_LIST'), 0)
+        self.check_lnotab(forloop)
 
     def test_condition_with_binop_with_bools(self):
         def f():
@@ -419,6 +472,7 @@ def f():
                 return 1
             return 0
         self.assertEqual(f(), 1)
+        self.check_lnotab(f)
 
     def test_if_with_if_expression(self):
         # Check bpo-37289
@@ -427,6 +481,19 @@ def f(x):
                 return True
             return False
         self.assertTrue(f(True))
+        self.check_lnotab(f)
+
+    def test_trailing_nops(self):
+        # Check the lnotab of a function that even after trivial
+        # optimization has trailing nops, which the lnotab adjustment has to
+        # handle properly (bpo-38115).
+        def f(x):
+            while 1:
+                return 3
+            while 1:
+                return 5
+            return 6
+        self.check_lnotab(f)
 
 
 class TestBuglets(unittest.TestCase):
diff --git a/Misc/NEWS.d/next/Library/2019-09-13-09-24-58.bpo-38115.BOO-Y1.rst b/Misc/NEWS.d/next/Library/2019-09-13-09-24-58.bpo-38115.BOO-Y1.rst
new file mode 100644
index 000000000000..5119c0546e36
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-09-13-09-24-58.bpo-38115.BOO-Y1.rst
@@ -0,0 +1 @@
+Fix a bug in dis.findlinestarts() where it would return invalid bytecode offsets. Document that a code object's co_lnotab can contain invalid bytecode offsets.
\ No newline at end of file
diff --git a/Objects/lnotab_notes.txt b/Objects/lnotab_notes.txt
index 3dab2b986616..71a297971828 100644
--- a/Objects/lnotab_notes.txt
+++ b/Objects/lnotab_notes.txt
@@ -3,7 +3,9 @@ All about co_lnotab, the line number table.
 Code objects store a field named co_lnotab.  This is an array of unsigned bytes
 disguised as a Python bytes object.  It is used to map bytecode offsets to
 source code line #s for tracebacks and to identify line number boundaries for
-line tracing.
+line tracing. Because of internals of the peephole optimizer, it's possible
+for lnotab to contain bytecode offsets that are no longer valid (for example
+if the optimizer removed the last line in a function).
 
 The array is conceptually a compressed list of
     (bytecode offset increment, line number increment)