[Python-checkins] r87276 - in python/branches/py3k: Doc/library/difflib.rst Lib/difflib.py Lib/test/test_difflib.py

terry.reedy python-checkins at python.org
Wed Dec 15 21:18:10 CET 2010


Author: terry.reedy
Date: Wed Dec 15 21:18:10 2010
New Revision: 87276

Log:
Issue 10516, difflib: tweak doc; test new SequenceMatcher instance attributes; avoid unneeded lists of SM.b2j keys and items in .__chain_b. Do not backport.


Modified:
   python/branches/py3k/Doc/library/difflib.rst
   python/branches/py3k/Lib/difflib.py
   python/branches/py3k/Lib/test/test_difflib.py

Modified: python/branches/py3k/Doc/library/difflib.rst
==============================================================================
--- python/branches/py3k/Doc/library/difflib.rst	(original)
+++ python/branches/py3k/Doc/library/difflib.rst	Wed Dec 15 21:18:10 2010
@@ -359,11 +359,11 @@
       The *autojunk* parameter.
 
    SequenceMatcher objects get three data attributes: *bjunk* is the
-   set of elements of b for which *isjunk* is True; *bpopular* is the set of non-
-   junk elements considered popular by the heuristic (if it is not disabled);
-   *b2j* is a dict mapping the remaining elements of b to a list of positions where
-   they occur. All three are reset whenever *b* is reset with :meth:`set_seqs`
-   or :meth:`set_seq2`.
+   set of elements of *b* for which *isjunk* is True; *bpopular* is the set of
+   non-junk elements considered popular by the heuristic (if it is not
+   disabled); *b2j* is a dict mapping the remaining elements of *b* to a list
+   of positions where they occur. All three are reset whenever *b* is reset
+   with :meth:`set_seqs` or :meth:`set_seq2`.
 
    .. versionadded:: 3.2
       The *bjunk* and *bpopular* attributes.

Modified: python/branches/py3k/Lib/difflib.py
==============================================================================
--- python/branches/py3k/Lib/difflib.py	(original)
+++ python/branches/py3k/Lib/difflib.py	Wed Dec 15 21:18:10 2010
@@ -320,20 +320,22 @@
         self.bjunk = junk = set()
         isjunk = self.isjunk
         if isjunk:
-            for elt in list(b2j.keys()):  # using list() since b2j is modified
+            for elt in b2j.keys():
                 if isjunk(elt):
                     junk.add(elt)
-                    del b2j[elt]
+            for elt in junk: # separate loop avoids separate list of keys
+                del b2j[elt]
 
         # Purge popular elements that are not junk
         self.bpopular = popular = set()
         n = len(b)
         if self.autojunk and n >= 200:
             ntest = n // 100 + 1
-            for elt, idxs in list(b2j.items()):
+            for elt, idxs in b2j.items():
                 if len(idxs) > ntest:
                     popular.add(elt)
-                    del b2j[elt]
+            for elt in popular: # ditto; as fast for 1% deletion
+                del b2j[elt]
 
     def isbjunk(self, item):
         "Deprecated; use 'item in SequenceMatcher().bjunk'."

Modified: python/branches/py3k/Lib/test/test_difflib.py
==============================================================================
--- python/branches/py3k/Lib/test/test_difflib.py	(original)
+++ python/branches/py3k/Lib/test/test_difflib.py	Wed Dec 15 21:18:10 2010
@@ -12,12 +12,14 @@
         self.assertEqual(list(sm.get_opcodes()),
             [   ('insert', 0, 0, 0, 1),
                 ('equal', 0, 100, 1, 101)])
+        self.assertEqual(sm.bpopular, set())
         sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50)
         self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
         self.assertEqual(list(sm.get_opcodes()),
             [   ('equal', 0, 50, 0, 50),
                 ('insert', 50, 50, 50, 51),
                 ('equal', 50, 100, 51, 101)])
+        self.assertEqual(sm.bpopular, set())
 
     def test_one_delete(self):
         sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40)
@@ -27,6 +29,19 @@
                 ('delete', 40, 41, 40, 40),
                 ('equal', 41, 81, 40, 80)])
 
+    def test_bjunk(self):
+        sm = difflib.SequenceMatcher(isjunk=lambda x: x == ' ',
+                a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40)
+        self.assertEqual(sm.bjunk, set())
+
+        sm = difflib.SequenceMatcher(isjunk=lambda x: x == ' ',
+                a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40 + ' ' * 20)
+        self.assertEqual(sm.bjunk, {' '})
+
+        sm = difflib.SequenceMatcher(isjunk=lambda x: x in [' ', 'b'],
+                a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40 + ' ' * 20)
+        self.assertEqual(sm.bjunk, {' ', 'b'})
+
 
 class TestAutojunk(unittest.TestCase):
     """Tests for the autojunk parameter added in 2.7"""
@@ -38,10 +53,12 @@
 
         sm = difflib.SequenceMatcher(None, seq1, seq2)
         self.assertAlmostEqual(sm.ratio(), 0, places=3)
+        self.assertEqual(sm.bpopular, {'b'})
 
         # Now turn the heuristic off
         sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False)
         self.assertAlmostEqual(sm.ratio(), 0.9975, places=3)
+        self.assertEqual(sm.bpopular, set())
 
 
 class TestSFbugs(unittest.TestCase):


More information about the Python-checkins mailing list