[Python-checkins] r83173 - in python/branches/py3k: Lib/re.py Lib/test/test_re.py Misc/NEWS

gregory.p.smith python-checkins at python.org
Tue Jul 27 07:31:29 CEST 2010


Author: gregory.p.smith
Date: Tue Jul 27 07:31:29 2010
New Revision: 83173

Log:
The default size of the re module's compiled regular expression cache has
been increased from 100 to 500 and the cache replacement policy has changed
from simply clearing the entire cache on overflow to randomly forgetting 20%
of the existing cached compiled regular expressions.  This is a performance
win for applications that use a lot of regular expressions and limits the
impact of the performance hit anytime the cache is exceeded.


Modified:
   python/branches/py3k/Lib/re.py
   python/branches/py3k/Lib/test/test_re.py
   python/branches/py3k/Misc/NEWS

Modified: python/branches/py3k/Lib/re.py
==============================================================================
--- python/branches/py3k/Lib/re.py	(original)
+++ python/branches/py3k/Lib/re.py	Tue Jul 27 07:31:29 2010
@@ -254,7 +254,40 @@
 
 _pattern_type = type(sre_compile.compile("", 0))
 
-_MAXCACHE = 100
+_MAXCACHE = 500
+
+def _shrink_cache(cache_dict, max_length, divisor=5):
+    """Make room in the given cache.
+
+    Args:
+        cache_dict: The cache dictionary to modify.
+        max_length: Maximum # of entries in cache_dict before it is shrunk.
+        divisor: Cache will shrink to max_length - 1/divisor*max_length items.
+    """
+    # Toss out a fraction of the entries at random to make room for new ones.
+    # A random algorithm was chosen as opposed to simply cache_dict.popitem()
+    # as popitem could penalize the same regular expression repeatedly based
+    # on its internal hash value.  Being random should spread the cache miss
+    # love around.
+    cache_keys = tuple(cache_dict.keys())
+    overage = len(cache_keys) - max_length
+    if overage < 0:
+        # Cache is already within limits.  Normally this should not happen
+        # but it could due to multithreading.
+        return
+    number_to_toss = max_length // divisor + overage
+    # The import is done here to avoid a circular depencency.
+    import random
+    if not hasattr(random, 'sample'):
+        # Do nothing while resolving the circular dependency:
+        #  re->random->warnings->tokenize->string->re
+        return
+    for doomed_key in random.sample(cache_keys, number_to_toss):
+        try:
+            del cache_dict[doomed_key]
+        except KeyError:
+            # Ignore problems if the cache changed from another thread.
+            pass
 
 def _compile(*key):
     # internal: compile pattern
@@ -272,7 +305,7 @@
         raise TypeError("first argument must be string or compiled pattern")
     p = sre_compile.compile(pattern, flags)
     if len(_cache) >= _MAXCACHE:
-        _cache.clear()
+        _shrink_cache(_cache, _MAXCACHE)
     _cache[cachekey] = p
     return p
 
@@ -284,7 +317,7 @@
     repl, pattern = key
     p = sre_parse.parse_template(repl, pattern)
     if len(_cache_repl) >= _MAXCACHE:
-        _cache_repl.clear()
+        _shrink_cache(_cache_repl, _MAXCACHE)
     _cache_repl[key] = p
     return p
 

Modified: python/branches/py3k/Lib/test/test_re.py
==============================================================================
--- python/branches/py3k/Lib/test/test_re.py	(original)
+++ python/branches/py3k/Lib/test/test_re.py	Tue Jul 27 07:31:29 2010
@@ -874,8 +874,71 @@
                 if result is None:
                     print('=== Fails on unicode-sensitive match', t)
 
+
+class ReCacheTests(unittest.TestCase):
+    """These tests are specific to the re._shrink_cache implementation."""
+
+    def setUp(self):
+        self._orig_maxcache = re._MAXCACHE
+
+    def tearDown(self):
+        re._MAXCACHE = self._orig_maxcache
+
+    def test_compile_cache_overflow(self):
+        # NOTE: If a profiler or debugger is tracing code and compiling
+        # regular expressions while tracing through this test... expect
+        # the test to fail.  This test is not concurrency safe.
+
+        # Explicitly fill the caches.
+        re._MAXCACHE = 20
+        max_cache = re._MAXCACHE
+        unique_chars = tuple(chr(char_num) for char_num in
+                             range(b'a'[0], b'a'[0]+max_cache))
+        re._cache.clear()
+        for char in unique_chars:
+            re._compile(char, 0)
+        self.assertEqual(max_cache, len(re._cache))
+        re._cache_repl.clear()
+        for char in unique_chars:
+            re._compile_repl(char*2, char)
+        self.assertEqual(max_cache, len(re._cache_repl))
+
+        # Overflow both caches and make sure they have extra room left
+        # afterwards as well as having more than a single entry.
+        re._compile('A', 0)
+        self.assertLess(len(re._cache), max_cache)
+        self.assertGreater(len(re._cache), 1)
+        re._compile_repl('A', 'A')
+        self.assertLess(len(re._cache_repl), max_cache)
+        self.assertGreater(len(re._cache_repl), 1)
+
+    def test_shrink_cache_at_limit(self):
+        cache = dict(zip(range(6), range(6)))
+        re._shrink_cache(cache, 6, divisor=3)
+        self.assertEqual(4, len(cache))
+
+    def test_shrink_cache_empty(self):
+        cache = {}
+        re._shrink_cache(cache, 6, divisor=3)
+        # Cache was empty, make sure we didn't raise an exception.
+        self.assertEqual(0, len(cache))
+
+    def test_shrink_cache_overflowing(self):
+        cache = dict(zip(range(6), range(6)))
+        re._shrink_cache(cache, 4, divisor=2)
+        # Cache was larger than the maximum, be sure we shrunk to smaller.
+        self.assertEqual(2, len(cache))
+
+    def test_shrink_cache_underflow(self):
+        cache = dict(zip(range(6), range(6)))
+        # No shrinking to do.
+        re._shrink_cache(cache, 9, divisor=3)
+        self.assertEqual(6, len(cache))
+
+
 def test_main():
     run_unittest(ReTests)
+    run_unittest(ReCacheTests)
     run_re_tests()
 
 if __name__ == "__main__":

Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS	(original)
+++ python/branches/py3k/Misc/NEWS	Tue Jul 27 07:31:29 2010
@@ -473,6 +473,13 @@
 Library
 -------
 
+- The default size of the re module's compiled regular expression cache has
+  been increased from 100 to 500 and the cache replacement policy has changed
+  from simply clearing the entire cache on overflow to randomly forgetting 20%
+  of the existing cached compiled regular expressions.  This is a performance
+  win for applications that use a lot of regular expressions and limits the
+  impact of the performance hit anytime the cache is exceeded.
+
 - Issue #7113: Speed up loading in configparser. Patch by Łukasz Langa.
 
 - Issue #9032: XML-RPC client retries the request on EPIPE error. The EPIPE


More information about the Python-checkins mailing list