[Python-checkins] r46413 - sandbox/trunk/stringbench/stringbench.py

Fri May 26 22:17:10 CEST 2006

Author: andrew.dalke
Date: Fri May 26 22:17:09 2006
New Revision: 46413

Modified:
   sandbox/trunk/stringbench/stringbench.py
Log:
Added more tests to stress various string operations.

Added command-line options:
  -8 or --8-bit    only time 8-bit strings
  -u or --unicode  only time unicode strings
  -R or --skip-re  skip doing the regular expression benchmarks



Modified: sandbox/trunk/stringbench/stringbench.py
==============================================================================

--- sandbox/trunk/stringbench/stringbench.py	(original)
+++ sandbox/trunk/stringbench/stringbench.py	Fri May 26 22:17:09 2006
@@ -7,18 +7,33 @@
 import re
 import sys
 import datetime
+import optparse
 
 print sys.version
 print datetime.datetime.now()
 
 REPEAT = 1
 REPEAT = 3
+#REPEAT = 7
 
 if __name__ != "__main__":
     raise SystemExit("Must run as main program")
 
+parser = optparse.OptionParser()
+parser.add_option("-R", "--skip-re", dest="skip_re",
+                  action="store_true",
+                  help="skip regular expression tests")
+parser.add_option("-8", "--8-bit", dest="str_only",
+                  action="store_true",
+                  help="only do 8-bit string benchmarks")
+parser.add_option("-u", "--unicode", dest="unicode_only",
+                  action="store_true",
+                  help="only do Unicode string benchmarks")
+
+
 _RANGE_1000 = range(1000)
 _RANGE_100 = range(100)
+_RANGE_10 = range(10)
 
 def bench(s, group, repeat_count):
     def blah(f):
@@ -29,6 +44,9 @@
         return f
     return blah
 
+def uses_re(f):
+    f.uses_re = True
+
 ####### 'in' comparisons
 
 @bench('"A" in "A"*1000', "early match, single character", 1000)
@@ -77,6 +95,7 @@
         s2 in s1
 
 # Try with regex
+ at uses_re
 @bench('s="ABC"*33; re.compile(s+"D").search((s+"D")*500+s+"E")',
        "late match, 100 characters", 100)
 def re_test_slow_match_100_characters(STR):
@@ -91,7 +110,7 @@
 
 #### same tests as 'in' but use 'find'
 
-# Add rfind
+# XXX: TODO: Add rfind
 
 
 
@@ -472,21 +491,40 @@
 ## split text on "--" characters
 @bench(
     '"this--is--a--test--of--the--emergency--broadcast--system".split("--")',
-    "split on multicharacter seperator", 1000)
-def split_multichar_sep(STR):
+    "split on multicharacter separator (small)", 1000)
+def split_multichar_sep_small(STR):
     s = STR("this--is--a--test--of--the--emergency--broadcast--system")
     s_split = s.split
     for x in _RANGE_1000:
         s_split("--")
 @bench(
     '"this--is--a--test--of--the--emergency--broadcast--system".rsplit("--")',
-    "split on multicharacter seperator", 1000)
-def rsplit_multichar_sep(STR):
+    "split on multicharacter separator (small)", 1000)
+def rsplit_multichar_sep_small(STR):
     s = STR("this--is--a--test--of--the--emergency--broadcast--system")
     s_rsplit = s.rsplit
     for x in _RANGE_1000:
         s_rsplit("--")
 
+## split dna text on "ACTAT" characters
+ at bench('dna.split("ACTAT")',
+       "split on multicharacter separator (dna)", 100)
+def split_multichar_sep_dna(STR):
+    s = _get_dna(STR)
+    s_split = s.split
+    for x in _RANGE_100:
+        s_split("ACTAT")
+        
+ at bench('dna.rsplit("ACTAT")',
+       "split on multicharacter separator (dna)", 100)
+def rsplit_multichar_sep_dna(STR):
+    s = _get_dna(STR)
+    s_rsplit = s.rsplit
+    for x in _RANGE_100:
+        s_rsplit("ACTAT")
+
+
+
 ## split with limits
 
 GFF3_example = "\t".join([
@@ -739,6 +777,7 @@
     for x in _RANGE_1000:
         s_replace(from_str, to_str)
 
+ at uses_re
 @bench('re.sub(" ", "\\t", "This is a test"', 'replace single character',
        1000)
 def replace_single_character_re(STR):
@@ -759,6 +798,7 @@
     for x in _RANGE_100:
         s_replace(from_str, to_str)
 
+ at uses_re
 @bench('re.sub("\\n", " ", "...text.with.2000.lines...")',
        'replace single character, big string', 100)
 def replace_single_character_big_re(STR):
@@ -815,6 +855,38 @@
         s_replace(from_str, to_str)
 
 
+big_s = "A" + ("Z"*1024*1024)
+big_s_unicode = unicode(big_s)
+def _get_big_s(STR):
+    if STR is unicode: return big_s_unicode
+    if STR is str: return big_s
+    raise AssertionError
+
+# The older replace implementation counted all matches in
+# the string even when it only neeed to make one replacement.
+ at bench('("A" + ("Z"*1024*1024)).replace("A", "BB", 1)',
+       'quick replace single character match', 10)
+def quick_replace_single_match(STR):
+    s = _get_big_s(STR)
+    from_str = STR("A")
+    to_str = STR("BB")
+    s_replace = s.replace
+    for x in _RANGE_10:
+        s_replace(from_str, to_str, 1)
+
+ at bench('("A" + ("Z"*1024*1024)).replace("AZZ", "BBZZ", 1)',
+       'quick replace multiple character match', 10)
+def quick_replace_multiple_match(STR):
+    s = _get_big_s(STR)
+    from_str = STR("AZZ")
+    to_str = STR("BBZZ")
+    s_replace = s.replace
+    for x in _RANGE_10:
+        s_replace(from_str, to_str, 1)
+
+
+####
+
 # CCP does a lot of this, for internationalisation of ingame messages.
 _format = "The %(thing)s is %(place)s the %(location)s."
 _format_dict = { "thing":"THING", "place":"PLACE", "location":"LOCATION", }
@@ -902,7 +974,9 @@
         return min(times) / number
 
 def main():
-    test_names = sys.argv[1:]
+    (options, test_names) = parser.parse_args()
+    if options.str_only and options.unicode_only:
+        raise SystemExit("Only one of --8-bit and --unicode are allowed")
     
     bench_functions = []
     for (k,v) in globals().items():
@@ -914,6 +988,9 @@
                 else:
                     # Not selected, ignore
                     continue
+            if options.skip_re and hasattr(v, "uses_re"):
+                continue
+            
             bench_functions.append( (v.group, k, v) )
     bench_functions.sort()
 
@@ -927,12 +1004,22 @@
         print "="*10, title
         for (_, k, v) in group:
             if hasattr(v, "is_bench"):
-                str_time = BenchTimer("__main__.%s(str)" % (k,),
-                                      "import __main__").best(REPEAT)
-                uni_time = BenchTimer("__main__.%s(unicode)" % (k,),
-                                      "import __main__").best(REPEAT)
+                if not options.unicode_only:
+                    str_time = BenchTimer("__main__.%s(str)" % (k,),
+                                          "import __main__").best(REPEAT)
+                else:
+                    str_time = 0.0
+                if not options.str_only:
+                    uni_time = BenchTimer("__main__.%s(unicode)" % (k,),
+                                          "import __main__").best(REPEAT)
+                else:
+                    uni_time = 0.0
+                try:
+                    average = str_time/uni_time
+                except ZeroDivisionError:
+                    average = 0.0
                 print "%.2f\t%.2f\t%.1f\t%s (*%d)" % (
-                    1000*str_time, 1000*uni_time, 100.*str_time/uni_time,
+                    1000*str_time, 1000*uni_time, 100.*average,
                     v.comment, v.repeat_count)
 
                 str_total += str_time
@@ -941,8 +1028,12 @@
     if str_total == uni_total == 0.0:
         print "That was zippy!"
     else:
+        try:
+            average = str_time/uni_time
+        except ZeroDivisionError:
+            average = 0.0
         print "%.2f\t%.2f\t%.1f\t%s" % (
-            1000*str_total, 1000*uni_total, 100.*str_total/uni_total,
+            1000*str_total, 1000*uni_total, 100.*average,
             "TOTAL")
 
 if __name__ == "__main__":