[Python-checkins] r79423 - sandbox/trunk/stringbench/stringbench.py

Thu Mar 25 19:05:25 CET 2010

Author: antoine.pitrou
Date: Thu Mar 25 19:05:24 2010
New Revision: 79423

Log:
Make stringbench both 2.x and 3.x compatible.
Also, make the total running time much smaller, by shortening iterations.



Modified:
   sandbox/trunk/stringbench/stringbench.py

Modified: sandbox/trunk/stringbench/stringbench.py
==============================================================================

--- sandbox/trunk/stringbench/stringbench.py	(original)
+++ sandbox/trunk/stringbench/stringbench.py	Thu Mar 25 19:05:24 2010
@@ -1,5 +1,6 @@
 
 # Various microbenchmarks comparing unicode and byte string performance
+# Please keep this file both 2.x and 3.x compatible!
 
 import timeit
 import itertools
@@ -11,9 +12,23 @@
 
 VERSION = '2.0'
 
-print 'stringbench v%s' % VERSION
-print sys.version
-print datetime.datetime.now()
+def p(*args):
+    sys.stdout.write(' '.join(str(s) for s in args) + '\n')
+
+if sys.version_info >= (3,):
+    BYTES = bytes_from_str = lambda x: x.encode('ascii')
+    UNICODE = unicode_from_str = lambda x: x
+else:
+    BYTES = bytes_from_str = lambda x: x
+    UNICODE = unicode_from_str = lambda x: x.decode('ascii')
+
+class UnsupportedType(TypeError):
+    pass
+
+
+p('stringbench v%s' % VERSION)
+p(sys.version)
+p(datetime.datetime.now())
 
 REPEAT = 1
 REPEAT = 3
@@ -26,7 +41,7 @@
 parser.add_option("-R", "--skip-re", dest="skip_re",
                   action="store_true",
                   help="skip regular expression tests")
-parser.add_option("-8", "--8-bit", dest="str_only",
+parser.add_option("-8", "--8-bit", dest="bytes_only",
                   action="store_true",
                   help="only do 8-bit string benchmarks")
 parser.add_option("-u", "--unicode", dest="unicode_only",
@@ -34,9 +49,9 @@
                   help="only do Unicode string benchmarks")
 
 
-_RANGE_1000 = range(1000)
-_RANGE_100 = range(100)
-_RANGE_10 = range(10)
+_RANGE_1000 = list(range(1000))
+_RANGE_100 = list(range(100))
+_RANGE_10 = list(range(10))
 
 dups = {}
 def bench(s, group, repeat_count):
@@ -97,8 +112,10 @@
        "late match, 100 characters", 100)
 def in_test_slow_match_100_characters(STR):
     m = STR("ABC"*33)
-    s1 = (m+"D")*300 + m+"E"
-    s2 = m+"E"
+    d = STR("D")
+    e = STR("E")
+    s1 = (m+d)*300 + m+e
+    s2 = m+e
     for x in _RANGE_100:
         s2 in s1
 
@@ -108,8 +125,10 @@
        "late match, 100 characters", 100)
 def re_test_slow_match_100_characters(STR):
     m = STR("ABC"*33)
-    s1 = (m+"D")*300 + m+"E"
-    s2 = m+"E"
+    d = STR("D")
+    e = STR("E")
+    s1 = (m+d)*300 + m+e
+    s2 = m+e
     pat = re.compile(s2)
     search = pat.search
     for x in _RANGE_100:
@@ -179,8 +198,10 @@
        "late match, 100 characters", 100)
 def find_test_slow_match_100_characters(STR):
     m = STR("ABC"*33)
-    s1 = (m+"D")*500 + m+"E"
-    s2 = m+"E"
+    d = STR("D")
+    e = STR("E")
+    s1 = (m+d)*500 + m+e
+    s2 = m+e
     s1_find = s1.find
     for x in _RANGE_100:
         s1_find(s2)
@@ -189,8 +210,10 @@
        "late match, 100 characters", 100)
 def find_test_slow_match_100_characters_bis(STR):
     m = STR("ABC"*33)
-    s1 = (m+"D")*500 + "E"+m
-    s2 = "E"+m
+    d = STR("D")
+    e = STR("E")
+    s1 = (m+d)*500 + e+m
+    s2 = e+m
     s1_find = s1.find
     for x in _RANGE_100:
         s1_find(s2)
@@ -259,8 +282,10 @@
        "late match, 100 characters", 100)
 def rfind_test_slow_match_100_characters(STR):
     m = STR("ABC"*33)
-    s1 = "E"+m + ("D"+m)*500
-    s2 = "E"+m
+    d = STR("D")
+    e = STR("E")
+    s1 = e+m + (d+m)*500
+    s2 = e+m
     s1_rfind = s1.rfind
     for x in _RANGE_100:
         s1_rfind(s2)
@@ -269,8 +294,10 @@
        "late match, 100 characters", 100)
 def rfind_test_slow_match_100_characters_bis(STR):
     m = STR("ABC"*33)
-    s1 = m+"E" + ("D"+m)*500
-    s2 = m+"E"
+    d = STR("D")
+    e = STR("E")
+    s1 = m+e + (d+m)*500
+    s2 = m+e
     s1_rfind = s1.rfind
     for x in _RANGE_100:
         s1_rfind(s2)
@@ -307,8 +334,10 @@
        "late match, 100 characters", 100)
 def index_test_slow_match_100_characters(STR):
     m = STR("ABC"*33)
-    s1 = (m+"D")*500 + m+"E"
-    s2 = m+"E"
+    d = STR("D")
+    e = STR("E")
+    s1 = (m+d)*500 + m+e
+    s2 = m+e
     s1_index = s1.index
     for x in _RANGE_100:
         s1_index(s2)
@@ -344,8 +373,10 @@
        "late match, 100 characters", 100)
 def rindex_test_slow_match_100_characters(STR):
     m = STR("ABC"*33)
-    s1 = "E" + m + ("D"+m)*500
-    s2 = "E" + m
+    d = STR("D")
+    e = STR("E")
+    s1 = e + m + (d+m)*500
+    s2 = e + m
     s1_rindex = s1.rindex
     for x in _RANGE_100:
         s1_rindex(s2)
@@ -398,8 +429,10 @@
        "late match, 100 characters", 100)
 def partition_test_slow_match_100_characters(STR):
     m = STR("ABC"*33)
-    s1 = (m+"D")*500 + m+"E"
-    s2 = m+"E"
+    d = STR("D")
+    e = STR("E")
+    s1 = (m+d)*500 + m+e
+    s2 = m+e
     s1_partition = s1.partition
     for x in _RANGE_100:
         s1_partition(s2)
@@ -452,8 +485,10 @@
        "late match, 100 characters", 100)
 def rpartition_test_slow_match_100_characters(STR):
     m = STR("ABC"*33)
-    s1 = "E" + m + ("D"+m)*500
-    s2 = "E" + m
+    d = STR("D")
+    e = STR("E")
+    s1 = e + m + (d+m)*500
+    s2 = e + m
     s1_rpartition = s1.rpartition
     for x in _RANGE_100:
         s1_rpartition(s2)
@@ -506,8 +541,10 @@
        "late match, 100 characters", 100)
 def split_test_slow_match_100_characters(STR):
     m = STR("ABC"*33)
-    s1 = (m+"D")*500 + m+"E"
-    s2 = m+"E"
+    d = STR("D")
+    e = STR("E")
+    s1 = (m+d)*500 + m+e
+    s2 = m+e
     s1_split = s1.split
     for x in _RANGE_100:
         s1_split(s2, 1)
@@ -560,8 +597,10 @@
        "late match, 100 characters", 100)
 def rsplit_test_slow_match_100_characters(STR):
     m = STR("ABC"*33)
-    s1 = "E" + m + ("D"+m)*500
-    s2 = "E" + m
+    d = STR("D")
+    e = STR("E")
+    s1 = e + m + (d+m)*500
+    s2 = e + m
     s1_rsplit = s1.rsplit
     for x in _RANGE_100:
         s1_rsplit(s2, 1)
@@ -632,11 +671,16 @@
 
 #### Benchmark join
 
+def get_bytes_yielding_seq(STR, arg):
+    if STR is BYTES and sys.version_info >= (3,):
+        raise UnsupportedType
+    return STR(arg)
+
 @bench('"A".join("")',
        "join empty string, with 1 character sep", 100)
 def join_empty_single(STR):
     sep = STR("A")
-    s2 = STR("")
+    s2 = get_bytes_yielding_seq(STR, "")
     sep_join = sep.join
     for x in _RANGE_100:
         sep_join(s2)
@@ -645,7 +689,7 @@
        "join empty string, with 5 character sep", 100)
 def join_empty_5(STR):
     sep = STR("ABCDE")
-    s2 = STR("")
+    s2 = get_bytes_yielding_seq(STR, "")
     sep_join = sep.join
     for x in _RANGE_100:
         sep_join(s2)
@@ -654,7 +698,7 @@
        "join string with 26 characters, with 1 character sep", 1000)
 def join_alphabet_single(STR):
     sep = STR("A")
-    s2 = STR("ABCDEFGHIJKLMnOPQRSTUVWXYZ")
+    s2 = get_bytes_yielding_seq(STR, "ABCDEFGHIJKLMnOPQRSTUVWXYZ")
     sep_join = sep.join
     for x in _RANGE_1000:
         sep_join(s2)
@@ -663,7 +707,7 @@
        "join string with 26 characters, with 5 character sep", 1000)
 def join_alphabet_5(STR):
     sep = STR("ABCDE")
-    s2 = STR("ABCDEFGHIJKLMnOPQRSTUVWXYZ")
+    s2 = get_bytes_yielding_seq(STR, "ABCDEFGHIJKLMnOPQRSTUVWXYZ")
     sep_join = sep.join
     for x in _RANGE_1000:
         sep_join(s2)
@@ -672,7 +716,7 @@
        "join list of 26 characters, with 1 character sep", 1000)
 def join_alphabet_list_single(STR):
     sep = STR("A")
-    s2 = list(STR("ABCDEFGHIJKLMnOPQRSTUVWXYZ"))
+    s2 = [STR(x) for x in "ABCDEFGHIJKLMnOPQRSTUVWXYZ"]
     sep_join = sep.join
     for x in _RANGE_1000:
         sep_join(s2)
@@ -681,7 +725,7 @@
        "join list of 26 characters, with 5 character sep", 1000)
 def join_alphabet_list_five(STR):
     sep = STR("ABCDE")
-    s2 = list(STR("ABCDEFGHIJKLMnOPQRSTUVWXYZ"))
+    s2 = [STR(x) for x in "ABCDEFGHIJKLMnOPQRSTUVWXYZ"]
     sep_join = sep.join
     for x in _RANGE_1000:
         sep_join(s2)
@@ -771,12 +815,13 @@
 Python is distributed under an OSI-approved open source license that
 makes it free to use, even for commercial products.
 """*25
-human_text_unicode = unicode(human_text)
+human_text_bytes = bytes_from_str(human_text)
+human_text_unicode = unicode_from_str(human_text)
 def _get_human_text(STR):
-    if STR is unicode:
+    if STR is UNICODE:
         return human_text_unicode
-    if STR is str:
-        return human_text
+    if STR is BYTES:
+        return human_text_bytes
     raise AssertionError
 
 @bench('human_text.split()', "split whitespace (huge)", 10)
@@ -799,16 +844,18 @@
 def newlines_split(STR):
     s = STR("this\nis\na\ntest\n")
     s_split = s.split
+    nl = STR("\n")
     for x in _RANGE_1000:
-        s_split("\n")
+        s_split(nl)
 
 
 @bench('"this\\nis\\na\\ntest\\n".rsplit("\\n")', "split newlines", 1000)
 def newlines_rsplit(STR):
     s = STR("this\nis\na\ntest\n")
     s_rsplit = s.rsplit
+    nl = STR("\n")
     for x in _RANGE_1000:
-        s_rsplit("\n")
+        s_rsplit(nl)
 
 @bench('"this\\nis\\na\\ntest\\n".splitlines()', "split newlines", 1000)
 def newlines_splitlines(STR):
@@ -822,7 +869,7 @@
 def _make_2000_lines():
     import random
     r = random.Random(100)
-    chars = map(chr, range(32, 128))
+    chars = list(map(chr, range(32, 128)))
     i = 0
     while i < len(chars):
         chars[i] = " "
@@ -837,12 +884,13 @@
     return "\n".join(words)+"\n"
 
 _text_with_2000_lines = _make_2000_lines()
-_text_with_2000_lines_unicode = unicode(_text_with_2000_lines)
+_text_with_2000_lines_bytes = bytes_from_str(_text_with_2000_lines)
+_text_with_2000_lines_unicode = unicode_from_str(_text_with_2000_lines)
 def _get_2000_lines(STR):
-    if STR is unicode:
+    if STR is UNICODE:
         return _text_with_2000_lines_unicode
-    if STR is str:
-        return _text_with_2000_lines
+    if STR is BYTES:
+        return _text_with_2000_lines_bytes
     raise AssertionError
 
 
@@ -850,15 +898,17 @@
 def newlines_split_2000(STR):
     s = _get_2000_lines(STR)
     s_split = s.split
+    nl = STR("\n")
     for x in _RANGE_10:
-        s_split("\n")
+        s_split(nl)
 
 @bench('"...text...".rsplit("\\n")', "split 2000 newlines", 10)
 def newlines_rsplit_2000(STR):
     s = _get_2000_lines(STR)
     s_rsplit = s.rsplit
+    nl = STR("\n")
     for x in _RANGE_10:
-        s_rsplit("\n")
+        s_rsplit(nl)
 
 @bench('"...text...".splitlines()', "split 2000 newlines", 10)
 def newlines_splitlines_2000(STR):
@@ -875,16 +925,18 @@
 def split_multichar_sep_small(STR):
     s = STR("this--is--a--test--of--the--emergency--broadcast--system")
     s_split = s.split
+    pat = STR("--")
     for x in _RANGE_1000:
-        s_split("--")
+        s_split(pat)
 @bench(
     '"this--is--a--test--of--the--emergency--broadcast--system".rsplit("--")',
     "split on multicharacter separator (small)", 1000)
 def rsplit_multichar_sep_small(STR):
     s = STR("this--is--a--test--of--the--emergency--broadcast--system")
     s_rsplit = s.rsplit
+    pat = STR("--")
     for x in _RANGE_1000:
-        s_rsplit("--")
+        s_rsplit(pat)
 
 ## split dna text on "ACTAT" characters
 @bench('dna.split("ACTAT")',
@@ -892,16 +944,18 @@
 def split_multichar_sep_dna(STR):
     s = _get_dna(STR)
     s_split = s.split
+    pat = STR("ACTAT")
     for x in _RANGE_10:
-        s_split("ACTAT")
+        s_split(pat)
 
 @bench('dna.rsplit("ACTAT")',
        "split on multicharacter separator (dna)", 10)
 def rsplit_multichar_sep_dna(STR):
     s = _get_dna(STR)
     s_rsplit = s.rsplit
+    pat = STR("ACTAT")
     for x in _RANGE_10:
-        s_rsplit("ACTAT")
+        s_rsplit(pat)
 
 
 
@@ -950,8 +1004,9 @@
 def count_newlines(STR):
     s = _get_2000_lines(STR)
     s_count = s.count
+    nl = STR("\n")
     for x in _RANGE_10:
-        s_count("\n")
+        s_count(nl)
 
 # Orchid sequences concatenated, from Biopython
 _dna = """
@@ -1008,21 +1063,23 @@
 """
 _dna = "".join(_dna.splitlines())
 _dna = _dna * 25
-_dna_unicode = unicode(_dna)
+_dna_bytes = bytes_from_str(_dna)
+_dna_unicode = unicode_from_str(_dna)
 
 def _get_dna(STR):
-    if STR is unicode:
+    if STR is UNICODE:
         return _dna_unicode
-    if STR is str:
-        return _dna
+    if STR is BYTES:
+        return _dna_bytes
     raise AssertionError
 
 @bench('dna.count("AACT")', "count AACT substrings in DNA example", 10)
 def count_aact(STR):
     seq = _get_dna(STR)
     seq_count = seq.count
+    needle = STR("AACT")
     for x in _RANGE_10:
-        seq_count("AACT")
+        seq_count(needle)
 
 ##### startswith and endswith
 
@@ -1229,10 +1286,11 @@
 
 
 big_s = "A" + ("Z"*128*1024)
-big_s_unicode = unicode(big_s)
+big_s_bytes = bytes_from_str(big_s)
+big_s_unicode = unicode_from_str(big_s)
 def _get_big_s(STR):
-    if STR is unicode: return big_s_unicode
-    if STR is str: return big_s
+    if STR is UNICODE: return big_s_unicode
+    if STR is BYTES: return big_s_bytes
     raise AssertionError
 
 # The older replace implementation counted all matches in
@@ -1263,21 +1321,27 @@
 # CCP does a lot of this, for internationalisation of ingame messages.
 _format = "The %(thing)s is %(place)s the %(location)s."
 _format_dict = { "thing":"THING", "place":"PLACE", "location":"LOCATION", }
-_format_unicode = unicode(_format)
-_format_dict_unicode = dict([ (unicode(k), unicode(v)) for (k,v) in _format_dict.iteritems() ])
+_format_bytes = bytes_from_str(_format)
+_format_unicode = unicode_from_str(_format)
+_format_dict_bytes = dict((bytes_from_str(k), bytes_from_str(v)) for (k,v) in _format_dict.items())
+_format_dict_unicode = dict((unicode_from_str(k), unicode_from_str(v)) for (k,v) in _format_dict.items())
 
 def _get_format(STR):
-    if STR is unicode:
+    if STR is UNICODE:
         return _format_unicode
-    if STR is str:
-        return _format
+    if STR is BYTES:
+        if sys.version_info >= (3,):
+            raise UnsupportedType
+        return _format_bytes
     raise AssertionError
 
 def _get_format_dict(STR):
-    if STR is unicode:
+    if STR is UNICODE:
         return _format_dict_unicode
-    if STR is str:
-        return _format_dict
+    if STR is BYTES:
+        if sys.version_info >= (3,):
+            raise UnsupportedType
+        return _format_dict_bytes
     raise AssertionError
 
 # Formatting.
@@ -1334,12 +1398,8 @@
     def best(self, repeat=1):
         for i in range(1, 10):
             number = 10**i
-            try:
-                x = self.timeit(number)
-            except:
-                self.print_exc()
-                raise
-            if x > 0.2:
+            x = self.timeit(number)
+            if x > 0.02:
                 break
         times = [x]
         for i in range(1, repeat):
@@ -1348,7 +1408,7 @@
 
 def main():
     (options, test_names) = parser.parse_args()
-    if options.str_only and options.unicode_only:
+    if options.bytes_only and options.unicode_only:
         raise SystemExit("Only one of --8-bit and --unicode are allowed")
 
     bench_functions = []
@@ -1367,49 +1427,56 @@
             bench_functions.append( (v.group, k, v) )
     bench_functions.sort()
 
-    print "string\tunicode"
-    print "(in ms)\t(in ms)\t%\tcomment"
+    p("bytes\tunicode")
+    p("(in ms)\t(in ms)\t%\tcomment")
 
-    str_total = uni_total = 0.0
+    bytes_total = uni_total = 0.0
 
     for title, group in itertools.groupby(bench_functions,
                                       operator.itemgetter(0)):
         # Flush buffer before each group
         sys.stdout.flush()
-        print "="*10, title
+        p("="*10, title)
         for (_, k, v) in group:
             if hasattr(v, "is_bench"):
+                bytes_time = 0.0
+                bytes_time_s = " - "
                 if not options.unicode_only:
-                    str_time = BenchTimer("__main__.%s(str)" % (k,),
-                                          "import __main__").best(REPEAT)
-                else:
-                    str_time = 0.0
-                if not options.str_only:
-                    uni_time = BenchTimer("__main__.%s(unicode)" % (k,),
-                                          "import __main__").best(REPEAT)
-                else:
-                    uni_time = 0.0
+                    try:
+                        bytes_time = BenchTimer("__main__.%s(__main__.BYTES)" % (k,),
+                                                "import __main__").best(REPEAT)
+                        bytes_time_s = "%.2f" % (1000 * bytes_time)
+                        bytes_total += bytes_time
+                    except UnsupportedType:
+                        bytes_time_s = "N/A"
+                uni_time = 0.0
+                uni_time_s = " - "
+                if not options.bytes_only:
+                    try:
+                        uni_time = BenchTimer("__main__.%s(__main__.UNICODE)" % (k,),
+                                              "import __main__").best(REPEAT)
+                        uni_time_s = "%.2f" % (1000 * uni_time)
+                        uni_total += uni_time
+                    except UnsupportedType:
+                        uni_time_s = "N/A"
                 try:
-                    average = str_time/uni_time
-                except ZeroDivisionError:
+                    average = bytes_time/uni_time
+                except (TypeError, ZeroDivisionError):
                     average = 0.0
-                print "%.2f\t%.2f\t%.1f\t%s (*%d)" % (
-                    1000*str_time, 1000*uni_time, 100.*average,
-                    v.comment, v.repeat_count)
-
-                str_total += str_time
-                uni_total += uni_time
+                p("%s\t%s\t%.1f\t%s (*%d)" % (
+                    bytes_time_s, uni_time_s, 100.*average,
+                    v.comment, v.repeat_count))
 
-    if str_total == uni_total == 0.0:
-        print "That was zippy!"
+    if bytes_total == uni_total == 0.0:
+        p("That was zippy!")
     else:
         try:
-            ratio = str_total/uni_total
+            ratio = bytes_total/uni_total
         except ZeroDivisionError:
             ratio = 0.0
-        print "%.2f\t%.2f\t%.1f\t%s" % (
-            1000*str_total, 1000*uni_total, 100.*ratio,
-            "TOTAL")
+        p("%.2f\t%.2f\t%.1f\t%s" % (
+            1000*bytes_total, 1000*uni_total, 100.*ratio,
+            "TOTAL"))
 
 if __name__ == "__main__":
     main()