[Python-checkins] r79423 - sandbox/trunk/stringbench/stringbench.py
antoine.pitrou
python-checkins at python.org
Thu Mar 25 19:05:25 CET 2010
Author: antoine.pitrou
Date: Thu Mar 25 19:05:24 2010
New Revision: 79423
Log:
Make stringbench both 2.x and 3.x compatible.
Also, make the total running time much smaller, by shortening iterations.
Modified:
sandbox/trunk/stringbench/stringbench.py
Modified: sandbox/trunk/stringbench/stringbench.py
==============================================================================
--- sandbox/trunk/stringbench/stringbench.py (original)
+++ sandbox/trunk/stringbench/stringbench.py Thu Mar 25 19:05:24 2010
@@ -1,5 +1,6 @@
# Various microbenchmarks comparing unicode and byte string performance
+# Please keep this file both 2.x and 3.x compatible!
import timeit
import itertools
@@ -11,9 +12,23 @@
VERSION = '2.0'
-print 'stringbench v%s' % VERSION
-print sys.version
-print datetime.datetime.now()
+def p(*args):
+ sys.stdout.write(' '.join(str(s) for s in args) + '\n')
+
+if sys.version_info >= (3,):
+ BYTES = bytes_from_str = lambda x: x.encode('ascii')
+ UNICODE = unicode_from_str = lambda x: x
+else:
+ BYTES = bytes_from_str = lambda x: x
+ UNICODE = unicode_from_str = lambda x: x.decode('ascii')
+
+class UnsupportedType(TypeError):
+ pass
+
+
+p('stringbench v%s' % VERSION)
+p(sys.version)
+p(datetime.datetime.now())
REPEAT = 1
REPEAT = 3
@@ -26,7 +41,7 @@
parser.add_option("-R", "--skip-re", dest="skip_re",
action="store_true",
help="skip regular expression tests")
-parser.add_option("-8", "--8-bit", dest="str_only",
+parser.add_option("-8", "--8-bit", dest="bytes_only",
action="store_true",
help="only do 8-bit string benchmarks")
parser.add_option("-u", "--unicode", dest="unicode_only",
@@ -34,9 +49,9 @@
help="only do Unicode string benchmarks")
-_RANGE_1000 = range(1000)
-_RANGE_100 = range(100)
-_RANGE_10 = range(10)
+_RANGE_1000 = list(range(1000))
+_RANGE_100 = list(range(100))
+_RANGE_10 = list(range(10))
dups = {}
def bench(s, group, repeat_count):
@@ -97,8 +112,10 @@
"late match, 100 characters", 100)
def in_test_slow_match_100_characters(STR):
m = STR("ABC"*33)
- s1 = (m+"D")*300 + m+"E"
- s2 = m+"E"
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*300 + m+e
+ s2 = m+e
for x in _RANGE_100:
s2 in s1
@@ -108,8 +125,10 @@
"late match, 100 characters", 100)
def re_test_slow_match_100_characters(STR):
m = STR("ABC"*33)
- s1 = (m+"D")*300 + m+"E"
- s2 = m+"E"
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*300 + m+e
+ s2 = m+e
pat = re.compile(s2)
search = pat.search
for x in _RANGE_100:
@@ -179,8 +198,10 @@
"late match, 100 characters", 100)
def find_test_slow_match_100_characters(STR):
m = STR("ABC"*33)
- s1 = (m+"D")*500 + m+"E"
- s2 = m+"E"
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*500 + m+e
+ s2 = m+e
s1_find = s1.find
for x in _RANGE_100:
s1_find(s2)
@@ -189,8 +210,10 @@
"late match, 100 characters", 100)
def find_test_slow_match_100_characters_bis(STR):
m = STR("ABC"*33)
- s1 = (m+"D")*500 + "E"+m
- s2 = "E"+m
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*500 + e+m
+ s2 = e+m
s1_find = s1.find
for x in _RANGE_100:
s1_find(s2)
@@ -259,8 +282,10 @@
"late match, 100 characters", 100)
def rfind_test_slow_match_100_characters(STR):
m = STR("ABC"*33)
- s1 = "E"+m + ("D"+m)*500
- s2 = "E"+m
+ d = STR("D")
+ e = STR("E")
+ s1 = e+m + (d+m)*500
+ s2 = e+m
s1_rfind = s1.rfind
for x in _RANGE_100:
s1_rfind(s2)
@@ -269,8 +294,10 @@
"late match, 100 characters", 100)
def rfind_test_slow_match_100_characters_bis(STR):
m = STR("ABC"*33)
- s1 = m+"E" + ("D"+m)*500
- s2 = m+"E"
+ d = STR("D")
+ e = STR("E")
+ s1 = m+e + (d+m)*500
+ s2 = m+e
s1_rfind = s1.rfind
for x in _RANGE_100:
s1_rfind(s2)
@@ -307,8 +334,10 @@
"late match, 100 characters", 100)
def index_test_slow_match_100_characters(STR):
m = STR("ABC"*33)
- s1 = (m+"D")*500 + m+"E"
- s2 = m+"E"
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*500 + m+e
+ s2 = m+e
s1_index = s1.index
for x in _RANGE_100:
s1_index(s2)
@@ -344,8 +373,10 @@
"late match, 100 characters", 100)
def rindex_test_slow_match_100_characters(STR):
m = STR("ABC"*33)
- s1 = "E" + m + ("D"+m)*500
- s2 = "E" + m
+ d = STR("D")
+ e = STR("E")
+ s1 = e + m + (d+m)*500
+ s2 = e + m
s1_rindex = s1.rindex
for x in _RANGE_100:
s1_rindex(s2)
@@ -398,8 +429,10 @@
"late match, 100 characters", 100)
def partition_test_slow_match_100_characters(STR):
m = STR("ABC"*33)
- s1 = (m+"D")*500 + m+"E"
- s2 = m+"E"
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*500 + m+e
+ s2 = m+e
s1_partition = s1.partition
for x in _RANGE_100:
s1_partition(s2)
@@ -452,8 +485,10 @@
"late match, 100 characters", 100)
def rpartition_test_slow_match_100_characters(STR):
m = STR("ABC"*33)
- s1 = "E" + m + ("D"+m)*500
- s2 = "E" + m
+ d = STR("D")
+ e = STR("E")
+ s1 = e + m + (d+m)*500
+ s2 = e + m
s1_rpartition = s1.rpartition
for x in _RANGE_100:
s1_rpartition(s2)
@@ -506,8 +541,10 @@
"late match, 100 characters", 100)
def split_test_slow_match_100_characters(STR):
m = STR("ABC"*33)
- s1 = (m+"D")*500 + m+"E"
- s2 = m+"E"
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*500 + m+e
+ s2 = m+e
s1_split = s1.split
for x in _RANGE_100:
s1_split(s2, 1)
@@ -560,8 +597,10 @@
"late match, 100 characters", 100)
def rsplit_test_slow_match_100_characters(STR):
m = STR("ABC"*33)
- s1 = "E" + m + ("D"+m)*500
- s2 = "E" + m
+ d = STR("D")
+ e = STR("E")
+ s1 = e + m + (d+m)*500
+ s2 = e + m
s1_rsplit = s1.rsplit
for x in _RANGE_100:
s1_rsplit(s2, 1)
@@ -632,11 +671,16 @@
#### Benchmark join
+def get_bytes_yielding_seq(STR, arg):
+ if STR is BYTES and sys.version_info >= (3,):
+ raise UnsupportedType
+ return STR(arg)
+
@bench('"A".join("")',
"join empty string, with 1 character sep", 100)
def join_empty_single(STR):
sep = STR("A")
- s2 = STR("")
+ s2 = get_bytes_yielding_seq(STR, "")
sep_join = sep.join
for x in _RANGE_100:
sep_join(s2)
@@ -645,7 +689,7 @@
"join empty string, with 5 character sep", 100)
def join_empty_5(STR):
sep = STR("ABCDE")
- s2 = STR("")
+ s2 = get_bytes_yielding_seq(STR, "")
sep_join = sep.join
for x in _RANGE_100:
sep_join(s2)
@@ -654,7 +698,7 @@
"join string with 26 characters, with 1 character sep", 1000)
def join_alphabet_single(STR):
sep = STR("A")
- s2 = STR("ABCDEFGHIJKLMnOPQRSTUVWXYZ")
+ s2 = get_bytes_yielding_seq(STR, "ABCDEFGHIJKLMnOPQRSTUVWXYZ")
sep_join = sep.join
for x in _RANGE_1000:
sep_join(s2)
@@ -663,7 +707,7 @@
"join string with 26 characters, with 5 character sep", 1000)
def join_alphabet_5(STR):
sep = STR("ABCDE")
- s2 = STR("ABCDEFGHIJKLMnOPQRSTUVWXYZ")
+ s2 = get_bytes_yielding_seq(STR, "ABCDEFGHIJKLMnOPQRSTUVWXYZ")
sep_join = sep.join
for x in _RANGE_1000:
sep_join(s2)
@@ -672,7 +716,7 @@
"join list of 26 characters, with 1 character sep", 1000)
def join_alphabet_list_single(STR):
sep = STR("A")
- s2 = list(STR("ABCDEFGHIJKLMnOPQRSTUVWXYZ"))
+ s2 = [STR(x) for x in "ABCDEFGHIJKLMnOPQRSTUVWXYZ"]
sep_join = sep.join
for x in _RANGE_1000:
sep_join(s2)
@@ -681,7 +725,7 @@
"join list of 26 characters, with 5 character sep", 1000)
def join_alphabet_list_five(STR):
sep = STR("ABCDE")
- s2 = list(STR("ABCDEFGHIJKLMnOPQRSTUVWXYZ"))
+ s2 = [STR(x) for x in "ABCDEFGHIJKLMnOPQRSTUVWXYZ"]
sep_join = sep.join
for x in _RANGE_1000:
sep_join(s2)
@@ -771,12 +815,13 @@
Python is distributed under an OSI-approved open source license that
makes it free to use, even for commercial products.
"""*25
-human_text_unicode = unicode(human_text)
+human_text_bytes = bytes_from_str(human_text)
+human_text_unicode = unicode_from_str(human_text)
def _get_human_text(STR):
- if STR is unicode:
+ if STR is UNICODE:
return human_text_unicode
- if STR is str:
- return human_text
+ if STR is BYTES:
+ return human_text_bytes
raise AssertionError
@bench('human_text.split()', "split whitespace (huge)", 10)
@@ -799,16 +844,18 @@
def newlines_split(STR):
s = STR("this\nis\na\ntest\n")
s_split = s.split
+ nl = STR("\n")
for x in _RANGE_1000:
- s_split("\n")
+ s_split(nl)
@bench('"this\\nis\\na\\ntest\\n".rsplit("\\n")', "split newlines", 1000)
def newlines_rsplit(STR):
s = STR("this\nis\na\ntest\n")
s_rsplit = s.rsplit
+ nl = STR("\n")
for x in _RANGE_1000:
- s_rsplit("\n")
+ s_rsplit(nl)
@bench('"this\\nis\\na\\ntest\\n".splitlines()', "split newlines", 1000)
def newlines_splitlines(STR):
@@ -822,7 +869,7 @@
def _make_2000_lines():
import random
r = random.Random(100)
- chars = map(chr, range(32, 128))
+ chars = list(map(chr, range(32, 128)))
i = 0
while i < len(chars):
chars[i] = " "
@@ -837,12 +884,13 @@
return "\n".join(words)+"\n"
_text_with_2000_lines = _make_2000_lines()
-_text_with_2000_lines_unicode = unicode(_text_with_2000_lines)
+_text_with_2000_lines_bytes = bytes_from_str(_text_with_2000_lines)
+_text_with_2000_lines_unicode = unicode_from_str(_text_with_2000_lines)
def _get_2000_lines(STR):
- if STR is unicode:
+ if STR is UNICODE:
return _text_with_2000_lines_unicode
- if STR is str:
- return _text_with_2000_lines
+ if STR is BYTES:
+ return _text_with_2000_lines_bytes
raise AssertionError
@@ -850,15 +898,17 @@
def newlines_split_2000(STR):
s = _get_2000_lines(STR)
s_split = s.split
+ nl = STR("\n")
for x in _RANGE_10:
- s_split("\n")
+ s_split(nl)
@bench('"...text...".rsplit("\\n")', "split 2000 newlines", 10)
def newlines_rsplit_2000(STR):
s = _get_2000_lines(STR)
s_rsplit = s.rsplit
+ nl = STR("\n")
for x in _RANGE_10:
- s_rsplit("\n")
+ s_rsplit(nl)
@bench('"...text...".splitlines()', "split 2000 newlines", 10)
def newlines_splitlines_2000(STR):
@@ -875,16 +925,18 @@
def split_multichar_sep_small(STR):
s = STR("this--is--a--test--of--the--emergency--broadcast--system")
s_split = s.split
+ pat = STR("--")
for x in _RANGE_1000:
- s_split("--")
+ s_split(pat)
@bench(
'"this--is--a--test--of--the--emergency--broadcast--system".rsplit("--")',
"split on multicharacter separator (small)", 1000)
def rsplit_multichar_sep_small(STR):
s = STR("this--is--a--test--of--the--emergency--broadcast--system")
s_rsplit = s.rsplit
+ pat = STR("--")
for x in _RANGE_1000:
- s_rsplit("--")
+ s_rsplit(pat)
## split dna text on "ACTAT" characters
@bench('dna.split("ACTAT")',
@@ -892,16 +944,18 @@
def split_multichar_sep_dna(STR):
s = _get_dna(STR)
s_split = s.split
+ pat = STR("ACTAT")
for x in _RANGE_10:
- s_split("ACTAT")
+ s_split(pat)
@bench('dna.rsplit("ACTAT")',
"split on multicharacter separator (dna)", 10)
def rsplit_multichar_sep_dna(STR):
s = _get_dna(STR)
s_rsplit = s.rsplit
+ pat = STR("ACTAT")
for x in _RANGE_10:
- s_rsplit("ACTAT")
+ s_rsplit(pat)
@@ -950,8 +1004,9 @@
def count_newlines(STR):
s = _get_2000_lines(STR)
s_count = s.count
+ nl = STR("\n")
for x in _RANGE_10:
- s_count("\n")
+ s_count(nl)
# Orchid sequences concatenated, from Biopython
_dna = """
@@ -1008,21 +1063,23 @@
"""
_dna = "".join(_dna.splitlines())
_dna = _dna * 25
-_dna_unicode = unicode(_dna)
+_dna_bytes = bytes_from_str(_dna)
+_dna_unicode = unicode_from_str(_dna)
def _get_dna(STR):
- if STR is unicode:
+ if STR is UNICODE:
return _dna_unicode
- if STR is str:
- return _dna
+ if STR is BYTES:
+ return _dna_bytes
raise AssertionError
@bench('dna.count("AACT")', "count AACT substrings in DNA example", 10)
def count_aact(STR):
seq = _get_dna(STR)
seq_count = seq.count
+ needle = STR("AACT")
for x in _RANGE_10:
- seq_count("AACT")
+ seq_count(needle)
##### startswith and endswith
@@ -1229,10 +1286,11 @@
big_s = "A" + ("Z"*128*1024)
-big_s_unicode = unicode(big_s)
+big_s_bytes = bytes_from_str(big_s)
+big_s_unicode = unicode_from_str(big_s)
def _get_big_s(STR):
- if STR is unicode: return big_s_unicode
- if STR is str: return big_s
+ if STR is UNICODE: return big_s_unicode
+ if STR is BYTES: return big_s_bytes
raise AssertionError
# The older replace implementation counted all matches in
@@ -1263,21 +1321,27 @@
# CCP does a lot of this, for internationalisation of ingame messages.
_format = "The %(thing)s is %(place)s the %(location)s."
_format_dict = { "thing":"THING", "place":"PLACE", "location":"LOCATION", }
-_format_unicode = unicode(_format)
-_format_dict_unicode = dict([ (unicode(k), unicode(v)) for (k,v) in _format_dict.iteritems() ])
+_format_bytes = bytes_from_str(_format)
+_format_unicode = unicode_from_str(_format)
+_format_dict_bytes = dict((bytes_from_str(k), bytes_from_str(v)) for (k,v) in _format_dict.items())
+_format_dict_unicode = dict((unicode_from_str(k), unicode_from_str(v)) for (k,v) in _format_dict.items())
def _get_format(STR):
- if STR is unicode:
+ if STR is UNICODE:
return _format_unicode
- if STR is str:
- return _format
+ if STR is BYTES:
+ if sys.version_info >= (3,):
+ raise UnsupportedType
+ return _format_bytes
raise AssertionError
def _get_format_dict(STR):
- if STR is unicode:
+ if STR is UNICODE:
return _format_dict_unicode
- if STR is str:
- return _format_dict
+ if STR is BYTES:
+ if sys.version_info >= (3,):
+ raise UnsupportedType
+ return _format_dict_bytes
raise AssertionError
# Formatting.
@@ -1334,12 +1398,8 @@
def best(self, repeat=1):
for i in range(1, 10):
number = 10**i
- try:
- x = self.timeit(number)
- except:
- self.print_exc()
- raise
- if x > 0.2:
+ x = self.timeit(number)
+ if x > 0.02:
break
times = [x]
for i in range(1, repeat):
@@ -1348,7 +1408,7 @@
def main():
(options, test_names) = parser.parse_args()
- if options.str_only and options.unicode_only:
+ if options.bytes_only and options.unicode_only:
raise SystemExit("Only one of --8-bit and --unicode are allowed")
bench_functions = []
@@ -1367,49 +1427,56 @@
bench_functions.append( (v.group, k, v) )
bench_functions.sort()
- print "string\tunicode"
- print "(in ms)\t(in ms)\t%\tcomment"
+ p("bytes\tunicode")
+ p("(in ms)\t(in ms)\t%\tcomment")
- str_total = uni_total = 0.0
+ bytes_total = uni_total = 0.0
for title, group in itertools.groupby(bench_functions,
operator.itemgetter(0)):
# Flush buffer before each group
sys.stdout.flush()
- print "="*10, title
+ p("="*10, title)
for (_, k, v) in group:
if hasattr(v, "is_bench"):
+ bytes_time = 0.0
+ bytes_time_s = " - "
if not options.unicode_only:
- str_time = BenchTimer("__main__.%s(str)" % (k,),
- "import __main__").best(REPEAT)
- else:
- str_time = 0.0
- if not options.str_only:
- uni_time = BenchTimer("__main__.%s(unicode)" % (k,),
- "import __main__").best(REPEAT)
- else:
- uni_time = 0.0
+ try:
+ bytes_time = BenchTimer("__main__.%s(__main__.BYTES)" % (k,),
+ "import __main__").best(REPEAT)
+ bytes_time_s = "%.2f" % (1000 * bytes_time)
+ bytes_total += bytes_time
+ except UnsupportedType:
+ bytes_time_s = "N/A"
+ uni_time = 0.0
+ uni_time_s = " - "
+ if not options.bytes_only:
+ try:
+ uni_time = BenchTimer("__main__.%s(__main__.UNICODE)" % (k,),
+ "import __main__").best(REPEAT)
+ uni_time_s = "%.2f" % (1000 * uni_time)
+ uni_total += uni_time
+ except UnsupportedType:
+ uni_time_s = "N/A"
try:
- average = str_time/uni_time
- except ZeroDivisionError:
+ average = bytes_time/uni_time
+ except (TypeError, ZeroDivisionError):
average = 0.0
- print "%.2f\t%.2f\t%.1f\t%s (*%d)" % (
- 1000*str_time, 1000*uni_time, 100.*average,
- v.comment, v.repeat_count)
-
- str_total += str_time
- uni_total += uni_time
+ p("%s\t%s\t%.1f\t%s (*%d)" % (
+ bytes_time_s, uni_time_s, 100.*average,
+ v.comment, v.repeat_count))
- if str_total == uni_total == 0.0:
- print "That was zippy!"
+ if bytes_total == uni_total == 0.0:
+ p("That was zippy!")
else:
try:
- ratio = str_total/uni_total
+ ratio = bytes_total/uni_total
except ZeroDivisionError:
ratio = 0.0
- print "%.2f\t%.2f\t%.1f\t%s" % (
- 1000*str_total, 1000*uni_total, 100.*ratio,
- "TOTAL")
+ p("%.2f\t%.2f\t%.1f\t%s" % (
+ 1000*bytes_total, 1000*uni_total, 100.*ratio,
+ "TOTAL"))
if __name__ == "__main__":
main()
More information about the Python-checkins
mailing list