Binary search tree
Scott SA
pydev at rscorp.ab.ca
Mon Nov 12 14:21:36 EST 2007
On 11/12/07, Scott SA (pydev at rscorp.ab.ca) wrote:
Uhm sorry, there is a slightly cleaner way of running the second option I presented (sorry for the second post).
>If you would find an index and count useful, you could do something like this:
>
> for idx in range(len(urls)):
> unique_urls.setdefault(urls[idx],[]).append(idx)
for idx,url in enumerate(urls):
unique_urls.setdefault(url,[]).append(idx)
I decided to test the speeds of the four methods:
set_example
s = set()
for url in urls:
if not url in s:
s.add(url)
dict_example
d = {}
for url in urls:
if url in d:
d[url] = 1
setdefault_idx_example
unique_urls = {}
for idx in range(len(urls)):
unique_urls.setdefault(urls[idx],[]).append(idx)
setdefault_enum_example
unique_urls = {}
for idx,url in enumerate(urls):
unique_urls.setdefault(url,[]).append(idx)
Here are the results:
Starting tests with 500000 simulated URLs run 3 times.
------------------------------------------------------------
'set' example
run time: 0.5505 seconds
'dict' example
run time: 0.2521 seconds
'setdefault_idx' example
run time: 3.6476 seconds
'setdefault_enum' example
run time: 2.5606 seconds
------------------------------------------------------------
For the simple quest of "Find the unique URLs in a list", the 'dict' example is by far the quickest.
Buuut, if you need the indexes and possibly a 'hit count', then the enumerated example seems about the best (of the possibilities tested)
Scott
PS. For those interesed in my test suite, I submit the following:
import time
# ________ convenience methods ________
def build_list(seed_list,base_count):
"""
Make a bogus list of urls with duplicates
"""
return_list = []
for x in range(base_count):
temp_list = []
for i in range(1000):
for url in seed_list: # add a unique set
temp_list.append(('www.%d_%s' % (i,url)))
return_list += temp_list
return return_list
def run_testscript(example_name, urls):
"""
run the given script name
"""
start_time = time.time()
exec('%s_example(urls)' % example_name)
run_time = time.time() - start_time
return run_time
def run_tests(example_list, urls, iterations):
print "Starting tests with %d simulated URLs run %d times.\n%s" % \
(len(urls), iterations, 60 * '-')
for example_name in example_list:
time_value = 0.0
for i in range(iterations):
time_value += run_testscript(example_name, urls)
print '\'%s\' example\n%srun time: %0.4f seconds' % \
(example_name, 20 * ' ', time_value/iterations)
print 60*'-'
# ________ the various tests/examples ________
def setdefault_idx_example(urls):
unique_urls = {}
for idx in range(len(urls)):
unique_urls.setdefault(urls[idx],[]).append(idx)
def setdefault_enum_example(urls):
unique_urls = {}
for idx,url in enumerate(urls):
unique_urls.setdefault(url,[]).append(idx)
def set_example(urls):
s = set()
for url in urls:
if not url in s:
s.add(url)
def dict_example(urls):
d = {}
for url in urls:
if url in d:
d[url] = 1
# ________ run 'em all ________
url_seedlist = ['somewhere.com','nowhere.net','elsewhere.gov','here.ca','there.org']
dummy_urls = build_list(url_seedlist,100) # number of duplicates
testscript_list = ['set','dict', 'setdefault_idx','setdefault_enum']
run_tests(testscript_list, dummy_urls, 3)
More information about the Python-list
mailing list