Binary search tree

Mon Nov 12 14:21:36 EST 2007

On 11/12/07, Scott SA (pydev at rscorp.ab.ca) wrote:

Uhm sorry, there is a slightly cleaner way of running the second option I presented (sorry for the second post).

>If you would find an index and count useful, you could do something like this:
>    
>    for idx in range(len(urls)):
>        unique_urls.setdefault(urls[idx],[]).append(idx)

   for idx,url in enumerate(urls):
       unique_urls.setdefault(url,[]).append(idx)

I decided to test the speeds of the four methods:

    set_example
        s = set()
        for url in urls:
            if not url in s:
                s.add(url)

    dict_example
        d = {}
        for url in urls:
            if url in d:
                d[url] = 1

    setdefault_idx_example
        unique_urls   = {}
        for idx in range(len(urls)):
            unique_urls.setdefault(urls[idx],[]).append(idx)

    setdefault_enum_example
        unique_urls   = {}
        for idx,url in enumerate(urls):
           unique_urls.setdefault(url,[]).append(idx)

Here are the results:

    Starting tests with 500000 simulated URLs run 3 times.
    ------------------------------------------------------------
    'set' example
                        run time: 0.5505 seconds
    'dict' example
                        run time: 0.2521 seconds
    'setdefault_idx' example
                        run time: 3.6476 seconds
    'setdefault_enum' example
                        run time: 2.5606 seconds
    ------------------------------------------------------------

For the simple quest of "Find the unique URLs in a list", the 'dict' example is by far the quickest.

Buuut, if you need the indexes and possibly a 'hit count', then the enumerated example seems about the best (of the possibilities tested)

Scott

PS. For those interesed in my test suite, I submit the following:

import time

# ________ convenience methods ________

def build_list(seed_list,base_count):
    """
    Make a bogus list of urls with duplicates 
    """
    return_list = []
    for x in range(base_count):
        temp_list = []
        for i in range(1000):
            for url in seed_list:   # add a unique set
                temp_list.append(('www.%d_%s' % (i,url)))
        return_list += temp_list

    return return_list

def run_testscript(example_name, urls):
    """
    run the given script name
    """
    start_time      = time.time()
    exec('%s_example(urls)' % example_name)
    run_time = time.time() - start_time

    return run_time

def run_tests(example_list, urls, iterations):

    print "Starting tests with %d simulated URLs run %d times.\n%s" % \
        (len(urls), iterations, 60 * '-')

    for example_name in example_list:
        time_value  = 0.0
        for i in range(iterations):
            time_value  += run_testscript(example_name, urls)

        print '\'%s\' example\n%srun time: %0.4f seconds' % \
            (example_name, 20 * ' ', time_value/iterations)
    print 60*'-'

# ________ the various tests/examples ________

def setdefault_idx_example(urls):
    unique_urls   = {}
    for idx in range(len(urls)):
        unique_urls.setdefault(urls[idx],[]).append(idx)

def setdefault_enum_example(urls):
    unique_urls   = {}
    for idx,url in enumerate(urls):
       unique_urls.setdefault(url,[]).append(idx)

def set_example(urls):
    s = set()
    for url in urls:
        if not url in s:
            s.add(url)

def dict_example(urls):
    d = {}
    for url in urls:
        if url in d:
            d[url] = 1

# ________ run 'em all ________

url_seedlist    = ['somewhere.com','nowhere.net','elsewhere.gov','here.ca','there.org']
dummy_urls      = build_list(url_seedlist,100) # number of duplicates
testscript_list = ['set','dict', 'setdefault_idx','setdefault_enum']

run_tests(testscript_list, dummy_urls, 3)