[Tutor] help with comparing list of tuples in python 2

Sat Jun 18 04:00:44 EDT 2016

Lulu J wrote:

> Hi there,
> 
> My apologies if this is a trivial question but I am sort of new to python.
> Here is my problem:
> I have a list of dictionaries. Each dictionary has a word and its position
> in the text  the positions are in the form of a tuple.
> Here is an example:
> [
> {'position': (5, 4), 'term': u'happy',},
>  {'position': (5, 5), 'term': u'something'}
> ]

How did you get your data? Perhaps it is possible to store it in a different 
data structure in the first place, one that is a better fit for your 
problem.

> for the potions, the first element is the paragraph number and the second
> is the word number in that paragraph(sequence from 1...n)
> 
> What I would like to is find which words are next to each other. Meaning,
> they will be in the same paragraph and the difference of their word
> numbers is 1.

What is the actual question your script has to answer? Is it

(1) What word precedes/follows the third word in the sixth paragraph?

or

(2) What words precede/follow the word u"happy"?

or something else I didn't think of?

For (2) here is a not-so-short self-contained example:

$ cat neighbors.py
import pprint
import sys

try:
    raw_input
except NameError:
    raw_input = input  # Python 3

SAMPLE = u"""\
The sky is blue
Press the blue button
This is pie in the sky
"""

def gen_terms(paragraphs):
    """Generate {"position": (para_index, word_index), "term": word} dicts.

    A minimalistic emulation of what your code does.
    """
    for para_index, para in enumerate(paragraphs):
        for word_index, word in enumerate(para.lower().split()):
            yield {"position": (para_index, word_index), "term": word}

def find_neighbors(word, position_to_word, word_to_position):
    before = set()  # use collections.Counter() if you are
    after = set()   # interested in frequencies
    for para_index, word_index in word_to_position[word]:
        try:
            left_word = position_to_word[para_index, word_index-1]
        except KeyError:
            pass  # first word in paragraph
        else:
            before.add(left_word)
        try:
            right_word = position_to_word[para_index, word_index+1]
        except KeyError:
            pass  # last word in paragraph
        else:
            after.add(right_word)
    return before, after

def format_words(words):
    return ", ".join(sorted(words)) or "<none>"

def build_lookup_tables(terms):
    # map word to all positions in the text
    word_to_position = {}
    # map position to the corresponding word
    position_to_word = {}
    for term in terms:
        pos = term["position"]
        word = term["term"]
        word_to_position.setdefault(word, []).append(pos)
        position_to_word[pos] = word
    return word_to_position, position_to_word

def normalize(word):
    try:
        word = word.decode()
    except AttributeError:
        pass  # Python 3
    return word.lower()

def main():
    verbose = "--verbose" in sys.argv
    if verbose:
        print("Original text:")
        print(SAMPLE)

    terms = list(gen_terms(SAMPLE.lower().splitlines()))
    if verbose:
        print("list of terms:")
        pprint.pprint(terms)
        print("")

    word_to_position, position_to_word = build_lookup_tables(terms)
    if verbose:
        print("word --> position")
        pprint.pprint(dict(word_to_position))
        print("position --> word")
        pprint.pprint(position_to_word)
        print("")

    while True:
        word = normalize(raw_input("enter a word: "))
        if not word:
            print("Bye!")
            break
        elif word not in word_to_position:
            print("Unknown word, enter one of these:")
            print(format_words(word_to_position))
            print("")
        else:
            before, after = find_neighbors(
                word,
                position_to_word=position_to_word,
                word_to_position=word_to_position
            )
            print(u"These words occur before '{}'".format(word))
            print(format_words(before))
            print(u"These words occur after '{}'".format(word))
            print(format_words(after))
            print("")

if __name__ == "__main__":
    main()

Let's run it in verbose mode so that you can see the data structures used:

$ python neighbors.py --verbose
Original text:
The sky is blue
Press the blue button
This is pie in the sky

list of terms:
[{'position': (0, 0), 'term': u'the'},
 {'position': (0, 1), 'term': u'sky'},
 {'position': (0, 2), 'term': u'is'},
 {'position': (0, 3), 'term': u'blue'},
 {'position': (1, 0), 'term': u'press'},
 {'position': (1, 1), 'term': u'the'},
 {'position': (1, 2), 'term': u'blue'},
 {'position': (1, 3), 'term': u'button'},
 {'position': (2, 0), 'term': u'this'},
 {'position': (2, 1), 'term': u'is'},
 {'position': (2, 2), 'term': u'pie'},
 {'position': (2, 3), 'term': u'in'},
 {'position': (2, 4), 'term': u'the'},
 {'position': (2, 5), 'term': u'sky'}]

word --> position
{u'blue': [(0, 3), (1, 2)],
 u'button': [(1, 3)],
 u'in': [(2, 3)],
 u'is': [(0, 2), (2, 1)],
 u'pie': [(2, 2)],
 u'press': [(1, 0)],
 u'sky': [(0, 1), (2, 5)],
 u'the': [(0, 0), (1, 1), (2, 4)],
 u'this': [(2, 0)]}
position --> word
{(0, 0): u'the',
 (0, 1): u'sky',
 (0, 2): u'is',
 (0, 3): u'blue',
 (1, 0): u'press',
 (1, 1): u'the',
 (1, 2): u'blue',
 (1, 3): u'button',
 (2, 0): u'this',
 (2, 1): u'is',
 (2, 2): u'pie',
 (2, 3): u'in',
 (2, 4): u'the',
 (2, 5): u'sky'}

enter a word: foo
Unknown word, enter one of these:
blue, button, in, is, pie, press, sky, the, this

enter a word: is 
These words occur before 'is'
sky, this
These words occur after 'is'
blue, pie

enter a word: press
These words occur before 'press'
<none>
These words occur after 'press'
the

enter a word: button
These words occur before 'button'
blue
These words occur after 'button'
<none>

enter a word: 
Bye!
$