Parsing HTML

Richie Hindle richie at
Thu Sep 23 13:44:57 CEST 2004

> I want to extract some info from a some specific HTML pages, Microsofts
> International Word list (e.g.
> I
> want to take all the words, both English and the other language and create
> a dictionary. so that I can look up About and get Om as the answer.

BeautifulSoup ( is perfect for
this job:

import urllib2, pprint
from BeautifulSoup import BeautifulSoup

def cellToWord(cell):
   """Given a table cell, return the word in that cell."""
   # Some words are in bold.
   if cell('b'):
      return cell.first('b').string.strip()      # Return the bold piece.
      return cell.string.split('.')[1].strip()   # Remove the number.

def parse(url):
   """Parse the given URL and return a dictionary mapping US words to
   foreign words."""
   # Read the URL and pass it to BeautifulSoup.
   html = urllib2.urlopen(url).read()
   soup = BeautifulSoup()
   # Read the main table, extracting the words from the table cells.
   USToForeign = {}
   mainTable = soup.first('table')
   rows = mainTable('tr')
   for row in rows[1:]:        # Exclude the first (headings) row.
      cells = row('td')
      if len(cells) == 3:      # Some rows have a single colspan="3" cell.
         US = cellToWord(cells[0])
         foreign = cellToWord(cells[1])
         USToForeign[US] = foreign
   return USToForeign

if __name__ == '__main__':
   url = ''
   USToForeign = parse(url)
   pairs = USToForeign.items()
   pairs.sort(lambda a, b: cmp(a[0].lower(), b[0].lower()))  # Web page order

Richie Hindle
richie at

More information about the Python-list mailing list