Newbie ? -- SGML metadata extraction

Adonis adonisv at DELETETHISTEXTearthlink.net
Tue Jan 17 00:36:23 EST 2006


ProvoWallis wrote:
> Thanks. One more question, though.
> 
> I'm not sure how to limit the scope of my search so that I'm just
> extracting the id attribute from the sections that I want. I.e., I want
> the id attributes from the forms in sections 1 and 3 but not from 2.
> 
> Maybe I'm missing something.
> 

If the data has closing tags this is easily achieved using a dom or sax 
parser, but here is a slightly modified version, very ugly but simple.

hope this helps.

Adonis

---

from HTMLParser import HTMLParser

data = """<main-section no="1">

<form id="graphic_1.tif">
<form id="graphic_2.tif">

<main-section no="2">

<form id="graphic_3.tif">

<main-section no="3">

<form id="graphic_4.tif">
<form id="graphic_5.tif">
<form id="graphic_6.tif">
"""

class ParseForms(HTMLParser):

     _section = None
     _secDict = dict()

     def getSection(self, key):
         return self._secDict.get(str(key))

     def handle_starttag(self, tag, attrs):
         if tag == "form":
             if not self._secDict.has_key(self._section):
                 self._secDict[self._section] = [dict(attrs).get('id')]
             else:
               self._secDict[self._section].append(dict(attrs).get('id'))

         if tag == "main-section":
             self._section = dict(attrs).get('no')

if __name__ == "__main__":
     parser = ParseForms()
     parser.feed(data)
     print parser.getSection(1)
     print parser.getSection(3)




More information about the Python-list mailing list