Using XML w/ Python...

Steve Holden steve at
Sun Dec 11 12:17:23 CET 2005

Jay wrote:
> Yes i know, i did check out a couple but i could never understand it.
> They were confusing for me and i wasnt hoping for a full typed
> tutorial, just like some help with excactly wat im trying to do, not
> the whole module... but watever, Thx alot for the feedbak.
Well I don't want to hold this up as an example of best practice (it was 
a quick hack to get some book graphics for my web site), but this 
example shows you how you can extract stuff from XML, in this case 
returned from Amazon's web services module.

Sorry about any wrapping that mangles the code.


# download book details from
# hwBuild: database-driven web content management system
# Copyright (C) 2005 Steve Holden - steve at
# This program is free software; you can redistribute it
# and/or modify it under the terms of the GNU General
# Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at
# your option) any later version.
# This program is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied
# PURPOSE. See the GNU General Public License for more details.
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
import urllib
import urlparse
import os
import re
from xml.parsers import expat
from config import Config
picindir = os.path.join(Config['datadir'], "pybooks")
for f in os.listdir(picindir):
     os.unlink(os.path.join(picindir, f))

filpat = re.compile(r"\d+")

class myParser:
     def __init__(self):
         self.parser = expat.ParserCreate()
         self.parser.StartElementHandler = self.start_element
         self.parser.EndElementHandler = self.end_element
         self.parser.CharacterDataHandler = self.character_data
         self.processing = 0
         self.count = 0

     def parse(self, f):
         return self.count

     def start_element(self, name, attrs):
         if name == "MediumImage":
             self.processing = 1
             self.imgname = ""
         if self.processing == 1 and name == "URL":
             self.processing = 2

     def end_element(self, name):
         if self.processing == 2 and name == "URL":
             self.processing = 1
             print "Getting:", self.imgname
             scheme, loc, path, params, query, fragment = 
             itemno = filpat.match(os.path.basename(path))
             fnam =
             u  = urllib.urlopen(self.imgname)
             img =
             outfile = file(os.path.join(picindir, "%s.jpg" % fnam), "wb")
             self.count += 1
         if self.processing ==1 and name == "MediumImage":
             self.processing = 0

     def character_data(self, data):
         if self.processing == 2:
             self.imgname += data

def main(search=None):
     print "Search:", search
     count = 0
     for pageNum in range(1,5):
         f = 
% (urllib.quote(search or Config['book-search']), pageNum))
         fnam = os.path.join(picindir, "bookdata.txt")
         file(fnam, "w").write(
         f = file(fnam, "r")
         p = myParser()
         n = p.parse(f)
         if n == 0:
         count += n
     return count

if __name__ == "__main__":
     import sys
     search = None
     if len(sys.argv) > 1:
         search = sys.argv[1]
     n = main(search)
     print "Pictures found:", n
Steve Holden       +44 150 684 7255  +1 800 494 3119
Holden Web LLC           
PyCon TX 2006        

More information about the Python-list mailing list