[Python-Dev] Patch Manager screen scraper

Jeremy Hylton jeremy@beopen.com
Fri, 14 Jul 2000 16:48:45 -0400 (EDT)

Content-Type: text/plain; charset=us-ascii
Content-Description: message body text
Content-Transfer-Encoding: 7bit

I wrote a small script to extract patch meta data from the SourceForge
patch manager and install them in a local Postgres database.  This
code might be useful for other folks, although the second half (the
Postgres half) depends on having PyGreSQL installed.

I'm posting the current source here in case anyone else wants to
fiddle.  It's definitely rough.  Attached is the code and the DB


Content-Type: text/plain
Content-Description: python script
Content-Disposition: inline;
Content-Transfer-Encoding: 7bit

"""Screen scraper for Patch Manager interface

The patch form URL is http://www.sourceforge.net/patch/index.php.
GET method
If I'm lucky, it can be used without authentication.

the input fields are:  (* means hidden field)

This script produces the following HTML for each entry:
    <TD><A HREF='?func=detailpatch&patch_id=100518&group_id=5470'>100518</A>
    <TD>fix bltinmodule.c for 64-bit platforms</TD>
    <TD>2000-Jun-07 03:21</TD>

If there are more than 50 patches, the following HTML is produced:
		<TR><TD COLSPAN='2'>&nbsp;</TD><TD>&nbsp;</TD><TD COLSPAN='2'><A HREF='?func=browse&group_id=5470&set=custom&_assigned_to=100&_status=100&offset=50'><B>Next 50 --></B></A></TD></TR></TABLE>	<!-- end content -->
Future plans:

support authentication
command-line interface for modifying patches

import cgi
import re
import types
from urllib import urlencode
from urlparse import urljoin
from urllib2 import urlopen

import pg

DATABASE = 'jeremy'

class PatchListParser:
    """Minimal re-based parsed that grabs relevant URLs from summary"""

    rx_href = re.compile('HREF="([?/=&_A-Za-z0-9]+)"')

    def parse_hrefs(self, buf):
        hrefs = []
        offset = 0
        while 1:
            mo = self.rx_href.search(buf, offset)
            if mo is None:
            offset = mo.end(1)
        return hrefs

    def get_query_hrefs(self, buf):
        queries = []
        for href in self.parse_hrefs(buf):
            if href[0] == '?':
        return queries

class PatchParser:
    """Minimal re-based parser that pulls key-values from patch page"""

    rx_entry = re.compile('<TD[^>]*><B>(.+):</B><BR>(.+)</TD>')

    def parse(self, buf):
        entries = {}
        offset = 0
        while 1:
            mo = self.rx_entry.search(buf, offset)
            if mo is None:
            offset = mo.end(2)
            k, v = mo.group(1, 2)
            entries[k] = v
        return entries

class SQLMapping:
    """Decode a simple mapping from an SQL table

    Assumes that the keys and values are disjoint, so that a single
    interface can resolve in either direction.

    def __init__(self, db, table, fields="*"):
        self.dict1 = {}
        self.dict2 = {}
        r = db.query("SELECT %s FROM %s" % (fields, table)).getresult()
        for key, val in r:
            assert None not in (key, val)
            self.dict1[key] = val
            self.dict2[val] = key

    def lookup(self, kv):
        r = self.dict1.get(kv)
        if r is None:
            r = self.dict2.get(kv)
        return r

def urldecode(query):
    d = cgi.parse_qs(query)
    for k, v in d.items():
        if len(v) != 1:
            raise ValueError, "unexpected duplicate entry"
        d[k] = v[0]
    return d

class PatchManager:
    url = "http://www.sourceforge.net/patch/index.php"
    group_id = 5470
    list_parser = PatchListParser()
    patch_parser = PatchParser()

    # XXX to get the right numeric values for assigned_to and status,
    # would need to scrape them out of the form...

    def get_patches(self, assigned_to='100', status='100'):
        assert type(assigned_to) == types.StringType
        assert type(status) == types.StringType

        url = self._get_initial_query(assigned_to, status)
        patch_list = self._load_patch_summary(url)
        patches = {}
        for patch_id, p in patch_list:
            patches[patch_id] = self._load_patch_detail(p)
        return patches

    def _get_initial_query(self, assigned_to, status):
        dict = {'group_id': self.group_id,
                'set': 'custom',
                'SUBMIT': 'Browse',
                '_assigned_to': assigned_to,
                '_status': status,
        query = urlencode(dict)
        return "%s?%s" % (self.url, query)

    def _load_patch_summary(self, url):
        todo = [(url, 0)]
        patches = []
        offset = 0
        while todo:
            url, offset = todo[0]
            del todo[0]
            buf = urlopen(url).read()
            for href in self.list_parser.get_query_hrefs(buf):
                d = urldecode(href[1:])
                if d['func'] == 'detailpatch':
                                    urljoin(self.url, href)))
                elif d['func'] == 'browse':
                    new_offset = int(d['offset'])
                    if new_offset > offset:
                        todo.append((urljoin(self.url, href),
        return patches

    def _load_patch_detail(self, url):
        buf = urlopen(url).read()
        return self.patch_parser.parse(buf)

class PatchDBInterface:
    """Interface between the PatchManager and the SQL database

    Scheme for the patches table is:
    CREATE TABLE patches_t (
       patch_id int PRIMARY KEY,
       summary text,
       status int REFERENCES status_t,
       category text,
       date text,
       submitted_by int REFERENCES users_t,
       assigned_to int REFERENCES users_t,
       summary_url text

    def __init__(self, db):
        self.db = db
        self.users = SQLMapping(db, 'users_t')
        self.status = SQLMapping(db, 'status_t')
        self.num_deletes = 0

    def update(self, patch_id, attrs):
        # resolve REFERENCES
        status = self.status.lookup(attrs['Status'])
        submitted_by = self.users.lookup(attrs['Submitted By'])
        if submitted_by is None:
            submitted_by = 0
        assigned_to = self.users.lookup(attrs['Assigned To'])
        if assigned_to is None:
            assigned_to = 100

        # delete old version if necessary
        if self.has_patch(patch_id):
            q = "DELETE FROM patches_t WHERE patch_id = %(patch_id)d"
            self.db.query(q % locals())
            self.num_deletes = self.num_deletes + 1

        d = locals()
        del d['attrs'] # just to make debugging prints clearer
        for k, v in attrs.items():
            d[k] = pg._quote(v, 0)
        q = "INSERT INTO patches_t VALUES (%(patch_id)d," \
            " %(Summary)s, %(status)d, %(Category)s, %(Date)s," \
            " %(submitted_by)d, %(assigned_to)d)"
        self.db.query(q % d)

    def has_patch(self, patch_id):
        r = self.db.query("SELECT * FROM patches_t" \
                          " WHERE patch_id = %d" % patch_id).getresult()
        if r:
            return 1
            return 0

if __name__ == "__main__":
    import sys
    import getopt

    opts, args = getopt.getopt(sys.argv[1:], 'vd:')
    assert len(args) == 0
    for k, v in opts:
        if k == '-v':
            VERBOSE = 1
        elif k == '-d':
            DATABASE = v
    pmgr = PatchManager()
    if VERBOSE:
        print "Loading patches"
    p = pmgr.get_patches()
    if VERBOSE:
        print "Retrieved %d patches" % len(p)

    if VERBOSE:
        print "Inserting into local database"
    db = pg.connect(DATABASE)
    pdbi = PatchDBInterface(db)
    for p_id, attrs in p.items():
        pdbi.update(p_id, attrs)
    if VERBOSE:
        new = len(p) - pdbi.num_deletes
        print "Found %d new patches" % new
        print "Updated %d existing patches" % pdbi.num_deletes

Content-Type: text/plain
Content-Description: database schema
Content-Disposition: inline;
Content-Transfer-Encoding: 7bit

CREATE TABLE users_t (
       user_id int PRIMARY KEY,
       username text NOT NULL

CREATE TABLE status_t (
       status_id int PRIMARY KEY,
       name text NOT NULL

CREATE TABLE patches_t (
       patch_id int PRIMARY KEY,
       summary text,
       status int REFERENCES status_t,
       category text,
       date text,
       submitted_by int REFERENCES users_t,
       assigned_to int REFERENCES users_t,
       summary_url text