automatic cookie handling with urllib?

Doug Fort dougfort at
Thu Mar 22 04:14:40 EST 2001

This is our cookie module.  I've been meaning to rewrite it as a Python
library Cookie object.

Doug Fort (dougfort at
Senior Meat Manager
Downright Software LLC

#!/usr/bin/env python

This object stores and retrieves cookies IAW RFC 2109 & RFC 2068

$Id:,v 1.7 2001/02/16 00:27:46 dougfort Exp $
Downright Software LLC
Copyright (c) 2000 Downright Software LLC. All Rights Reserved.

Distributed and Licensed under the provisions of the WebNudge
Open Source License (Version 1.0) which is included by reference.

The WebNudge Open Source License can be found in the file WOSLV10.TXT
in the source distribution kit.
__version__="$Revision: 1.7 $"[11:-2]

import re
import time
import urlparse
import string

import webnudge.util.misc
def CookieContainerException(Exception):
    def __init__(self, message):
        self._message = message
    def __str__(self):
        return self._message

class CookieContainer:
    This object stores and retrieves cookies IAW RFC 2109 & RFC 2068 

    def __init__(self, listelement=None):
        self._cookiedict = {}

        # Match dates  of this form:
        # Monday, 05-Feb-2001 08:00:00 GMT
        self._DatePattern = re.compile(r"""
            (?P<weekday>               # Start of group 'weekday'
            [A-za-z]+                  # Any word of at least one letter
            )                          # End of group 'weekday'
            \s*\,\s*                   # a literal comma after weekday
            (?P<day>                   # Start of group 'day'
            \d\d                       # two digits
            )                          # End of group 'day'
            -                          # literal hyphen 
            (?P<month>                 # Start of group 'month'
            [A-za-z]+                  # three letters
            )                          # End of group 'month'
            -                          # literal hyphen 
            (?P<year>                  # Start of group 'year'
            \d+                        # some digits
            )                          # End of group 'year'
            \s+                        # a space or more
            (?P<hour>                  # Start of group 'hour'
            \d\d                       # two digits
            )                          # End of group 'hour'
            :                          # a colon
            (?P<minute>                # Start of group 'minute'
            \d\d                       # two digits
            )                          # End of group 'minute'
            :                          # a colon
            (?P<second>                # Start of group 'second'
            \d\d                       # two digits
            )                          # End of group 'second'
            \s+                        # some whitespace
            GMT                        # literal 'GMT
            """, re.VERBOSE | re.IGNORECASE)

        self._DateFormat = "%4d-%2s-%2s %2s:%2s:%2s" # yyyy-mm-dd hh:mm:ss
        self._MonthDict = {
            "jan" : "01",
            "feb" : "02",
            "mar" : "03",
            "apr" : "04",
            "may" : "05",
            "jun" : "06",
            "jul" : "07",
            "aug" : "08",
            "sep" : "09",
            "oct" : "10",
            "nov" : "11",
            "dec" : "12"

    def __str__(self):
        Report ourself as a string
        return str(self._cookiedict)
    def isempty(self):
        Report presence of cookies
        return not self._cookiedict

    def clear(self):
        Empty out the cookies

    def loadFromHeaders(self, defaultdomain, headers):
        Extract 'set-cookie' from headers from RawHTMLPage
        Return a count of the new cookies added
        count = 0
        cookieheaderlist = headers.getallmatchingheaders("set-cookie")
        for cookieheader in cookieheaderlist:
            cookie = {
                "domain" : defaultdomain,
                "path" : "/",
                "secure" : "no"
            # split on ';' after dropping 'set-cookie:'
            tokenlist = string.split(cookieheader[12:],";")
            # assume name is the  first token
            token =  string.strip(tokenlist[0])
            index = string.find(token, "=")
            if index <= 0:
            cookie["name"] = token[:index]
            cookie["value"] = token[index+1:]
            for token in tokenlist[1:]:
                # split on the first '=', except for secure
                token = string.strip(token)
                if token == "secure":
                    cookie["secure"] = "yes"
                index = string.find(token, "=")
                if index <= 0:
                key = string.lower(token[:index])
                value = token[index+1:]
                if key == "expires":
                    cookie[key] = self._convertExpirationDate(value)
                    cookie[key] = value
            self._cookiedict[cookie["name"]] = cookie
            count = count + 1

        return count

    def returnCookieList(self, url):
        Return a list of name:value tuples for cookies that
        fit the url
        scheme,netloc,path,parameters,query,fragment = urlparse.urlparse(
        returnlist = []
        for cookie in self._cookiedict.values():
            if len(netloc) < len(cookie["domain"]):
            # The url must be in the domain the cookie specifies
            if netloc[len(netloc)-len(cookie["domain"]):] != cookie["domain"]:
            # the path must include the path the domain specifies
            if path and string.find(path, cookie["path"]) != 0:
            # if we have an expiration date, check for it
            expirationdate = cookie.get("expires", None)
            if expirationdate and expirationdate <= time.time():

        return returnlist

    def _convertExpirationDate(self, value):
        convert a date string of the form
        'Dayofweek, dd-mmm-yyyy hh:mm:ss GMT'
        into a python date.
        # look for a date we can understand
        match =
        if not match:
            return None

        # allow for a two digit year
        # redhat is one of the offenders
        if len("year")) == 4:
            year = int("year"))
            year = 2000 + int("year"))
        # kludge alert!  I can't find a slick way to convert
        # this date, so I'm going to convert it to our database
        # format and use code that I know works
        datestr = self._DateFormat % (
        return webnudge.util.misc.strtime(datestr)
if __name__ == "__main__":
    Code for commandline testing
    import sys
    if len(sys.argv) != 2:
        print "Usage: <url>"

    cookiecontainer = CookieContainer()

    import webnudge.util.rawhtmlpage
    page = webnudge.util.rawhtmlpage.RawHTMLPage()
    page.load(sys.argv[1], "GET", [], cookiecontainer, debuglevel=1)
    if not page:
        print "*** Error *** %s" % (page._message)

    print "*" * 30
    print page._data
    print "*" * 30

    print "cookie dict"
    for key, value in cookiecontainer._cookiedict.items():
        sys.stdout.write("%s = %s\n" % (key, value))

    print "cookies returned"
    for item in page._cookiesreturned:
        print item

