automatic cookie handling with urllib?

Doug Fort dougfort at downright.com
Thu Mar 22 04:14:40 EST 2001


This is our cookie module.  I've been meaning to rewrite it as a Python
library Cookie object.

--
Doug Fort (dougfort at downright.com)
Senior Meat Manager
Downright Software LLC
http://www.dougfort.net


-------------- next part --------------
#!/usr/bin/env python
"""
CookieContainer

This object stores and retrieves cookies IAW RFC 2109 & RFC 2068

$Id: cookiecontainer.py,v 1.7 2001/02/16 00:27:46 dougfort Exp $
"""
__author__="""
Downright Software LLC
http://www.downright.com
"""
__copyright__="""
Copyright (c) 2000 Downright Software LLC. All Rights Reserved.

Distributed and Licensed under the provisions of the WebNudge
Open Source License (Version 1.0) which is included by reference.

The WebNudge Open Source License can be found in the file WOSLV10.TXT
in the source distribution kit.
"""
__version__="$Revision: 1.7 $"[11:-2]

import re
import time
import urlparse
import string

import webnudge.util.misc
        
def CookieContainerException(Exception):
    def __init__(self, message):
        self._message = message
    def __str__(self):
        return self._message

###########################################################
class CookieContainer:
###########################################################
    """
    This object stores and retrieves cookies IAW RFC 2109 & RFC 2068 
    """

    #----------------------------------------------------------
    def __init__(self, listelement=None):
    #----------------------------------------------------------
        """
        Constructor
        """
        self._cookiedict = {}

        # Match dates  of this form:
        # Monday, 05-Feb-2001 08:00:00 GMT
        self._DatePattern = re.compile(r"""
            (?P<weekday>               # Start of group 'weekday'
            [A-za-z]+                  # Any word of at least one letter
            )                          # End of group 'weekday'
            \s*\,\s*                   # a literal comma after weekday
            (?P<day>                   # Start of group 'day'
            \d\d                       # two digits
            )                          # End of group 'day'
            -                          # literal hyphen 
            (?P<month>                 # Start of group 'month'
            [A-za-z]+                  # three letters
            )                          # End of group 'month'
            -                          # literal hyphen 
            (?P<year>                  # Start of group 'year'
            \d+                        # some digits
            )                          # End of group 'year'
            \s+                        # a space or more
            (?P<hour>                  # Start of group 'hour'
            \d\d                       # two digits
            )                          # End of group 'hour'
            :                          # a colon
            (?P<minute>                # Start of group 'minute'
            \d\d                       # two digits
            )                          # End of group 'minute'
            :                          # a colon
            (?P<second>                # Start of group 'second'
            \d\d                       # two digits
            )                          # End of group 'second'
            \s+                        # some whitespace
            GMT                        # literal 'GMT
            """, re.VERBOSE | re.IGNORECASE)

        self._DateFormat = "%4d-%2s-%2s %2s:%2s:%2s" # yyyy-mm-dd hh:mm:ss
        self._MonthDict = {
            "jan" : "01",
            "feb" : "02",
            "mar" : "03",
            "apr" : "04",
            "may" : "05",
            "jun" : "06",
            "jul" : "07",
            "aug" : "08",
            "sep" : "09",
            "oct" : "10",
            "nov" : "11",
            "dec" : "12"
            }

    #----------------------------------------------------------
    def __str__(self):
    #----------------------------------------------------------
        """
        Report ourself as a string
        """
        return str(self._cookiedict)
        
    #----------------------------------------------------------
    def isempty(self):
    #----------------------------------------------------------
        """
        Report presence of cookies
        """
        return not self._cookiedict

    #----------------------------------------------------------
    def clear(self):
    #----------------------------------------------------------
        """
        Empty out the cookies
        """
        self._cookiedict.clear()

    #----------------------------------------------------------
    def loadFromHeaders(self, defaultdomain, headers):
    #----------------------------------------------------------
        """
        Extract 'set-cookie' from headers from RawHTMLPage
        Return a count of the new cookies added
        """
        count = 0
        
        cookieheaderlist = headers.getallmatchingheaders("set-cookie")
        for cookieheader in cookieheaderlist:
            cookie = {
                "domain" : defaultdomain,
                "path" : "/",
                "secure" : "no"
                }
            
            # split on ';' after dropping 'set-cookie:'
            tokenlist = string.split(cookieheader[12:],";")
            
            # assume name is the  first token
            token =  string.strip(tokenlist[0])
            index = string.find(token, "=")
            if index <= 0:
                continue
            cookie["name"] = token[:index]
            cookie["value"] = token[index+1:]
            
            for token in tokenlist[1:]:
                # split on the first '=', except for secure
                token = string.strip(token)
                if token == "secure":
                    cookie["secure"] = "yes"
                    continue
                index = string.find(token, "=")
                if index <= 0:
                    continue
                key = string.lower(token[:index])
                value = token[index+1:]
                if key == "expires":
                    cookie[key] = self._convertExpirationDate(value)
                else:
                    cookie[key] = value
            self._cookiedict[cookie["name"]] = cookie
            count = count + 1

        return count

    #----------------------------------------------------------
    def returnCookieList(self, url):
    #----------------------------------------------------------
        """
        Return a list of name:value tuples for cookies that
        fit the url
        """
        scheme,netloc,path,parameters,query,fragment = urlparse.urlparse(
            url
            )
        returnlist = []
        for cookie in self._cookiedict.values():
            if len(netloc) < len(cookie["domain"]):
                continue
            # The url must be in the domain the cookie specifies
            if netloc[len(netloc)-len(cookie["domain"]):] != cookie["domain"]:
                continue
            # the path must include the path the domain specifies
            if path and string.find(path, cookie["path"]) != 0:
                continue
            # if we have an expiration date, check for it
            expirationdate = cookie.get("expires", None)
            if expirationdate and expirationdate <= time.time():
                continue
            returnlist.append((cookie["name"],cookie["value"]))

        return returnlist

    #----------------------------------------------------------
    def _convertExpirationDate(self, value):
    #----------------------------------------------------------
        """
        convert a date string of the form
        'Dayofweek, dd-mmm-yyyy hh:mm:ss GMT'
        into a python date.
        """
        # look for a date we can understand
        match = self._DatePattern.search(value)
        if not match:
            return None

        # allow for a two digit year
        # redhat is one of the offenders
        if len(match.group("year")) == 4:
            year = int(match.group("year"))
        else:
            year = 2000 + int(match.group("year"))
            
        # kludge alert!  I can't find a slick way to convert
        # this date, so I'm going to convert it to our database
        # format and use code that I know works
        datestr = self._DateFormat % (
            year,
            self._MonthDict[string.lower(match.group("month"))],
            match.group("day"),
            match.group("hour"),
            match.group("minute"),
            match.group("second"),
            )
        
        return webnudge.util.misc.strtime(datestr)
        
#----------------------------------------------------------
if __name__ == "__main__":
#----------------------------------------------------------
    """
    Code for commandline testing
    """
    import sys
    if len(sys.argv) != 2:
        print "Usage: cookiecontainer.py <url>"
        sys.exit(-1)

    cookiecontainer = CookieContainer()

    import webnudge.util.rawhtmlpage
    page = webnudge.util.rawhtmlpage.RawHTMLPage()
    page.load(sys.argv[1], "GET", [], cookiecontainer, debuglevel=1)
    if not page:
        print "*** Error *** %s" % (page._message)
        sys.exit(-1)

    print "*" * 30
    print page._data
    print "*" * 30

    print "cookie dict"
    for key, value in cookiecontainer._cookiedict.items():
        sys.stdout.write("%s = %s\n" % (key, value))

    print "cookies returned"
    for item in page._cookiesreturned:
        print item


More information about the Python-list mailing list