[Tutor] Regex for a variable within a string

Wed Apr 8 12:28:09 CEST 2009

Hi, I'm new to python and have decided to develop a web crawler / file
downloader as my first application. I am at the stage where the script
requests a page and parses the page for URLs, then prints them out. However,
I'd like to change my current regex that greps for 'http' to one that will
grep for the url variable that is used in the connect string.

I was hoping I could use something like p=re.compile((url).*?'"') but this
is clearly not the right syntax. Apologies for such a newbie question! My
current code is below:

Thanks in advance

#! /usr/bin/python

import urllib2
import sys
import re
import getopt
import HTMLParser
import httplib

#====== GET OPTIONS
def main(argv):
 url = "nothing"
 filetype = "nofile"
  try:
  opts, args = getopt.getopt(argv, "u:f:",["url=","filetype="])
 except getopt.GetoptError:
  usage()
  sys.exit(2)

 for opt, arg in opts:
  if opt in ("-u", "--url"):
  url = arg
  elif opt in ("-f","--filetype"):
  filetype = arg

 print "Using base url: " + url + "\n" "Using filetype: " + filetype
 request(url,filetype)

#===== PRINT USAGE
def usage():
 print '''
 Bulk file downloader

-u = base url
-f = filetype

example: bulkdld.py -u http://www.google.com -f .wmv
'''
 sys.exit(1)

#========REQUEST PAGE
def request(url,filetype):

 txdata = None
 txheaders= {
  'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
  'Accept-Language': 'en-us',
  'Keep-Alive': '300',
  'Connection': 'keep-alive',
  'Cache-Control': 'max-age=0',
  }
 req = urllib2.Request(url, txdata, txheaders)
 u = urllib2.urlopen(req)
 headers = u.info()
 data = u.read()
 #print data

 p=re.compile('http://.*?"') ########## THIS IS THE REGEX IN QUESTION
 m=p.findall(data)

 print m

if __name__ == "__main__":
 main(sys.argv[1:])
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/tutor/attachments/20090408/f0f22376/attachment.htm>