[Tutor] Regex for a variable within a string
David Cash
cashbang at googlemail.com
Wed Apr 8 12:28:09 CEST 2009
Hi, I'm new to python and have decided to develop a web crawler / file
downloader as my first application. I am at the stage where the script
requests a page and parses the page for URLs, then prints them out. However,
I'd like to change my current regex that greps for 'http' to one that will
grep for the url variable that is used in the connect string.
I was hoping I could use something like p=re.compile((url).*?'"') but this
is clearly not the right syntax. Apologies for such a newbie question! My
current code is below:
Thanks in advance
#! /usr/bin/python
import urllib2
import sys
import re
import getopt
import HTMLParser
import httplib
#====== GET OPTIONS
def main(argv):
url = "nothing"
filetype = "nofile"
try:
opts, args = getopt.getopt(argv, "u:f:",["url=","filetype="])
except getopt.GetoptError:
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-u", "--url"):
url = arg
elif opt in ("-f","--filetype"):
filetype = arg
print "Using base url: " + url + "\n" "Using filetype: " + filetype
request(url,filetype)
#===== PRINT USAGE
def usage():
print '''
Bulk file downloader
-u = base url
-f = filetype
example: bulkdld.py -u http://www.google.com -f .wmv
'''
sys.exit(1)
#========REQUEST PAGE
def request(url,filetype):
txdata = None
txheaders= {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
'Accept-Language': 'en-us',
'Keep-Alive': '300',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
}
req = urllib2.Request(url, txdata, txheaders)
u = urllib2.urlopen(req)
headers = u.info()
data = u.read()
#print data
p=re.compile('http://.*?"') ########## THIS IS THE REGEX IN QUESTION
m=p.findall(data)
print m
if __name__ == "__main__":
main(sys.argv[1:])
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/tutor/attachments/20090408/f0f22376/attachment.htm>
More information about the Tutor
mailing list