optomizations
Rodrick Brown
rodrick.brown at gmail.com
Mon Apr 22 21:19:23 EDT 2013
I would like some feedback on possible solutions to make this script run
faster.
The system is pegged at 100% CPU and it takes a long time to complete.
#!/usr/bin/env python
import gzip
import re
import os
import sys
from datetime import datetime
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-f', dest='inputfile', type=str, help='data file
to parse')
parser.add_argument('-o', dest='outputdir', type=str,
default=os.getcwd(), help='Output directory')
args = parser.parse_args()
if len(sys.argv[1:]) < 1:
parser.print_usage()
sys.exit(-1)
print(args)
if args.inputfile and os.path.exists(args.inputfile):
try:
with gzip.open(args.inputfile) as datafile:
for line in datafile:
line = line.replace('mediacdn.xxx.com', 'media.xxx.com')
line = line.replace('staticcdn.xxx.co.uk', '
static.xxx.co.uk')
line = line.replace('cdn.xxx', 'www.xxx')
line = line.replace('cdn.xxx', 'www.xxx')
line = line.replace('cdn.xx', 'www.xx')
siteurl = line.split()[6].split('/')[2]
line = re.sub(r'\bhttps?://%s\b' % siteurl, "", line, 1)
(day, month, year, hour, minute, second) =
(line.split()[3]).replace('[','').replace(':','/').split('/')
datelog = '{} {} {}'.format(month, day, year)
dateobj = datetime.strptime(datelog, '%b %d %Y')
outfile = '{}{}{}_combined.log'.format(dateobj.year,
dateobj.month, dateobj.day)
outdir = (args.outputdir + os.sep + siteurl)
if not os.path.exists(outdir):
os.makedirs(outdir)
with open(outdir + os.sep + outfile, 'w+') as outf:
outf.write(line)
except IOError, err:
sys.stderr.write("Error unable to read or extract inputfile: {}
{}\n".format(args.inputfile, err))
sys.exit(-1)
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/python-list/attachments/20130422/0726db1d/attachment.html>
More information about the Python-list
mailing list