[Pypi-checkins] r855 - in trunk/appengine: . templates
martin.von.loewis
python-checkins at python.org
Mon Aug 16 10:25:00 CEST 2010
Author: martin.von.loewis
Date: Mon Aug 16 10:25:00 2010
New Revision: 855
Added:
trunk/appengine/templates/stats.html (contents, props changed)
Modified:
trunk/appengine/app.yaml
trunk/appengine/cron.yaml
trunk/appengine/fetch.py
trunk/appengine/handlers.py
trunk/appengine/mirror.py
trunk/appengine/model.py
trunk/appengine/stats.py
Log:
Integrate download stats.
Modified: trunk/appengine/app.yaml
==============================================================================
--- trunk/appengine/app.yaml (original)
+++ trunk/appengine/app.yaml Mon Aug 16 10:25:00 2010
@@ -12,6 +12,10 @@
script: mirror.py
login: admin
+- url: /daily
+ script: mirror.py
+ login: admin
+
- url: .*
script: mirror.py
Modified: trunk/appengine/cron.yaml
==============================================================================
--- trunk/appengine/cron.yaml (original)
+++ trunk/appengine/cron.yaml Mon Aug 16 10:25:00 2010
@@ -1,4 +1,7 @@
cron:
-- description: daily summary job
+- description: check for updates
url: /cron
schedule: every 5 minutes
+- description: daily summary job
+ url: /daily
+ schedule: every 24 hours
Modified: trunk/appengine/fetch.py
==============================================================================
--- trunk/appengine/fetch.py (original)
+++ trunk/appengine/fetch.py Mon Aug 16 10:25:00 2010
@@ -217,11 +217,18 @@
todo.append(('package', ''))
todo.append(('last_modified', now))
+############## Statistics ##########################
+
+def integrate_stats(m):
+ import stats
+ stats.integrate()
+
############## Queuing #############################
actions = {'package':package,
'file':copy_file,
'last_modified': last_modified,
+ 'integrate_stats': integrate_stats,
}
def queue_step():
Modified: trunk/appengine/handlers.py
==============================================================================
--- trunk/appengine/handlers.py (original)
+++ trunk/appengine/handlers.py Mon Aug 16 10:25:00 2010
@@ -135,12 +135,26 @@
class Stats(webapp.RequestHandler):
def get(self, path):
- self.response.headers['content-type'] = 'text/plain'
- self.response.out.write('not implemented yet')
+ path = path.rstrip('/')
+ if not path:
+ days = [d.day for d in model.Stats.all().fetch(1000) if d.data]
+ days.sort()
+ self.response.out.write(template.render(tpl_path('stats.html'), {'days':days}))
+ return
+ if not path.endswith('.bz2'):
+ return self.error(404)
+ path = path[:-4]
+ s = model.Stats.all().filter("day = ", path).fetch(1)
+ if not s or not s[0].data:
+ return self.error(404)
+ self.response.headers['content-type'] = 'application/octet-stream'
+ self.response.out.write(s[0].data)
class Step(webapp.RequestHandler):
- def get(self):
+ def get(self, path):
self.response.headers['content-type'] = 'text/plain'
+ if path == '/integrate':
+ return self.response.out.write(stats.integrate())
self.response.out.write(fetch.step())
post = get
@@ -148,3 +162,8 @@
def get(self):
self.response.headers['content-type'] = 'text/plain'
self.response.out.write(fetch.cron())
+
+class Daily(webapp.RequestHandler):
+ def get(self):
+ self.response.headers['content-type'] = 'text/plain'
+ self.response.out.write(stats.integrate())
Modified: trunk/appengine/mirror.py
==============================================================================
--- trunk/appengine/mirror.py (original)
+++ trunk/appengine/mirror.py Mon Aug 16 10:25:00 2010
@@ -10,7 +10,8 @@
('/serversig/(.*)', Serversig),
('/local-stats/days/(.*)', Stats),
('/cron', Cron),
- ('/step', Step),
+ ('/daily', Daily),
+ ('/step(/.*)?', Step),
('/mkupload/(.*)', MkUpload),
#('/mkupload2', MkUpload2),
('/upload', Upload),
Modified: trunk/appengine/model.py
==============================================================================
--- trunk/appengine/model.py (original)
+++ trunk/appengine/model.py Mon Aug 16 10:25:00 2010
@@ -64,3 +64,8 @@
name = db.StringProperty()
project = db.StringProperty()
agent = db.StringProperty()
+
+class Stats(db.Model):
+ day = db.StringProperty()
+ data = db.BlobProperty()
+ partial = db.BlobProperty()
Modified: trunk/appengine/stats.py
==============================================================================
--- trunk/appengine/stats.py (original)
+++ trunk/appengine/stats.py Mon Aug 16 10:25:00 2010
@@ -1,5 +1,104 @@
-import datetime
+# stdlib
+import datetime, bz2, csv, re, cStringIO, cPickle
+from collections import defaultdict
+# GAE
+from google.appengine.api.labs import taskqueue
+# PyPI
+import model
+
+# list of recognized user agents
+SETUPTOOLS_UA = (re.compile((r'^.* setuptools/(?P<version>[0-9]\..*)$')), 'setuptools/%s')
+URLLIB_UA = (re.compile(r'^Python-urllib/(?P<version>[23]\.[0-9])$'), 'Python-urllib/%s')
+SAFARI_UA = (re.compile(r'^Mozilla.* .* Version/(?P<version>.*) Safari/.*$'), 'Safari/%s')
+GOOGLEBOT = (re.compile(r'Googlebot-Mobile/(?P<version>.*);'), 'Googlebot-Mobile/%s')
+MSNBOT = (re.compile(r'^msnbot/(?P<version>.*) '), 'msnbot/%s')
+FIREFOX_UA = (re.compile(r'^Mozilla.*? Firefox/(?P<version>[23])\..*$'), 'Firefox/%s')
+PLAIN_MOZILLA = (re.compile(r'^Mozilla/(?P<version>.*?) '), 'Mozilla/%s')
+
+def get_simplified_ua(user_agent):
+ """returns a simplified version of the user agent"""
+ while user_agent.endswith(',gzip(gfe)'):
+ user_agent = user_agent[:-len(',gzip(gfe)')]
+ for expr, repl in (URLLIB_UA, SETUPTOOLS_UA, SAFARI_UA, GOOGLEBOT,
+ MSNBOT, FIREFOX_UA, PLAIN_MOZILLA):
+ res = expr.search(user_agent)
+ if res is not None:
+ return repl % res.group('version')
+ return user_agent
def today():
now = datetime.datetime.utcnow()
return "%s-%.2d-%.2d" % (now.year, now.month, now.day)
+
+def mkbz2(entries):
+ downloads = entries.items()
+ downloads.sort()
+ output = cStringIO.StringIO()
+ writer = csv.writer(output)
+ for (p,n,a),c in downloads:
+ writer.writerow((p,n,a,c))
+ data = bz2.compress(output.getvalue())
+ return data
+
+def unpack(data):
+ res = {}
+ reader = csv.reader(cStringIO.StringIO(bz2.decompress(data)))
+ for project, agent, name, count in reader:
+ res[project, agent, name] = int(count)
+ return res
+
+def integrate_one_day(day):
+ entries = defaultdict(lambda:0)
+ old = model.Stats.all().filter('day = ', day).fetch(1)
+ if old:
+ old = old[0]
+ if len(old.partial) > 500000:
+ # argh. need to make multiple files to fit into Google blob limits
+ partno = 1
+ while model.Stats.all().filter('day = ', '%s.part%d' % (day, partno)).fetch(1):
+ partno += 1
+ old.day = '%s.part%d' % (day, partno)
+ old.data = old.partial
+ old.partial = None
+ old.put()
+ # enough for now
+ return
+ entries.update(unpack(old.partial))
+ deletable = []
+ todo = model.Download.all().filter('day = ', day).fetch(100)
+ for download in todo:
+ agent = get_simplified_ua(download.agent)
+ key = download.project,download.name,agent
+ entries[key] += 1
+ deletable.append(download)
+ if len(todo) == 100:
+ # Partial results. Save them
+ data = mkbz2(entries)
+ if old:
+ old.partial = data
+ else:
+ old = model.Stats(day=day, partial=data)
+ old.put()
+ for d in deletable:
+ d.delete()
+ return
+ # complete data
+ data = mkbz2(entries)
+ if old:
+ old.data = data
+ old.partial = None
+ else:
+ old = model.Stats(day=day, data=data)
+ old.put()
+ for d in deletable:
+ d.delete()
+
+def integrate():
+ 'Integrate all downloads except for those from today'
+ # find a day that isn't integrated yet
+ d = model.Download.all().filter('day != ', today()).fetch(1)
+ if not d:
+ return "Done"
+ integrate_one_day(d[0].day)
+ taskqueue.add(url='/step/integrate')
+ return "queued next integration"
Added: trunk/appengine/templates/stats.html
==============================================================================
--- (empty file)
+++ trunk/appengine/templates/stats.html Mon Aug 16 10:25:00 2010
@@ -0,0 +1,10 @@
+<html>
+<head>
+<title>Index of /local-stats/days</title>
+</head>
+<body>
+{% for day in days %}
+<a href="/local-stats/days/{{day}}.bz2">{{day}}.bz2</a><br/>
+{% endfor %}
+</body>
+</html>
\ No newline at end of file
More information about the Pypi-checkins
mailing list