[Pypi-checkins] r855 - in trunk/appengine: . templates

martin.von.loewis python-checkins at python.org
Mon Aug 16 10:25:00 CEST 2010


Author: martin.von.loewis
Date: Mon Aug 16 10:25:00 2010
New Revision: 855

Added:
   trunk/appengine/templates/stats.html   (contents, props changed)
Modified:
   trunk/appengine/app.yaml
   trunk/appengine/cron.yaml
   trunk/appengine/fetch.py
   trunk/appengine/handlers.py
   trunk/appengine/mirror.py
   trunk/appengine/model.py
   trunk/appengine/stats.py
Log:
Integrate download stats.


Modified: trunk/appengine/app.yaml
==============================================================================
--- trunk/appengine/app.yaml	(original)
+++ trunk/appengine/app.yaml	Mon Aug 16 10:25:00 2010
@@ -12,6 +12,10 @@
   script: mirror.py
   login: admin
 
+- url: /daily
+  script: mirror.py
+  login: admin
+
 - url: .*
   script: mirror.py
 

Modified: trunk/appengine/cron.yaml
==============================================================================
--- trunk/appengine/cron.yaml	(original)
+++ trunk/appengine/cron.yaml	Mon Aug 16 10:25:00 2010
@@ -1,4 +1,7 @@
 cron:
-- description: daily summary job
+- description: check for updates
   url: /cron
   schedule: every 5 minutes
+- description: daily summary job
+  url: /daily
+  schedule: every 24 hours

Modified: trunk/appengine/fetch.py
==============================================================================
--- trunk/appengine/fetch.py	(original)
+++ trunk/appengine/fetch.py	Mon Aug 16 10:25:00 2010
@@ -217,11 +217,18 @@
         todo.append(('package', ''))
     todo.append(('last_modified', now))
 
+############## Statistics ##########################
+
+def integrate_stats(m):
+    import stats
+    stats.integrate()
+
 ############## Queuing #############################
 
 actions = {'package':package,
            'file':copy_file,
            'last_modified': last_modified,
+           'integrate_stats': integrate_stats,
            }
 
 def queue_step():

Modified: trunk/appengine/handlers.py
==============================================================================
--- trunk/appengine/handlers.py	(original)
+++ trunk/appengine/handlers.py	Mon Aug 16 10:25:00 2010
@@ -135,12 +135,26 @@
 
 class Stats(webapp.RequestHandler):
     def get(self, path):
-        self.response.headers['content-type'] = 'text/plain'
-        self.response.out.write('not implemented yet')
+        path = path.rstrip('/')
+        if not path:
+            days = [d.day for d in model.Stats.all().fetch(1000) if d.data]
+            days.sort()
+            self.response.out.write(template.render(tpl_path('stats.html'), {'days':days}))
+            return
+        if not path.endswith('.bz2'):
+            return self.error(404)
+        path = path[:-4]
+        s = model.Stats.all().filter("day = ", path).fetch(1)
+        if not s or not s[0].data:
+            return self.error(404)
+        self.response.headers['content-type'] = 'application/octet-stream'
+        self.response.out.write(s[0].data)
 
 class Step(webapp.RequestHandler):
-    def get(self):
+    def get(self, path):
         self.response.headers['content-type'] = 'text/plain'
+        if path == '/integrate':
+            return self.response.out.write(stats.integrate())
         self.response.out.write(fetch.step())
     post = get
 
@@ -148,3 +162,8 @@
     def get(self):
         self.response.headers['content-type'] = 'text/plain'
         self.response.out.write(fetch.cron())        
+
+class Daily(webapp.RequestHandler):
+    def get(self):
+        self.response.headers['content-type'] = 'text/plain'
+        self.response.out.write(stats.integrate())

Modified: trunk/appengine/mirror.py
==============================================================================
--- trunk/appengine/mirror.py	(original)
+++ trunk/appengine/mirror.py	Mon Aug 16 10:25:00 2010
@@ -10,7 +10,8 @@
      ('/serversig/(.*)', Serversig),
      ('/local-stats/days/(.*)', Stats),
      ('/cron', Cron),
-     ('/step', Step),
+     ('/daily', Daily),
+     ('/step(/.*)?', Step),
      ('/mkupload/(.*)', MkUpload),
      #('/mkupload2', MkUpload2),
      ('/upload', Upload),

Modified: trunk/appengine/model.py
==============================================================================
--- trunk/appengine/model.py	(original)
+++ trunk/appengine/model.py	Mon Aug 16 10:25:00 2010
@@ -64,3 +64,8 @@
     name = db.StringProperty()
     project = db.StringProperty()
     agent = db.StringProperty()
+
+class Stats(db.Model):
+    day = db.StringProperty()
+    data = db.BlobProperty()
+    partial = db.BlobProperty()

Modified: trunk/appengine/stats.py
==============================================================================
--- trunk/appengine/stats.py	(original)
+++ trunk/appengine/stats.py	Mon Aug 16 10:25:00 2010
@@ -1,5 +1,104 @@
-import datetime
+# stdlib
+import datetime, bz2, csv, re, cStringIO, cPickle
+from collections import defaultdict
+# GAE
+from google.appengine.api.labs import taskqueue
+# PyPI
+import model
+
+# list of recognized user agents
+SETUPTOOLS_UA = (re.compile((r'^.* setuptools/(?P<version>[0-9]\..*)$')), 'setuptools/%s')
+URLLIB_UA = (re.compile(r'^Python-urllib/(?P<version>[23]\.[0-9])$'), 'Python-urllib/%s')
+SAFARI_UA = (re.compile(r'^Mozilla.* .* Version/(?P<version>.*) Safari/.*$'), 'Safari/%s')
+GOOGLEBOT = (re.compile(r'Googlebot-Mobile/(?P<version>.*);'), 'Googlebot-Mobile/%s')
+MSNBOT = (re.compile(r'^msnbot/(?P<version>.*) '), 'msnbot/%s')
+FIREFOX_UA = (re.compile(r'^Mozilla.*? Firefox/(?P<version>[23])\..*$'), 'Firefox/%s')
+PLAIN_MOZILLA = (re.compile(r'^Mozilla/(?P<version>.*?) '), 'Mozilla/%s')
+
+def get_simplified_ua(user_agent):
+    """returns a simplified version of the user agent"""
+    while user_agent.endswith(',gzip(gfe)'):
+        user_agent = user_agent[:-len(',gzip(gfe)')]
+    for expr, repl in (URLLIB_UA, SETUPTOOLS_UA, SAFARI_UA, GOOGLEBOT, 
+                       MSNBOT, FIREFOX_UA, PLAIN_MOZILLA):
+        res = expr.search(user_agent)
+        if res is not None:
+            return repl % res.group('version')
+        return user_agent
 
 def today():
     now = datetime.datetime.utcnow()
     return "%s-%.2d-%.2d" % (now.year, now.month, now.day)
+
+def mkbz2(entries):
+    downloads = entries.items()
+    downloads.sort()
+    output = cStringIO.StringIO()
+    writer = csv.writer(output)
+    for (p,n,a),c in downloads:
+        writer.writerow((p,n,a,c))
+    data = bz2.compress(output.getvalue())
+    return data
+
+def unpack(data):
+    res = {}
+    reader = csv.reader(cStringIO.StringIO(bz2.decompress(data)))
+    for project, agent, name, count in reader:
+        res[project, agent, name] = int(count)
+    return res
+
+def integrate_one_day(day):
+    entries = defaultdict(lambda:0)
+    old = model.Stats.all().filter('day = ', day).fetch(1)
+    if old:
+        old = old[0]
+        if len(old.partial) > 500000:
+            # argh. need to make multiple files to fit into Google blob limits
+            partno = 1
+            while model.Stats.all().filter('day = ', '%s.part%d' % (day, partno)).fetch(1):
+                partno += 1
+            old.day = '%s.part%d' % (day, partno)
+            old.data = old.partial
+            old.partial = None
+            old.put()
+            # enough for now
+            return
+        entries.update(unpack(old.partial))
+    deletable = []
+    todo = model.Download.all().filter('day = ', day).fetch(100)
+    for download in todo:
+        agent = get_simplified_ua(download.agent)
+        key = download.project,download.name,agent
+        entries[key] += 1
+        deletable.append(download)
+    if len(todo) == 100:
+        # Partial results. Save them
+        data = mkbz2(entries)
+        if old:
+            old.partial = data
+        else:
+            old = model.Stats(day=day, partial=data)
+        old.put()
+        for d in deletable:
+            d.delete()
+        return
+    # complete data
+    data = mkbz2(entries)
+    if old:
+        old.data = data
+        old.partial = None
+    else:
+        old = model.Stats(day=day, data=data)
+    old.put()
+    for d in deletable:
+        d.delete()
+
+def integrate():
+    'Integrate all downloads except for those from today'
+    # find a day that isn't integrated yet
+    d = model.Download.all().filter('day != ', today()).fetch(1)
+    if not d:
+        return "Done"
+    integrate_one_day(d[0].day)
+    taskqueue.add(url='/step/integrate')
+    return "queued next integration"

Added: trunk/appengine/templates/stats.html
==============================================================================
--- (empty file)
+++ trunk/appengine/templates/stats.html	Mon Aug 16 10:25:00 2010
@@ -0,0 +1,10 @@
+<html>
+<head>
+<title>Index of /local-stats/days</title>
+</head>
+<body>
+{% for day in days %}
+<a href="/local-stats/days/{{day}}.bz2">{{day}}.bz2</a><br/>
+{% endfor %}
+</body>
+</html>
\ No newline at end of file


More information about the Pypi-checkins mailing list