[Pypi-checkins] r767 - trunk/pypi/tools

martin.von.loewis python-checkins at python.org
Fri Jul 9 01:24:58 CEST 2010


Author: martin.von.loewis
Date: Fri Jul  9 01:24:57 2010
New Revision: 767

Added:
   trunk/pypi/tools/integratestats   (contents, props changed)
Modified:
   trunk/pypi/tools/apache_stats.py
Log:
Add integratestats tool.


Modified: trunk/pypi/tools/apache_stats.py
==============================================================================
--- trunk/pypi/tools/apache_stats.py	(original)
+++ trunk/pypi/tools/apache_stats.py	Fri Jul  9 01:24:57 2010
@@ -37,13 +37,6 @@
         current line. if the callable returns True, 
         the line is not included
         """
-        if isinstance(fileobj, str):
-            fileobj = self._get_file_obj(fileobj, 'w', compression)
-            file_created = True
-        else:
-            file_created = False
-
-        writer = csv.writer(fileobj)
         downloads = {}
         for log in self._get_logs(logfile, files_url):
             if filter is not None:
@@ -58,6 +51,16 @@
                 downloads[key] += count
             else:
                 downloads[key] = count
+        self._write_stats(fileobj, downloads)
+
+    def _write_stats(self, fileobj, downloads, compression=None):
+        if isinstance(fileobj, str):
+            fileobj = self._get_file_obj(fileobj, 'w', compression)
+            file_created = True
+        else:
+            file_created = False
+
+        writer = csv.writer(fileobj)
         filenames = downloads.keys()
         filenames.sort()
         for key in filenames:
@@ -107,9 +110,17 @@
             yield {'packagename': line[0],
                    'filename': line[1],
                    'useragent': line[2],
-                   'count': line[3]}
+                   'count': int(line[3])}
         #reader.close()
 
+    def read_stats_dict(self, stats_file):
+        res = {}
+        for r in self.read_stats(stats_file):
+            key = (r['packagename'], r['filename'], r['useragent'])
+            value = r['count']
+            res[key] = value
+        return res
+
     def build_local_stats(self, year, month, day, logfile, directory=None):
         """builds local stats with default values"""
         filename = '%d-%.2d-%.2d.bz2' % (year, month, day)
@@ -119,6 +130,27 @@
         self.build_daily_stats(year, month, day, logfile, filename, 
                                compression='bz2')
 
+    def integrate_stats(self, targetdir, year, month, day, fd):
+        new = self.read_stats_dict(fd)
+        oldpath = "%s/days/%s-%.2s-%.2s.bz2" % (targetdir, year, month, day)
+        if os.path.exists(oldpath):
+            old = self.read_stats_dict(oldpath)
+            for k, v in new.items():
+                old[k] = old.get(k, 0) + v
+        else:
+            old = new
+        self._write_stats(oldpath, old, 'bz2')
+        monthpath = "%s/months/%s-%.2s.bz2" % (targetdir, year, month)
+        if os.path.exists(monthpath):
+            old = self.read_stats_dict(monthpath)
+            for k, v in new.items():
+                old[k] = old.get(k, 0) + v
+        else:
+            old = new
+        self._write_stats(monthpath, old, 'bz2')
+        return new
+        
+
 class ApacheLocalStats(LocalStats):
     """concrete class that uses the ApacheLogReader"""
     def _get_logs(self, logfile, files_url):

Added: trunk/pypi/tools/integratestats
==============================================================================
--- (empty file)
+++ trunk/pypi/tools/integratestats	Fri Jul  9 01:24:57 2010
@@ -0,0 +1,65 @@
+#!/usr/bin/python
+import sys, os, socket, psycopg2, urllib, re, bz2, cStringIO, ConfigParser
+sys.path.append(os.path.dirname(__file__)+"/..")
+import apache_stats
+
+statsdir = '/data/pypi/stats/'
+
+def integrate(config, data):
+    # Setup database connection
+    c = ConfigParser.ConfigParser({'user':'', 'password':''})
+    c.read(config)
+    dbname = c.get('database', 'name')
+    dbuser = c.get('database', 'user')
+    dbpass = c.get('database', 'password')
+    dbconn = psycopg2.connect(database=dbname, user=dbuser, password=dbpass)
+    cursor = dbconn.cursor()
+    for (package, filename, browser), count in data.items():
+        cursor.execute('update release_files set downloads=downloads+%s where filename=%s',
+                       (count, filename))
+    dbconn.commit()
+    dbconn.close()
+
+def integrate_remote(config, host, dbupdate=True):
+    index = urllib.urlopen('http://%s.pypi.python.org/local-stats/days' % host).read()
+    files = set(re.findall('href=.(20..-..-..).bz2', index))
+    try:
+        integrated = open('/data/pypi/stats/integrated/'+host).readlines()
+        integrated = set([x.strip() for x in integrated])
+    except IOError:
+        integrated = set()
+    missing = files-integrated
+    stats = apache_stats.LocalStats()
+    for m in missing:
+        data = urllib.urlopen('http://%s.pypi.python.org/local-stats/days/%s.bz2' % (host, m)).read()
+        data = bz2.decompress(data)
+        data = cStringIO.StringIO(data)
+        year, month, day = m.split('-')
+        # index integration
+        delta = stats.integrate_stats(statsdir, year, month, day, data)
+        if dbupdate:
+            # database integration
+            integrate(config, delta)
+        integrated.add(m)
+        open('/data/pypi/stats/integrated/'+host, 'w').write('\n'.join(sorted(integrated)))
+
+def main():
+    lasts = socket.gethostbyname_ex('last.pypi.python.org')
+    # look for name X.pypi.python.org
+    lasts = [lasts[0]] + lasts[1]
+    for last in lasts:
+        if last[1:] == '.pypi.python.org':
+            break
+    else:
+        raise ValueError, "Could not properly resolve last mirror name"
+    last = last.split('.')[0]
+    integrate_remote(None, 'a', False)
+    host = 'b'
+    while True:
+        integrate_remote(sys.argv[1], host)
+        host = chr(ord(host)+1)
+        if host == last:
+            break
+
+main()
+        


More information about the Pypi-checkins mailing list