[Spambayes] table.py patch to produce averages at end of line.

Anthony Baxter anthony@interlink.com.au
Mon Oct 28 08:12:38 2002


The following simple patch produces a final column in table.py of 
averages for all files and all measures. This is useful if you're
doing tests with very small amounts of data, and want to run the
test multiple times with different seeds to check that your results
are actually meaningful. For instance (ignore the actual results, they
won't make sense outside of the context of the testing I'm doing)

filename:  002a_100        002c_100       
                   002b_100        002d_100
ham:spam:  400:1000        400:1000       
                   400:1000        400:1000
fp total:      127     195     104     245         167
fp %:        31.75   48.75   26.00   61.25       41.94
fn total:        0       0       0       0           0
fn %:         0.00    0.00    0.00    0.00        0.00
unsure t:      282     162     287      86         204
unsure %:    20.14   11.57   20.50    6.14       14.59
real cost:$1326.40$1982.40$1097.40$2467.20    $1718.35
best cost: $231.00 $244.20 $228.00 $249.80     $238.25
h mean:      81.07   78.72   77.23   79.29       79.08
h sdev:      20.09   30.80   24.59   34.25       27.43
s mean:      99.94   99.94   99.93   99.99       99.95
s sdev:       0.71    0.90    1.01    0.09        0.68
mean diff:   18.87   21.22   22.70   20.70       20.87
k:            0.91    0.67    0.89    0.60        0.77

Not sure if this is generally useful enough to anyone else for it to
be checked in - any opinions?

Anthony

Index: table.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/table.py,v
retrieving revision 1.4
diff -u -r1.4 table.py
--- table.py	26 Oct 2002 15:30:23 -0000	1.4
+++ table.py	28 Oct 2002 08:06:07 -0000
@@ -122,6 +122,9 @@
 meand = "mean diff:"
 kval  = "k:        "
 
+tfptot = tfpper = tfntot = tfnper = tuntot = tunper = trcost = tbcost = \
+thmean = thsdev = tsmean = tssdev = tmeand = tkval =  0
+
 for filename in sys.argv[1:]:
     filename = windowsfy(filename)
     (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost,
@@ -147,20 +150,51 @@
         rat2  = rat2[0:(len(ratio) + 8)]
         ratio += " %7s" % ("%d:%d" % (htest, stest))
     fptot += "%8d"   % fp
+    tfptot += fp 
     fpper += "%8.2f" % fpp
+    tfpper += fpp
     fntot += "%8d"   % fn
+    tfntot += fn
     fnper += "%8.2f" % fnp
+    tfnper += fnp
     untot += "%8d"   % un
+    tuntot += un
     unper += "%8.2f" % unp
+    tunper += unp
     rcost += "%8s"   % ("$%.2f" % cost)
+    trcost += cost
     bcost += "%8s"   % ("$%.2f" % bestcost)
+    tbcost += bestcost
     hmean += "%8.2f" % hamdevall[0]
+    thmean += hamdevall[0]
     hsdev += "%8.2f" % hamdevall[1]
+    thsdev += hamdevall[1]
     smean += "%8.2f" % spamdevall[0]
+    tsmean += spamdevall[0]
     ssdev += "%8.2f" % spamdevall[1]
+    tssdev += spamdevall[1]
     meand += "%8.2f" % (spamdevall[0] - hamdevall[0])
+    tmeand += (spamdevall[0] - hamdevall[0])
     k = (spamdevall[0] - hamdevall[0]) / (spamdevall[1] + hamdevall[1])
     kval  += "%8.2f" % k
+    tkval  += k
+
+nfiles = len(sys.argv[1:])
+if nfiles:
+    fptot += "%12d"   % (tfptot/nfiles)
+    fpper += "%12.2f" % (tfpper/nfiles)
+    fntot += "%12d"   % (tfntot/nfiles)
+    fnper += "%12.2f" % (tfnper/nfiles)
+    untot += "%12d"   % (tuntot/nfiles)
+    unper += "%12.2f" % (tunper/nfiles)
+    rcost += "%12s"   % ("$%.2f" % (trcost/nfiles))
+    bcost += "%12s"   % ("$%.2f" % (tbcost/nfiles))
+    hmean += "%12.2f" % (thmean/nfiles)
+    hsdev += "%12.2f" % (thsdev/nfiles)
+    smean += "%12.2f" % (tsmean/nfiles)
+    ssdev += "%12.2f" % (tssdev/nfiles)
+    meand += "%12.2f" % (tmeand/nfiles)
+    kval  += "%12.2f" % (tkval/nfiles)
 
 print fname
 if len(fnam2.strip()) > 0: