Hi all,
I have three questions related to standard deviations and variances in
scipy.
First, can someone explain the behaviour of array.std() without any
arguments?
>>> a = arange(30).reshape(3,10)
>>> a
array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]])
>>> a.std()
array([ 2.99856287, 2.85723522, 2.74647109, 2.67007684, 2.63104804,
2.63104804, 2.67007684, 2.74647109, 2.85723522, 2.99856287])
I don't understand what these numbers represent. The correct standard
deviations of the column vectors are given by:
>>> a.std(0)
array([ 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.])
and the standard deviations of the row vectors are:
>>> a.std(1)
array([ 3.02765035, 3.02765035, 3.02765035])
I would have expected a.std() to give the same output as
>>> a.ravel().std()
8.8034084308295046
which is what a.mean() does.
Second, I'd like to point out that some of the functions in Lib/stats/
have a different convention to scipy core about whether operations are
performed row-wise or column-wise, and whether anyone would object to my
changing the stats functions to operate column-wise. At the moment we
get this:
>>> average(a)
array([ 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.])
which is column-wise, but
>>> std(a)
array([ 3.02765035, 3.02765035, 3.02765035])
which is row-wise. I presume the default behaviour of std() and friends
is just a historical relic. If so we'd be wise to get this straight
well before a 1.0 release.
Third, I'd like to request that we add an array.var() method to scipy
core to compute an array's sample variance.
At the moment it seems that there is no way to compute the sample
variance of an array of numbers without installing the full scipy.
Users needing to do this will either have to roll their own function in
Python, like this:
def var(A):
m = len(A)
return average((a-means)**2) * (m/(m-1.))
or square the output of std(). Both are less efficient than a native
array.var() would be, requiring extra memory copying and, in the second
case, squaring the result of a square root operation, which also
introduces numerical imprecision.
The extra code required is minimal. There's an example patch below,
which works fine except that it inherits the weirdness of std().
Comments? :)
-- Ed
---------------------
Index: base/src/multiarraymodule.c
===================================================================
--- base/src/multiarraymodule.c (revision 1534)
+++ base/src/multiarraymodule.c (working copy)
@@ -460,7 +460,61 @@
return obj1;
}
+static PyObject *
+PyArray_Var(PyArrayObject *self, int axis, int rtype)
+{
+ PyObject *obj1=NULL, *obj2=NULL, *new=NULL;
+ PyObject *ret=NULL, *newshape=NULL;
+ int i, n;
+ intp val;
+ if ((new = _check_axis(self, &axis, 0))==NULL) return NULL;
+
+ /* Compute and reshape mean */
+ obj1 = PyArray_EnsureArray(PyArray_Mean((PyAO *)new, axis, rtype));
+ if (obj1 == NULL) {Py_DECREF(new); return NULL;}
+ n = PyArray_NDIM(new);
+ newshape = PyTuple_New(n);
+ if (newshape == NULL) {Py_DECREF(obj1); Py_DECREF(new); return
NULL;}
+ for (i=0; i<n; i++) {
+ if (i==axis) val = 1;
+ else val = PyArray_DIM(new,i);
+ PyTuple_SET_ITEM(newshape, i, PyInt_FromLong((long)val));
+ }
+ obj2 = PyArray_Reshape((PyAO *)obj1, newshape);
+ Py_DECREF(obj1);
+ Py_DECREF(newshape);
+ if (obj2 == NULL) {Py_DECREF(new); return NULL;}
+
+ /* Compute x = x - mx */
+ obj1 = PyNumber_Subtract((PyObject *)self, obj2);
+ Py_DECREF(obj2);
+ if (obj1 == NULL) {Py_DECREF(new); return NULL;}
+
+ /* Compute x * x */
+ obj2 = PyArray_EnsureArray(PyNumber_Multiply(obj1, obj1));
+ Py_DECREF(obj1);
+ if (obj2 == NULL) {Py_DECREF(new); return NULL;}
+
+ /* Compute add.reduce(x*x,axis) */
+ obj1 = PyArray_GenericReduceFunction((PyAO *)obj2, n_ops.add,
+ axis, rtype);
+ Py_DECREF(obj2);
+ if (obj1 == NULL) {Py_DECREF(new); return NULL;}
+
+ n = PyArray_DIM(new,axis)-1;
+ Py_DECREF(new);
+ if (n<=0) n=1;
+ obj2 = PyFloat_FromDouble(1.0/((double )n));
+ if (obj2 == NULL) {Py_DECREF(obj1); return NULL;}
+ ret = PyArray_EnsureArray(PyNumber_Multiply(obj1, obj2));
+ Py_DECREF(obj1);
+ Py_DECREF(obj2);
+
+ return ret;
+}
+
+
static PyObject *
PyArray_Sum(PyArrayObject *self, int axis, int rtype)
{
Index: base/src/arraymethods.c
===================================================================
--- base/src/arraymethods.c (revision 1534)
+++ base/src/arraymethods.c (working copy)
@@ -1225,6 +1225,23 @@
return PyArray_Std(self, axis, rtype.type_num);
}
+static char doc_var[] = "a.var(axis=None, rtype=None)";
+
+static PyObject *
+array_var(PyArrayObject *self, PyObject *args, PyObject *kwds)
+{
+ int axis=MAX_DIMS;
+ PyArray_Typecode rtype = {PyArray_NOTYPE, 0, 0};
+ static char *kwlist[] = {"axis", "rtype", NULL};
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
+ PyArray_AxisConverter,
+ &axis, PyArray_TypecodeConverter,
+ &rtype)) return NULL;
+
+ return PyArray_Var(self, axis, rtype.type_num);
+}
+
static char doc_compress[] = "a.compress(condition=, axis=None)";
static PyObject *
@@ -1501,6 +1518,8 @@
METH_VARARGS, doc_nonzero},
{"std", (PyCFunction)array_stddev,
METH_VARARGS|METH_KEYWORDS, doc_stddev},
+ {"var", (PyCFunction)array_var,
+ METH_VARARGS|METH_KEYWORDS, doc_var},
{"sum", (PyCFunction)array_sum,
METH_VARARGS|METH_KEYWORDS, doc_sum},
{"cumsum", (PyCFunction)array_cumsum,
Index: base/src/scalartypes.inc.src
===================================================================
--- base/src/scalartypes.inc.src (revision 1534)
+++ base/src/scalartypes.inc.src (working copy)
@@ -972,7 +972,7 @@
/**begin repeat
-#name=getfield, take, put, putmask, repeat, tofile, mean, trace,
diagonal, clip, std, sum, cumsum, prod, cumprod, compress#
+#name=getfield, take, put, putmask, repeat, tofile, mean, trace,
diagonal, clip, std, var, sum, cumsum, prod, cumprod, compress#
*/
static PyObject *
@@ -1144,6 +1144,8 @@
METH_VARARGS, NULL},
{"std", (PyCFunction)gentype_std,
METH_VARARGS|METH_KEYWORDS, NULL},
+ {"var", (PyCFunction)gentype_var,
+ METH_VARARGS|METH_KEYWORDS, NULL},
{"sum", (PyCFunction)gentype_sum,
METH_VARARGS|METH_KEYWORDS, NULL},
{"cumsum", (PyCFunction)gentype_cumsum,
Index: base/code_generators/generate_array_api.py
===================================================================
--- base/code_generators/generate_array_api.py (revision 1534)
+++ base/code_generators/generate_array_api.py (working copy)
@@ -367,6 +367,10 @@
""",
'Std','PyArrayObject *, int, int','PyObject *'),
+ (r"""Var
+ """,
+ 'Var','PyArrayObject *, int, int','PyObject *'),
+
(r"""Sum
""",
'Sum','PyArrayObject *, int, int','PyObject *'),