[Python-checkins] python/nondist/sandbox/statistics statistics.py, 1.7, 1.8 test_statistics.py, 1.2, 1.3

Sun Feb 29 04:12:30 EST 2004

Update of /cvsroot/python/python/nondist/sandbox/statistics
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv18374

Modified Files:
	statistics.py test_statistics.py 
Log Message:
* Adopt Skips suggestion to factor out variance() from standard deviation.
  While the variance is not often used, converting back from the standard
  deviation loses precision.

* Alter select() for a tri-partite partition instead of a two-way 
  partition.  This increases the number of comparisons by 50% but
  prevents the algorithm from losing its O(n) behavior when the
  dataset contains many equal elements.

Index: statistics.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/statistics/statistics.py,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** statistics.py	17 Feb 2004 17:39:26 -0000	1.7
--- statistics.py	29 Feb 2004 09:12:28 -0000	1.8
***************
*** 46,55 ****
          raise ValueError('data must have at least one element')

! def stddev(data, sample=True):
!     """Computes the standard deviation of the dataset.

!     If sample is True, computes the standard deviation with an n-1 divisor
!     for an unbiased estimator for a data sample.  If sample is False, computes
!     with a divisor of n, giving the standard deviation for a complete population.
      """

--- 46,55 ----
          raise ValueError('data must have at least one element')

! def variance(data, sample=True):
!     """Computes the variance of the dataset.

!     If sample is True, computes the variance with an n-1 divisor to reflect
!     the additional degree of freedom in sampled data.  If sample is False,
!     computes with a divisor of n, giving the variance for a complete population.
      """

***************
*** 81,115 ****
      if sample:
          try:
!             return (s / (k-1)) ** 0.5   # sample standard deviation
          except ZeroDivisionError:
!             raise ValueError('sample deviation requires at least two elements')
      else:
!         return (s / k) ** 0.5           # population standard deviation

  def select(data, n):
      """Find the nth rank ordered element (the least value has rank 0).
      """
      try:
          while True:
!             it = iter(data)
!             pivot = it.next()
!             under, notunder = [], []
!             ua, nua = under.append, notunder.append
!             for elem in it:
                  if elem < pivot:
!                     ua(elem)
!                 else:
!                     nua(elem)
              if n < len(under):
                  data = under
!             elif n == len(under):
                  return pivot
              else:
!                 data = notunder
!                 n -= len(under) + 1
!             if len(data) > 3:
!                 swap = random.randrange(len(data))
!                 data[0], data[swap] = data[swap], data[0]
!     except StopIteration:
          raise ValueError('not enough elements for the given rank')

--- 81,124 ----
      if sample:
          try:
!             return (s / (k-1))          # sample variance
          except ZeroDivisionError:
!             raise ValueError('sample variance requires at least two elements')
      else:
!         return (s / k)                  # population variance
! 
! def stddev(data, sample=True):
!     """Computes the standard deviation of the dataset.
! 
!     If sample is True, computes the standard deviation with an n-1 divisor
!     for an unbiased estimator for a data sample.  If sample is False, computes
!     with a divisor of n, giving the standard deviation for a complete population.
!     """
!     return variance(data, sample) ** 0.5

  def select(data, n):
      """Find the nth rank ordered element (the least value has rank 0).
+ 
+     Equivalent to sorted(data)[n]
      """
+     data = list(data)
      try:
          while True:
!             pivot = random.choice(data)
!             pcount = data.count(pivot)
!             over, under = [], []
!             uappend, oappend = under.append, over.append
!             for elem in data:
                  if elem < pivot:
!                     uappend(elem)
!                 elif elem > pivot:
!                     oappend(elem)
              if n < len(under):
                  data = under
!             elif n < len(under) + pcount:
                  return pivot
              else:
!                 data = over
!                 n -= len(under) + pcount
!     except IndexError:
          raise ValueError('not enough elements for the given rank')

Index: test_statistics.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/statistics/test_statistics.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** test_statistics.py	17 Feb 2004 13:00:16 -0000	1.2
--- test_statistics.py	29 Feb 2004 09:12:28 -0000	1.3
***************
*** 51,73 ****

      def test_select(self):
!         n = 200
!         a = range(n)
          random.shuffle(a)
          for i in xrange(100):
              nth = random.randrange(n)
!             self.assertEqual(select(a, nth), nth)

          # Try with sorted data
!         a = range(n)
          for i in xrange(100):
              nth = random.randrange(n)
!             self.assertEqual(select(a, nth), nth)

          # Try with reverse sorted data
!         a = range(n)
          a.reverse()
          for i in xrange(100):
              nth = random.randrange(n)
!             self.assertEqual(select(a, nth), nth)

      def test_nlargest(self):
--- 51,77 ----

      def test_select(self):
!         testdata = range(2000) + [20] * 500 + [85] * 500
!         sorteddata = sorted(testdata)
!         n = len(testdata)
!         a = testdata[:]
          random.shuffle(a)
          for i in xrange(100):
              nth = random.randrange(n)
!             self.assertEqual(select(a, nth), sorteddata[nth])

          # Try with sorted data
!         a = sorteddata[:]
          for i in xrange(100):
              nth = random.randrange(n)
!             self.assertEqual(select(a, nth), sorteddata[nth])

          # Try with reverse sorted data
!         a = sorteddata[:]
          a.reverse()
          for i in xrange(100):
              nth = random.randrange(n)
!             self.assertEqual(select(a, nth), sorteddata[nth])
! 
!         self.assertRaises(ValueError, select, a, n+1)

      def test_nlargest(self):