nonuniform sampling with replacement
Alf P. Steinbach
alfps at start.no
Sun Mar 21 08:03:00 EDT 2010
* Jah_Alarm:
> I've got a vector length n of integers (some of them are repeating),
> and I got a selection probability vector of the same length. How will
> I sample with replacement k (<=n) values with the probabilty vector.
> In Matlab this function is randsample. I couldn't find anything to
> this extent in Scipy or Numpy.
<code>
#Py3
import operator # itemgetter
import random
from collections import defaultdict
def normalized_to_sum( s, v ):
current_s = sum( v )
c = s/current_s
return [c*x for x in v]
class ValueSampler:
def __init__( self, values, probabilities ):
assert len( values ) == len( probabilities )
get2nd = operator.itemgetter( 1 )
v_p = sorted( zip( values, probabilities ), key = get2nd, reverse = True )
v_ap = []; sum = 0;
for (v, p) in v_p:
v_ap.append( (v, p + sum) );
sum += p
self._data = v_ap
def __call__( self, p ):
return self.choice( p )
def choice( self, p ):
data = self._data; i_v = 0; i_p = 1;
assert 0 <= p <= 1
assert len( data ) > 0, "Sampler: Sampling from empty value set"
low = 0; high = len( data ) - 1;
if p > data[high][i_p]: return data[high][i_p] # Float values workaround.
while low != high:
mid = (low + high)//2
if p > data[mid][i_p]:
low = mid + 1
else:
high = mid
return data[low][i_v]
def main():
v = [3, 1, 4, 1, 5, 9, 2, 6, 5, 4];
p = normalized_to_sum( 1, [2, 7, 1, 8, 2, 8, 1, 8, 2, 8] )
sampler = ValueSampler( v, p )
probabilities = defaultdict( lambda: 0.0 )
for (i, value) in enumerate( v ):
probabilities[value] += p[i]
print( probabilities )
print()
frequencies = defaultdict( lambda: 0.0 )
n = 100000
for i in range( n ):
value = sampler( random.random() )
frequencies[value] += 1/n
print( frequencies )
main()
</code>
Cheers & hth.,
- Alf
Disclaimer: I just cooked it up and just cooked up binary searches usually have
bugs. They usually need to be exercised and fixed. But I think you get the idea.
Note also that division differs in Py3 and Py2. This is coded for Py3.
More information about the Python-list
mailing list