# getting a submatrix of all true

John Hunter jdhunter at ace.bsd.uchicago.edu
Tue Jul 8 07:11:39 CEST 2003

```>>>>> "John" == John Hunter <jdhunter at ace.bsd.uchicago.edu> writes:
John>Using this approach, I was able to find a 893x67 submatrix
John>with a score of 59831 versus the 462x81 submatrix with a
John>score of 37422.

Onwards and upwards....  Since this is beginning to look like a
So I followed the lead of simulated annealing algorithms and added a
little noise to the step where the decision to delete the row or

from MLab import randn
def trim_missing(a, thresh=(0.05,0.05), sigma=0.005 ):
...snip...
if fracRow-vthresh+sigma*randn()<fracCol-othresh:
#delete the column
else:
#delete the row

where sigma is like the temperature parameter of a simulated annealing
algorithm.

Repeatedly stepping over a variety of sigmas in search of a minimum
has yielded minor improvement for the dataset of interest. Best
results:

Deterministic: 59831 893 67
Random:        59928 908 66 for sigma=0.0017

For random arrays the results are more striking.  For random matrices
500x500, comparing the scores of the deterministic (sigma=0) versus
random (best sigma in [0,0.01]) searches:

score        score
% ones   mean determ  mean random    mean(ratio)  std(ratio)
1%     41753.2       42864.6         1.03         0.0093
5%      5525.8        6174.2         1.12         0.06
10%      1968.3        2193.9         1.12         0.05

For a matrix of the same number of total elements, but unbalanced in
the dimensions, eg, 1000x250, the results are more pronounced, in
favor of the random search algorithm

score        score
% ones   mean determ  mean random    mean(ratio)  std(ratio)
1%     48724.0       51451.8         1.06         0.018
5%      6340.5        7542.3         1.19         0.038
10%      2092.5        2458.6         1.180        0.091

Below is a script that performs a random search of a random matrix for
a variety of sigmas.

JDH

from __future__ import division
import sys, time
from urllib import urlopen
from Numeric import *
from MLab import randn, rand, mean, std

def trim_missing(a, thresh=(0.05,0.05), sigma=0.005 ):
"""
a is a numObservations by numVariables boolean matrix

thresh = (othresh, vthresh) are the observation and variable
thresholds that trigger deletion

remove any rows where percent missing vars > vthresh
remove any cols where percent missing obs > othresh

This is done iteratively rechecking the percent missing on each
iteration or row or col removal, with the largest offender removed
on each iteration

return value is score, rowInd, colInd where the two ind arrays
are the indices of the rows and cols that were kept

"""
othresh, vthresh = thresh # theshold for deletion
dr=zeros(a.shape[0])   # Arrays to note deleted rows...
dc=zeros(a.shape[1])   # ... and columns
sizeleft=list(a.shape) # Remaining dimensions
sr=sum(a,1)         # Column scores = ij's to remove
sc=sum(a,0)         # Row scores
while 1:               # Keep deleting till none left
mr=argmax(sr)       # index highest row score
mc=argmax(sc)       # index highest column score
fracRow = sr[mr]/sizeleft[1]
fracCol = sc[mc]/sizeleft[0]

if fracRow<=vthresh and fracCol<=othresh:
break            # stop when missing criteria are satisfied
if fracRow-vthresh+sigma*randn()<fracCol-othresh:
sr=sr-a[:,mc]    # rows reduced by contents of column
sc[mc]=0         # sum of column now zero
dc[mc]=1         # tags column as deleted
sizeleft[1]-=1
else:
# deleting the row
sc=sc-a[mr,:]    # columns reduced by contents of row
sr[mr]=0         # sum of row is now zero
dr[mr]=1         # tags row as deleted
sizeleft[0]-=1

score=(a.shape[0]-sum(dr))*(a.shape[1]-sum(dc))
dr=compress(dr==0,range(a.shape[0]))  # gives return format of
dc=compress(dc==0,range(a.shape[1]))  # optimrcs posted by Bengt

return score, dr, dc

results = []
for trial in range(10):
a=rand(1000,250)
a=where(a>0.90,1,0)

score0, goodRows, goodCols = trim_missing(a, thresh=(0,0), sigma=0)
maxScore = 0, score0, len(goodRows), len(goodCols)
print 'Deterministic:', score0, len(goodRows), len(goodCols)
for sigma in arange(0.0001, 0.01, 0.001):
for i in range(10):
score, goodRows, goodCols = trim_missing(
a, thresh=(0,0), sigma=sigma)
if score>maxScore[1]:
#print '\t', sigma, score, len(goodRows), len(goodCols)
maxScore = sigma, score, len(goodRows), len(goodCols)
print '\t', maxScore

results.append((score0, maxScore[1], maxScore[1]/score0))
results = array(results)
print mean(results)
print std(results)

```