OT: doubts on my sexual identity

John Hunter jdhunter at ace.bsd.uchicago.edu
Fri Dec 13 18:23:37 EST 2002


>>>>> "Dave" == Dave Brueck <dave at pythonapocrypha.com> writes:

    Dave> On 13 Dec 2002, Michele Simionato wrote:
    >> I wonder is somebody thought to make a statistics of women in
    >> c.l.p. :

    Dave> Hey, maybe we could use _that_ as our next "Python
    Dave> popularity statistic".  ;-)

Certainly pythoneers seem to talk more about women than people on
other groups.  Here is another google group metric that counts uses
'woman words', and divides the counts by some (hopefully) language
neutral baseline.  It appears that people on c.l.python talk about
women 5-10 times as much as people on other groups:

# total of "language neutral" baseline words
python baseline total: 94620
c++ baseline total: 539300
perl baseline total: 274800

# Counts by language, also normed by baseline
Word: woman
	python: 2370 0.0250
	c++: 2020 0.0037
	perl: 1640 0.0060

Word: female
	python: 1280 0.0135
	c++: 2090 0.0039
	perl: 387 0.0014

Word: women
	python: 2660 0.0281
	c++: 2250 0.0042
	perl: 781 0.0028



from __future__ import division
import urllib, re, sys

rgx = re.compile('.*</b> of about <b>([\d,]+)</b>.*', re.MULTILINE|re.DOTALL)

# gotta do some tricks to get around google's user agent policy
class AppURLopener(urllib.FancyURLopener):
    def __init__(self, *args):
        self.version = "MSIE/5.0"
        apply(urllib.FancyURLopener.__init__, (self,) + args)


def get_counts(words, groups):
    """ returns a dictionary from words to dictionaries.  The
    dictionaries are maps from group to counts"""
    urllib._urlopener = AppURLopener()
    fmt = 'http://groups.google.com/groups?as_q=%s&safe=images&ie=UTF-8&oe=UTF-8&as_ugroup=*%s*&lr=&hl=en'
    summary = {}
    for group in groups:
        for word in words:    
            url =  fmt % (word, group)
            s =  urllib._urlopener.open(url).read()
            m = rgx.match(s)
            if m: summary.setdefault(word, {})[group] = \
               int( m.group(1).replace(',', '') )
    return summary

baseline = ('loop', 'program', 'help')
words = ('women', 'woman', 'female')
groups = ('python', 'perl', 'c++')
# approx total volume by summing the baseline

baseSumm = get_counts(baseline, groups)

totals = {}
for (word, grpDict) in baseSumm.items():
    for (grp, cnt) in grpDict.items():
        totals[grp] = totals.get(grp,0) +  cnt

for (grp, cnt) in totals.items():
    print '%s baseline total: %d' % (grp, cnt)

print 
wordSumm = get_counts(words, groups)

for (word, grpDict) in wordSumm.items():
    print 'Word: %s' % word
    for (grp, cnt) in grpDict.items():
        print '\t%s: %d %1.4f' % (grp, cnt, cnt/totals[grp])
    print
    


Go figure,
John Hunter





More information about the Python-list mailing list