Filtering computer.lang.python
Wanderer
wanderer at dialup4less.com
Tue Apr 10 11:28:58 EDT 2018
On Tuesday, April 10, 2018 at 3:28:05 AM UTC-4, Thomas Jollans wrote:
> On 2018-04-10 07:06, T Berger wrote:
> > This is the first time I've joined a google group and I don't understand the setup. Why are most of the posts in this group unrelated to python, and how do I filter this junk (sorry) out?
> >
>
> Welcome to python-list/comp.lang.python!
>
> This isn't originally a Google group. Google just mirrors the old USENET
> group, which is awash with spam.
>
> There is also a mailing list version of this group (posts are mirrored
> both ways) at https://mail.python.org/mailman/listinfo/python-list
>
> The mailing list has proper spam filtering and some moderation. None (or
> barely any) of the regulars use Google Groups. Some people use USENET
> directly and maintain their own extensive filtering regime to make it
> readable. Probably most of us use the mailing list, because it's just so
> much nicer!
>
> -- Thomas
Here's my python code for filtering google groups again. You need to bookmark pyc files to run them from the bookmarks in firefox. You also need to create the bannedAuthors.txt and bannedSubjects.txt files.
# remove banned author and authors with mostly caps
# to compile to pyc
#>>>import py_compile
#>>>py_compile.compile("file.py")
import urllib2
import webbrowser
import os
from bs4 import BeautifulSoup
import argparse
class Usage(Exception):
def __init__(self, msg):
self.msg = msg
PALEMOON = 'Mozilla/5.0 (Windows NT 6.1; WOW64) KHTML/4.11 Gecko/20130308 Firefox/33.0 (PaleMoon/25.2)'
WATERFOX = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:40.0) Gecko/20100101 Firefox/51.1.0 Waterfox/51.1.0'
USERAGENTBASE = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:40.0) Gecko/20100101 '
BROWSERPATH = 'C:\\"Program Files"\\Waterfox\\waterfox.exe'
FILENAME = 'C:\\Pystuff\\pygroup.htm'
SEDFILENAME = 'C:\\Pystuff\\SED.htm'
WEBPAGE_START = "https://groups.google.com/forum/?_escaped_fragment_=forum/"
PYGROUP_WEBPAGE = "comp.lang.python%5B"
SED_WEBPAGE = "sci.electronics.design%5B"
WEBPAGE_END = "%5D"
BANNED_AUTHORS_FILE = 'C:\\Pystuff\\bannedAuthors.txt'
BANNED_SUBJECTS_FILE = 'C:\\Pystuff\\bannedSubjects.txt'
def getUserAgentVersion():
""" get the useragent version
returns agentVersion
-- user agent version in format Firefox/51.0.1 Waterfox/51.0.1
"""
bvers = os.popen(BROWSERPATH + " -v").read()
bversList = bvers.split()
agentVersion = 'Firefox/' + bversList[2] + ' ' + bversList[1] + '/' + bversList[2]
return agentVersion
def getwebpage(url):
""" Open a webpage
url -- the url to the webpage
returns
page -- the source for the webpage
"""
user_agent = USERAGENTBASE + getUserAgentVersion()
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(url, None, headers)
response = urllib2.urlopen(req)
page = response.read()
return page
def getBannedAuthors():
""" Convert the banned authors text file into a list
returns
bannedAuthors -- list of banned author strings
"""
f = open(BANNED_AUTHORS_FILE, 'r')
bannedAuthors = f.read().split('\n')
f.close()
return bannedAuthors
def getBannedSubjects():
""" Convert the banned subjects text file into a list
returns
bannedAuthors -- list of banned author strings
"""
f = open(BANNED_SUBJECTS_FILE, 'r')
bannedSubjects = f.read().split('\n')
f.close()
return bannedSubjects
def removeBadAuthors(html_doc, filecode):
""" Remove posts from google group by authors that are mostly caps or on the Banned List
html_doc -- an html document
"""
bannedAuthors = getBannedAuthors()
bannedSubjects = getBannedSubjects()
#print bannedAuthors
soup = BeautifulSoup(html_doc)
#print soup.prettify()
post = soup.find("tr")
postcount = 0
banNoneCount = 0
banNameCount = 0
banBigCount = 0
banSubjectCount = 0
while post is not None:
postcount += 1
author = post.find("td", "author")
subject = post.find("td", "subject")
if author is None or subject is None:
print "Author is None"
oldpost = post
post = oldpost.find_next_sibling('tr')
oldpost.decompose()
postcount = postcount - 1
banNoneCount += 1
else:
aname = author.get_text()
print aname.encode("ascii", "ignore")
asubject = ((subject.get_text()).lower()).replace(" ", "")
bannedsubject = False
for badsubject in bannedSubjects:
print "BAD SUBJECT", badsubject
if badsubject in asubject and len(badsubject) > 3:
print "ASUBJECT", asubject.encode("ascii", "ignore")
bannedsubject = True
break
if bannedsubject:
print "Subject is Banned"
oldpost = post
post = oldpost.find_next_sibling('tr')
oldpost.decompose()
postcount = postcount - 1
banSubjectCount += 1
elif aname in bannedAuthors or \
'smtb' in aname:
print "Author is Banned"
oldpost = post
post = oldpost.find_next_sibling('tr')
oldpost.decompose()
postcount = postcount - 1
banNameCount += 1
else:
print author
numCaps = 1.0 * sum(1 for c in aname if c.isupper())
ratio = numCaps/(1.0*len(aname))
print ratio
oldpost = post
post = oldpost.find_next_sibling('tr')
if ratio > 0.7 or len(aname) > 35:
oldpost.decompose()
postcount = postcount - 1
banBigCount += 1
print "BIG"
if post is None: print "Post is NONE"
f = open(FILENAME, filecode)
f.write(soup.prettify().encode('ascii', 'ignore') + '<br>\n\r')
f.write('<a> Banned No Name: ' + str(banNoneCount) + '</a>, ')
f.write('<a> Banned Name: ' + str(banNameCount) + '</a>, ')
f.write('<a> All Uppercase Name: ' + str(banBigCount) + '</a>, ')
f.write('<a> Banned Subject: ' + str(banSubjectCount) + '</a>, ')
f.write('<a> Total Banned: ' + str(banNoneCount +banNameCount + banBigCount + banSubjectCount) + '</a><br>\n\r')
f.close()
return postcount
def main(sed = None):
if sed is None:
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--sed' , help="load sci.electronics.design group", action="store_true")
args = parser.parse_args()
if args.sed:
webgroup = SED_WEBPAGE
else:
webgroup = PYGROUP_WEBPAGE
else:
if sed:
webgroup = SED_WEBPAGE
else:
webgroup = PYGROUP_WEBPAGE
postcount = 0
numberOposts = 0
filecode = 'w'
while postcount < 10:
webpage = WEBPAGE_START + webgroup + str(numberOposts + 1) + '-' + str(numberOposts + 50) + WEBPAGE_END
print webpage
html_doc = getwebpage(webpage)
postcount += removeBadAuthors(html_doc, filecode)
if postcount < 10:
numberOposts += 50
filecode = 'a'
print "postcount less than 10", postcount
print "number of posts", numberOposts
webbrowser.open(FILENAME)
print 'done'
if __name__ == "__main__":
main()
More information about the Python-list
mailing list