parsing incoming emails
Michiel Overtoom
motoom at xs4all.nl
Thu Jul 10 16:13:53 EDT 2008
Ahmed wrote...
> I am working on a project where I need to parse incoming emails
> (Microsoft outlook)
I'm not sure if you are able to bypass Outlook (and have Python fetch the
mail itself using poplib), but if you are, the following code might be
useful. I use this to pry apart emails which might contain multiple MIME parts.
from email.Parser import Parser
from rfc822 import parseaddr
import poplib
import smtplib
popserver="pop.site.com"
popuser="user at site.com"
poppassword="secret"
# split a message into an header- and body part
def separate(msg):
if isinstance(msg,str):
msg=msg.split('\n')
emptyline=msg.index('')
return msg[:emptyline],msg[emptyline+1:]
# return a certain headerline from the headers
def headerline(header,tag="From: "):
for h in header:
if h.startswith(tag):
return h[len(tag)+1:]
return ""
# enumerate recursively the contents of a MIME message
# remember the first text/plain and text/html part(s) that is found
# also remember if any other parts were found (like attachments)
#
def enummimeparts(msg,extract,level=1,verbose=False):
m=Parser().parsestr(msg)
if m.is_multipart():
if verbose: print '\t'*level,'multipart'
for part in m.get_payload():
enummimeparts(part.as_string(),extract,level+1,verbose)
else:
t=m.get_content_type()
if verbose: print '\t'*level,t
if t=="text/plain":
if not "text/plain" in extract:
headers,body=separate(m.as_string())
extract["text/plain"]='\n'.join(body)
else:
extract["others"]=True
elif t=="text/html":
if not "text/html" in extract:
headers,body=separate(m.as_string())
extract["text/html"]='\n'.join(body)
else:
extract["others"]=True
else:
extract["others"]=True
# extract the first 'text/plain' and 'text/html' mime-parts from a message
def extracttext(msg):
extract={}
enummimeparts(msg,extract)
return
extract.get("text/plain",None),extract.get("text/html",None),extract.get("ot
hers",False)
def processmessage(msgnr):
# get a message from the POP server, extract the parts
response,lines,bytes=pop.retr(msgnr)
msg='\n'.join(lines)
headers,body=separate(lines)
name,fromaddress=parseaddr(headerline(headers,"From:"))
subject=headerline(headers,"Subject:")
logging.info(subject+" ("+fromaddress+")")
(plain,html,others)=extracttext(msg)
# prefer flat text; if not present in the message, fallback to HTML
content (if any)
texttoprocess=""
if plain:
texttoprocess=plain
elif html:
texttoprocess=html
# now do something useful with the text
processtext(texttoprocess)
# delete message from pop server after processing
pop.dele(msgnr)
# connect to the pop server and process all messages
logging.info("Checking pop server '%s', user '%s'" % (popserver,popuser))
pop=poplib.POP3(popserver)
pop.user(popuser)
pop.pass_(poppassword)
stat=pop.stat()
if stat[0]:
for n in range(stat[0]):
processmessage(n+1)
pop.quit()
--
"The ability of the OSS process to collect and harness
the collective IQ of thousands of individuals across
the Internet is simply amazing." - Vinod Vallopillil
http://www.catb.org/~esr/halloween/halloween4.html
More information about the Python-list
mailing list