Forcing output as unicode for web

Hasan Diwan diwanh at cs.rpi.edu
Sat Mar 1 02:56:07 CET 2003


i have a webpage generated from scraping through various rss feeds
using python, i'd like to display non-ascii characters on the page
(asian characters, pretty much) when the script hits a non-ascii rss
feed, it crashes, how do i solve this? The script is attached below:
#!/sw/bin/python
from xml.dom import minidom
import string
import urllib
import time
import sys
import re
import zlib
user='buzz'
passwd='baz'
site = 'foo.bar.com'
class news:
        def load(self, url):
         p=None
         try:
          p=minidom.parse(urllib.urlopen(url))
         except:
          print 'unreachable URL '+url,
         return p
        DEFAULT_NAMESPACES = \
          (None, # RSS 0.91, 0.92, 0.93, 0.94, 2.0
           'http://purl.org/rss/1.0/', # RSS 1.0
           'http://my.netscape.com/rdf/simple/0.9/' # RSS 0.90
           )

        def getElementsByTagName(self, node, tagName,
possibleNamespaces=DEFAULT_NAMESPACES):
           for namespace in possibleNamespaces:
              children = node.getElementsByTagName(tagName)
              if len(children): return children
              return []
        def first(self,node, tagName,
possibleNamespaces=DEFAULT_NAMESPACES):
          children = node.getElementsByTagName(tagName)
          return len(children) and children[0] or None

        def textOf(self,node):
          return node and "".join([child.data for child in
node.childNodes]) or ""
DUBLIN_CORE = ('http://purl.org/dc/elements/1.1/',)

if __name__ == '__main__':
          import sys
          n = news()
          o = open('/tmp/news.html', 'w')
          o.write('<?xml version="1.0" encoding="utf8"?>\n')
          o.write('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0
Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n')
          o.write('<html xmlns="http://www.w3.org/1999/xhtml"
lang="en-US" xml:lang="en-US"><head><title>News Summary</title>
</head><body link="#0000c0" alink="#0000f0" vlink="#0000a0"
bgcolor="#ffffff" text="#0000cc"><table width="100%" cellpading="0"
cellspacing="0" border="0"><tr valign="top"><td width="33%">')
          o.write(time.asctime(time.gmtime()))
          o.write(''''</td> <td width="33%"><center>News
Summary</center></td> <td width="33%"><div align="right"><a
href="index.html">Top</a></div></td></tr></table><div align="left"><a
href="http://www.syndic8.com">Some RSS/RDF feeds</a> -- Contact me if
you'd like to see these on the feed.''')
          f = open('.rssfeeds')
          for x in f.readlines():
           rssDocument = n.load(x)
           if re.compile(r"gz$").match(x): # Compressed file,
decompress
                   rssDocument = zlib.decompress(rssDocument)
           if rssDocument == None: pass
           print x
           o.write('<hr /><br /><center>Headlines from
'+x.encode('utf8')+'</center><br />')
           for item in n.getElementsByTagName(rssDocument, 'item'):
             url = n.textOf(n.first(item, 'link'))
             o.write( '<a href='+url+'>')
             o.write( n.textOf(n.first(item, 'title'))+'</a>:
'+n.textOf(n.first(item, 'description'))+ '<br />')
          o.write( '<hr /><address>Hasan Diwan &lt;<a
href="mailto:hdiwan at mac.com">blah at diespamdie.com</a>&gt;<br><font
size="-1">No, that is not my real email address</font></body></html>'
)
          o.close()
          o1 = open('/tmp/news.html')
          import ftplib
          ftp=ftplib.FTP(site)
          ftp.login(user,passwd)
          ftp.cwd('public_html')
          ftp.storlines('STOR news.html',o1)
          ftp.quit()
          o1.close()
          import posix
          posix.unlink('/tmp/news.html')
---
An example of a feed which will crash the generator is
http://www.newsisfree.com/HPE/xml/feeds/20/820.xml
(Le Monde's International RSS feed). My plans include adding an
Chinese and maybe a Cyrillic feed as well. Thanks in advance for the
help.
--
Hasan Diwan [http://ibn.com/~hdiwan]




More information about the Python-list mailing list