Spanish Accents

Thu Dec 22 11:22:52 EST 2011

On Thu, Dec 22, 2011 at 11:30 AM, Rami Chowdhury
<rami.chowdhury at gmail.com>wrote:

> Could you try using the 'open' function from the 'codecs' module?
>

I believe this is what you meant:

file = codecs.open(p + "2.txt", "r", "utf-8")
for line in file:
  print line

but got this error:

 141 file = codecs.open(p + "2.txt", "r", "utf-8")
   142 for line in file:
   143   print line
   144
 *line* = '\r\n', *file* = <open file 'index2.txt', mode 'rb'>
/usr/lib64/python2.4/codecs.py in *next*(self=<open file 'index2.txt', mode
'rb'>)   492
   493         """ Return the next decoded line from the input stream."""
   494         return self.reader.next()
   495
   496     def __iter__(self):
 *self* = <open file 'index2.txt', mode 'rb'>, self.*reader* = <open file
'index2.txt', mode 'rb'>, self.reader.*next* = <bound method
StreamReader.next of <open file 'index2.txt', mode 'rb'>>
/usr/lib64/python2.4/codecs.py in *next*(self=<open file 'index2.txt', mode
'rb'>)   429
   430         """ Return the next decoded line from the input stream."""
   431         line = self.readline()
   432         if line:
   433             return line
 line *undefined*, *self* = <open file 'index2.txt', mode 'rb'>, self.*
readline* = <bound method StreamReader.readline of <open file 'index2.txt',
mode 'rb'>>  /usr/lib64/python2.4/codecs.py in *readline*(self=<open file
'index2.txt', mode 'rb'>, size=None, keepends=True)   344
         # If size is given, we call read() only once
   345         while True:
   346             data = self.read(readsize, firstline=True)
   347             if data:
   348
                 # If we're at a "\r" read one extra character (which might
 data *undefined*, *self* = <open file 'index2.txt', mode 'rb'>, self.*read* =
<bound method StreamReader.read of <open file 'index2.txt', mode 'rb'>>, *
readsize* = 72, firstline *undefined*, *builtin* *True* = True
/usr/lib64/python2.4/codecs.py in *read*(self=<open file 'index2.txt', mode
'rb'>, size=72, chars=-1, firstline=True)   291
             data = self.bytebuffer + newdata
   292             try:
   293
                 newchars, decodedbytes = self.decode(data, self.errors)
   294             except UnicodeDecodeError, exc:
   295                 if firstline:
 *newchars* = u'', *decodedbytes* = 0, *self* = <open file 'index2.txt',
mode 'rb'>, self.*decode* = <built-in function utf_8_decode>, *data* =
'\xe1intentado para ellos bastante sabios para discernir lo obvio.
Tales perso',
self.*errors* = 'strict'

*UnicodeDecodeError*: 'utf8' codec can't decode bytes in position 0-2:
invalid data
      args = ('utf8', '\xe1 intentado para ellos bastante sabios para
discernir lo obvio. Tales perso', 0, 3, 'invalid data')
      encoding = 'utf8'
      end = 3
      object = '\xe1 intentado para ellos bastante sabios para discernir lo
obvio. Tales perso'
      reason = 'invalid data'
      start = 0

which is the letter á (a with accent).
So I tried with utf-16 and got this error:

 141 file = codecs.open(p + "2.txt", "r", "utf-16")
   142 for line in file:
   143   print line
   144
 *line* = '\r\n', *file* = <open file 'index2.txt', mode 'rb'>
/usr/lib64/python2.4/codecs.py in *next*(self=<open file 'index2.txt', mode
'rb'>)   492
   493         """ Return the next decoded line from the input stream."""
   494         return self.reader.next()
   495
   496     def __iter__(self):
 *self* = <open file 'index2.txt', mode 'rb'>, self.*reader* = <open file
'index2.txt', mode 'rb'>, self.reader.*next* = <bound method
StreamReader.next of <open file 'index2.txt', mode 'rb'>>
/usr/lib64/python2.4/codecs.py in *next*(self=<open file 'index2.txt', mode
'rb'>)   429
   430         """ Return the next decoded line from the input stream."""
   431         line = self.readline()
   432         if line:
   433             return line
 line *undefined*, *self* = <open file 'index2.txt', mode 'rb'>, self.*
readline* = <bound method StreamReader.readline of <open file 'index2.txt',
mode 'rb'>>  /usr/lib64/python2.4/codecs.py in *readline*(self=<open file
'index2.txt', mode 'rb'>, size=None, keepends=True)   344
         # If size is given, we call read() only once
   345         while True:
   346             data = self.read(readsize, firstline=True)
   347             if data:
   348
                 # If we're at a "\r" read one extra character (which might
 data *undefined*, *self* = <open file 'index2.txt', mode 'rb'>, self.*read* =
<bound method StreamReader.read of <open file 'index2.txt', mode 'rb'>>, *
readsize* = 72, firstline *undefined*, *builtin* *True* = True
/usr/lib64/python2.4/codecs.py in *read*(self=<open file 'index2.txt', mode
'rb'>, size=72, chars=-1, firstline=True)   291
             data = self.bytebuffer + newdata
   292             try:
   293
                 newchars, decodedbytes = self.decode(data, self.errors)
   294             except UnicodeDecodeError, exc:
   295                 if firstline:
 newchars *undefined*, decodedbytes *undefined*, *self* = <open file
'index2.txt', mode 'rb'>, self.*decode* = <bound method StreamReader.decode
of <open file 'index2.txt', mode 'rb'>>, *data* = '<span
class="text">\r\n<i>Noticia:
Este sitio web entre este portal est\xe1 i', self.*errors* = 'strict'
/usr/lib64/python2.4/encodings/utf_16.py in *decode*(self=<open file
'index2.txt', mode 'rb'>, input='<span class="text">\r\n<i>Noticia: Este
sitio web entre este portal est\xe1 i', errors='strict')    47
             self.decode = codecs.utf_16_be_decode
    48         elif consumed>=2:
    49
             raise UnicodeError,"UTF-16 stream does not start with BOM"
    50         return (object, consumed)
    51
 *builtin* *UnicodeError* = <class exceptions.UnicodeError>

*UnicodeError*: UTF-16 stream does not start with BOM
      args = ('UTF-16 stream does not start with BOM',)
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/python-list/attachments/20111222/a60b05e5/attachment.html>