[Python-checkins] r83200 - in python/branches/release26-maint: Lib/io.py Lib/test/test_io.py Misc/NEWS
victor.stinner
python-checkins at python.org
Wed Jul 28 03:58:41 CEST 2010
Author: victor.stinner
Date: Wed Jul 28 03:58:41 2010
New Revision: 83200
Log:
Issue #5006: Better handling of unicode byte-order marks (BOM) in the io
library. This means, for example, that opening an UTF-16 text file in append
mode doesn't add a BOM at the end of the file if the file isn't empty.
Modified:
python/branches/release26-maint/Lib/io.py
python/branches/release26-maint/Lib/test/test_io.py
python/branches/release26-maint/Misc/NEWS
Modified: python/branches/release26-maint/Lib/io.py
==============================================================================
--- python/branches/release26-maint/Lib/io.py (original)
+++ python/branches/release26-maint/Lib/io.py Wed Jul 28 03:58:41 2010
@@ -1440,6 +1440,15 @@
self._snapshot = None # info for reconstructing decoder state
self._seekable = self._telling = self.buffer.seekable()
+ if self._seekable and self.writable():
+ position = self.buffer.tell()
+ if position != 0:
+ try:
+ self._get_encoder().setstate(0)
+ except LookupError:
+ # Sometimes the encoder doesn't exist
+ pass
+
# self._snapshot is either None, or a tuple (dec_flags, next_input)
# where dec_flags is the second (integer) item of the decoder state
# and next_input is the chunk of input bytes that comes next after the
@@ -1726,6 +1735,17 @@
raise IOError("can't restore logical file position")
self._decoded_chars_used = chars_to_skip
+ # Finally, reset the encoder (merely useful for proper BOM handling)
+ try:
+ encoder = self._encoder or self._get_encoder()
+ except LookupError:
+ # Sometimes the encoder doesn't exist
+ pass
+ else:
+ if cookie != 0:
+ encoder.setstate(0)
+ else:
+ encoder.reset()
return cookie
def read(self, n=None):
Modified: python/branches/release26-maint/Lib/test/test_io.py
==============================================================================
--- python/branches/release26-maint/Lib/test/test_io.py (original)
+++ python/branches/release26-maint/Lib/test/test_io.py Wed Jul 28 03:58:41 2010
@@ -799,6 +799,37 @@
self.assertEquals(d.decode(b'oiabcd'), '')
self.assertEquals(d.decode(b'', 1), 'abcd.')
+ def test_append_bom(self):
+ # The BOM is not written again when appending to a non-empty file
+ filename = test_support.TESTFN
+ for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
+ with io.open(filename, 'w', encoding=charset) as f:
+ f.write('aaa')
+ pos = f.tell()
+ with io.open(filename, 'rb') as f:
+ self.assertEquals(f.read(), 'aaa'.encode(charset))
+
+ with io.open(filename, 'a', encoding=charset) as f:
+ f.write('xxx')
+ with io.open(filename, 'rb') as f:
+ self.assertEquals(f.read(), 'aaaxxx'.encode(charset))
+
+ def test_seek_bom(self):
+ # Same test, but when seeking manually
+ filename = test_support.TESTFN
+ for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
+ with io.open(filename, 'w', encoding=charset) as f:
+ f.write('aaa')
+ pos = f.tell()
+ with io.open(filename, 'r+', encoding=charset) as f:
+ f.seek(pos)
+ f.write('zzz')
+ f.seek(0)
+ f.write('bbb')
+ with io.open(filename, 'rb') as f:
+ self.assertEquals(f.read(), 'bbbzzz'.encode(charset))
+
+
class TextIOWrapperTest(unittest.TestCase):
def setUp(self):
Modified: python/branches/release26-maint/Misc/NEWS
==============================================================================
--- python/branches/release26-maint/Misc/NEWS (original)
+++ python/branches/release26-maint/Misc/NEWS Wed Jul 28 03:58:41 2010
@@ -84,6 +84,10 @@
Library
-------
+- Issue #5006: Better handling of unicode byte-order marks (BOM) in the io
+ library. This means, for example, that opening an UTF-16 text file in append
+ mode doesn't add a BOM at the end of the file if the file isn't empty.
+
- Issue #3704: cookielib was not properly handling URLs with a / in the
parameters.
More information about the Python-checkins
mailing list