[Python-checkins] python/dist/src/Lib/test test_unicode.py,1.47.6.5,1.47.6.6

lemburg@users.sourceforge.net lemburg@users.sourceforge.net
Tue, 24 Sep 2002 07:06:58 -0700


Update of /cvsroot/python/python/dist/src/Lib/test
In directory usw-pr-cvs1:/tmp/cvs-serv25610/Lib/test

Modified Files:
      Tag: release22-maint
	test_unicode.py 
Log Message:
Backport the UTF-8 codec from 2.3 and add a work-around to let the
UTF-8 decoder accept broken UTF-8 sequences which encode lone
high surrogates (the pre-2.2.2 versions forgot to generate the
UTF-8 prefix \xed for these).

Fixes SF bug #610783: Lone surrogates cause bad .pyc files.



Index: test_unicode.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/test/test_unicode.py,v
retrieving revision 1.47.6.5
retrieving revision 1.47.6.6
diff -C2 -d -r1.47.6.5 -r1.47.6.6
*** test_unicode.py	23 Sep 2002 20:49:43 -0000	1.47.6.5
--- test_unicode.py	24 Sep 2002 14:06:55 -0000	1.47.6.6
***************
*** 554,570 ****
  
  # UTF-8 specific encoding tests:
! verify(u'\u20ac'.encode('utf-8') == \
!        ''.join((chr(0xe2), chr(0x82), chr(0xac))) )
! verify(u'\ud800\udc02'.encode('utf-8') == \
!        ''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))) )
! verify(u'\ud84d\udc56'.encode('utf-8') == \
!        ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
  # UTF-8 specific decoding tests
! verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
!                'utf-8') == u'\U00023456' )
! verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
!                'utf-8') == u'\U00010002' )
! verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
!                'utf-8') == u'\u20ac' )
  
  # Other possible utf-8 test cases:
--- 554,591 ----
  
  # UTF-8 specific encoding tests:
! verify(u''.encode('utf-8') == '')
! verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
! verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
! verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
! verify(u'\ud800'.encode('utf-8') == '\xed\xa0\x80')
! verify(u'\udc00'.encode('utf-8') == '\xed\xb0\x80')
! verify((u'\ud800\udc02'*1000).encode('utf-8') ==
!        '\xf0\x90\x80\x82'*1000)
! verify(u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
!        u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
!        u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
!        u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
!        u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
!        u' Nunstuck git und'.encode('utf-8') ==
!        '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
!        '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
!        '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
!        '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
!        '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
!        '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
!        '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
!        '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
!        '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
!        '\xe3\x80\x8cWenn ist das Nunstuck git und')
! 
  # UTF-8 specific decoding tests
! verify(unicode('\xf0\xa3\x91\x96', 'utf-8') == u'\U00023456' )
! verify(unicode('\xf0\x90\x80\x82', 'utf-8') == u'\U00010002' )
! verify(unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac' )
! # test UTF-8 2.2.1 bug work-around
! verify(unicode('\xa0\x80', 'utf-8') == u'\ud800' )
! verify(unicode('\xaf\xbf', 'utf-8') == u'\udbff' )
! verify(unicode('\xed\xb0\x80', 'utf-8') == u'\udc00' )
! verify(unicode('\xed\xbf\xbf', 'utf-8') == u'\udfff' )
  
  # Other possible utf-8 test cases: