[Python-checkins] cpython (2.7): Issue #15054: Fix incorrect tokenization of 'b' and 'br' string literals.

Sun Jun 17 04:50:57 CEST 2012

http://hg.python.org/cpython/rev/35d3a8ed7997
changeset:   77474:35d3a8ed7997
branch:      2.7
parent:      77443:0add70dd3c43
user:        Meador Inge <meadori at gmail.com>
date:        Sat Jun 16 21:05:50 2012 -0500
summary:
  Issue #15054: Fix incorrect tokenization of 'b' and 'br' string literals.

Patch by Serhiy Storchaka.

files:
  Lib/test/test_tokenize.py |  25 +++++++++++++++++++++++++
  Lib/tokenize.py           |  10 +++++-----
  Misc/NEWS                 |   4 ++++
  3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -278,6 +278,31 @@
     OP         '+'           (1, 32) (1, 33)
     STRING     'UR"ABC"'     (1, 34) (1, 41)
 
+    >>> dump_tokens("b'abc' + B'abc'")
+    STRING     "b'abc'"      (1, 0) (1, 6)
+    OP         '+'           (1, 7) (1, 8)
+    STRING     "B'abc'"      (1, 9) (1, 15)
+    >>> dump_tokens('b"abc" + B"abc"')
+    STRING     'b"abc"'      (1, 0) (1, 6)
+    OP         '+'           (1, 7) (1, 8)
+    STRING     'B"abc"'      (1, 9) (1, 15)
+    >>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")
+    STRING     "br'abc'"     (1, 0) (1, 7)
+    OP         '+'           (1, 8) (1, 9)
+    STRING     "bR'abc'"     (1, 10) (1, 17)
+    OP         '+'           (1, 18) (1, 19)
+    STRING     "Br'abc'"     (1, 20) (1, 27)
+    OP         '+'           (1, 28) (1, 29)
+    STRING     "BR'abc'"     (1, 30) (1, 37)
+    >>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')
+    STRING     'br"abc"'     (1, 0) (1, 7)
+    OP         '+'           (1, 8) (1, 9)
+    STRING     'bR"abc"'     (1, 10) (1, 17)
+    OP         '+'           (1, 18) (1, 19)
+    STRING     'Br"abc"'     (1, 20) (1, 27)
+    OP         '+'           (1, 28) (1, 29)
+    STRING     'BR"abc"'     (1, 30) (1, 37)
+
 Operators
 
     >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -70,10 +70,10 @@
 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 # Tail end of """ string.
 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
+Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
 # Single-line ' or " string.
-String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
-               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
+String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+               r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
 
 # Because of leftmost-then-longest match semantics, be sure to put the
 # longest operators first (e.g., if = came before ==, == would get
@@ -91,9 +91,9 @@
 Token = Ignore + PlainToken
 
 # First (or only) line of ' or " string.
-ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
+ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                 group("'", r'\\\r?\n'),
-                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
+                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                 group('"', r'\\\r?\n'))
 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -67,6 +67,10 @@
 Library
 -------
 
+- Issue #15054: A bug in tokenize.tokenize that caused string literals
+  with 'b' and 'br' prefixes to be incorrectly tokenized has been fixed.
+  Patch by Serhiy Storchaka.
+
 - Issue #15036: Allow removing or changing multiple items in
   single-file mailboxes (mbox, MMDF, Babyl) flushing the mailbox
   between the changes.

-- 
Repository URL: http://hg.python.org/cpython