[Python-checkins] r74240 - in python/trunk: Doc/library/re.rst Lib/test/test_re.py Misc/NEWS Modules/_sre.c

Tue Jul 28 22:35:04 CEST 2009

Author: mark.dickinson
Date: Tue Jul 28 22:35:03 2009
New Revision: 74240

Log:
Issue #6561: '\d' regular expression should not match characters of
category [No]; only those of category [Nd].  (Backport of r74237
from py3k.)


Modified:
   python/trunk/Doc/library/re.rst
   python/trunk/Lib/test/test_re.py
   python/trunk/Misc/NEWS
   python/trunk/Modules/_sre.c

Modified: python/trunk/Doc/library/re.rst
==============================================================================

--- python/trunk/Doc/library/re.rst	(original)
+++ python/trunk/Doc/library/re.rst	Tue Jul 28 22:35:03 2009
@@ -332,7 +332,8 @@
 ``\d``
    When the :const:`UNICODE` flag is not specified, matches any decimal digit; this
    is equivalent to the set ``[0-9]``.  With :const:`UNICODE`, it will match
-   whatever is classified as a digit in the Unicode character properties database.
+   whatever is classified as a decimal digit in the Unicode character properties
+   database.
 
 ``\D``
    When the :const:`UNICODE` flag is not specified, matches any non-digit

Modified: python/trunk/Lib/test/test_re.py
==============================================================================
--- python/trunk/Lib/test/test_re.py	(original)
+++ python/trunk/Lib/test/test_re.py	Tue Jul 28 22:35:03 2009
@@ -636,6 +636,27 @@
         self.assertEqual(iter.next().span(), (4, 4))
         self.assertRaises(StopIteration, iter.next)
 
+    def test_bug_6561(self):
+        # '\d' should match characters in Unicode category 'Nd'
+        # (Number, Decimal Digit), but not those in 'Nl' (Number,
+        # Letter) or 'No' (Number, Other).
+        decimal_digits = [
+            u'\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
+            u'\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
+            u'\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
+            ]
+        for x in decimal_digits:
+            self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
+
+        not_decimal_digits = [
+            u'\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
+            u'\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
+            u'\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
+            u'\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
+            ]
+        for x in not_decimal_digits:
+            self.assertIsNone(re.match('^\d$', x, re.UNICODE))
+
     def test_empty_array(self):
         # SF buf 1647541
         import array

Modified: python/trunk/Misc/NEWS
==============================================================================
--- python/trunk/Misc/NEWS	(original)
+++ python/trunk/Misc/NEWS	Tue Jul 28 22:35:03 2009
@@ -1205,6 +1205,10 @@
 Extension Modules
 -----------------
 
+- Issue #6561: '\d' in a regex now matches only characters with
+  Unicode category 'Nd' (Number, Decimal Digit).  Previously it also
+  matched characters with category 'No'.
+
 - Issue #1523: Remove deprecated overflow wrapping for struct.pack
   with an integer format code ('bBhHiIlLqQ').  Packing an out-of-range
   integer now consistently raises struct.error.

Modified: python/trunk/Modules/_sre.c
==============================================================================
--- python/trunk/Modules/_sre.c	(original)
+++ python/trunk/Modules/_sre.c	Tue Jul 28 22:35:03 2009
@@ -172,7 +172,7 @@
 
 #if defined(HAVE_UNICODE)
 
-#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
+#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))