http://hg.python.org/cpython/rev/fc89e09ca2fc changeset: 75339:fc89e09ca2fc branch: 2.7 parent: 75336:eb88cc90cc56 user: Ezio Melotti <ezio.melotti@gmail.com> date: Wed Feb 29 11:40:00 2012 +0200 summary: #10713: Improve documentation for \b and \B and add a few tests. Initial patch and tests by Martin Pool. files: Doc/library/re.rst | 15 ++++++++++----- Lib/test/test_re.py | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -325,14 +325,19 @@ Matches the empty string, but only at the beginning or end of a word. A word is defined as a sequence of alphanumeric or underscore characters, so the end of a word is indicated by whitespace or a non-alphanumeric, non-underscore character. - Note that ``\b`` is defined as the boundary between ``\w`` and ``\W``, so the - precise set of characters deemed to be alphanumeric depends on the values of the - ``UNICODE`` and ``LOCALE`` flags. Inside a character range, ``\b`` represents - the backspace character, for compatibility with Python's string literals. + Note that formally, ``\b`` is defined as the boundary between a ``\w`` and + a ``\W`` character (or vice versa), or between ``\w`` and the beginning/end + of the string, so the precise set of characters deemed to be alphanumeric + depends on the values of the ``UNICODE`` and ``LOCALE`` flags. + For example, ``r'\bfoo\b'`` matches ``'foo'``, ``'foo.'``, ``'(foo)'``, + ``'bar foo baz'`` but not ``'foobar'`` or ``'foo3'``. + Inside a character range, ``\b`` represents the backspace character, for compatibility with Python's string literals. ``\B`` Matches the empty string, but only when it is *not* at the beginning or end of a - word. This is just the opposite of ``\b``, so is also subject to the settings + word. This means that ``r'py\B'`` matches ``'python'``, ``'py3'``, ``'py2'``, + but not ``'py'``, ``'py.'``, or ``'py!'``. + ``\B`` is just the opposite of ``\b``, so is also subject to the settings of ``LOCALE`` and ``UNICODE``. ``\d`` diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -373,6 +373,32 @@ self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a", re.UNICODE).group(0), "1aa! a") + def test_string_boundaries(self): + # See http://bugs.python.org/issue10713 + self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), + "abc") + # There's a word boundary at the start of a string. + self.assertTrue(re.match(r"\b", "abc")) + # A non-empty string includes a non-boundary zero-length match. + self.assertTrue(re.search(r"\B", "abc")) + # There is no non-boundary match at the start of a string. + self.assertFalse(re.match(r"\B", "abc")) + # However, an empty string contains no word boundaries, and also no + # non-boundaries. + self.assertEqual(re.search(r"\B", ""), None) + # This one is questionable and different from the perlre behaviour, + # but describes current behavior. + self.assertEqual(re.search(r"\b", ""), None) + # A single word-character string has two boundaries, but no + # non-boundary gaps. + self.assertEqual(len(re.findall(r"\b", "a")), 2) + self.assertEqual(len(re.findall(r"\B", "a")), 0) + # If there are no words, there are no boundaries + self.assertEqual(len(re.findall(r"\b", " ")), 0) + self.assertEqual(len(re.findall(r"\b", " ")), 0) + # Can match around the whitespace. + self.assertEqual(len(re.findall(r"\B", " ")), 2) + def test_bigcharset(self): self.assertEqual(re.match(u"([\u2222\u2223])", u"\u2222").group(1), u"\u2222") -- Repository URL: http://hg.python.org/cpython