[Tutor] sentence case module for comments and possible cookbook submission

Fri Sep 17 01:40:38 CEST 2004

This is a bit late but I have some suggestions inline below.

Kent

At 05:05 AM 9/10/2004 -0400, Brian van den Broek wrote:
>#! /usr/bin/env python
># sentence_caser.py
># Version 0.1
># Brian van den Broek
># bvande at po-box.mcgill.ca
># This module is released under the Python License. (See www.python.org.)
>
>punctuation_indexes = {}
>punctuation = ['!', '?']
>
>def punctuation_stripper(data_string):
>     '''punctuation_stripper(data_string) -> data_string
>
>     Stores the indexes of each type of punctuation (other than '.') in the
>     punctuation_indexes dict and replaces them with '.'s. (This makes 
> splitting
>     the string easier, and, thanks to the dict, is reversible.)'''
>
>     for mark in punctuation:
>         punctuation_indexes[mark] = []
>         offset = 0
>         while True:
>             try:
>                 i = data_string.index(mark, offset)
>                 punctuation_indexes[mark].append(i)
>                 offset = i + 1
>             except ValueError:
>                 break
>         data_string = data_string.replace(mark, '.')
>     return data_string

I think you can do without this entirely with a better use of split; see below

>def change_to_sentence_case(sentence_list):
>     '''change_to_sentence_case(sentence_list) -> cap_sentences_list
>
>     Takes a list of sentence strings and transforms it so that the first and
>     only the first) letter is capitalized. It is a bit more complicated than
>     just calling the capitalize string method as the strings in the sentence
>     list may well start with ' ', '(', '[', etc. The while loop travels the
>     string, looking for the first letter and calling capitalize on the
>     substring it commences. restore_Is() is also called, in an attempt to 
> undo
>     lower-casing of the pronoun "I".'''
>
>     cap_sentences_list = []
>     for s in sentence_list:
>         offset = 0
>         while offset < len(s):

This would be more idiomatic:
for offset in range(len(s)):

It's tempting to use re.search here but that's probably overkill. Or you 
could use re.sub with a class instance that keeps enough state to 
lower-case just the first alpha as the sub parameter :-)

>             if s[offset].isalpha():
>                 s = s[:offset] + s[offset:].capitalize()
>                 break
>             offset += 1
>         s += '.'

You don't need this extra period, see below

>         s = restore_Is(s)
>         cap_sentences_list.append(s)
>     return cap_sentences_list
>
>def restore_Is(sentence):
>     '''restore_Is(sentence) -> sentence
>
>     Takes a sentence string and tries to restore any "I"s incorrectly changed
>     to "i"s by change_to_sentence_case()'s use of .capitalize().'''
>
>     sentence = sentence.replace(' i ', ' I ')
>     sentence = sentence.replace(' i,', ' I,')
>     sentence = sentence.replace(' i.', ' I.')
>     return sentence

You could do this very nicely with a regular expression and catch i? i! i; 
etc as well:
 >>> sentence = "i'll i will i?"
 >>> import re
 >>> sentence = re.sub(r'\bi\b', 'I', sentence)
 >>> sentence
"I'll I will I?"

The regular expression says to match an 'i' that is a word by itself.

>def restore_punctuation(data_sentences):
>     '''restore_punctuation(data_sentences) -> data_sentences
>
>     Consulting the punctuation_indexes dict, restore_punctuation() reverts
>     non '.' punctuation that was changed to '.' to facilitate splitting the
>     string.'''
>     for mark in punctuation:
>         for i in punctuation_indexes[mark]:
>             data_sentences = data_sentences[:i] + mark + data_sentences[i 
> + 1:]
>     return data_sentences

You don't need this either, see below

>def sentence_caser(data_string):
>     '''sentence_caser(data_string) -> data_string
>
>     Takes a string and returns it into sentence case (it is hoped). To do so,
>     it runs it through various helper functions. sentence_caser() does almost
>     no work on its own; consult the functions punctuation_stripper(),
>     change_to_sentence_case(), and restore_punctuation() for details of the
>     processing.'''
>
>     working_data = punctuation_stripper(data_string)
>     data_sentences_list = working_data.split('.')
>     data_sentences_list = change_to_sentence_case(data_sentences_list)
>     data_sentences = ''.join(data_sentences_list)
>     data_sentences = restore_punctuation(data_sentences)
>
>     data_sentences = data_sentences[:len(data_string)]
>     # To remove possibly spurious trailing '.' added when original string 
> ended
>     # with non-'.' character (as in data below).
>
>     return data_sentences

If you use
data_sentences_list = re.split(r'([.!?])', data_string)
  you will split on all the punctuation you care about and retain the 
punctuation as individual list entries in the sentence list. When you 
rejoin the sentences you will get the correct punctuation back. Then 
punctuation_stripper, restore_punctuation and the hack with the extra 
period are not needed. This whole block of code reduces to

     data_sentences_list = re.split(r'([.!?])', data_string)
     data_sentences_list = change_to_sentence_case(data_sentences_list)
     data_sentences = ''.join(data_sentences_list)
     return data_sentences

>if __name__ == '__main__':
>     data = '''STRINGS IN ALL CAPS ARE HARD TO READ! SOME PEOPLE THINK 
> THEY ARE
>LIKE SHOUTING. DO YOU THINK SO? I ONLY WRITE THEM WHEN I HAVE A CAPS-LOCK
>ACCIDENT. (OR WHEN CREATING TEST DATA.) THEY ARE NO FUN. (OK, ENOUGH NOW.)'''
>     print data
>     print
>     print sentence_caser(data)

Here's the whole rewrite:

#! /usr/bin/env python
# sentence_caser.py
# Version 0.1
# Brian van den Broek
# bvande at po-box.mcgill.ca
# This module is released under the Python License. (See www.python.org.)

import re

def change_to_sentence_case(sentence_list):
     '''change_to_sentence_case(sentence_list) -> cap_sentences_list

     Takes a list of sentence strings and transforms it so that the first and
     only the first) letter is capitalized. It is a bit more complicated than
     just calling the capitalize string method as the strings in the sentence
     list may well start with ' ', '(', '[', etc. The while loop travels the
     string, looking for the first letter and calling capitalize on the
     substring it commences. restore_Is() is also called, in an attempt to undo
     lower-casing of the pronoun "I".'''

     cap_sentences_list = []
     for s in sentence_list:
         for offset in range(len(s)):
             if s[offset].isalpha():
                 s = s[:offset] + s[offset:].capitalize()
                 break
         s = restore_Is(s)
         cap_sentences_list.append(s)
     return cap_sentences_list

def restore_Is(sentence):
     '''restore_Is(sentence) -> sentence

     Takes a sentence string and tries to restore any "I"s incorrectly changed
     to "i"s by change_to_sentence_case()'s use of .capitalize().'''

     sentence = re.sub(r'\bi\b', 'I', sentence)
     return sentence

def sentence_caser(data_string):
     '''sentence_caser(data_string) -> data_string

     Takes a string and returns it into sentence case (it is hoped). To do so,
     it runs it through various helper functions. sentence_caser() does almost
     no work on its own; consult the functions punctuation_stripper(),
     change_to_sentence_case(), and restore_punctuation() for details of the
     processing.'''

     data_sentences_list = re.split(r'([.!?])', data_string)
     data_sentences_list = change_to_sentence_case(data_sentences_list)
     data_sentences = ''.join(data_sentences_list)
     return data_sentences

if __name__ == '__main__':
     data = '''STRINGS IN ALL CAPS ARE HARD TO READ! SOME PEOPLE THINK THEY ARE
LIKE SHOUTING. DO YOU THINK SO? I ONLY WRITE THEM WHEN I HAVE A CAPS-LOCK
ACCIDENT. (OR WHEN CREATING TEST DATA.) THEY ARE NO FUN. (OK, ENOUGH NOW.)'''
     print data
     print
     print sentence_caser(data)