[Tutor] Processing Gutenburg texts

Simon Brunning SBrunning@trisystems.co.uk
Thu, 10 Aug 2000 12:09:48 +0100


I've put together a simple module for working with Gutenburg text file (see
below), and I'd appreciate some advice. (I'm new to Python, so be gentle!)
The problem is that Gutenburg texts are not actually all formatted the same
way, so what started out as a fairly simple and clean process has turned
into a monster, and I'd like to simplify things.

The ugliest piece of code has to be the bit which looks for the title and
author, but *any* suggestions as to improvements would be welcome. Thanks.

# pyGut.py - a class for converting Gutenburg formatted text files
# to more user-friendly HTML.
#
# Requires HTMLgen - thanks, Robin!
# A huge number of Gutenburg texts are available from http://promo.net/pg/
# Simon Brunning - sbrunning@bigfoot.com
# BTW, this tool totally mangles poetry, and I can't thing of anything to
# do about it.

import sys, os, string
from HTMLgen import HTMLgen

class GutDocument:
    '''Gutenburg to HTML conversion.
    Use importText to load from Gutenburg formatted text file,
    and exportHTML to create HTML file.'''

    def __init__(self):
        self.title = 'No title'
        self.author = 'No author'
        self.textBlocks = []
        self.tocLines = {}
        self.finishedFrontMatter = 0
        self.titleFound = 0
        self.textBlock = []
        self.tocTriggers = ['chapter', 'act', 'stave', 'verse',
                            'preface', 'introduction', 'appendix',
                            'book', 'dedication', 'canto',
                            'dedicatory']
        self.lineBreaks = ['\n', '\r\n', '\r']

    def importText(self, gutFile):
        importFile = open(gutFile, 'r')
        while(1):
            inLine = importFile.readline()
            if inLine == '': break # EOF
            self.storeFragment(inLine)
        if len(self.textBlocks) == 0: # Empty, end of front-matter not
found.
            self.textBlocks.append('No text found.')
 
    def storeFragment(self, fragment):
        if self.finishedFrontMatter == 0: # Looking for Title & author
            fragment = string.replace(fragment, 'The ', '')
            if self.titleFound == 0 \
               and string.find(fragment,
                               'Project Gutenberg Etext of') <> -1: # Title
                self.title = string.replace(fragment, '*', '')
                self.title = string.replace(self.title,
                                            'Project Gutenberg Etext of ',
'')
                self.titleFound = 1
                if string.find(self.title, ' by ') <> -1:
                    self.title, self.author = tuple(string.split(self.title,
' by '))
                if self.title[-1] == ',':
                    self.title = self.title[:-1]
            if string.find(fragment, '*END*') > -1: # We've finished the
front matter
                self.finishedFrontMatter = 1
        elif fragment not in self.lineBreaks: # Continue paragraph
            if len(string.strip(fragment)):
                self.textBlock.append(string.strip(fragment))
        elif len(self.textBlock): # End of paragraph
            self.textBlocks.append(string.join(self.textBlock))
            for tocTrigger in self.tocTriggers: # Table of contents
                if string.find(string.lower(self.textBlocks[-1]),
tocTrigger) == 0:
                    self.tocLines[len(self.textBlocks)-1] =
string.join(self.textBlock)
                    break
            self.textBlock = []
        
    def exportHTML(self, htmlFileUrl):
        exportFile = HTMLgen.BasicDocument(title=self.title)
        exportFile.append(HTMLgen.Heading(1, self.title)) # Heading
        exportFile.append(HTMLgen.Heading(2, self.author)) # Author
        entry = HTMLgen.Name('toc', 'Table of Contents:')
        exportFile.append(entry)
        exportFile.append(HTMLgen.BR())
        sortedTocLines = self.tocLines.keys()
        sortedTocLines.sort()
        for tocLine in sortedTocLines: # Table of contents
            url = '#' + str(tocLine)
            text = self.tocLines[tocLine]
            entry = HTMLgen.Href(url, text)
            exportFile.append(entry)
            exportFile.append(HTMLgen.BR())
        exportFile.append(HTMLgen.HR())
        for textBlock in range(len(self.textBlocks)-1):
            text = self.textBlocks[textBlock]
            if not self.tocLines.has_key(textBlock): # Ordinary text
                entry = HTMLgen.Text(text)
                exportFile.append(entry)
            else: # toced line
                url = str(textBlock) # Add the anchor & text
                entry = HTMLgen.Name(url, text)
                exportFile.append(entry)
                exportFile.append(HTMLgen.BR())
                entry = HTMLgen.Href('#toc', 'Top') # Link to top
                exportFile.append(entry)
                tocElement = sortedTocLines.index(textBlock)
                if tocElement > 0:  # Link to previous
                    url = '#' + str(sortedTocLines[tocElement-1])
                    entry = HTMLgen.Href(url, 'Previous')
                    exportFile.append(entry)
                if tocElement < (len(sortedTocLines)-1): # Link to next
                    url = '#' + str(sortedTocLines[tocElement+1])
                    entry = HTMLgen.Href(url, 'Next')
                    exportFile.append(entry)
            exportFile.append(HTMLgen.BR(2))
        exportFile.write(htmlFileUrl)

    def clear(self):
        self.title = 'No title'
        self.author = 'No author'
        self.textBlocks = []
        self.tocLines = {}
        self.finishedFrontMatter = 0
        self.titleFound = 0
        self.textBlock = []

    def __str__(self):
        result = 'ToC:' + os.linesep + \
               str(self.tocLines) + os.linesep + \
               'Text Blocks:' + os.linesep + \
               str(self.textBlocks)
        return result

    def __len__(self):
        return len(self.textBlocks)
            
    def pprint(self):
        import pprint
        print 'ToC:'
        pprint.pprint(self.tocLines)
        print 'Text Blocks:'
        pprint.pprint(self.textBlocks)

def batchConversion(directory):
    import glob
    files = glob.glob(directory + '\*.txt')
    doc = GutDocument()
    for file in files:
        doc.importText(file)
        htmlFile = string.replace(file, '.txt', '.html', )
        doc.exportHTML(htmlFile)
        doc.clear()

if __name__== '__main__': # Running as a script
    gutDocument = GutDocument()
    gutDocument.importText(sys.argv[1])
    gutDocument.exportHTML(sys.argv[2])
else: # Imported
    print 'Module pyGut imported.' # Imported

Cheers,
Simon Brunning
TriSystems Ltd.
sbrunning@trisystems.co.uk





-----------------------------------------------------------------------
The information in this email is confidential and may be legally privileged.
It is intended solely for the addressee. Access to this email by anyone else
is unauthorised. If you are not the intended recipient, any disclosure,
copying, distribution, or any action taken or omitted to be taken in
reliance on it, is prohibited and may be unlawful. TriSystems Ltd. cannot
accept liability for statements made which are clearly the senders own.