[Tutor] Processing Gutenburg texts
Simon Brunning
SBrunning@trisystems.co.uk
Thu, 10 Aug 2000 12:09:48 +0100
I've put together a simple module for working with Gutenburg text file (see
below), and I'd appreciate some advice. (I'm new to Python, so be gentle!)
The problem is that Gutenburg texts are not actually all formatted the same
way, so what started out as a fairly simple and clean process has turned
into a monster, and I'd like to simplify things.
The ugliest piece of code has to be the bit which looks for the title and
author, but *any* suggestions as to improvements would be welcome. Thanks.
# pyGut.py - a class for converting Gutenburg formatted text files
# to more user-friendly HTML.
#
# Requires HTMLgen - thanks, Robin!
# A huge number of Gutenburg texts are available from http://promo.net/pg/
# Simon Brunning - sbrunning@bigfoot.com
# BTW, this tool totally mangles poetry, and I can't thing of anything to
# do about it.
import sys, os, string
from HTMLgen import HTMLgen
class GutDocument:
'''Gutenburg to HTML conversion.
Use importText to load from Gutenburg formatted text file,
and exportHTML to create HTML file.'''
def __init__(self):
self.title = 'No title'
self.author = 'No author'
self.textBlocks = []
self.tocLines = {}
self.finishedFrontMatter = 0
self.titleFound = 0
self.textBlock = []
self.tocTriggers = ['chapter', 'act', 'stave', 'verse',
'preface', 'introduction', 'appendix',
'book', 'dedication', 'canto',
'dedicatory']
self.lineBreaks = ['\n', '\r\n', '\r']
def importText(self, gutFile):
importFile = open(gutFile, 'r')
while(1):
inLine = importFile.readline()
if inLine == '': break # EOF
self.storeFragment(inLine)
if len(self.textBlocks) == 0: # Empty, end of front-matter not
found.
self.textBlocks.append('No text found.')
def storeFragment(self, fragment):
if self.finishedFrontMatter == 0: # Looking for Title & author
fragment = string.replace(fragment, 'The ', '')
if self.titleFound == 0 \
and string.find(fragment,
'Project Gutenberg Etext of') <> -1: # Title
self.title = string.replace(fragment, '*', '')
self.title = string.replace(self.title,
'Project Gutenberg Etext of ',
'')
self.titleFound = 1
if string.find(self.title, ' by ') <> -1:
self.title, self.author = tuple(string.split(self.title,
' by '))
if self.title[-1] == ',':
self.title = self.title[:-1]
if string.find(fragment, '*END*') > -1: # We've finished the
front matter
self.finishedFrontMatter = 1
elif fragment not in self.lineBreaks: # Continue paragraph
if len(string.strip(fragment)):
self.textBlock.append(string.strip(fragment))
elif len(self.textBlock): # End of paragraph
self.textBlocks.append(string.join(self.textBlock))
for tocTrigger in self.tocTriggers: # Table of contents
if string.find(string.lower(self.textBlocks[-1]),
tocTrigger) == 0:
self.tocLines[len(self.textBlocks)-1] =
string.join(self.textBlock)
break
self.textBlock = []
def exportHTML(self, htmlFileUrl):
exportFile = HTMLgen.BasicDocument(title=self.title)
exportFile.append(HTMLgen.Heading(1, self.title)) # Heading
exportFile.append(HTMLgen.Heading(2, self.author)) # Author
entry = HTMLgen.Name('toc', 'Table of Contents:')
exportFile.append(entry)
exportFile.append(HTMLgen.BR())
sortedTocLines = self.tocLines.keys()
sortedTocLines.sort()
for tocLine in sortedTocLines: # Table of contents
url = '#' + str(tocLine)
text = self.tocLines[tocLine]
entry = HTMLgen.Href(url, text)
exportFile.append(entry)
exportFile.append(HTMLgen.BR())
exportFile.append(HTMLgen.HR())
for textBlock in range(len(self.textBlocks)-1):
text = self.textBlocks[textBlock]
if not self.tocLines.has_key(textBlock): # Ordinary text
entry = HTMLgen.Text(text)
exportFile.append(entry)
else: # toced line
url = str(textBlock) # Add the anchor & text
entry = HTMLgen.Name(url, text)
exportFile.append(entry)
exportFile.append(HTMLgen.BR())
entry = HTMLgen.Href('#toc', 'Top') # Link to top
exportFile.append(entry)
tocElement = sortedTocLines.index(textBlock)
if tocElement > 0: # Link to previous
url = '#' + str(sortedTocLines[tocElement-1])
entry = HTMLgen.Href(url, 'Previous')
exportFile.append(entry)
if tocElement < (len(sortedTocLines)-1): # Link to next
url = '#' + str(sortedTocLines[tocElement+1])
entry = HTMLgen.Href(url, 'Next')
exportFile.append(entry)
exportFile.append(HTMLgen.BR(2))
exportFile.write(htmlFileUrl)
def clear(self):
self.title = 'No title'
self.author = 'No author'
self.textBlocks = []
self.tocLines = {}
self.finishedFrontMatter = 0
self.titleFound = 0
self.textBlock = []
def __str__(self):
result = 'ToC:' + os.linesep + \
str(self.tocLines) + os.linesep + \
'Text Blocks:' + os.linesep + \
str(self.textBlocks)
return result
def __len__(self):
return len(self.textBlocks)
def pprint(self):
import pprint
print 'ToC:'
pprint.pprint(self.tocLines)
print 'Text Blocks:'
pprint.pprint(self.textBlocks)
def batchConversion(directory):
import glob
files = glob.glob(directory + '\*.txt')
doc = GutDocument()
for file in files:
doc.importText(file)
htmlFile = string.replace(file, '.txt', '.html', )
doc.exportHTML(htmlFile)
doc.clear()
if __name__== '__main__': # Running as a script
gutDocument = GutDocument()
gutDocument.importText(sys.argv[1])
gutDocument.exportHTML(sys.argv[2])
else: # Imported
print 'Module pyGut imported.' # Imported
Cheers,
Simon Brunning
TriSystems Ltd.
sbrunning@trisystems.co.uk
-----------------------------------------------------------------------
The information in this email is confidential and may be legally privileged.
It is intended solely for the addressee. Access to this email by anyone else
is unauthorised. If you are not the intended recipient, any disclosure,
copying, distribution, or any action taken or omitted to be taken in
reliance on it, is prohibited and may be unlawful. TriSystems Ltd. cannot
accept liability for statements made which are clearly the senders own.