[Python-checkins] r46987 - sandbox/trunk/Doc/func-example.py

andrew.kuchling python-checkins at python.org
Fri Jun 16 02:42:16 CEST 2006


Author: andrew.kuchling
Date: Fri Jun 16 02:42:15 2006
New Revision: 46987

Added:
   sandbox/trunk/Doc/func-example.py   (contents, props changed)
Log:
Add larger example for functional HOWTO; I'll dissect it in a section to be written

Added: sandbox/trunk/Doc/func-example.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/Doc/func-example.py	Fri Jun 16 02:42:15 2006
@@ -0,0 +1,199 @@
+#!/Users/andrewk/source/p/python/python.exe
+
+# Example: indexer that records info about all the files in a directory tree.
+
+import os, sys
+import itertools, functools
+import cPickle
+
+#
+# Management of the list of indexing functions.
+#
+
+_indexers = {}
+
+def register(ext, func):
+    """Registers the function 'func'
+
+    >>> is_indexable_filename('foo.jpg')
+    False
+    >>> register('jpg', None)
+    >>> is_indexable_filename('foo.jpg')
+    True
+    >>> _indexers.clear()
+    """
+    _indexers['.' + ext] = func
+
+def is_indexable_filename (fn):
+    """Returns true if there's an indexer available for the given filename.
+
+    >>> register('txt', None)
+    >>> is_indexable_filename('foo.txt')
+    True
+    >>> is_indexable_filename('foo.jpg')
+    False
+    """
+    base, ext = os.path.splitext(fn)
+    return _indexers.has_key(ext)
+
+def is_ignorable_directory (dirname):
+    """Return true if the directory with the given name shouldn't be scanned.
+
+    >>> is_ignorable_directory('.svn')
+    True
+    >>> is_ignorable_directory('text')
+    False
+    """
+    return (dirname in ('.svn', 'CVS'))
+
+def remove_punctuation (word):
+    """Removes leading and trailing punctuation characters from a word.
+    May return the empty string.
+
+    >>> remove_punctuation('test')
+    'test'
+    >>> remove_punctuation('comma,')
+    'comma'
+    >>> remove_punctuation('()')
+    ''
+    """
+    word = word.strip(',.?!"\'()[]#*\\')
+    return word
+
+#
+# Functions for indexing directories and files
+#
+
+def index (*args):
+    """Index the directory trees rooted at the specified paths.
+    Can take any number of arguments.
+    Returns the index data structure.
+    """
+    idx = load_index()
+    for path in args:
+        index_tree(idx, path)
+    save_index(idx)
+    return idx
+
+def index_tree (idx, path):
+    """Index the contents of the files in the directory tree rooted at 'path'.
+    """
+    for dirpath, dirnames, filenames in os.walk(path):
+        # Remove ignorable directories
+        for d in list(dirnames):
+            if is_ignorable_directory(d):
+                dirnames.remove(d)
+
+        # Discard uninteresting filenames
+        filenames = [fn for fn in filenames
+                     if is_indexable_filename(fn)]
+
+        # Index files        
+        for fn in filenames:
+            full_path = os.path.join(dirpath, fn)
+            index_file(idx, full_path)
+
+def index_file (idx, path):
+    """Index the contents of a single file.  It's assumed that
+    an indexing function will be found for the file's type.
+    """
+    assert is_indexable_filename(path)
+    base, ext = os.path.splitext(path)
+    
+    indexer = _indexers[ext]
+    record_func = functools.partial(record, idx)
+    indexer(path, record_func)
+
+
+#
+# Index data structure
+#
+# The index is a big dictionary:
+#   { word => [list of (filename, line number) tuples] } 
+# 
+
+def lookup (idx, word):
+    """Return an iterator over the files and lines containing the requested
+    word.
+    """
+    for file, line in idx.get(word, []):
+        yield (file, line)
+    
+def record (idx, word, path, line=None):
+    """Add an index entry for the given word, using the specified path
+    and line number.  The line number can be None.
+
+    >>> record({}, 'word', '/path', None)
+    {'word': {('/path', None): 1}}
+    >>> record({}, 'word', '/path', 42)
+    {'word': {('/path', 42): 1}}
+    """
+    d = idx.setdefault(word, {})
+    key = (path, line)
+    if key not in d:
+        d[key] = 1
+    return idx
+
+def load_index ():
+    """Read index from disk.
+    """
+    index_filename = '/tmp/index'
+    if os.path.exists(index_filename):
+        input = open(index_filename, 'rb')
+        idx = cPickle.load(input)
+        input.close()
+    else:
+        idx = {}
+    
+    return idx
+
+def save_index (idx):
+    """Write index to disk.
+    """
+    output = open('/tmp/index', 'wb')
+    cPickle.dump(idx, output, -1)
+    output.close()
+    
+    import pprint
+    print len(idx), 'words in index'
+    #print idx
+    ##pprint.pprint(idx)
+
+#
+# File analysis functions
+#
+
+def text_inspector (input_file, record_func):
+    line_num = 1
+    for line in open(input_file, 'r'):
+        for word in line.split():
+            word = remove_punctuation(word.lower())
+            if word != '':
+                record_func(word, input_file, line_num)
+        line_num += 1
+
+
+if __name__ == '__main__':
+    if '-t' in sys.argv[1:]:
+        import doctest
+        doctest.testmod()
+        raise SystemExit
+
+    register('txt', text_inspector)
+    #register('jpg', jpg_inspector)
+    #register('gif', gif_inspector)
+
+    idx = index(*sys.argv[1:])
+
+    # Look up a word
+    for filename, line in lookup(idx, 'the'):
+        print filename, line
+    
+
+# Exercises:
+# * Matching lines are output in random order.  Output them in sorted order.
+#   [5] (One-line change)
+# * Use itertools.groupby() for better output, i.e. file.txt: 1 3 4 5
+#   [10]
+# * Remove file entries before adding new ones. [15]
+


More information about the Python-checkins mailing list