[Scipy-svn] r5586 - in trunk/scipy/io/matlab: . tests

Sun Feb 22 03:57:55 EST 2009

Author: matthew.brett at gmail.com
Date: 2009-02-22 02:57:41 -0600 (Sun, 22 Feb 2009)
New Revision: 5586

Added:
   trunk/scipy/io/matlab/tests/afunc.m
Modified:
   trunk/scipy/io/matlab/mio5.py
   trunk/scipy/io/matlab/miobase.py
   trunk/scipy/io/matlab/zlibstreams.py
Log:
Much faster gzipstreams, back in for the moment, draft of binary read of unreadable data, as yet unused

Modified: trunk/scipy/io/matlab/mio5.py
===================================================================

--- trunk/scipy/io/matlab/mio5.py	2009-02-22 04:05:03 UTC (rev 5585)
+++ trunk/scipy/io/matlab/mio5.py	2009-02-22 08:57:41 UTC (rev 5586)
@@ -14,7 +14,7 @@
 import time
 import sys
 import zlib
-from zlibstreams import TwoShotZlibInputStream
+from zlibstreams import StubbyZlibInputStream
 from StringIO import StringIO
 from copy import copy as pycopy
 import warnings
@@ -23,6 +23,7 @@
 
 import scipy.sparse
 
+import byteordercodes
 from miobase import MatFileReader, MatArrayReader, MatMatrixGetter, \
      MatFileWriter, MatStreamWriter, docfiller, matdims, \
      MatReadError
@@ -230,6 +231,13 @@
         obj = np.asarray(input_array).view(cls)
 
 
+class MatlabBinaryBlock(object):
+    ''' Class to contain matlab unreadable blocks '''
+    def __init__(self, binaryblock, endian):
+        self.binaryblock = binaryblock
+        self.endian = endian
+
+
 class Mat5ArrayReader(MatArrayReader):
     ''' Class to get Mat5 arrays
 
@@ -326,6 +334,20 @@
         header['is_global'] = flags_class >> 10 & 1
         header['is_complex'] = flags_class >> 11 & 1
         header['nzmax'] = af['nzmax']
+        ''' Here I am playing with a binary block read of
+        untranslatable data. I am not using this at the moment because
+        reading it has the side effect of making opposite ending mat
+        files unwritable on the round trip.
+        
+        if mc == mxFUNCTION_CLASS:
+            # we can't read these, and want to keep track of the byte
+            # count - so we need to avoid the following unpredictable
+            # length element reads
+            return Mat5BinaryBlockGetter(self,
+                                         header,
+                                         af,
+                                         byte_count)
+        '''
         header['dims'] = self.read_element()
         header['name'] = self.read_element().tostring()
         # maybe a dictionary mapping here as a dispatch table
@@ -354,9 +376,9 @@
 
     '''
     def __init__(self, array_reader, byte_count):
-        instr = array_reader.mat_stream.read(byte_count)
         super(Mat5ZArrayReader, self).__init__(
-            StringIO(zlib.decompress(instr)),
+            StubbyZlibInputStream(array_reader.mat_stream,
+                            byte_count),
             array_reader.dtypes,
             array_reader.processor_func,
             array_reader.codecs,
@@ -387,7 +409,6 @@
     def __init__(self, array_reader):
         self.array_reader = array_reader
         self.mat_stream = array_reader.mat_stream
-        self.data_position = self.mat_stream.tell()
         self.header = {}
         self.name = ''
         self.is_global = False
@@ -526,10 +547,50 @@
 
 
 class Mat5FunctionGetter(Mat5ObjectMatrixGetter):
-    def get_raw_array(self):
-        raise MatReadError('Cannot read matlab functions')
+    ''' Class to provide warning and message string for unreadable
+    matlab function data
+    '''
+    
+    def get_raw_array(self): raise MatReadError('Cannot read matlab functions')
 
 
+class Mat5BinaryBlockGetter(object):
+    ''' Class to read in unreadable binary blocks
+
+    This class could be used to read in matlab functions
+    '''
+
+    def __init__(self,
+                 array_reader,
+                 header,
+                 array_flags,
+                 byte_count):
+        self.array_reader = array_reader
+        self.header = header
+        self.array_flags = array_flags
+        arr_str = array_flags.tostring()
+        self.binaryblock = array_reader.mat_stream.read(
+            byte_count-len(array_flags.tostring()))
+        stream = StringIO(self.binaryblock)
+        reader = Mat5ArrayReader(
+            stream,
+            array_reader.dtypes,
+            lambda x : None,
+            array_reader.codecs,
+            array_reader.class_dtypes,
+            False)
+        self.header['dims'] = reader.read_element()
+        self.header['name'] = reader.read_element().tostring()
+        self.name = self.header['name']
+        self.is_global = header['is_global']
+
+    def get_array(self):
+        dt = self.array_reader.dtypes[miINT32]
+        endian = byteordercodes.to_numpy_code(dt.byteorder)
+        data = self.array_flags.tostring() + self.binaryblock
+        return MatlabBinaryBlock(data, endian)
+
+               
 class MatFile5Reader(MatFileReader):
     ''' Reader for Mat 5 mat files
     Adds the following attribute to base class
@@ -840,13 +901,13 @@
         self.update_matrix_tag()
 
 
-class Mat5FunctionWriter(Mat5CellWriter):
-    ''' class to write matlab functions
+class Mat5BinaryBlockWriter(Mat5MatrixWriter):
+    ''' class to write untranslatable binary blocks '''
+    def write(self):
+        # check endian
+        # write binary block as is
+        pass
 
-    Only differs from cell writing in mx class in header '''
-    default_mclass = mxFUNCTION_CLASS
-
-
 class Mat5StructWriter(Mat5CellWriter):
     ''' class to write matlab structs
 
@@ -1022,8 +1083,8 @@
                 self.unicode_strings,
                 self.long_field_names,
                 self.oned_as)
-        if isinstance(narr, MatlabFunction):
-            return Mat5FunctionWriter(*args)
+        if isinstance(narr, MatlabBinaryBlock):
+            return Mat5BinaryBlockWriter(*args)
         if isinstance(narr, MatlabObject):
             return Mat5ObjectWriter(*args)
         if narr.dtype.hasobject: # cell or struct array

Modified: trunk/scipy/io/matlab/miobase.py
===================================================================
--- trunk/scipy/io/matlab/miobase.py	2009-02-22 04:05:03 UTC (rev 5585)
+++ trunk/scipy/io/matlab/miobase.py	2009-02-22 08:57:41 UTC (rev 5586)
@@ -3,6 +3,7 @@
 """
 Base classes for matlab (TM) file stream reading
 """
+import warnings
 
 import numpy as np
 
@@ -447,6 +448,10 @@
             try:
                 res = getter.get_array()
             except MatReadError, err:
+                warnings.warn(
+                    'Unreadable variable "%s", because "%s"' % \
+                    (name, err),
+                    Warning, stacklevel=2)
                 res = "Read error: %s" % err
                 getter.to_next()
             mdict[name] = res

Added: trunk/scipy/io/matlab/tests/afunc.m
===================================================================
--- trunk/scipy/io/matlab/tests/afunc.m	2009-02-22 04:05:03 UTC (rev 5585)
+++ trunk/scipy/io/matlab/tests/afunc.m	2009-02-22 08:57:41 UTC (rev 5586)
@@ -0,0 +1,12 @@
+function [a, b] = afunc(c, d)
+% A function
+a = c + 1;
+b = d + 10;
+function [a, b] = afunc(c, d)
+% A function
+a = c + 1;
+b = d + 10;
+function [a, b] = afunc(c, d)
+% A function
+a = c + 1;
+b = d + 10;

Modified: trunk/scipy/io/matlab/zlibstreams.py
===================================================================
--- trunk/scipy/io/matlab/zlibstreams.py	2009-02-22 04:05:03 UTC (rev 5585)
+++ trunk/scipy/io/matlab/zlibstreams.py	2009-02-22 08:57:41 UTC (rev 5586)
@@ -30,6 +30,7 @@
 
 '''
 
+from StringIO import StringIO
 from zlib import decompressobj
 
 
@@ -38,21 +39,53 @@
 
     >>> from StringIO import StringIO
     >>> from zlib import compress
-    >>> S = 'A handy module for reading compressed streams'
-    >>> F = StringIO(compress(S))
-    >>> ZF = ZlibInputStream(F)
-    >>> ZF.read()
+    >>> s = 'A handy module for reading compressed streams'
+    >>> cs = compress(s)
+    >>> fobj = StringIO(cs)
+    >>> zf = ZlibInputStream(fobj)
+    >>> zf.read()
     'A handy module for reading compressed streams'
-    >>> ZF.tell() == len(S)
+    >>> zf.tell() == len(s)
     True
-    >>> F = StringIO(compress(S))
-    >>> ZF = ZlibInputStream(F)
-    >>> ZF.tell()
+    >>> fobj = StringIO(cs)
+    >>> zf = ZlibInputStream(fobj)
+    >>> zf.tell()
     0
-    >>> ZF.read(6)
+    >>> zf.read(6)
     'A hand'
-    >>> ZF.tell()
+    >>> zf.tell()
     6
+
+    You can change the blocksize to preserve memory.  Here it is
+    ridiculously small for testing.
+    
+    >>> fobj = StringIO(cs)
+    >>> zf = ZlibInputStream(fobj)
+    >>> zf.default_blocksize = 3
+    >>> zf.read()
+    'A handy module for reading compressed streams'
+
+    You can set the known length of the zipped stream.  This is
+    normally when the stream is embedded in another stream, so there
+    is no end-of-file signal when the zlib stream is finished.
+
+    >>> fobj = StringIO(cs + 'padding')
+    >>> zf = ZlibInputStream(fobj, len(cs))
+    >>> zf.default_blocksize = 3
+    >>> zf.read()
+    'A handy module for reading compressed streams'
+
+    >>> fobj = StringIO(cs + 'padding')
+    >>> zf = ZlibInputStream(fobj, len(cs))
+    >>> zf.default_blocksize = 3
+    >>> zf.read(7)
+    'A handy'
+    >>> zf.tell()
+    7
+    >>> zf.read(7)
+    ' module'
+    >>> zf.tell()
+    14
     '''
 
     default_blocksize = 16384 # 16K
@@ -71,7 +104,7 @@
         self.zipped_length=zipped_length
         self.exhausted = False
         self.unzipped_pos = 0
-        self.data = ""
+        self.data = StringIO()
         self._unzipper = decompressobj()
         # number of zlib compressed bytes read
         self._z_bytes_read = 0 
@@ -116,20 +149,23 @@
         ''' Fill self.data with at least *bytes* number of bytes
         If bytes == -1, continue until the end of the stream
 
-        Returns ``None``
+        Parameters
+        ----------
+        bytes : integer
+            Number of bytes to read from zlib stream
+            If ``bytes==-1``, read the remaining bytes in stream
+
+        Returns
+        -------
+        None
         '''
         if self.exhausted:
             return
         # read until we have enough bytes in the buffer
         read_to_end = bytes == -1
-        
-        bytes_to_fill = bytes - len(self.data)
-        if not (bytes_to_fill or read_to_end):
-            return
-        # store data chunks in a list until the end so that we avoid the
-        # quadratic behavior of continuously extending a string
-        data_chunks = [self.data]
-        while bytes_to_fill > 0 or read_to_end:
+        s_data = StringIO(self.data.read())
+        s_data.seek(0, 2) # seek to end
+        while read_to_end or (bytes - s_data.pos) > 0:
             z_n_to_fetch = self._blocksize_iterator.next()
             if z_n_to_fetch == 0:
                 self.exhausted = True
@@ -137,15 +173,14 @@
             raw = self.fileobj.read(z_n_to_fetch)
             self._z_bytes_read += len(raw)
             if raw:
-                decompressed = self._unzipper.decompress(raw)
-                data_chunks.append(decompressed)
-                bytes_to_fill -= len(decompressed)
+                s_data.write(self._unzipper.decompress(raw))
             if len(raw) < z_n_to_fetch: # hit end of file
-                data_chunks.append(self._unzipper.flush())
+                s_data.write(self._unzipper.flush())
                 self.exhausted = True
                 break
-        self.data = ''.join(data_chunks)
-
+        s_data.seek(0)
+        self.data = s_data
+        
     def seek(self, offset, whence=0):
         ''' Set position in uncompressed stream
 
@@ -199,18 +234,13 @@
             string containing read data
 
         '''
-        if bytes == -1:
+        if (bytes == -1 or
+            (self.data.len-self.data.pos) < bytes):
             self.__fill(bytes)
-            data = self.data
-            self.data = ""
-        else:
-            if len(self.data) < bytes:
-                self.__fill(bytes)
-            data = self.data[:bytes]
-            self.data = self.data[bytes:]
+        data = self.data.read(bytes)
         self.unzipped_pos += len(data)
         return data
-
+    
     def readline(self):
         ''' Read text line from data
 
@@ -220,19 +250,39 @@
         >>> from zlib import compress
         >>> S = 'A handy module\\nfor reading\\ncompressed streams'
         >>> F = StringIO(compress(S))
-        >>> ZF = ZlibInputStream(F)
-        >>> ZF.readline()
+        >>> zf = ZlibInputStream(F)
+        >>> zf.readline()
         'A handy module\\n'
-        >>> ZF.readline()
+        >>> zf.readline()
         'for reading\\n'
+
+        You can also set the block size
+        (here very small for testing)
+        
+        >>> F = StringIO(compress(S))
+        >>> zf = ZlibInputStream(F)
+        >>> zf.default_blocksize = 5
+        >>> zf.readline()
+        'A handy module\\n'
+        >>> zf.readline()
+        'for reading\\n'
         '''
         # make sure we have an entire line
-        while not self.exhausted and "\n" not in self.data:
-            self.__fill(len(self.data) + 512)
-        i = self.data.find("\n") + 1
-        if i <= 0:
-            return self.read()
-        return self.read(i)
+        data = self.data.read()
+        blocks = [data]
+        while not self.exhausted and "\n" not in data:
+            # fill results in fresh data starting at 0
+            data = self.read(512)
+            blocks.append(data)
+        data = ''.join(blocks)
+        i = data.find("\n") + 1
+        if i <= 0: # newline at end
+            self.unzipped_pos += len(data)
+            return data
+        # new line not at end
+        self.unzipped_pos += i
+        self.data = StringIO(data[i:])
+        return data[:i]
 
     def readlines(self):
         ''' Read all data broken up into list of text lines
@@ -240,8 +290,8 @@
         >>> from zlib import compress
         >>> S = 'A handy module\\nfor reading\\ncompressed streams'
         >>> F = StringIO(compress(S))
-        >>> ZF = ZlibInputStream(F)
-        >>> ZF.readlines()
+        >>> zf = ZlibInputStream(F)
+        >>> zf.readlines()
         ['A handy module\\n', 'for reading\\n', 'compressed streams']
         >>>
         '''
@@ -273,4 +323,35 @@
                 yield self.default_blocksize
 
 
+class OneShotZlibInputStream(ZlibInputStream):
+    ''' One shot read, for testing '''
+    
+    def _block_size_generator(self):
+        ''' Generator to give block sizes for reading
+	'''
+	if self.zipped_length:
+            yield self.zipped_length
+            yield 0
+	else:
+            while True:
+                yield self.default_blocksize
 	
+
+class StubbyZlibInputStream(ZlibInputStream):
+    ''' One short, then fairly long reads '''
+
+    default_blocksize = 128 * 1024 # 128K
+    first_blocksize = 512 # 512 bytes
+    
+    def _block_size_generator(self):
+	if self.zipped_length:
+            # do not read beyond specified length
+            yield min(self.zipped_length, self.first_blocksize)
+            while True:
+                yield min(
+                    self.zipped_length - self._z_bytes_read,
+                    self.default_blocksize)
+	else:
+            yield self.first_blocksize
+            while True:
+                yield self.default_blocksize