finding/replacing a long binary pattern in a .bin file

Bengt Richter bokr at oz.net
Fri Jan 14 06:06:41 EST 2005


On Thu, 13 Jan 2005 11:40:52 -0800, Jeff Shannon <jeff at ccvcorp.com> wrote:

>Bengt Richter wrote:
>
>> BTW, I'm sure you could write a generator that would take a file name
>> and oldbinstring and newbinstring as arguments, and read and yield nice
>> os-file-system-friendly disk-sector-multiple chunks, so you could write
>> 
>>     fout = open('mynewbinfile', 'wb')
>>     for buf in updated_file_stream('myoldbinfile','rb', oldbinstring, newbinstring):
>>         fout.write(buf)
>>     fout.close()
>
>What happens when the bytes to be replaced are broken across a block 
>boundary?  ISTM that neither half would be recognized....
>
>I believe that this requires either reading the entire file into 
>memory, to scan all at once, or else conditionally matching an 
>arbitrary fragment of the end of a block against the beginning of the 
>oldbinstring...  Given that the file in question is only a few tens of 
>kbytes, I'd think that doing it in one gulp is simpler.  (For a large 
>file, chunking it might be necessary, though...)
>
Might as well post this, in case you're interested... warning, not very tested.
You want to write a proper test? ;-)

----< sreplace.py >-------------------------------------------------
def sreplace(sseq, old, new, retsize=4096):
    """
    iterate through sseq input string chunk sequence treating it
    as a continuous stream, replacing each substring old with new,
    and generating a sequence of retsize returned strings, except
    that the last may be shorter depedning on available input.
    """
    inbuf = ''
    endsseq = False
    out = []
    start = 0
    lenold = len(old)
    lennew = len(new)
    while not endsseq:
        start, endprev = old and inbuf.find(old, start) or -1, start
        if start<0:
            start = endprev  # restore find start pos
            for chunk in sseq: inbuf+= chunk; break
            else:
                out.append(inbuf[start:])
                endsseq = True
        else:
            out.append(inbuf[endprev:start])
            start += lenold
            out.append(new)
        if endsseq or sum(map(len, out))>=retsize:
            s = ''.join(out)
            while len(s)>= retsize:
                yield s[:retsize]
                s = s[retsize:]
            if endsseq:
                if s: yield s
            else:
                out = [s]

if __name__ == '__main__':
    import sys
    args = sys.argv[:]
    usage = """
        Test usage: [python] sreplace.py old new retsize [rest of args is string chunks for test] 
            where old is old string to find in chunked stream and new is replacement
            and retsize is returned buffer size, except that last may be shorter"""
    if not args[1:]: raise SystemExit, usage
    try:
        args[3] =  int(args[3]) 
        args[0] = iter(sys.argv[4:])
        print '%r\n-----------\n%s\n------------' %(sys.argv[1:], '\n'.join(sreplace(*args[:4])))
    except Exception, e:
        print '%s: %s' %(e.__class__.__name__, e)
        raise SystemExit, usage
--------------------------------------------------------------------

As mentioned, not tested very much beyond what you see:

[ 2:43] C:\pywk\ut>py24 sreplace.py x _XX_  20 This is x and abcxdef 012x345 zzxx zzz x
['x', '_XX_', '20', 'This', 'is', 'x', 'and', 'abcxdef', '012x345', 'zzxx', 'zzz', 'x']
-----------
Thisis_XX_andabc_XX_
def012_XX_345zz_XX__
XX_zzz_XX_
------------

[ 2:43] C:\pywk\ut>py24 sreplace.py x _XX_  80 This is x and abcxdef 012x345 zzxx zzz x
['x', '_XX_', '80', 'This', 'is', 'x', 'and', 'abcxdef', '012x345', 'zzxx', 'zzz', 'x']
-----------
Thisis_XX_andabc_XX_def012_XX_345zz_XX__XX_zzz_XX_
------------

[ 2:43] C:\pywk\ut>py24 sreplace.py x _XX_  4  This is x and abcxdef 012x345 zzxx zzz x
['x', '_XX_', '4', 'This', 'is', 'x', 'and', 'abcxdef', '012x345', 'zzxx', 'zzz', 'x']
-----------
This
is_X
X_an
dabc
_XX_
def0
12_X
X_34
5zz_
XX__
XX_z
zz_X
X_
------------

[ 2:44] C:\pywk\ut>py24 sreplace.py def DEF 80 This is x and abcxdef 012x345 zzxx zzz x
['def', 'DEF', '80', 'This', 'is', 'x', 'and', 'abcxdef', '012x345', 'zzxx', 'zzz', 'x']
-----------
ThisisxandabcxDEF012x345zzxxzzzx
------------

If you wanted to change a binary file, you'd use it something like (although probably let
the default buffer size be at 4096, not 20, which is pretty silly other than demoing.
At least the input chunks are 512 ;-)

 >>> from sreplace import sreplace
 >>> fw = open('sreplace.py.txt','wb')
 >>> for buf in sreplace(iter(lambda f=open('sreplace.py','rb'):f.read(512), ''),'out','OUT',20):
 ...     fw.write(buf)
 ...
 >>> fw.close()
 >>> ^Z


[ 3:00] C:\pywk\ut>diff -u sreplace.py sreplace.py.txt
--- sreplace.py Fri Jan 14 02:39:52 2005
+++ sreplace.py.txt     Fri Jan 14 03:00:01 2005
@@ -7,7 +7,7 @@
     """
     inbuf = ''
     endsseq = False
-    out = []
+    OUT = []
     start = 0
     lenold = len(old)
     lennew = len(new)
@@ -17,21 +17,21 @@
             start = endprev  # restore find start pos
             for chunk in sseq: inbuf+= chunk; break
             else:
-                out.append(inbuf[start:])
+                OUT.append(inbuf[start:])
                 endsseq = True
         else:
-            out.append(inbuf[endprev:start])
+            OUT.append(inbuf[endprev:start])
             start += lenold
-            out.append(new)
-        if endsseq or sum(map(len, out))>=retsize:
-            s = ''.join(out)
+            OUT.append(new)
+        if endsseq or sum(map(len, OUT))>=retsize:
+            s = ''.join(OUT)
             while len(s)>= retsize:
                 yield s[:retsize]
                 s = s[retsize:]
             if endsseq:
                 if s: yield s
             else:
-                out = [s]
+                OUT = [s]

 if __name__ == '__main__':
     import sys


Regards,
Bengt Richter



More information about the Python-list mailing list