Finding a text in raw data(size nearly 10GB) and Printing its memory address using python
MRAB
python at mrabarnett.plus.com
Mon Apr 23 18:43:01 EDT 2018
On 2018-04-23 22:11, Hac4u wrote:
> On Tuesday, April 24, 2018 at 12:54:43 AM UTC+5:30, MRAB wrote:
>> On 2018-04-23 18:24, Hac4u wrote:
>> > I have a raw data of size nearly 10GB. I would like to find a text string and print the memory address at which it is stored.
>> >
>> > This is my code
>> >
>> > import os
>> > import re
>> > filename="filename.dmp"
>> > read_data=2**24
>> > searchtext="bd:mongo:"
>> > he=searchtext.encode('hex')
>> > with open(filename, 'rb') as f:
>> > while True:
>> > data= f.read(read_data)
>> > if not data:
>> > break
>> > elif searchtext in data:
>> > print "Found"
>> > try:
>> > offset=hex(data.index(searchtext))
>> > print offset
>> > except ValueError:
>> > print 'Not Found'
>> > else:
>> > continue
>> >
>> >
>> > The address I am getting is
>> > #0x2c0900
>> > #0xb62300
>> >
>> > But the actual positioning is
>> > # 652c0900
>> > # 652c0950
>> >
>> Here's a version that handles overlaps.
>>
>> Try to keep in mind the distinction between bytestrings and text
>> strings. It doesn't matter as much in Python 2, but it does in Python 3.
>>
>>
>> filename = "filename.dmp"
>> chunk_size = 2**24
>> search_text = b"bd:mongo:"
>> chunk_start = 0
>> offset = 0
>> search_length = len(search_text)
>> overlap_length = search_length - 1
>> data = b''
>>
>> with open(filename, 'rb') as f:
>> while True:
>> # Read in more data.
>> data += f.read(chunk_size)
>> if not data:
>> break
>>
>> # Search this chunk.
>> while True:
>> offset = data.find(search_text, offset)
>> if offset < 0:
>> break
>>
>> print "Found at", hex(chunk_start + offset)
>> offset += search_length
>>
>> # We've searched this chunk. Discard all but a portion of overlap.
>> chunk_start += len(data) - overlap_length
>>
>> if overlap_length > 0:
>> data = data[-overlap_length : ]
>> else:
>> data = b''
>>
>> offset = 0
>
>
>
> Thanks alot for the code.
>
> I have two questions
>
> 1. Why did u use overlap. And, In what condition it can be counted on?
Suppose you're searching for b"bd:mongo:".
What happens if a chunk ends with b"b" and the next chunk starts with
b"d:mongo:"? Or b"bd:m" and b"ongo:"? Or b"bd:mongo" and b":"?
It wouldn't find a match that's split across chunks.
> 2. Your code does not end. It keep on looking for sth ..Though it worked well.
>
> So, Thanks alot for the code.
>
Here's my code with a bug fix:
filename = "filename.dmp"
chunk_size = 2**24
search_text = b"bd:mongo:"
chunk_start = 0
offset = 0
search_length = len(search_text)
overlap_length = search_length - 1
data = b''
with open(filename, 'rb') as f:
while True:
# Read in more data.
data += f.read(chunk_size)
if len(data) < search_length:
break
# Search this chunk.
while True:
offset = data.find(search_text, offset)
if offset < 0:
break
print "Found at", hex(chunk_start + offset)
offset += search_length
# We've searched this chunk. Discard all but a portion of overlap.
chunk_start += len(data) - overlap_length
if overlap_length > 0:
data = data[-overlap_length : ]
else:
data = b''
offset = 0
More information about the Python-list
mailing list