Images from PDF
Darrell
news at dorb.com
Thu Apr 12 23:47:31 EDT 2001
I was playing with pulling images out of PDF files tonight.
Thought I'd share.
--Darrell
import Image, re, zlib, sys
def stripImages(fn):
buf= open(fn,'rb').read()
fnS= fn.split(".")[0]
s =
re.findall("(?s)/XObject\s+/Subtype\s+/Image(.*?)stream\s*\012(.*?)endstream
", buf)
print len(s)
for i in s:
try:
name = re.findall("(?i)/name\s+/(\w+)",i[0])[0]
width= re.findall("(?i)/Width\s+(\d+)",i[0])[0]
height= re.findall("(?i)/Height\s+(\d+)",i[0])[0]
filter = re.findall("(?i)/filter\s+/(\w+)",i[0])[0]
colorSpace = re.findall("(?i)/ColorSpace\s+/(\w+)",i[0])[0]
except IndexError:
print "Skip:", i[0]
continue
print "Found:", name, width, height, filter, colorSpace
if filter=="FlateDecode":
im = zlib.decompress(i[1])
im = Image.fromstring("RGB", (int(width),int(height)), im)
im.save("%s_%s.jpg"%(fnS,name))
elif filter == "DCTDecode":
open("%s_%s.jpg"%(fnS,name),'wb').write(i[1])
stripImages(sys.argv[1])
More information about the Python-list
mailing list