ANN: Python Decrypt PDF script -- builds on pdftools

Hi,
I wanted to extract the meta-data from an encrypted/protected PDF file and could not find any Python scripts to do this. So, I decided to write something myself, the result follows.
This demonstration utility requires the `pdftools` files from http://www.boddie.org.uk/david/Projects/Python/pdftools/ but the decryption functions themselves should be usable with other Python PDF libraries.
Documentation is marginal and all I can say is that worked on the three PDF files I tested it on... :-)
--Phil.
P.S. The usual Usenet-mangling warning applies--yeah, I know--I should put it up on a web site somewhere... :-)
#!/usr/bin/python # # Decrypt PDF Info # # Decrypts PDF files and displays meta-data associated with them. (If the # file isn't encrypted the information is displayed as is.) # # The results are similar to xpdf's `pdfinfo` utility. # # It should be possible to decrypt all of the objects contained # in the PDF, but this only reads the Document Information Dictionary. # # (Note: All the PDF handling is provided by `pdftools`, this just adds # the ability to deal with encrypted PDF files.) # # Requires: # + pdftools # http://www.boddie.org.uk/david/Projects/Python/pdftools/ # # Based on: # + `pdfdecrypt.pl` # http://www-2.cs.cmu.edu/~dst/Adobe/Gallery/pdfdecrypt.pl [PDFPL] # # Incorporates: # + RC4 from CipherSaber implementation by Ka-Ping Yee ping@lfw.org # http://www.xs4all.nl/~cg/ciphersaber/comp/python.txt # # References: # + http://www-2.cs.cmu.edu/~dst/Adobe/Gallery/anon21jul01-pdf-encryption.txt [PDFE] # # Author: # follower@myrealbox.com (Standing on *many* shoulders...) # import sys import md5 import struct
from pdftools import PDFdocument
def arcfour(input, key): """ Perform the ARCFOUR (RC4) algorithm on a given input list of bytes with a key given as a list of bytes, and return the output as a list of bytes.
(From CipherSaber implementation by Ka-Ping Yee ping@lfw.org http://www.xs4all.nl/~cg/ciphersaber/comp/python.txt) """ i, j, state = 0, 0, range(256) for i in range(256): j = (j + state[i] + key[i % len(key)]) % 256 state[i], state[j] = state[j], state[i] i, j, output = 0, 0, [] for byte in input: i = (i + 1) % 256 j = (j + state[i]) % 256 state[i], state[j] = state[j], state[i] n = (state[i] + state[j]) % 256 output.append(byte ^ state[n]) return output
_passwordPad = [ 0x28, 0xbf, 0x4e, 0x5e, 0x4e, 0x75, 0x8a, 0x41, 0x64, 0x00, 0x4e, 0x56, 0xff, 0xfa, 0x01, 0x08, 0x2e, 0x2e, 0x00, 0xb6, 0xd0, 0x68, 0x3e, 0x80, 0x2f, 0x0c, 0xa9, 0xfe, 0x64, 0x53, 0x69, 0x7a]
passwordPad = "".join([chr(b) for b in _passwordPad])
def calculateFileKey(fileId, ownerHash, userHash, permissions, userPassword = ""): """ Calculates the file key for the document as described in references (see [PDFE] and [PDFPL]). """ md = md5.new() md.update((userPassword + passwordPad)[:32]) md.update(ownerHash) md.update(struct.pack("<L", permissions)) md.update(fileId)
fileKey = md.digest()[:5]
return fileKey
def calculateObjectKey(fileKey, objectNumber, generationNumber): """ Calculates the key for the object as described in references (see [PDFE] and [PDFPL]). """ md = md5.new() md.update(fileKey) md.update(struct.pack("<L", objectNumber)[:3]) md.update(struct.pack("<L", generationNumber)[:2]) objectKey = md.digest()[:10]
return objectKey
class NotEncryptedException(Exception): """ The supplied PDF document is not encrypted. """
KEY_OWNER_HASH = 'O' KEY_USER_HASH = 'U' KEY_PERMISSIONS = 'P'
def getFileKey(doc, userPassword = ""): """ Extracts the information required to calculate the file key from the supplied PDF document.
In most cases `userPassword` can be left empty. """ fileId = doc.trailer_dict['ID'][0] # Is the ID always repeated?
try: encryptDict = doc.dereference(doc.trailer_dict['Encrypt']) except KeyError: raise NotEncryptedException # TODO: Check encryption version is ok. (filter/standard/v/1/r/2)
ownerHash = encryptDict[KEY_OWNER_HASH] userHash = encryptDict[KEY_USER_HASH] # `permissions` should be "four-byte integer, LSB first." permissions = encryptDict[KEY_PERMISSIONS]
fileKey = calculateFileKey(fileId, ownerHash, userHash, permissions, userPassword)
# Sanity check user password assert(decrypt(userHash, fileKey) == passwordPad)
return fileKey
def decrypt(text, key): """ Decrypts the supplied object (as a string) with the supplied key.
Returns "plain text" form of object as a string. """ return "".join([chr(b) for b in arcfour(map(ord, text), map(ord, key))])
def showDocumentInfo(doc, fileKey): """ Displays the content of the (optionally encrypted) Document Information Dictionary for the supplied PDF document. """ infoDictRef = doc.trailer_dict['Info'] objectNumber = infoDictRef.obj generationNumber = infoDictRef.gen
infoDict = doc.dereference(infoDictRef)
objectKey = calculateObjectKey(fileKey, objectNumber, generationNumber)
for field, encryptedValue in infoDict.iteritems(): if fileKey: value = decrypt(encryptedValue, objectKey) else: value = encryptedValue print "%s: %s" % (field, value)
if __name__ == "__main__": try: filename = sys.argv[1] except IndexError: raise SystemExit("Usage %s <filename.pdf>" % sys.argv[0])
doc = PDFdocument(filename)
try: fileKey = getFileKey(doc) except NotEncryptedException: fileKey = ""
showDocumentInfo(doc, fileKey)
participants (1)
-
follower@gmail.com