ANN: Python Decrypt PDF script -- builds on pdftools
Herb Schilling
herbcle at
Thu Aug 19 21:43:43 EDT 2004
>I wanted to extract the meta-data from an encrypted/protected PDF file
>and could not find any Python scripts to do this. So, I decided to
>write something myself, the result follows.
>This demonstration utility requires the `pdftools` files from
><> but the
>decryption functions themselves should be usable with other Python PDF
>Documentation is marginal and all I can say is that worked on the
>three PDF files I tested it on... :-)
>P.S. The usual Usenet-mangling warning applies--yeah, I know--I should
>put it up on a web site somewhere... :-)
># Decrypt PDF Info
># Decrypts PDF files and displays meta-data associated with them. (If
># file isn't encrypted the information is displayed as is.)
># The results are similar to xpdf's `pdfinfo` utility.
># It should be possible to decrypt all of the objects contained
># in the PDF, but this only reads the Document Information Dictionary.
># (Note: All the PDF handling is provided by `pdftools`, this just
># the ability to deal with encrypted PDF files.)
># Requires:
># + pdftools
># <>
># Based on:
># + ``
># <>
># Incorporates:
># + RC4 from CipherSaber implementation by Ka-Ping Yee
><ping at>
># <>
># References:
># +
># Author:
># follower at (Standing on *many* shoulders...)
>import sys
>import md5
>import struct
>from pdftools import PDFdocument
>def arcfour(input, key):
> """
> Perform the ARCFOUR (RC4) algorithm on a given input list of bytes
> a key given as a list of bytes, and return the output as a list of
> bytes.
> (From CipherSaber implementation by Ka-Ping Yee <ping at>
> <>)
> """
> i, j, state = 0, 0, range(256)
> for i in range(256):
> j = (j + state[i] + key[i % len(key)]) % 256
> state[i], state[j] = state[j], state[i]
> i, j, output = 0, 0, []
> for byte in input:
> i = (i + 1) % 256
> j = (j + state[i]) % 256
> state[i], state[j] = state[j], state[i]
> n = (state[i] + state[j]) % 256
> output.append(byte ^ state[n])
> return output
>_passwordPad = [
> 0x28, 0xbf, 0x4e, 0x5e, 0x4e, 0x75, 0x8a, 0x41,
> 0x64, 0x00, 0x4e, 0x56, 0xff, 0xfa, 0x01, 0x08,
> 0x2e, 0x2e, 0x00, 0xb6, 0xd0, 0x68, 0x3e, 0x80,
> 0x2f, 0x0c, 0xa9, 0xfe, 0x64, 0x53, 0x69, 0x7a]
>passwordPad = "".join([chr(b) for b in _passwordPad])
>def calculateFileKey(fileId, ownerHash, userHash, permissions,
> userPassword = ""):
> """
> Calculates the file key for the document as described in
> (see [PDFE] and [PDFPL]).
> """
> md =
> md.update((userPassword + passwordPad)[:32])
> md.update(ownerHash)
> md.update(struct.pack("<L", permissions))
> md.update(fileId)
> fileKey = md.digest()[:5]
> return fileKey
>def calculateObjectKey(fileKey, objectNumber, generationNumber):
> """
> Calculates the key for the object as described in references
> (see [PDFE] and [PDFPL]).
> """
> md =
> md.update(fileKey)
> md.update(struct.pack("<L", objectNumber)[:3])
> md.update(struct.pack("<L", generationNumber)[:2])
> objectKey = md.digest()[:10]
> return objectKey
>class NotEncryptedException(Exception):
> """
> The supplied PDF document is not encrypted.
> """
>def getFileKey(doc, userPassword = ""):
> """
> Extracts the information required to calculate the file key
> from the supplied PDF document.
> In most cases `userPassword` can be left empty.
> """
> fileId = doc.trailer_dict['ID'][0] # Is the ID always repeated?
> try:
> encryptDict = doc.dereference(doc.trailer_dict['Encrypt'])
> except KeyError:
> raise NotEncryptedException
> # TODO: Check encryption version is ok. (filter/standard/v/1/r/2)
> ownerHash = encryptDict[KEY_OWNER_HASH]
> userHash = encryptDict[KEY_USER_HASH]
> # `permissions` should be "four-byte integer, LSB first."
> permissions = encryptDict[KEY_PERMISSIONS]
> fileKey = calculateFileKey(fileId, ownerHash, userHash,
> userPassword)
> # Sanity check user password
> assert(decrypt(userHash, fileKey) == passwordPad)
> return fileKey
>def decrypt(text, key):
> """
> Decrypts the supplied object (as a string) with the supplied key.
> Returns "plain text" form of object as a string.
> """
> return "".join([chr(b)
> for b in arcfour(map(ord, text), map(ord, key))])
>def showDocumentInfo(doc, fileKey):
> """
> Displays the content of the (optionally encrypted) Document
> Dictionary for the supplied PDF document.
> """
> infoDictRef = doc.trailer_dict['Info']
> objectNumber = infoDictRef.obj
> generationNumber = infoDictRef.gen
> infoDict = doc.dereference(infoDictRef)
> objectKey = calculateObjectKey(fileKey, objectNumber,
> for field, encryptedValue in infoDict.iteritems():
> if fileKey:
> value = decrypt(encryptedValue, objectKey)
> else:
> value = encryptedValue
> print "%s: %s" % (field, value)
>if __name__ == "__main__":
> try:
> filename = sys.argv[1]
> except IndexError:
> raise SystemExit("Usage %s <filename.pdf>" % sys.argv[0])
> doc = PDFdocument(filename)
> try:
> fileKey = getFileKey(doc)
> except NotEncryptedException:
> fileKey = ""
> showDocumentInfo(doc, fileKey)
