ANN: Python Decrypt PDF script -- builds on pdftools

Follower follower at
Tue Aug 17 18:13:55 CEST 2004


I wanted to extract the meta-data from an encrypted/protected PDF file
and could not find any Python scripts to do this. So, I decided to
write something myself, the result follows.

This demonstration utility requires the `pdftools` files from
<> but the
decryption functions themselves should be usable with other Python PDF

Documentation is marginal and all I can say is that worked on the
three PDF files I tested it on... :-)


P.S. The usual Usenet-mangling warning applies--yeah, I know--I should
put it up on a web site somewhere... :-)

# Decrypt PDF Info
# Decrypts PDF files and displays meta-data associated with them. (If
# file isn't encrypted the information is displayed as is.)
# The results are similar to xpdf's `pdfinfo` utility.
# It should be possible to decrypt all of the objects contained
# in the PDF, but this only reads the Document Information Dictionary.
# (Note: All the PDF handling is provided by `pdftools`, this just
#        the ability to deal with encrypted PDF files.)
# Requires:
#   + pdftools
#     <>
# Based on:
#   + ``
#     <>
# Incorporates:
#   + RC4 from CipherSaber implementation by Ka-Ping Yee
<ping at>
#     <>
# References:
#   + <>
# Author:
#   follower at (Standing on *many* shoulders...)
import sys
import md5
import struct

from pdftools import PDFdocument

def arcfour(input, key):
    Perform the ARCFOUR (RC4) algorithm on a given input list of bytes
    a key given as a list of bytes, and return the output as a list of

    (From CipherSaber implementation by Ka-Ping Yee <ping at>
    i, j, state = 0, 0, range(256)
    for i in range(256):
        j = (j + state[i] + key[i % len(key)]) % 256
        state[i], state[j] = state[j], state[i]
    i, j, output = 0, 0, []
    for byte in input:
        i = (i + 1) % 256
        j = (j + state[i]) % 256
        state[i], state[j] = state[j], state[i]
        n = (state[i] + state[j]) % 256
        output.append(byte ^ state[n])
    return output

_passwordPad = [
  0x28, 0xbf, 0x4e, 0x5e, 0x4e, 0x75, 0x8a, 0x41,
  0x64, 0x00, 0x4e, 0x56, 0xff, 0xfa, 0x01, 0x08, 
  0x2e, 0x2e, 0x00, 0xb6, 0xd0, 0x68, 0x3e, 0x80, 
  0x2f, 0x0c, 0xa9, 0xfe, 0x64, 0x53, 0x69, 0x7a]

passwordPad = "".join([chr(b) for b in _passwordPad])

def calculateFileKey(fileId, ownerHash, userHash, permissions,
                     userPassword = ""):
    Calculates the file key for the document as described in
    (see [PDFE] and [PDFPL]).    
    md =
    md.update((userPassword + passwordPad)[:32])
    md.update(struct.pack("<L", permissions))

    fileKey = md.digest()[:5]

    return fileKey    

def calculateObjectKey(fileKey, objectNumber, generationNumber):
    Calculates the key for the object as described in references
    (see [PDFE] and [PDFPL]).    
    md =
    md.update(struct.pack("<L", objectNumber)[:3])
    md.update(struct.pack("<L", generationNumber)[:2])
    objectKey = md.digest()[:10]

    return objectKey    

class NotEncryptedException(Exception):
    The supplied PDF document is not encrypted.


def getFileKey(doc, userPassword = ""):
    Extracts the information required to calculate the file key
    from the supplied PDF document.

    In most cases `userPassword` can be left empty.   
    fileId = doc.trailer_dict['ID'][0] # Is the ID always repeated?

        encryptDict = doc.dereference(doc.trailer_dict['Encrypt'])
    except KeyError:
        raise NotEncryptedException
    # TODO: Check encryption version is ok. (filter/standard/v/1/r/2)

    ownerHash = encryptDict[KEY_OWNER_HASH]
    userHash = encryptDict[KEY_USER_HASH]
    # `permissions` should be "four-byte integer, LSB first."
    permissions = encryptDict[KEY_PERMISSIONS] 
    fileKey = calculateFileKey(fileId, ownerHash, userHash,

    # Sanity check user password
    assert(decrypt(userHash, fileKey) == passwordPad)

    return fileKey    

def decrypt(text, key):
    Decrypts the supplied object (as a string) with the supplied key.

    Returns "plain text" form of object as a string.
    return "".join([chr(b)
                    for b in arcfour(map(ord, text), map(ord, key))])

def showDocumentInfo(doc, fileKey):
    Displays the content of the (optionally encrypted) Document
    Dictionary for the supplied PDF document.
    infoDictRef = doc.trailer_dict['Info']
    objectNumber = infoDictRef.obj
    generationNumber = infoDictRef.gen

    infoDict = doc.dereference(infoDictRef)

    objectKey = calculateObjectKey(fileKey, objectNumber,

    for field, encryptedValue in infoDict.iteritems():
        if fileKey:
            value = decrypt(encryptedValue, objectKey)
            value = encryptedValue
        print "%s: %s" % (field, value)

if __name__ == "__main__":
        filename = sys.argv[1]
    except IndexError:
        raise SystemExit("Usage %s <filename.pdf>" % sys.argv[0])
    doc = PDFdocument(filename)

        fileKey = getFileKey(doc)
    except NotEncryptedException:
        fileKey = ""

    showDocumentInfo(doc, fileKey)

More information about the Python-announce-list mailing list