A routine to calc Shannon's info entropy. I'm open to improvements/suggestions.

Steve D'Aprano steve+python at pearwood.info
Fri Dec 16 00:38:20 EST 2016

On Fri, 16 Dec 2016 09:21 am, DFS wrote:

> Code is based on the formula and results here:
> http://www.shannonentropy.netmark.pl
> Seems to work fine.
> Any suggestions to improve the code?

- Separate the calculation logic from the display logic as much
  as practical.

- Modular programming: use functions to make it easier to test and easier
  to modify the program in the future, e.g. to add command line options.

- Use standard tools where possible.

- Better, more descriptive names.

- Avoid global variables unless really needed.

- More error checking.

- Errors should exit with a non-zero return code, and should print to
  stderr rather than stdout.

You end up with about twice as much code, but hopefully it is easier to
understand, and it should certainly be easier to debug and maintain if you
decide to change it.

# --- cut ---

# only needed in Python 2
from __future__ import division

import sys
import math
import string

from collections import Counter

def fatal_error(errmsg):
    # Python 3 syntax
    print(errmsg, file=sys.stderr)
    ## Python 2 syntax
    ## print >>sys.stderr, errmsg

def display_header(msg):
    print("%s characters in '%s'" % (len(msg), msg))
    print(" #   Char  Freq   Dist    D*log2(D)  H(X) sum")
    print("---  ----  ----  ------  ----------  ----------")

def display_row(row_number, c, freq, dist, entropy, running_total):
    args = (row_number, c, freq, dist, entropy, running_total)
    template = "%3d  %-4c  %4d  %6.3f  %10.3f  %10.5f"
    print(template % args)

def entropy(c, freqs, num_symbols):
    """Return the entropy of character c from frequency table freqs."""
    f = freqs[c]/num_symbols
    return -f*math.log(f, 2)

def display_results(freqs, Hs, num_symbols):
    """Display results including entropy of each symbol.

    Returns the total entropy of the message.
    # Display rows with uppercase first, then lower, then digits.
    upper = sorted(filter(str.isupper, freqs))
    lower = sorted(filter(str.islower, freqs))
    digits = sorted(filter(str.isdigit, freqs))
    assert set(upper + lower + digits) == set(freqs)
    count = 1
    running_total = 0.0
    for chars in (upper, lower, digits):
        for c in chars:
            f = freqs[c]
            H = Hs[c]
            running_total += H
            display_row(count, c, f, f/num_symbols, H, running_total)
            count += 1
    total = running_total
    print("The Shannon entropy of your message is %.5f" % total)
    print("The metric entropy of your message is %.5f"% (total/num_symbols))
    return total

def main(args=None):
    if args is None:
        args = sys.argv[1:]
    if len(args) != 1:
        fatal_error("too many or too few arguments")
    msg = args[0]
    if not msg.isalnum():
        fatal_error("only alphanumeric symbols supported")
    frequencies = Counter(msg)
    num_symbols = len(msg)
    # Calculate the entropy of each symbol and the total entropy.
    entropies = {}
    for c in frequencies:
        H = entropy(c, frequencies, num_symbols)
        entropies[c] = H
    total = display_results(frequencies, entropies, num_symbols)

if __name__ == "__main__":
    # Only run when module is being used as a script.

# --- cut ---

“Cheer up,” they said, “things could be worse.” So I cheered up, and sure
enough, things got worse.

More information about the Python-list mailing list