A routine to calc Shannon's info entropy. I'm open to improvements/suggestions.
Steve D'Aprano
steve+python at pearwood.info
Fri Dec 16 00:38:20 EST 2016
On Fri, 16 Dec 2016 09:21 am, DFS wrote:
> Code is based on the formula and results here:
> http://www.shannonentropy.netmark.pl
[...]
> Seems to work fine.
>
> Any suggestions to improve the code?
- Separate the calculation logic from the display logic as much
as practical.
- Modular programming: use functions to make it easier to test and easier
to modify the program in the future, e.g. to add command line options.
- Use standard tools where possible.
- Better, more descriptive names.
- Avoid global variables unless really needed.
- More error checking.
- Errors should exit with a non-zero return code, and should print to
stderr rather than stdout.
You end up with about twice as much code, but hopefully it is easier to
understand, and it should certainly be easier to debug and maintain if you
decide to change it.
# --- cut ---
# only needed in Python 2
from __future__ import division
import sys
import math
import string
from collections import Counter
def fatal_error(errmsg):
# Python 3 syntax
print(errmsg, file=sys.stderr)
## Python 2 syntax
## print >>sys.stderr, errmsg
sys.exit(1)
def display_header(msg):
print()
print("%s characters in '%s'" % (len(msg), msg))
print()
print(" # Char Freq Dist D*log2(D) H(X) sum")
print("--- ---- ---- ------ ---------- ----------")
def display_row(row_number, c, freq, dist, entropy, running_total):
args = (row_number, c, freq, dist, entropy, running_total)
template = "%3d %-4c %4d %6.3f %10.3f %10.5f"
print(template % args)
def entropy(c, freqs, num_symbols):
"""Return the entropy of character c from frequency table freqs."""
f = freqs[c]/num_symbols
return -f*math.log(f, 2)
def display_results(freqs, Hs, num_symbols):
"""Display results including entropy of each symbol.
Returns the total entropy of the message.
"""
# Display rows with uppercase first, then lower, then digits.
upper = sorted(filter(str.isupper, freqs))
lower = sorted(filter(str.islower, freqs))
digits = sorted(filter(str.isdigit, freqs))
assert set(upper + lower + digits) == set(freqs)
count = 1
running_total = 0.0
for chars in (upper, lower, digits):
for c in chars:
f = freqs[c]
H = Hs[c]
running_total += H
display_row(count, c, f, f/num_symbols, H, running_total)
count += 1
total = running_total
print()
print("The Shannon entropy of your message is %.5f" % total)
print("The metric entropy of your message is %.5f"% (total/num_symbols))
return total
def main(args=None):
if args is None:
args = sys.argv[1:]
if len(args) != 1:
fatal_error("too many or too few arguments")
msg = args[0]
if not msg.isalnum():
fatal_error("only alphanumeric symbols supported")
display_header(msg)
frequencies = Counter(msg)
num_symbols = len(msg)
# Calculate the entropy of each symbol and the total entropy.
entropies = {}
for c in frequencies:
H = entropy(c, frequencies, num_symbols)
entropies[c] = H
total = display_results(frequencies, entropies, num_symbols)
if __name__ == "__main__":
# Only run when module is being used as a script.
main()
# --- cut ---
--
Steve
“Cheer up,” they said, “things could be worse.” So I cheered up, and sure
enough, things got worse.
More information about the Python-list
mailing list