Teaching difflib.Differ new tricks
Costas Malamas
costasm at hotmail.com
Fri Aug 16 03:39:04 EDT 2002
In the past I have needed to create diff reports in HTML, so I hacked
Tools/scripts/ndiff.py and later difflib.py but I never liked the
result. So, I sat down and overwrote much of difflib.Differ with a
new API to allow more programmatic control of what happens when
insertions/deletions/changes are detected.
The result is below: BaseDiffer() provides the new base API,
RetroDiffer() re-implements dlfflib.Differ() in this new API for
backwards testing and HTMLDiffer() creates HTML diffs (you still need
to add HTML headers and footers for things like CSS, but that's an
exercise left to the reader).
I'd be delighted to see difflib.Differ re-factored in a similar vein,
as I had to copy/paste a good bit of Tim's code to get this to work
properly. A diff-engine is a very handy thing to have; imagine RTF
diff reports, or XML reports or programmatic behavior based only on
text deltas (my next application will be parsing only additions to an
HTML page).
Hope someone else finds it useful,
Costas
---8<--------------------------------------------------------------------------
from difflib import Differ, IS_CHARACTER_JUNK, IS_LINE_JUNK,
_count_leading, SequenceMatcher
TRACE = 0
class BaseDiffer(Differ):
def _dump(self, tag, lines, lo, hi):
if tag == "-":
self.on_delete(lines, lo, hi)
elif tag == "+":
self.on_insert(lines, lo, hi)
elif tag == " ":
self.on_equal(lines, lo, hi)
else:
raise ValueError, 'unknown tag ' + `tag`
def _qformat(self, aline, bline, atags, btags):
#
# Shouldn't be called by BaseDiffer
#
pass
def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
r"""
When replacing one block of lines with another, search the
blocks
for *similar* lines; the best-matching pair (if any) is used
as a
synch point, and intraline difference marking is done on the
similar pair. Lots of work, but often worth it.
Example:
>>> d = Differ()
>>> d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
['abcdefGhijkl\n'], 0, 1)
>>> print ''.join(d.results),
- abcDefghiJkl
? ^ ^ ^
+ abcdefGhijkl
? ^ ^ ^
"""
##
## Same as the Differ's with the exception of the inner loop
below
##
if TRACE:
self.results.append('*** _fancy_replace %s %s %s %s\n'
% (alo, ahi, blo, bhi))
self._dump('>', a, alo, ahi)
self._dump('<', b, blo, bhi)
# don't synch up unless the lines have a similarity score of
at
# least cutoff; best_ratio tracks the best score seen so far
best_ratio, cutoff = 0.74, 0.75
cruncher = SequenceMatcher(self.charjunk)
eqi, eqj = None, None # 1st indices of equal lines (if any)
# search for the pair that matches best without being
identical
# (identical lines must be junk lines, & we don't want to
synch up
# on junk -- unless we have to)
for j in xrange(blo, bhi):
bj = b[j]
cruncher.set_seq2(bj)
for i in xrange(alo, ahi):
ai = a[i]
if ai == bj:
if eqi is None:
eqi, eqj = i, j
continue
cruncher.set_seq1(ai)
# computing similarity is expensive, so use the quick
# upper bounds first -- have seen this speed up messy
# compares by a factor of 3.
# note that ratio() is only expensive to compute the
first
# time it's called on a sequence pair; the expensive
part
# of the computation is cached by cruncher
if cruncher.real_quick_ratio() > best_ratio and \
cruncher.quick_ratio() > best_ratio and \
cruncher.ratio() > best_ratio:
best_ratio, best_i, best_j = cruncher.ratio(), i,
j
if best_ratio < cutoff:
# no non-identical "pretty close" pair
if eqi is None:
# no identical pair either -- treat it as a straight
replace
self._plain_replace(a, alo, ahi, b, blo, bhi)
return
# no close pair, but an identical pair -- synch up on that
best_i, best_j, best_ratio = eqi, eqj, 1.0
else:
# there's a close pair, so forget the identical pair (if
any)
eqi = None
# a[best_i] very similar to b[best_j]; eqi is None iff they're
not
# identical
if TRACE:
self.results.append('*** best_ratio %s %s %s %s\n'
% (best_ratio, best_i, best_j))
self._dump('>', a, best_i, best_i+1)
self._dump('<', b, best_j, best_j+1)
# pump out diffs from before the synch point
self._fancy_helper(a, alo, best_i, b, blo, best_j)
# do intraline marking on the synch pair
aelt, belt = a[best_i], b[best_j]
if eqi is None:
# pump out a '-', '?', '+', '?' quad for the synched lines
atags = btags = ""
cruncher.set_seqs(aelt, belt)
for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
la, lb = ai2 - ai1, bj2 - bj1
if tag == 'replace':
self.on_ch_change(aelt, ai1, ai2, belt, bj1, bj2)
#atags += '^' * la
#btags += '^' * lb
elif tag == 'delete':
self.on_ch_delete(aelt, ai1, ai2)
#atags += '-' * la
elif tag == 'insert':
self.on_ch_insert(belt, bj1, bj2)
#btags += '+' * lb
elif tag == 'equal':
self.on_ch_equal(aelt, ai1, ai2, belt, bj1, bj2)
#atags += ' ' * la
#btags += ' ' * lb
else:
raise ValueError, 'unknown tag ' + `tag`
#self._qformat(aelt, belt, atags, btags)
self.on_change(aelt, belt)
else:
# the synch pair is identical
#self.results.append(' ' + aelt)
self.on_equal([aelt], 0, 1)
# pump out diffs from after the synch point
self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
def on_delete(self, lines, lo, hi):
pass
def on_insert(self, lines, lo, hi):
pass
def on_equal(self, lines, lo, hi):
pass
def on_change(self, aline, bline):
pass
def on_ch_change(self, aline, alo, ahi, bline, blo, bhi):
pass
def on_ch_insert(self, line, lo, hi):
pass
def on_ch_delete(self, line, lo, hi):
pass
def on_ch_equal(self, aline, alo, ahi, bline, blo, bhi):
pass
class RetroDiffer(BaseDiffer):
"""
This class behaves just like the base difflib.Differ, using the
BaseDiffer API.
"""
def __init__(self, linejunk=None, charjunk=None):
BaseDiffer.__init__(self, linejunk, charjunk)
self._intraline1tags = ''
self._intraline2tags = ''
_qformat = Differ._qformat
def on_delete(self, lines, lo, hi):
for i in range(lo, hi):
self.results.append('%s %s' % ("-", lines[i]))
def on_insert(self, lines, lo, hi):
for i in range(lo, hi):
self.results.append('%s %s' % ("+", lines[i]))
def on_equal(self, lines, lo, hi):
for i in range(lo, hi):
self.results.append('%s %s' % (" ", lines[i]))
def on_change(self, aline, bline):
Differ._qformat(self, aline, bline, self._intraline1tags,
self._intraline2tags)
self._intraline1tags = ''
self._intraline2tags = ''
def on_ch_change(self, aline, alo, ahi, bline, blo, bhi):
self._intraline1tags += '^' * (ahi-alo)
self._intraline2tags += '^' * (bhi-blo)
def on_ch_insert(self, line, lo, hi):
self._intraline2tags += '+' * (hi-lo)
def on_ch_delete(self, line, lo, hi):
self._intraline1tags += '-' * (hi-lo)
def on_ch_equal(self, aline, alo, ahi, bline, blo, bhi):
self._intraline1tags += ' ' * (ahi-alo)
self._intraline2tags += ' ' * (bhi-blo)
class HTMLDiffer(BaseDiffer):
def __init__(self, *params, **opts):
BaseDiffer.__init__(self, *params, **opts)
self._intraline = ''
def on_delete(self, lines, lo, hi):
for i in range(lo, hi):
self.results.append("<b class='deleted'>%s</b>" %
lines[i])
def on_insert(self, lines, lo, hi):
for i in range(lo, hi):
self.results.append("<b class='inserted'>%s</b>" %
lines[i])
def on_equal(self, lines, lo, hi):
self.results += lines[lo:hi]
def on_ch_change(self, aline, alo, ahi, bline, blo, bhi):
self._intraline += "<b class='changed'>%s</b>" %
bline[blo:bhi]
def on_ch_insert(self, line, lo, hi):
self._intraline += "<b class='inserted'>%s</b>" % line[lo:hi]
def on_ch_delete(self, line, lo, hi):
self._intraline += "<b class='deleted'>%s</b>" % line[lo:hi]
def on_ch_equal(self, aline, alo, ahi, bline, blo, bhi):
self._intraline += bline[blo:bhi]
def on_change(self, aline, bline):
self.results.append(self._intraline[:])
self._intraline = ''
def ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK,
html=0):
if html:
return HTMLDiffer(linejunk, charjunk).compare(a, b)
else:
return RetroDiffer(linejunk, charjunk).compare(a, b)
More information about the Python-list
mailing list