Teaching difflib.Differ new tricks

Fri Aug 16 03:39:04 EDT 2002

In the past I have needed to create diff reports in HTML, so I hacked
Tools/scripts/ndiff.py and later difflib.py but I never liked the
result.  So, I sat down and overwrote much of difflib.Differ with a
new API to allow more programmatic control of what happens when
insertions/deletions/changes are detected.

The result is below: BaseDiffer() provides the new base API,
RetroDiffer() re-implements dlfflib.Differ() in this new API for
backwards testing and HTMLDiffer() creates HTML diffs (you still need
to add HTML headers and footers for things like CSS, but that's an
exercise left to the reader).

I'd be delighted to see difflib.Differ re-factored in a similar vein,
as I had to copy/paste a good bit of Tim's code to get this to work
properly.  A diff-engine is a very handy thing to have; imagine RTF
diff reports, or XML reports or programmatic behavior based only on
text deltas (my next application will be parsing only additions to an
HTML page).

Hope someone else finds it useful,

Costas

---8<--------------------------------------------------------------------------
from difflib import Differ, IS_CHARACTER_JUNK, IS_LINE_JUNK,
_count_leading, SequenceMatcher

TRACE = 0

class BaseDiffer(Differ):
    def _dump(self, tag, lines, lo, hi):
        if tag == "-":
            self.on_delete(lines, lo, hi)
        elif tag == "+":
            self.on_insert(lines, lo, hi)
        elif tag == " ":
            self.on_equal(lines, lo, hi)
        else:
            raise ValueError, 'unknown tag ' + `tag`

    def _qformat(self, aline, bline, atags, btags):
        #
        # Shouldn't be called by BaseDiffer
        #
        pass

    def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
        r"""
        When replacing one block of lines with another, search the
blocks
        for *similar* lines; the best-matching pair (if any) is used
as a
        synch point, and intraline difference marking is done on the
        similar pair. Lots of work, but often worth it.

        Example:

        >>> d = Differ()
        >>> d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
['abcdefGhijkl\n'], 0, 1)
        >>> print ''.join(d.results),
        - abcDefghiJkl
        ?    ^  ^  ^
        + abcdefGhijkl
        ?    ^  ^  ^
        """
        ##
        ## Same as the Differ's with the exception of the inner loop
below
        ## 
        if TRACE:
            self.results.append('*** _fancy_replace %s %s %s %s\n'
                                % (alo, ahi, blo, bhi))
            self._dump('>', a, alo, ahi)
            self._dump('<', b, blo, bhi)

        # don't synch up unless the lines have a similarity score of
at
        # least cutoff; best_ratio tracks the best score seen so far
        best_ratio, cutoff = 0.74, 0.75
        cruncher = SequenceMatcher(self.charjunk)
        eqi, eqj = None, None   # 1st indices of equal lines (if any)

        # search for the pair that matches best without being
identical
        # (identical lines must be junk lines, & we don't want to
synch up
        # on junk -- unless we have to)
        for j in xrange(blo, bhi):
            bj = b[j]
            cruncher.set_seq2(bj)
            for i in xrange(alo, ahi):
                ai = a[i]
                if ai == bj:
                    if eqi is None:
                        eqi, eqj = i, j
                    continue
                cruncher.set_seq1(ai)
                # computing similarity is expensive, so use the quick
                # upper bounds first -- have seen this speed up messy
                # compares by a factor of 3.
                # note that ratio() is only expensive to compute the
first
                # time it's called on a sequence pair; the expensive
part
                # of the computation is cached by cruncher
                if cruncher.real_quick_ratio() > best_ratio and \
                      cruncher.quick_ratio() > best_ratio and \
                      cruncher.ratio() > best_ratio:
                    best_ratio, best_i, best_j = cruncher.ratio(), i,
j
        if best_ratio < cutoff:
            # no non-identical "pretty close" pair
            if eqi is None:
                # no identical pair either -- treat it as a straight
replace
                self._plain_replace(a, alo, ahi, b, blo, bhi)
                return
            # no close pair, but an identical pair -- synch up on that
            best_i, best_j, best_ratio = eqi, eqj, 1.0
        else:
            # there's a close pair, so forget the identical pair (if
any)
            eqi = None

        # a[best_i] very similar to b[best_j]; eqi is None iff they're
not
        # identical
        if TRACE:
            self.results.append('*** best_ratio %s %s %s %s\n'
                                % (best_ratio, best_i, best_j))
            self._dump('>', a, best_i, best_i+1)
            self._dump('<', b, best_j, best_j+1)

        # pump out diffs from before the synch point
        self._fancy_helper(a, alo, best_i, b, blo, best_j)

        # do intraline marking on the synch pair
        aelt, belt = a[best_i], b[best_j]
        if eqi is None:
            # pump out a '-', '?', '+', '?' quad for the synched lines
            atags = btags = ""
            cruncher.set_seqs(aelt, belt)
            for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
                la, lb = ai2 - ai1, bj2 - bj1
                if tag == 'replace':
                    self.on_ch_change(aelt, ai1, ai2, belt, bj1, bj2)
                    #atags += '^' * la
                    #btags += '^' * lb
                elif tag == 'delete':
                    self.on_ch_delete(aelt, ai1, ai2)
                    #atags += '-' * la
                elif tag == 'insert':
                    self.on_ch_insert(belt, bj1, bj2)
                    #btags += '+' * lb
                elif tag == 'equal':
                    self.on_ch_equal(aelt, ai1, ai2, belt, bj1, bj2)
                    #atags += ' ' * la
                    #btags += ' ' * lb
                else:
                    raise ValueError, 'unknown tag ' + `tag`
            #self._qformat(aelt, belt, atags, btags)
            self.on_change(aelt, belt)
        else:
            # the synch pair is identical
            #self.results.append('  ' + aelt)
            self.on_equal([aelt], 0, 1)

        # pump out diffs from after the synch point
        self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)

    def on_delete(self, lines, lo, hi):
        pass

    def on_insert(self, lines, lo, hi):
        pass

    def on_equal(self, lines, lo, hi):
        pass

    def on_change(self, aline, bline):
        pass

    def on_ch_change(self, aline, alo, ahi, bline, blo, bhi):
        pass

    def on_ch_insert(self, line, lo, hi):
        pass

    def on_ch_delete(self, line, lo, hi):
        pass

    def on_ch_equal(self, aline, alo, ahi, bline, blo, bhi):
        pass

class RetroDiffer(BaseDiffer):
    """
    This class behaves just like the base difflib.Differ, using the
BaseDiffer API.
    """
    def __init__(self, linejunk=None, charjunk=None):
        BaseDiffer.__init__(self, linejunk, charjunk)
        self._intraline1tags = ''
        self._intraline2tags = ''

    _qformat = Differ._qformat

    def on_delete(self, lines, lo, hi):
        for i in range(lo, hi):
            self.results.append('%s %s' % ("-", lines[i]))

    def on_insert(self, lines, lo, hi):
        for i in range(lo, hi):
            self.results.append('%s %s' % ("+", lines[i]))

    def on_equal(self, lines, lo, hi):
        for i in range(lo, hi):
            self.results.append('%s %s' % (" ", lines[i]))

    def on_change(self, aline, bline):
        Differ._qformat(self, aline, bline, self._intraline1tags,
self._intraline2tags)
        self._intraline1tags = ''
        self._intraline2tags = ''

    def on_ch_change(self, aline, alo, ahi, bline, blo, bhi):
        self._intraline1tags += '^' * (ahi-alo)
        self._intraline2tags += '^' * (bhi-blo)

    def on_ch_insert(self, line, lo, hi):
        self._intraline2tags += '+' * (hi-lo)

    def on_ch_delete(self, line, lo, hi):
        self._intraline1tags += '-' * (hi-lo)

    def on_ch_equal(self, aline, alo, ahi, bline, blo, bhi):
        self._intraline1tags += ' ' * (ahi-alo)
        self._intraline2tags += ' ' * (bhi-blo)

class HTMLDiffer(BaseDiffer):
    def __init__(self, *params, **opts):
        BaseDiffer.__init__(self, *params, **opts)
        self._intraline = ''

    def on_delete(self, lines, lo, hi):
        for i in range(lo, hi):
            self.results.append("<b class='deleted'>%s</b>" %
lines[i])

    def on_insert(self, lines, lo, hi):
        for i in range(lo, hi):
            self.results.append("<b class='inserted'>%s</b>" %
lines[i])

    def on_equal(self, lines, lo, hi):
        self.results += lines[lo:hi]

    def on_ch_change(self, aline, alo, ahi, bline, blo, bhi):
        self._intraline += "<b class='changed'>%s</b>" %
bline[blo:bhi]

    def on_ch_insert(self, line, lo, hi):
        self._intraline += "<b class='inserted'>%s</b>" % line[lo:hi]

    def on_ch_delete(self, line, lo, hi):
        self._intraline += "<b class='deleted'>%s</b>" % line[lo:hi]

    def on_ch_equal(self, aline, alo, ahi, bline, blo, bhi):        
        self._intraline += bline[blo:bhi]

    def on_change(self, aline, bline):
        self.results.append(self._intraline[:])
        self._intraline = ''

def ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK,
html=0):
    if html:
        return HTMLDiffer(linejunk, charjunk).compare(a, b)
    else:    
        return RetroDiffer(linejunk, charjunk).compare(a, b)