appy.shared.diff: more work.

2011-11-19 23:53:38 +01:00 · 2011-11-19 23:53:38 +01:00 · 10398e770a
commit 10398e770a
parent 8c6301b901
1 changed files with 124 additions and 106 deletions
--- a/shared/diff.py
+++ b/shared/diff.py
@ -86,15 +86,18 @@ class Merger:
        while oldText:
            # Get the overlapping (new) diff.
            newDiff, newDiffStart, isPrevious = self.getNextDiff()
-            if not newDiff:
+            if not newDiff or (newDiffStart >= (self.i + len(oldText))):
-                # No more new diff. So normally, we should find what remains in
+                # No more new diff, or a new diff but far away, not within
-                # oldText at self.lineB[self.i:]
+                # oldText. So insert new the rest of p_oldText.
-                if not self.lineB[self.i:].startswith(oldText):
+                # Invariant: at this point, we should find what remains in
-                    raise 'Error!!!!'
+                # oldText at self.lineB[self.i:].
                res += self.differ.getModifiedChunk(oldText, 'insert', '',
                                                    msg=oldDiff.group(2))
                self.i += len(oldText)
                oldText = ''
                # If we have "popped" a new diff, dump it anyway.
                if newDiff:
                    res = self.dumpDiff(res, newDiff, newDiffStart, isPrevious)
                break
            # Dump the part of the old text that has been untouched by the new
            # diff.
@ -119,47 +122,54 @@ class Merger:
                self.deltaPrevious -= len(newDiff.group(3))
        return res
    def dumpDiff(self, res, diff, diffStart, isPrevious):
        '''Dumps the next p_diff (starting at p_diffStart) to insert into p_res
           and return p_res. If p_isPrevious is True, the diff is an old one
           (from self.lineA); else, it is a new one (from self.lineB).'''
        # Dump the part of lineB between self.i and diffStart
        res += self.lineB[self.i:diffStart]
        self.i = diffStart
        if isPrevious:
            # Dump the old diff (from self.lineA)
            if diff.group(1) == 'insert':
                # Check if the inserted text is still present in lineB
                if self.lineB[self.i:].startswith(diff.group(3)):
                    # Yes. Dump the diff and go ahead within lineB
                    res += diff.group(0)
                    self.i += len(diff.group(3))
                else:
                    # The inserted text can't be found as is in lineB.
                    # Must have been (partly) re-edited or removed.
                    overlap = self.manageOverlap(diff)
                    res += overlap
            elif diff.group(1) == 'delete':
                res += diff.group(0)
        else:
            # Dump the new diff (from self.lineB)
            res += diff.group(0)
            # Move forward within self.lineB
            self.i += len(diff.group(0))
            # Because of this new diff, all indexes computed on lineA are
            # now wrong because we express them relative to lineB. So:
            # update self.deltaPrevious to take this into account.
            self.deltaPrevious += len(diff.group(0))
            if diff.group(1) == 'delete':
                # The indexes in self.lineA do not take the deleted text into
                # account, because it wasn't deleted at this time. So remove
                # from self.deltaPrevious the length of removed text.
                self.deltaPrevious -= len(diff.group(3))
        return res
    def merge(self):
        '''Merges self.previousDiffs into self.lineB.'''
        res = ''
        diff, diffStart, isPrevious = self.getNextDiff()
        while diff:
-            # Dump the part of lineB between self.i and diffStart
+            res = self.dumpDiff(res, diff, diffStart, isPrevious)
-            res += self.lineB[self.i:diffStart]
+            # Load the next diff, if any
            self.i = diffStart
            if isPrevious:
                if diff.group(1) == 'insert':
                    # Check if the inserted text is still present in lineB
                    if self.lineB[self.i:].startswith(diff.group(3)):
                        # Yes. Dump the diff and go ahead within lineB
                        res += diff.group(0)
                        self.i += len(diff.group(3))
                    else:
                        # The inserted text can't be found as is in lineB.
                        # Must have been (partly) re-edited or removed.
                        overlap = self.manageOverlap(diff)
                        res += overlap
                elif diff.group(1) == 'delete':
                    res += diff.group(0)
            else:
                # Dump the diff and update self.i
                res += diff.group(0)
                self.i += len(diff.group(0))
                # Because of this new diff, all indexes computed on lineA are
                # now wrong because we express them relative to lineB. So:
                # update self.deltaPrevious to take this into account.
                self.deltaPrevious += len(diff.group(0))
                if diff.group(1) == 'delete':
                    # The indexes in lineA do not take the deleted text into
                    # account, because it wasn't deleted at this time. So remove
                    # from self.deltaPrevious the length of removed text.
                    self.deltaPrevious -= len(diff.group(3))
            # Load next diff
            diff, diffStart, isPrevious = self.getNextDiff()
        # Dump the end of self.lineB if not completely consumed
-        if self.i < len(self.lineB):
+        if self.i < len(self.lineB): res += self.lineB[self.i:]
            res += self.lineB[self.i:]
        return res
 # ------------------------------------------------------------------------------
@ -405,12 +415,14 @@ class HtmlDiff:
        return res
    garbage = ('', '\r')
-    def removeGarbage(self, l):
+    def removeGarbage(self, l, sep):
        '''Removes from list p_l elements that have no interest, like blank
-           strings or considered as is.'''
+           strings or considered as is. Also: strip lines (ie, if sep is a
           carriage return.'''
        i = len(l)-1
        while i >= 0:
            if l[i] in self.garbage: del l[i]
            if sep == '\n': l[i] = l[i].strip()
            i -= 1
        return l
@ -444,33 +456,64 @@ class HtmlDiff:
            if old[jo] != new[jn]: diffFound=True
        return i, jo+1, jn+1
-    def getReplacement(self, sep, lineA, lineB, previousDiffsA, outerTagA):
+    def getDumpPrefix(self, res, add, previousAdd, sep):
-        '''p_lineA has been replaced with p_lineB. Here, we will investigate
+        '''In most cases, when concatenating the next diff (p_add) to the
-           further here and explore differences at the *word* level between
+           global result (p_res), I must prefix it with p_sep (excepted if p_res
-           p_lineA and p_lineB.
+           is still empty). But when p_sep is a space, no space must be inserted
           between 2 adjacent updates (p_add and p_previousAdd), because such a
           space was not in the original version. This method computes the
           prefix, that can thus be empty if this latter case is met.'''
        prefix = ''
        if not res: return prefix
        if (sep == ' ') and previousAdd and \
           previousAdd.endswith('</span>') and add.startswith('<span'):
            pass
        else:
            prefix = sep
        return prefix
-           p_previousDiffsA may contain a series of updates (inserts, deletions)
+    def getReplacement(self, chunkA, chunkB, sep):
-           that have already been performed on p_lineA.
+        '''p_chunkA has been replaced with p_chunkB. Compute this update and
           return it.'''
        res = ''
        # We know that some lines have been replaced from chunkA to chunkB. By
        # identifying similarities between those lines, consider some as having
        # been deleted, modified or inserted.
        previousAdd = None
        for action, line in self.getSeqDiff(chunkA, chunkB, sep):
            add = None
            if action in ('insert', 'delete'):
                add = self.getModifiedChunk(line, action, sep)
            elif action == 'equal':
                add = line
            elif action == 'replace':
                lineA, lineB, previousDiffsA, outerTagA = line
                # lineA has been replaced with lineB. Here, we will investigate
                # further here and explore differences at the *word* level
                # between lineA and lineB. previousDiffsA may contain a series
                # of updates (inserts, deletions) that have already been
                # performed on lineA. If lineA was a previously inserted line,
                # lineA comes without his outer tag, that lies in outerTagA
                # (as a re.MatchObject instance computed from regex htmlTag).
                # In that case, we will wrap the result with that tag.
-           If p_lineA was a previously inserted line, p_lineA comes without his
+                # As a preamble, and in order to restrict annoyances due to the
-           outer tag, that lies in p_outerTagA (as a re.MatchObject instance
+                # presence of XHTML tags, we will remove start and end tags
-           computed from regex htmlTag). In that case, we will wrap the result
+                # from lineA and lineB if present.
-           with that tag.'''
+                i, ja, jb = self.getStringDiff(lineA, lineB)
-        # As a preamble, and in order to restrict annoyances due to the presence
+                diff = self.getHtmlDiff(lineA[i:ja], lineB[i:jb], ' ')
-        # of XHTML tags, we will remove start and end tags from p_lineA and
+                add = lineB[:i] + diff + lineB[jb:]
-        # p_lineB if present.
+                # Merge potential previous inner diff tags that were found (but
-        i, ja, jb = self.getStringDiff(lineA, lineB)
+                # extracted from) lineA.
-        diff = self.getHtmlDiff(lineA[i:ja], lineB[i:jb], ' ')
+                if previousDiffsA:
-        res = lineB[:i] + diff + lineB[jb:]
+                    merger = Merger(lineA, add, previousDiffsA, self)
-        # Merge potential previous inner diff tags that were found (but
+                    add = merger.merge()
-        # extracted from) lineA.
+                # Rewrap line into outerTagA if lineA was a line tagged as
-        if previousDiffsA:
+                # previously inserted.
-            merger = Merger(lineA, res, previousDiffsA, self)
+                if outerTagA:
-            res = merger.merge()
+                    add = self.computeTag(outerTagA, add)
-        # Rewrap line into outerTagA if lineA was a line tagged as previously
+            if add: res += self.getDumpPrefix(res, add, previousAdd, sep) + add
-        # inserted.
+            previousAdd = add
        if outerTagA:
            res = self.computeTag(outerTagA, res)
        return res
    def getHtmlDiff(self, old, new, sep):
@ -482,65 +525,40 @@ class HtmlDiff:
           word-by-word comparison within 2 lines that have been detected as
           similar in a previous call to m_getHtmlDiff with sep=carriage
           return.'''
-        res = []
+        res = ''
        a = self.split(old, sep)
        b = self.split(new, sep)
        matcher = difflib.SequenceMatcher()
        matcher.set_seqs(a, b)
        previousAdd = None
        for action, i1, i2, j1, j2 in matcher.get_opcodes():
            add = None
            # When sep is a space, we need to remember if we are dealing with
            # the last diff within the line or not.
-            chunkA = self.removeGarbage(a[i1:i2])
+            chunkA = self.removeGarbage(a[i1:i2], sep)
-            chunkB = self.removeGarbage(b[j1:j2])
+            chunkB = self.removeGarbage(b[j1:j2], sep)
            toAdd = None
            if action == 'equal':
-                if chunkA: toAdd = sep.join(chunkA)
+                if chunkA: add = sep.join(chunkA)
            elif action == 'insert':
                if chunkB:
-                    toAdd = self.getModifiedChunk(chunkB, action, sep)
+                    add = self.getModifiedChunk(chunkB, action, sep)
            elif action == 'delete':
                if chunkA:
-                    toAdd = self.getModifiedChunk(chunkA, action, sep)
+                    add = self.getModifiedChunk(chunkA, action, sep)
            elif action == 'replace':
                if not chunkA and not chunkB:
-                    toAdd = ''
+                    pass
                elif not chunkA:
                    # Was an addition, not a replacement
-                    toAdd = self.getModifiedChunk(chunkB, 'insert', sep)
+                    add = self.getModifiedChunk(chunkB, 'insert', sep)
                elif not chunkB:
                    # Was a deletion, not a replacement
-                    toAdd = self.getModifiedChunk(chunkA, 'delete', sep)
+                    add = self.getModifiedChunk(chunkA, 'delete', sep)
                else: # At least, a true replacement
-                    toAdd = ''
+                    add = self.getReplacement(chunkA, chunkB, sep)
-                    # We know that some lines have been replaced from a to b.
+            if add: res += self.getDumpPrefix(res, add, previousAdd, sep) + add
-                    # By identifying similarities between those lines, consider
+            previousAdd = add
-                    # some as having been deleted, modified or inserted.
+        return res
                    previousAdd = None
                    for sAction, line in self.getSeqDiff(chunkA, chunkB, sep):
                        if sAction in ('insert', 'delete'):
                            add = self.getModifiedChunk(line, sAction, sep)
                        elif sAction == 'equal':
                            add = line
                        elif sAction == 'replace':
                            add = self.getReplacement(sep, *line)
                        # In most cases, I must prefix "add" with "sep" before
                        # concatenating it to "toAdd" (excepted if toAdd is
                        # still empty). But when "sep" is a space, no space
                        # must be inserted between 2 adjacent updates, because
                        # such a space was not in the original version.
                        prefix = ''
                        if toAdd:
                            if (sep == ' ') and previousAdd and \
                               previousAdd.endswith('</span>') and \
                               add.startswith('<span'):
                                pass
                            else:
                                prefix = sep
                        toAdd += prefix + add
                        if sep == ' ':
                        previousAdd = add
            if toAdd: res.append(toAdd)
        return sep.join(res)
    def get(self):
        '''Produces the result.'''