From 8c6301b9012b7160e35eb331f0573065dcbde3d3 Mon Sep 17 00:00:00 2001
From: Gaetan Delannay <gaetan.delannay@gmail.com>
Date: Sat, 19 Nov 2011 11:48:03 +0100
Subject: [PATCH] appy.shared.diff: more work.

---
 shared/diff.py | 116 +++++++++++++++++++++++++++++--------------------
 1 file changed, 68 insertions(+), 48 deletions(-)

diff --git a/shared/diff.py b/shared/diff.py
index 74f76ea..4a04eea 100644
--- a/shared/diff.py
+++ b/shared/diff.py
@@ -90,13 +90,7 @@ class Merger:
                 # No more new diff. So normally, we should find what remains in
                 # oldText at self.lineB[self.i:]
                 if not self.lineB[self.i:].startswith(oldText):
-                    # Anormal additional char. Probably a space? Indeed,
-                    # word-level comparisons imply split(' ') which can be
-                    # error-prone.
-                    res += self.lineB[self.i]
-                    self.i += 1
-                    if not self.lineB[self.i:].startswith(oldText):
-                        raise 'Error!!!!'
+                    raise 'Error!!!!'
                 res += self.differ.getModifiedChunk(oldText, 'insert', '',
                                                     msg=oldDiff.group(2))
                 self.i += len(oldText)
@@ -128,9 +122,6 @@ class Merger:
     def merge(self):
         '''Merges self.previousDiffs into self.lineB.'''
         res = ''
-        print 'MERGE'
-        print 'Line A', self.lineA
-        print 'Line B', self.lineB
         diff, diffStart, isPrevious = self.getNextDiff()
         while diff:
             # Dump the part of lineB between self.i and diffStart
@@ -218,7 +209,9 @@ class HtmlDiff:
            string) is a chunk that was either inserted (p_type='insert') or
            deleted (p_type='delete'). This method will surround this part with
            a div or span tag that will get some CSS class allowing to highlight
-           the update. If p_msg is given, it will be used instead of the default
+           the update.
+
+           If p_msg is given, it will be used instead of the default
            p_type-related message stored on p_self.'''
         # Will the surrouding tag be a div or a span?
         if sep == '\n': tag = 'div'
@@ -233,7 +226,7 @@ class HtmlDiff:
         else:
             exec 'style = self.%sStyle' % type
             style = 'style="%s"' % style
-        # the 'name' attribute of the tag indicates the type of the update.
+        # The 'name' attribute of the tag indicates the type of the update.
         exec 'tagName = self.%sName' % type
         # The idea is: if there are several lines, every line must be surrounded
         # by a tag. This way, we know that a surrounding tag can't span several
@@ -421,7 +414,36 @@ class HtmlDiff:
             i -= 1
         return l
 
-    nextSeps = {'\n': ' ', ' ': ''}
+    def getStringDiff(self, old, new):
+        '''Identifies the differences between strings p_old and p_new by
+           computing:
+           * i = the end index of the potential common starting part (if no
+                 common part is found, i=0);
+           * jo = the start index in p_old of the potential common ending part;
+           * jn = the start index in p_new of the potential common ending part.
+        '''
+        # Compute i
+        i = -1
+        diffFound = False
+        while not diffFound:
+            i += 1
+            if (i == len(old)) or (i == len(new)): break
+            if old[i] != new[i]: diffFound = True
+        # Compute jo and jn
+        jo = len(old)
+        jn = len(new)
+        diffFound = False
+        while not diffFound:
+            if (jo == i) or (jn == i):
+                # We have reached the end of substring old[i:] or new[i:]
+                jo -=1
+                jn -= 1
+                break
+            jo -= 1
+            jn -= 1
+            if old[jo] != new[jn]: diffFound=True
+        return i, jo+1, jn+1
+
     def getReplacement(self, sep, lineA, lineB, previousDiffsA, outerTagA):
         '''p_lineA has been replaced with p_lineB. Here, we will investigate
            further here and explore differences at the *word* level between
@@ -437,18 +459,11 @@ class HtmlDiff:
         # As a preamble, and in order to restrict annoyances due to the presence
         # of XHTML tags, we will remove start and end tags from p_lineA and
         # p_lineB if present.
-        matchA = htmlTag.match(lineA)
-        contentA = matchA and matchA.group(3) or lineA
-        matchB = htmlTag.match(lineB)
-        contentB = matchB and matchB.group(3) or lineB
-        # Perform the diff at the level of words
-        diff = self.getHtmlDiff(contentA, contentB, self.nextSeps[sep])
-        if matchB:
-            res = self.computeTag(matchB, diff)
-        else:
-            res = diff
-        # Merge potential previous inner diff tags that
-        # were found (but extracted from) lineA.
+        i, ja, jb = self.getStringDiff(lineA, lineB)
+        diff = self.getHtmlDiff(lineA[i:ja], lineB[i:jb], ' ')
+        res = lineB[:i] + diff + lineB[jb:]
+        # Merge potential previous inner diff tags that were found (but
+        # extracted from) lineA.
         if previousDiffsA:
             merger = Merger(lineA, res, previousDiffsA, self)
             res = merger.merge()
@@ -468,15 +483,13 @@ class HtmlDiff:
            similar in a previous call to m_getHtmlDiff with sep=carriage
            return.'''
         res = []
-        if sep:
-            a = self.split(old, sep)
-            b = self.split(new, sep)
-        else:
-            a = old
-            b = new
+        a = self.split(old, sep)
+        b = self.split(new, sep)
         matcher = difflib.SequenceMatcher()
         matcher.set_seqs(a, b)
         for action, i1, i2, j1, j2 in matcher.get_opcodes():
+            # When sep is a space, we need to remember if we are dealing with
+            # the last diff within the line or not.
             chunkA = self.removeGarbage(a[i1:i2])
             chunkB = self.removeGarbage(b[j1:j2])
             toAdd = None
@@ -498,27 +511,34 @@ class HtmlDiff:
                     # Was a deletion, not a replacement
                     toAdd = self.getModifiedChunk(chunkA, 'delete', sep)
                 else: # At least, a true replacement
-                    toAdd = []
-                    # We know that some lines/words have been replaced from a to
-                    # b. By identifying similarities between those lines/words,
-                    # consider some as having been deleted, modified or
-                    # inserted.
+                    toAdd = ''
+                    # We know that some lines have been replaced from a to b.
+                    # By identifying similarities between those lines, consider
+                    # some as having been deleted, modified or inserted.
+                    previousAdd = None
                     for sAction, line in self.getSeqDiff(chunkA, chunkB, sep):
                         if sAction in ('insert', 'delete'):
-                            mChunk = self.getModifiedChunk(line, sAction, sep)
-                            toAdd.append(mChunk)
+                            add = self.getModifiedChunk(line, sAction, sep)
                         elif sAction == 'equal':
-                            toAdd.append(line)
+                            add = line
                         elif sAction == 'replace':
-                            toAdd.append(self.getReplacement(sep, *line))
-                    # The following line, when sep is the space (=when workin
-                    # on diffs at the word level), leads to additional spaces
-                    # being dumped into the result (ie, a space between a delete
-                    # and an insert, which was not in the initial text). We
-                    # could not find a way to avoid inserting those spaces. So
-                    # when merging diffs (see Merger.merge), we know that a
-                    # 'space' error can occur and we take it into account then.
-                    toAdd = sep.join(toAdd)
+                            add = self.getReplacement(sep, *line)
+                        # In most cases, I must prefix "add" with "sep" before
+                        # concatenating it to "toAdd" (excepted if toAdd is
+                        # still empty). But when "sep" is a space, no space
+                        # must be inserted between 2 adjacent updates, because
+                        # such a space was not in the original version.
+                        prefix = ''
+                        if toAdd:
+                            if (sep == ' ') and previousAdd and \
+                               previousAdd.endswith('</span>') and \
+                               add.startswith('<span'):
+                                pass
+                            else:
+                                prefix = sep
+                        toAdd += prefix + add
+                        if sep == ' ':
+                        previousAdd = add
             if toAdd: res.append(toAdd)
         return sep.join(res)