appy.shared: added new class appy.shared.diff.HtmlDiff for computing differences between 2 chunks of HTML code.

2011-10-10 10:02:16 +02:00 · 2011-10-10 10:02:16 +02:00 · 38260cc2a5
commit 38260cc2a5
parent f13b7ea307
1 changed files with 179 additions and 0 deletions
--- a/shared/diff.py
+++ b/shared/diff.py
@ -0,0 +1,179 @@
+# ------------------------------------------------------------------------------
+import difflib
+
+# ------------------------------------------------------------------------------
+class HtmlDiff:
+    '''This class allows to compute differences between two versions of some
+       HTML chunk.'''
+    insertStyle = 'color: blue; cursor: help'
+    deleteStyle = 'color: red; text-decoration: line-through; cursor: help'
+
+    def __init__(self, old, new,
+                 insertMsg='Inserted text', deleteMsg='Deleted text',
+                 insertCss=None, deleteCss=None, diffRatio=0.7):
+        # p_old and p_new are strings containing chunks of HTML.
+        self.old = old.strip()
+        self.new = new.strip()
+        # Every time an "insert" or "delete" difference will be detected from
+        # p_old to p_new, the impacted chunk will be surrounded by a tag that
+        # will get, respectively, a 'title' attribute filled p_insertMsg or
+        # p_deleteMsg. The message will give an explanation about the change
+        # (who made it and at what time, for example).
+        self.insertMsg = insertMsg
+        self.deleteMsg = deleteMsg
+        # This tag will get a CSS class p_insertCss or p_deleteCss for
+        # highlighting the change. If no class is provided, default styles will
+        # be used (see HtmlDiff.insertStyle and HtmlDiff.deleteStyle).
+        self.insertCss = insertCss
+        self.deleteCss = deleteCss
+        # The diff algorithm of this class will need to identify similarities
+        # between strings. Similarity ratios will be computed by using method
+        # difflib.SequenceMatcher.ratio (see m_isSimilar below). Strings whose
+        # comparison will produce a ratio  above p_diffRatio will be considered
+        # as similar.
+        self.diffRatio = diffRatio
+
+    def getModifiedChunk(self, seq, type, sep):
+        '''p_sep.join(p_seq) is a chunk that was either inserted
+           (p_type='insert') or deleted (p_type='delete'). This method will
+           surround this part with a div or span tag that will get some CSS
+           class allowing to highlight the difference.'''
+        if sep == '\n': tag = 'div'
+        else: tag = 'span'
+        exec 'msg = self.%sMsg' % type
+        exec 'cssClass = self.%sCss' % type
+        if cssClass:
+            style = 'class="%s"' % cssClass
+        else:
+            exec 'style = self.%sStyle' % type
+            style = 'style="%s"' % style
+        return '<%s %s title="%s">%s</%s>' % (tag,style,msg,sep.join(seq),tag)
+
+    def getStringDiff(self, old, new):
+        '''Identifies the differences between strings p_old and p_new by
+           computing:
+           * i = the end index of the potential common starting part (if no
+                 common part is found, i=0);
+           * jo = the start index in p_old of the potential common ending part;
+           * jn = the start index in p_new of the potential common ending part.
+        '''
+        # Compute i
+        i = -1
+        diffFound = False
+        while not diffFound:
+            i += 1
+            if old[i] != new[i]: diffFound = True
+        # Compute jo and jn
+        jo = len(old)
+        jn = len(new)
+        diffFound = False
+        while not diffFound:
+            jo -= 1
+            jn -= 1
+            if old[jo] != new[jn]: diffFound=True
+        return i, jo+1, jn+1
+
+    def isSimilar(self, s1, s2):
+        '''Returns True if strings p_s1 and p_s2 can be considered as
+           similar.'''
+        ratio = difflib.SequenceMatcher(a=s1.lower(), b=s2.lower()).ratio()
+        return ratio > self.diffRatio
+
+    def getSeqDiff(self, seqA, seqB):
+        '''p_seqA and p_seqB are lists of strings. Here we will try to identify
+           similarities between strings from p_seqA and p_seqB, and return a
+           list of differences between p_seqA and p_seqB, where each element
+           is a tuple (action, data).
+           * If p_action is "delete", data is a sublist of p_seqA with lines
+             considered as not included anymore in p_seqB;
+           * If p_action is "replace", data is a tuple (lineA, lineB) containing
+             one line from p_seqA and one from p_seqB considered as similar;
+           * If p_action is "insert", data is a sublist of p_seqB with lines
+             considered as not included in p_seqA.
+        '''
+        res = []
+        i = j = k = 0
+        deleted = []
+        # Scan every string from p_seqA and try to find a similar string in
+        # p_seqB.
+        while i < len(seqA):
+            if k == len(seqB):
+                # We have already "consumed" every string from p_seqB. Remaining
+                # strings from p_seqA must now be considered has having been
+                # deleted.
+                if deleted: res.append( ('delete', deleted) )
+                res.append( ('delete', seqA[i:]) )
+                break
+            similarFound = False
+            for j in range(k, len(seqB)):
+                if self.isSimilar(seqA[i], seqB[j]):
+                    similarFound = True
+                    if deleted:
+                        # Dump first the strings flagged as deleted.
+                        res.append( ('delete', deleted) )
+                        deleted = []
+                    # Strings between indices k and j in p_seqB must be
+                    # considered as inserted, because no similar line exists
+                    # in p_seqA.
+                    if k < j:
+                        res.append( ('insert', seqB[k:j]) )
+                    # Similar strings are appended in a 'replace' entry
+                    res.append(('replace', (seqA[i], seqB[j])))
+                    k = j+1
+                    break
+            if not similarFound:
+                # Add to list of deleted lines.
+                deleted.append(seqA[i])
+            i += 1
+        # Consider any "unconsumed" line from p_seqB as being inserted.
+        if deleted: res.append( ('delete', deleted) )
+        if k < len(seqB): res.append( ('insert', seqB[k:]) )
+        return res
+
+    def getHtmlDiff(self, old, new, sep):
+        '''Returns the differences between p_old and p_new. Result is a string
+           containing the comparison in HTML format. p_sep is used for turning
+           p_old and p_new into sequences.'''
+        res = []
+        a = old.split(sep)
+        b = new.split(sep)
+        matcher = difflib.SequenceMatcher()
+        matcher.set_seqs(a,b)
+        for action, i1, i2, j1, j2 in matcher.get_opcodes():
+            if action == 'equal':
+                toAdd = sep.join(a[i1:i2])
+            elif action == 'insert':
+                toAdd = self.getModifiedChunk(b[j1:j2], action, sep)
+            elif action == 'delete':
+                toAdd = self.getModifiedChunk(a[i1:i2], action, sep)
+            elif action == 'replace':
+                if sep == '\n':
+                    # We know that some lines have been replaced from a to b. By
+                    # identifying similarities between those lines, consider
+                    # some as having been deleted, modified or inserted.
+                    toAdd = ''
+                    for sAction, data in self.getSeqDiff(a[i1:i2], b[j1:j2]):
+                        if sAction in ('insert', 'delete'):
+                            toAdd += self.getModifiedChunk(data, sAction, sep)
+                        elif sAction == 'replace':
+                            lineA, lineB = data
+                            # Investigate further here and explore differences
+                            # at the *word* level between lineA and lineB. As a
+                            # preamble, and in order to restrict annoyances due
+                            # to the presence of XHTML tags, we will compute
+                            # start and end parts wich are similar between lineA
+                            # and lineB: they may correspond to opening and
+                            # closing XHTML tags.
+                            i, ja, jb = self.getStringDiff(lineA, lineB)
+                            diff = self.getHtmlDiff(lineA[i:ja],lineB[i:jb],' ')
+                            toAdd += lineB[:i] + diff + lineB[jb:]
+                else:
+                    toAdd = self.getModifiedChunk(a[i1:i2],'delete', sep)
+                    toAdd += self.getModifiedChunk(b[j1:j2],'insert', sep)
+            res.append(toAdd)
+        return sep.join(res)
+
+    def get(self):
+        '''Produces the result.'''
+        return self.getHtmlDiff(self.old, self.new, '\n')
+# ------------------------------------------------------------------------------