From 38260cc2a5897a616d4fe61a7e14f365bee7264a Mon Sep 17 00:00:00 2001 From: Gaetan Delannay Date: Mon, 10 Oct 2011 10:02:16 +0200 Subject: [PATCH] appy.shared: added new class appy.shared.diff.HtmlDiff for computing differences between 2 chunks of HTML code. --- shared/diff.py | 179 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 shared/diff.py diff --git a/shared/diff.py b/shared/diff.py new file mode 100644 index 0000000..7304ec9 --- /dev/null +++ b/shared/diff.py @@ -0,0 +1,179 @@ +# ------------------------------------------------------------------------------ +import difflib + +# ------------------------------------------------------------------------------ +class HtmlDiff: + '''This class allows to compute differences between two versions of some + HTML chunk.''' + insertStyle = 'color: blue; cursor: help' + deleteStyle = 'color: red; text-decoration: line-through; cursor: help' + + def __init__(self, old, new, + insertMsg='Inserted text', deleteMsg='Deleted text', + insertCss=None, deleteCss=None, diffRatio=0.7): + # p_old and p_new are strings containing chunks of HTML. + self.old = old.strip() + self.new = new.strip() + # Every time an "insert" or "delete" difference will be detected from + # p_old to p_new, the impacted chunk will be surrounded by a tag that + # will get, respectively, a 'title' attribute filled p_insertMsg or + # p_deleteMsg. The message will give an explanation about the change + # (who made it and at what time, for example). + self.insertMsg = insertMsg + self.deleteMsg = deleteMsg + # This tag will get a CSS class p_insertCss or p_deleteCss for + # highlighting the change. If no class is provided, default styles will + # be used (see HtmlDiff.insertStyle and HtmlDiff.deleteStyle). + self.insertCss = insertCss + self.deleteCss = deleteCss + # The diff algorithm of this class will need to identify similarities + # between strings. Similarity ratios will be computed by using method + # difflib.SequenceMatcher.ratio (see m_isSimilar below). Strings whose + # comparison will produce a ratio above p_diffRatio will be considered + # as similar. + self.diffRatio = diffRatio + + def getModifiedChunk(self, seq, type, sep): + '''p_sep.join(p_seq) is a chunk that was either inserted + (p_type='insert') or deleted (p_type='delete'). This method will + surround this part with a div or span tag that will get some CSS + class allowing to highlight the difference.''' + if sep == '\n': tag = 'div' + else: tag = 'span' + exec 'msg = self.%sMsg' % type + exec 'cssClass = self.%sCss' % type + if cssClass: + style = 'class="%s"' % cssClass + else: + exec 'style = self.%sStyle' % type + style = 'style="%s"' % style + return '<%s %s title="%s">%s' % (tag,style,msg,sep.join(seq),tag) + + def getStringDiff(self, old, new): + '''Identifies the differences between strings p_old and p_new by + computing: + * i = the end index of the potential common starting part (if no + common part is found, i=0); + * jo = the start index in p_old of the potential common ending part; + * jn = the start index in p_new of the potential common ending part. + ''' + # Compute i + i = -1 + diffFound = False + while not diffFound: + i += 1 + if old[i] != new[i]: diffFound = True + # Compute jo and jn + jo = len(old) + jn = len(new) + diffFound = False + while not diffFound: + jo -= 1 + jn -= 1 + if old[jo] != new[jn]: diffFound=True + return i, jo+1, jn+1 + + def isSimilar(self, s1, s2): + '''Returns True if strings p_s1 and p_s2 can be considered as + similar.''' + ratio = difflib.SequenceMatcher(a=s1.lower(), b=s2.lower()).ratio() + return ratio > self.diffRatio + + def getSeqDiff(self, seqA, seqB): + '''p_seqA and p_seqB are lists of strings. Here we will try to identify + similarities between strings from p_seqA and p_seqB, and return a + list of differences between p_seqA and p_seqB, where each element + is a tuple (action, data). + * If p_action is "delete", data is a sublist of p_seqA with lines + considered as not included anymore in p_seqB; + * If p_action is "replace", data is a tuple (lineA, lineB) containing + one line from p_seqA and one from p_seqB considered as similar; + * If p_action is "insert", data is a sublist of p_seqB with lines + considered as not included in p_seqA. + ''' + res = [] + i = j = k = 0 + deleted = [] + # Scan every string from p_seqA and try to find a similar string in + # p_seqB. + while i < len(seqA): + if k == len(seqB): + # We have already "consumed" every string from p_seqB. Remaining + # strings from p_seqA must now be considered has having been + # deleted. + if deleted: res.append( ('delete', deleted) ) + res.append( ('delete', seqA[i:]) ) + break + similarFound = False + for j in range(k, len(seqB)): + if self.isSimilar(seqA[i], seqB[j]): + similarFound = True + if deleted: + # Dump first the strings flagged as deleted. + res.append( ('delete', deleted) ) + deleted = [] + # Strings between indices k and j in p_seqB must be + # considered as inserted, because no similar line exists + # in p_seqA. + if k < j: + res.append( ('insert', seqB[k:j]) ) + # Similar strings are appended in a 'replace' entry + res.append(('replace', (seqA[i], seqB[j]))) + k = j+1 + break + if not similarFound: + # Add to list of deleted lines. + deleted.append(seqA[i]) + i += 1 + # Consider any "unconsumed" line from p_seqB as being inserted. + if deleted: res.append( ('delete', deleted) ) + if k < len(seqB): res.append( ('insert', seqB[k:]) ) + return res + + def getHtmlDiff(self, old, new, sep): + '''Returns the differences between p_old and p_new. Result is a string + containing the comparison in HTML format. p_sep is used for turning + p_old and p_new into sequences.''' + res = [] + a = old.split(sep) + b = new.split(sep) + matcher = difflib.SequenceMatcher() + matcher.set_seqs(a,b) + for action, i1, i2, j1, j2 in matcher.get_opcodes(): + if action == 'equal': + toAdd = sep.join(a[i1:i2]) + elif action == 'insert': + toAdd = self.getModifiedChunk(b[j1:j2], action, sep) + elif action == 'delete': + toAdd = self.getModifiedChunk(a[i1:i2], action, sep) + elif action == 'replace': + if sep == '\n': + # We know that some lines have been replaced from a to b. By + # identifying similarities between those lines, consider + # some as having been deleted, modified or inserted. + toAdd = '' + for sAction, data in self.getSeqDiff(a[i1:i2], b[j1:j2]): + if sAction in ('insert', 'delete'): + toAdd += self.getModifiedChunk(data, sAction, sep) + elif sAction == 'replace': + lineA, lineB = data + # Investigate further here and explore differences + # at the *word* level between lineA and lineB. As a + # preamble, and in order to restrict annoyances due + # to the presence of XHTML tags, we will compute + # start and end parts wich are similar between lineA + # and lineB: they may correspond to opening and + # closing XHTML tags. + i, ja, jb = self.getStringDiff(lineA, lineB) + diff = self.getHtmlDiff(lineA[i:ja],lineB[i:jb],' ') + toAdd += lineB[:i] + diff + lineB[jb:] + else: + toAdd = self.getModifiedChunk(a[i1:i2],'delete', sep) + toAdd += self.getModifiedChunk(b[j1:j2],'insert', sep) + res.append(toAdd) + return sep.join(res) + + def get(self): + '''Produces the result.''' + return self.getHtmlDiff(self.old, self.new, '\n') +# ------------------------------------------------------------------------------