appy.shared.diff: more flesh on the XhtmlDiff class.

This commit is contained in:
Gaetan Delannay 2011-10-22 19:41:50 +02:00
parent c11378c747
commit 1ebcbb7b34

View file

@ -1,5 +1,104 @@
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
import difflib import re, difflib
# ------------------------------------------------------------------------------
innerDiff = re.compile('<span name="(insert|delete)".*?>(.*?)</span>')
# ------------------------------------------------------------------------------
class Merger:
'''This class allows to merge 2 lines of text, each containing inserts and
deletions.'''
def __init__(self, lineA, lineB, previousDiffs):
# lineA comes "naked": any diff previously found on it was removed from
# it (ie, deleted text has been completely removed, while inserted text
# has been included, but without its surrounding tag). Info about
# previous diffs is kept in a separate variable "previousDiffs".
self.lineA = lineA
self.previousDiffs = previousDiffs
# Differences between lineA and lineB have just been computed and are
# included (within inner tags) in lineB. We will compute their position
# in self.newDiffs (see below).
self.lineB = lineB
self.newDiffs = self.computeNewDiffs()
# We choose to walk within self.lineB. We will keep in self.i our
# current position within self.lineB.
self.i = 0
# The delta index that must be applied on previous diffs
self.deltaPrevious = 0
def computeNewDiffs(self):
'''lineB may include inner "insert" and/or tags. This function
detects them.'''
i = 0
res = []
while i < len(self.lineB):
match = innerDiff.search(self.lineB, i)
if not match: break
res.append(match)
i = match.end()
return res
def getNextDiff(self):
'''During the merging process on self.lineB, what next diff to
"consume"? An old one? A new one?'''
# No more diff ?
if not self.previousDiffs and not self.newDiffs:
return None, None, None
# No more new diff ?
if not self.newDiffs:
diff = self.previousDiffs[0]
del self.previousDiffs[0]
return diff, diff.start() + self.deltaPrevious, True
# No more previous diff ?
if not self.previousDiffs:
diff = self.newDiffs[0]
del self.newDiffs[0]
return diff, diff.start(), False
# At least one more new and previous diff. Which one to consume?
previousDiff = self.previousDiffs[0]
newDiff = self.newDiffs[0]
previousDiffIndex = previousDiff.start() + self.deltaPrevious
newDiffIndex = newDiff.start()
if previousDiffIndex <= newDiffIndex:
# Previous wins
del self.previousDiffs[0]
return previousDiff, previousDiffIndex, True
else:
# New wins
del self.newDiffs[0]
return newDiff, newDiffIndex, False
def merge(self):
'''Merges self.previousDiffs into self.lineB.'''
res = ''
diff, diffStart, isPrevious = self.getNextDiff()
while diff:
# Dump the part of lineB between self.i and diffStart
res += self.lineB[self.i:diffStart]
self.i = diffStart
# Dump the diff
res += diff.group(0)
if isPrevious:
if diff.group(1) == 'insert':
self.i += len(diff.group(2))
else:
# Update self.i
self.i += len(diff.group(0))
# Because of this new diff, all indexes computed on lineA are
# now wrong because we express them relative to lineB. So:
# update self.deltaPrevious to take this into account.
self.deltaPrevious += len(diff.group(0))
if diff.group(1) == 'delete':
# The indexes in lineA do not take the deleted text into
# account, because it wasn't deleted at this time. So remove
# from self.deltaPrevious the length of removed text.
self.deltaPrevious -= len(diff.group(2))
# Load next diff
diff, diffStart, isPrevious = self.getNextDiff()
# Dump the end of self.lineB if not completely consumed
if self.i < len(self.lineB):
res += self.lineB[self.i:]
return res
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
class HtmlDiff: class HtmlDiff:
@ -10,7 +109,8 @@ class HtmlDiff:
def __init__(self, old, new, def __init__(self, old, new,
insertMsg='Inserted text', deleteMsg='Deleted text', insertMsg='Inserted text', deleteMsg='Deleted text',
insertCss=None, deleteCss=None, diffRatio=0.7): insertCss=None, deleteCss=None, insertName='insert',
deleteName='delete', diffRatio=0.7):
# p_old and p_new are strings containing chunks of HTML. # p_old and p_new are strings containing chunks of HTML.
self.old = old.strip() self.old = old.strip()
self.new = new.strip() self.new = new.strip()
@ -26,18 +126,30 @@ class HtmlDiff:
# be used (see HtmlDiff.insertStyle and HtmlDiff.deleteStyle). # be used (see HtmlDiff.insertStyle and HtmlDiff.deleteStyle).
self.insertCss = insertCss self.insertCss = insertCss
self.deleteCss = deleteCss self.deleteCss = deleteCss
# This tag will get a "name" attribute whose content will be
# p_insertName or p_deleteName
self.insertName = insertName
self.deleteName = deleteName
# The diff algorithm of this class will need to identify similarities # The diff algorithm of this class will need to identify similarities
# between strings. Similarity ratios will be computed by using method # between strings. Similarity ratios will be computed by using method
# difflib.SequenceMatcher.ratio (see m_isSimilar below). Strings whose # difflib.SequenceMatcher.ratio (see m_isSimilar below). Strings whose
# comparison will produce a ratio above p_diffRatio will be considered # comparison will produce a ratio above p_diffRatio will be considered
# as similar. # as similar.
self.diffRatio = diffRatio self.diffRatio = diffRatio
# Some computed values
for tag in ('div', 'span'):
setattr(self, '%sInsertPrefix' % tag,
'<%s name="%s"' % (tag, self.insertName))
setattr(self, '%sDeletePrefix' % tag,
'<%s name="%s"' % (tag, self.deleteName))
def getModifiedChunk(self, seq, type, sep): def getModifiedChunk(self, seq, type, sep):
'''p_sep.join(p_seq) is a chunk that was either inserted '''p_sep.join(p_seq) (if p_seq is a list) or p_seq (if p_seq is a
(p_type='insert') or deleted (p_type='delete'). This method will string) is a chunk that was either inserted (p_type='insert') or
surround this part with a div or span tag that will get some CSS deleted (p_type='delete'). This method will surround this part with
class allowing to highlight the difference.''' a div or span tag that will get some CSS class allowing to highlight
the difference.'''
# Prepare parts of the surrounding tag.
if sep == '\n': tag = 'div' if sep == '\n': tag = 'div'
else: tag = 'span' else: tag = 'span'
exec 'msg = self.%sMsg' % type exec 'msg = self.%sMsg' % type
@ -47,7 +159,22 @@ class HtmlDiff:
else: else:
exec 'style = self.%sStyle' % type exec 'style = self.%sStyle' % type
style = 'style="%s"' % style style = 'style="%s"' % style
return '<%s %s title="%s">%s</%s>' % (tag,style,msg,sep.join(seq),tag) exec 'tagName = self.%sName' % type
# The idea is: if there are several lines, every line must be surrounded
# by a tag. this way, we know that a surrounding tag can't span several
# lines, which is a prerequisite for managing cumulative diffs.
if sep == ' ':
seq = sep.join(seq)
sep = ''
if isinstance(seq, basestring):
return '%s<%s name="%s" %s title="%s">%s</%s>%s' % \
(sep, tag, tagName, style, msg, seq, tag, sep)
else:
res = ''
for line in seq:
res += '%s<%s name="%s" %s title="%s">%s</%s>%s' % \
(sep, tag, tagName, style, msg, line, tag, sep)
return res
def getStringDiff(self, old, new): def getStringDiff(self, old, new):
'''Identifies the differences between strings p_old and p_new by '''Identifies the differences between strings p_old and p_new by
@ -62,6 +189,7 @@ class HtmlDiff:
diffFound = False diffFound = False
while not diffFound: while not diffFound:
i += 1 i += 1
if (i == len(old)) or (i == len(new)): break
if old[i] != new[i]: diffFound = True if old[i] != new[i]: diffFound = True
# Compute jo and jn # Compute jo and jn
jo = len(old) jo = len(old)
@ -84,110 +212,206 @@ class HtmlDiff:
ratio = difflib.SequenceMatcher(a=s1.lower(), b=s2.lower()).ratio() ratio = difflib.SequenceMatcher(a=s1.lower(), b=s2.lower()).ratio()
return ratio > self.diffRatio return ratio > self.diffRatio
def isEmpty(self, l):
'''Is list p_l empty ?'''
return not l or ( (len(l) == 1) and (l[0] in ('', '\r')))
def getTagContent(self, line):
'''p_lines is a XHTML tag with content. This method returns the content
of the tag, removing start and end tags.'''
return line[line.find('>')+1:line.rfind('<')]
def getLineAndType(self, line):
'''p_line is a string that can already have been surrounded by an
"insert" or "delete" tag. This is what we try to determine here.
This method returns a tuple (type, line, innerDiffs), where "type"
can be:
* "insert" if it has already been flagged as inserted;
* "delete" if it has already been flagged as deleted;
* None else;
"line" holds the original parameter p_line, excepted:
* if type="insert". In that case, the surrounding insert tag has been
removed;
* if inner diff tags (insert or delete) are found. In that case,
- if inner "insert" tags are found, they are removed but their
content is kept;
- if inner "delete" tags are found, they are removed, content
included;
- "innerDiffs" holds the list of re.MatchObjects instances
representing the found inner tags.
'''
if line.startswith(self.divDeletePrefix): return ('delete', line, None)
if line.startswith(self.divInsertPrefix):
# Return the line without the surrounding tag.
return ('insert', self.getTagContent(line), None)
# Replace found inner inserts with their content.
innerDiffs = []
while True:
match = innerDiff.search(line)
if not match: break
# I found one.
innerDiffs.append(match)
# Keep content only for "insert" tags.
content = ''
if match.group(1) == 'insert':
content = match.group(2)
line = line[:match.start()] + content + line[match.end():]
return (None, line, innerDiffs)
def getSeqDiff(self, seqA, seqB): def getSeqDiff(self, seqA, seqB):
'''p_seqA and p_seqB are lists of strings. Here we will try to identify '''p_seqA and p_seqB are lists of strings. Here we will try to identify
similarities between strings from p_seqA and p_seqB, and return a similarities between strings from p_seqA and p_seqB, and return a
list of differences between p_seqA and p_seqB, where each element list of differences between p_seqA and p_seqB, where each element
is a tuple (action, data). is a tuple (action, line).
* If p_action is "delete", data is a sublist of p_seqA with lines * If p_action is "delete", "line" is a line of p_seqA considered as
considered as not included anymore in p_seqB; not included anymore in p_seqB;
* If p_action is "replace", data is a tuple (lineA, lineB) containing * If p_action is "insert", "line" is a line of p_seqB considered as
one line from p_seqA and one from p_seqB considered as similar; not included in p_seqA;
* If p_action is "insert", data is a sublist of p_seqB with lines * If p_action is "replace", "line" is a tuple
considered as not included in p_seqA. (lineA, lineB, previousDiffsA) containing one line from p_seqA and
one from p_seqB considered as similar. "previousDiffsA" contains
potential previous inner diffs that were found (but extracted
from, for comparison purposes) lineA.
''' '''
res = [] res = []
i = j = k = 0 i = j = k = 0
deleted = []
# Scan every string from p_seqA and try to find a similar string in # Scan every string from p_seqA and try to find a similar string in
# p_seqB. # p_seqB.
while i < len(seqA): while i < len(seqA):
if k == len(seqB): pastAction, lineSeqA, innerDiffs = self.getLineAndType(seqA[i])
if pastAction == 'delete':
# We will consider this line as "equal" because it already has
# been noted as deleted in a previous diff.
res.append( ('equal', seqA[i]) )
elif k == len(seqB):
# We have already "consumed" every string from p_seqB. Remaining # We have already "consumed" every string from p_seqB. Remaining
# strings from p_seqA must now be considered has having been # strings from p_seqA must be considered as deleted (or
# deleted. # sometimes equal, see above)
if deleted: res.append( ('delete', deleted) ) if not pastAction: res.append( ('delete', seqA[i]) )
res.append( ('delete', seqA[i:]) ) else:
break # 'insert': should not happen. The inserted line should also
# be found in seqB.
res.append( ('equal', seqA[i]) )
else:
# Try to find a line in seqB which is similar to lineSeqA.
similarFound = False similarFound = False
for j in range(k, len(seqB)): for j in range(k, len(seqB)):
if self.isSimilar(seqA[i], seqB[j]): if self.isSimilar(lineSeqA, seqB[j]):
similarFound = True similarFound = True
if deleted:
# Dump first the strings flagged as deleted.
res.append( ('delete', deleted) )
deleted = []
# Strings between indices k and j in p_seqB must be # Strings between indices k and j in p_seqB must be
# considered as inserted, because no similar line exists # considered as inserted, because no similar line exists
# in p_seqA. # in p_seqA.
if k < j: if k < j:
res.append( ('insert', seqB[k:j]) ) for line in seqB[k:j]: res.append(('insert', line))
# Similar strings are appended in a 'replace' entry # Similar strings are appended in a 'replace' entry,
res.append(('replace', (seqA[i], seqB[j]))) # excepted if lineSeqA is already an insert from a
# previous diff: in this case, we keep the "old"
# version: the new one is the same, but for which we
# don't remember who updated it.
if (pastAction == 'insert') and (lineSeqA == seqB[j]):
res.append( ('equal', seqA[i]) )
# TODO: manage lineSeqA != seqB[j]
else:
res.append(('replace', (lineSeqA, seqB[j],
innerDiffs)))
k = j+1 k = j+1
break break
if not similarFound: if not similarFound: res.append( ('delete', seqA[i]) )
# Add to list of deleted lines.
deleted.append(seqA[i])
i += 1 i += 1
# Consider any "unconsumed" line from p_seqB as being inserted. # Consider any "unconsumed" line from p_seqB as being inserted.
if deleted: res.append( ('delete', deleted) ) if k < len(seqB):
if k < len(seqB): res.append( ('insert', seqB[k:]) ) for line in seqB[k:]: res.append( ('insert', line) )
return res
def split(self, s, sep):
'''Splits string p_s with p_sep. If p_sep is a space, the split can't
happen for a leading or trailing space, which must be considered as
being part of the first or last word.'''
# Manage sep == \n
if sep == '\n': return s.split(sep)
leadSpace = s.startswith(sep)
trailSpace = s.endswith(sep)
if not leadSpace and not trailSpace: return s.split(sep)
res = s.strip(sep).split(sep)
if leadSpace: res[0] = sep + res[0]
if trailSpace: res[-1] = res[-1] + sep
return res return res
def getHtmlDiff(self, old, new, sep): def getHtmlDiff(self, old, new, sep):
'''Returns the differences between p_old and p_new. Result is a string '''Returns the differences between p_old and p_new. Result is a string
containing the comparison in HTML format. p_sep is used for turning containing the comparison in HTML format. p_sep is used for turning
p_old and p_new into sequences.''' p_old and p_new into sequences. If p_sep is a carriage return, this
method is used for performing a whole diff between 2 strings splitted
into sequences of lines; if sep is a space, the diff is a
word-by-word comparison within 2 lines that have been detected as
similar in a previous call to m_getHtmlDiff with sep=carriage
return.'''
res = [] res = []
a = old.split(sep) a = self.split(old, sep)
b = new.split(sep) b = self.split(new, sep)
matcher = difflib.SequenceMatcher() matcher = difflib.SequenceMatcher()
matcher.set_seqs(a,b) matcher.set_seqs(a,b)
for action, i1, i2, j1, j2 in matcher.get_opcodes(): for action, i1, i2, j1, j2 in matcher.get_opcodes():
chunkA = a[i1:i2]
chunkB = b[j1:j2]
aIsEmpty = self.isEmpty(chunkA)
bIsEmpty = self.isEmpty(chunkB)
toAdd = None
if action == 'equal': if action == 'equal':
toAdd = sep.join(a[i1:i2]) if not aIsEmpty: toAdd = sep.join(chunkA)
elif action == 'insert': elif action == 'insert':
print 'INSERT', b[j1:j2] if not bIsEmpty:
toAdd = self.getModifiedChunk(b[j1:j2], action, sep) toAdd = self.getModifiedChunk(chunkB, action, sep)
elif action == 'delete': elif action == 'delete':
print 'DELETE', a[i1:i2] if not aIsEmpty:
toAdd = self.getModifiedChunk(a[i1:i2], action, sep) toAdd = self.getModifiedChunk(chunkA, action, sep)
elif action == 'replace': elif action == 'replace':
if sep == '\n': if aIsEmpty and bIsEmpty:
print 'REPLACE', a[i1:i2]
print 'WITH', b[j1:j2]
# We know that some lines have been replaced from a to b. By
# identifying similarities between those lines, consider
# some as having been deleted, modified or inserted.
toAdd = '' toAdd = ''
for sAction, data in self.getSeqDiff(a[i1:i2], b[j1:j2]): elif aIsEmpty:
# Was an addition, not a replacement
toAdd = self.getModifiedChunk(chunkB, 'insert', sep)
elif bIsEmpty:
# Was a deletion, not a replacement
toAdd = self.getModifiedChunk(chunkA, 'delete', sep)
else: # At least, a true replacement (grr difflib)
if sep == '\n':
# We know that some lines have been replaced from a to
# b. By identifying similarities between those lines,
# consider some as having been deleted, modified or
# inserted.
toAdd = ''
for sAction, line in self.getSeqDiff(chunkA, chunkB):
if sAction in ('insert', 'delete'): if sAction in ('insert', 'delete'):
toAdd += self.getModifiedChunk(data, sAction, sep) toAdd += self.getModifiedChunk(line,sAction,sep)
elif sAction == 'equal':
toAdd += line
elif sAction == 'replace': elif sAction == 'replace':
lineA, lineB = data lineA, lineB, previousDiffsA = line
# Investigate further here and explore differences # Investigate further here and explore
# at the *word* level between lineA and lineB. As a # differences at the *word* level between lineA
# preamble, and in order to restrict annoyances due # and lineB. As a preamble, and in order to
# to the presence of XHTML tags, we will compute # restrict annoyances due to the presence of
# start and end parts wich are similar between lineA # XHTML tags, we will compute start and end
# and lineB: they may correspond to opening and # parts wich are similar between lineA and
# lineB: they may correspond to opening and
# closing XHTML tags. # closing XHTML tags.
i, ja, jb = self.getStringDiff(lineA, lineB) i, ja, jb = self.getStringDiff(lineA, lineB)
diff = self.getHtmlDiff(lineA[i:ja],lineB[i:jb],' ') diff = self.getHtmlDiff(lineA[i:ja],
lineB[i:jb], ' ')
toAdd += lineB[:i] + diff + lineB[jb:] toAdd += lineB[:i] + diff + lineB[jb:]
# Merge potential previous inner diff tags that
# were found (but extracted from) lineA.
if previousDiffsA:
merger= Merger(lineA, toAdd, previousDiffsA)
toAdd = merger.merge()
else: else:
if ((i2-i1) == 1) and (a[i1] == ''): toAdd = self.getModifiedChunk(chunkA, 'delete', sep)
# difflib has considered an empty char as 'removed' (?) toAdd += self.getModifiedChunk(chunkB, 'insert', sep)
toAdd = '' if toAdd: res.append(toAdd)
else:
toAdd = self.getModifiedChunk(a[i1:i2],'delete', sep)
toAdd += self.getModifiedChunk(b[j1:j2],'insert', sep)
res.append(toAdd)
return sep.join(res) return sep.join(res)
def get(self): def get(self):
'''Produces the result.''' '''Produces the result.'''
print 'RUN'
return self.getHtmlDiff(self.old, self.new, '\n') return self.getHtmlDiff(self.old, self.new, '\n')
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------