appy.shared.diff: more work.

This commit is contained in:
Gaetan Delannay 2011-11-19 23:53:38 +01:00
parent 8c6301b901
commit 10398e770a

View file

@ -86,15 +86,18 @@ class Merger:
while oldText: while oldText:
# Get the overlapping (new) diff. # Get the overlapping (new) diff.
newDiff, newDiffStart, isPrevious = self.getNextDiff() newDiff, newDiffStart, isPrevious = self.getNextDiff()
if not newDiff: if not newDiff or (newDiffStart >= (self.i + len(oldText))):
# No more new diff. So normally, we should find what remains in # No more new diff, or a new diff but far away, not within
# oldText at self.lineB[self.i:] # oldText. So insert new the rest of p_oldText.
if not self.lineB[self.i:].startswith(oldText): # Invariant: at this point, we should find what remains in
raise 'Error!!!!' # oldText at self.lineB[self.i:].
res += self.differ.getModifiedChunk(oldText, 'insert', '', res += self.differ.getModifiedChunk(oldText, 'insert', '',
msg=oldDiff.group(2)) msg=oldDiff.group(2))
self.i += len(oldText) self.i += len(oldText)
oldText = '' oldText = ''
# If we have "popped" a new diff, dump it anyway.
if newDiff:
res = self.dumpDiff(res, newDiff, newDiffStart, isPrevious)
break break
# Dump the part of the old text that has been untouched by the new # Dump the part of the old text that has been untouched by the new
# diff. # diff.
@ -119,47 +122,54 @@ class Merger:
self.deltaPrevious -= len(newDiff.group(3)) self.deltaPrevious -= len(newDiff.group(3))
return res return res
def dumpDiff(self, res, diff, diffStart, isPrevious):
'''Dumps the next p_diff (starting at p_diffStart) to insert into p_res
and return p_res. If p_isPrevious is True, the diff is an old one
(from self.lineA); else, it is a new one (from self.lineB).'''
# Dump the part of lineB between self.i and diffStart
res += self.lineB[self.i:diffStart]
self.i = diffStart
if isPrevious:
# Dump the old diff (from self.lineA)
if diff.group(1) == 'insert':
# Check if the inserted text is still present in lineB
if self.lineB[self.i:].startswith(diff.group(3)):
# Yes. Dump the diff and go ahead within lineB
res += diff.group(0)
self.i += len(diff.group(3))
else:
# The inserted text can't be found as is in lineB.
# Must have been (partly) re-edited or removed.
overlap = self.manageOverlap(diff)
res += overlap
elif diff.group(1) == 'delete':
res += diff.group(0)
else:
# Dump the new diff (from self.lineB)
res += diff.group(0)
# Move forward within self.lineB
self.i += len(diff.group(0))
# Because of this new diff, all indexes computed on lineA are
# now wrong because we express them relative to lineB. So:
# update self.deltaPrevious to take this into account.
self.deltaPrevious += len(diff.group(0))
if diff.group(1) == 'delete':
# The indexes in self.lineA do not take the deleted text into
# account, because it wasn't deleted at this time. So remove
# from self.deltaPrevious the length of removed text.
self.deltaPrevious -= len(diff.group(3))
return res
def merge(self): def merge(self):
'''Merges self.previousDiffs into self.lineB.''' '''Merges self.previousDiffs into self.lineB.'''
res = '' res = ''
diff, diffStart, isPrevious = self.getNextDiff() diff, diffStart, isPrevious = self.getNextDiff()
while diff: while diff:
# Dump the part of lineB between self.i and diffStart res = self.dumpDiff(res, diff, diffStart, isPrevious)
res += self.lineB[self.i:diffStart] # Load the next diff, if any
self.i = diffStart
if isPrevious:
if diff.group(1) == 'insert':
# Check if the inserted text is still present in lineB
if self.lineB[self.i:].startswith(diff.group(3)):
# Yes. Dump the diff and go ahead within lineB
res += diff.group(0)
self.i += len(diff.group(3))
else:
# The inserted text can't be found as is in lineB.
# Must have been (partly) re-edited or removed.
overlap = self.manageOverlap(diff)
res += overlap
elif diff.group(1) == 'delete':
res += diff.group(0)
else:
# Dump the diff and update self.i
res += diff.group(0)
self.i += len(diff.group(0))
# Because of this new diff, all indexes computed on lineA are
# now wrong because we express them relative to lineB. So:
# update self.deltaPrevious to take this into account.
self.deltaPrevious += len(diff.group(0))
if diff.group(1) == 'delete':
# The indexes in lineA do not take the deleted text into
# account, because it wasn't deleted at this time. So remove
# from self.deltaPrevious the length of removed text.
self.deltaPrevious -= len(diff.group(3))
# Load next diff
diff, diffStart, isPrevious = self.getNextDiff() diff, diffStart, isPrevious = self.getNextDiff()
# Dump the end of self.lineB if not completely consumed # Dump the end of self.lineB if not completely consumed
if self.i < len(self.lineB): if self.i < len(self.lineB): res += self.lineB[self.i:]
res += self.lineB[self.i:]
return res return res
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
@ -405,12 +415,14 @@ class HtmlDiff:
return res return res
garbage = ('', '\r') garbage = ('', '\r')
def removeGarbage(self, l): def removeGarbage(self, l, sep):
'''Removes from list p_l elements that have no interest, like blank '''Removes from list p_l elements that have no interest, like blank
strings or considered as is.''' strings or considered as is. Also: strip lines (ie, if sep is a
carriage return.'''
i = len(l)-1 i = len(l)-1
while i >= 0: while i >= 0:
if l[i] in self.garbage: del l[i] if l[i] in self.garbage: del l[i]
if sep == '\n': l[i] = l[i].strip()
i -= 1 i -= 1
return l return l
@ -444,33 +456,64 @@ class HtmlDiff:
if old[jo] != new[jn]: diffFound=True if old[jo] != new[jn]: diffFound=True
return i, jo+1, jn+1 return i, jo+1, jn+1
def getReplacement(self, sep, lineA, lineB, previousDiffsA, outerTagA): def getDumpPrefix(self, res, add, previousAdd, sep):
'''p_lineA has been replaced with p_lineB. Here, we will investigate '''In most cases, when concatenating the next diff (p_add) to the
further here and explore differences at the *word* level between global result (p_res), I must prefix it with p_sep (excepted if p_res
p_lineA and p_lineB. is still empty). But when p_sep is a space, no space must be inserted
between 2 adjacent updates (p_add and p_previousAdd), because such a
space was not in the original version. This method computes the
prefix, that can thus be empty if this latter case is met.'''
prefix = ''
if not res: return prefix
if (sep == ' ') and previousAdd and \
previousAdd.endswith('</span>') and add.startswith('<span'):
pass
else:
prefix = sep
return prefix
p_previousDiffsA may contain a series of updates (inserts, deletions) def getReplacement(self, chunkA, chunkB, sep):
that have already been performed on p_lineA. '''p_chunkA has been replaced with p_chunkB. Compute this update and
return it.'''
res = ''
# We know that some lines have been replaced from chunkA to chunkB. By
# identifying similarities between those lines, consider some as having
# been deleted, modified or inserted.
previousAdd = None
for action, line in self.getSeqDiff(chunkA, chunkB, sep):
add = None
if action in ('insert', 'delete'):
add = self.getModifiedChunk(line, action, sep)
elif action == 'equal':
add = line
elif action == 'replace':
lineA, lineB, previousDiffsA, outerTagA = line
# lineA has been replaced with lineB. Here, we will investigate
# further here and explore differences at the *word* level
# between lineA and lineB. previousDiffsA may contain a series
# of updates (inserts, deletions) that have already been
# performed on lineA. If lineA was a previously inserted line,
# lineA comes without his outer tag, that lies in outerTagA
# (as a re.MatchObject instance computed from regex htmlTag).
# In that case, we will wrap the result with that tag.
If p_lineA was a previously inserted line, p_lineA comes without his # As a preamble, and in order to restrict annoyances due to the
outer tag, that lies in p_outerTagA (as a re.MatchObject instance # presence of XHTML tags, we will remove start and end tags
computed from regex htmlTag). In that case, we will wrap the result # from lineA and lineB if present.
with that tag.''' i, ja, jb = self.getStringDiff(lineA, lineB)
# As a preamble, and in order to restrict annoyances due to the presence diff = self.getHtmlDiff(lineA[i:ja], lineB[i:jb], ' ')
# of XHTML tags, we will remove start and end tags from p_lineA and add = lineB[:i] + diff + lineB[jb:]
# p_lineB if present. # Merge potential previous inner diff tags that were found (but
i, ja, jb = self.getStringDiff(lineA, lineB) # extracted from) lineA.
diff = self.getHtmlDiff(lineA[i:ja], lineB[i:jb], ' ') if previousDiffsA:
res = lineB[:i] + diff + lineB[jb:] merger = Merger(lineA, add, previousDiffsA, self)
# Merge potential previous inner diff tags that were found (but add = merger.merge()
# extracted from) lineA. # Rewrap line into outerTagA if lineA was a line tagged as
if previousDiffsA: # previously inserted.
merger = Merger(lineA, res, previousDiffsA, self) if outerTagA:
res = merger.merge() add = self.computeTag(outerTagA, add)
# Rewrap line into outerTagA if lineA was a line tagged as previously if add: res += self.getDumpPrefix(res, add, previousAdd, sep) + add
# inserted. previousAdd = add
if outerTagA:
res = self.computeTag(outerTagA, res)
return res return res
def getHtmlDiff(self, old, new, sep): def getHtmlDiff(self, old, new, sep):
@ -482,65 +525,40 @@ class HtmlDiff:
word-by-word comparison within 2 lines that have been detected as word-by-word comparison within 2 lines that have been detected as
similar in a previous call to m_getHtmlDiff with sep=carriage similar in a previous call to m_getHtmlDiff with sep=carriage
return.''' return.'''
res = [] res = ''
a = self.split(old, sep) a = self.split(old, sep)
b = self.split(new, sep) b = self.split(new, sep)
matcher = difflib.SequenceMatcher() matcher = difflib.SequenceMatcher()
matcher.set_seqs(a, b) matcher.set_seqs(a, b)
previousAdd = None
for action, i1, i2, j1, j2 in matcher.get_opcodes(): for action, i1, i2, j1, j2 in matcher.get_opcodes():
add = None
# When sep is a space, we need to remember if we are dealing with # When sep is a space, we need to remember if we are dealing with
# the last diff within the line or not. # the last diff within the line or not.
chunkA = self.removeGarbage(a[i1:i2]) chunkA = self.removeGarbage(a[i1:i2], sep)
chunkB = self.removeGarbage(b[j1:j2]) chunkB = self.removeGarbage(b[j1:j2], sep)
toAdd = None
if action == 'equal': if action == 'equal':
if chunkA: toAdd = sep.join(chunkA) if chunkA: add = sep.join(chunkA)
elif action == 'insert': elif action == 'insert':
if chunkB: if chunkB:
toAdd = self.getModifiedChunk(chunkB, action, sep) add = self.getModifiedChunk(chunkB, action, sep)
elif action == 'delete': elif action == 'delete':
if chunkA: if chunkA:
toAdd = self.getModifiedChunk(chunkA, action, sep) add = self.getModifiedChunk(chunkA, action, sep)
elif action == 'replace': elif action == 'replace':
if not chunkA and not chunkB: if not chunkA and not chunkB:
toAdd = '' pass
elif not chunkA: elif not chunkA:
# Was an addition, not a replacement # Was an addition, not a replacement
toAdd = self.getModifiedChunk(chunkB, 'insert', sep) add = self.getModifiedChunk(chunkB, 'insert', sep)
elif not chunkB: elif not chunkB:
# Was a deletion, not a replacement # Was a deletion, not a replacement
toAdd = self.getModifiedChunk(chunkA, 'delete', sep) add = self.getModifiedChunk(chunkA, 'delete', sep)
else: # At least, a true replacement else: # At least, a true replacement
toAdd = '' add = self.getReplacement(chunkA, chunkB, sep)
# We know that some lines have been replaced from a to b. if add: res += self.getDumpPrefix(res, add, previousAdd, sep) + add
# By identifying similarities between those lines, consider previousAdd = add
# some as having been deleted, modified or inserted. return res
previousAdd = None
for sAction, line in self.getSeqDiff(chunkA, chunkB, sep):
if sAction in ('insert', 'delete'):
add = self.getModifiedChunk(line, sAction, sep)
elif sAction == 'equal':
add = line
elif sAction == 'replace':
add = self.getReplacement(sep, *line)
# In most cases, I must prefix "add" with "sep" before
# concatenating it to "toAdd" (excepted if toAdd is
# still empty). But when "sep" is a space, no space
# must be inserted between 2 adjacent updates, because
# such a space was not in the original version.
prefix = ''
if toAdd:
if (sep == ' ') and previousAdd and \
previousAdd.endswith('</span>') and \
add.startswith('<span'):
pass
else:
prefix = sep
toAdd += prefix + add
if sep == ' ':
previousAdd = add
if toAdd: res.append(toAdd)
return sep.join(res)
def get(self): def get(self):
'''Produces the result.''' '''Produces the result.'''