appy.shared.diff: more work.

This commit is contained in:
Gaetan Delannay 2011-11-19 11:48:03 +01:00
parent cf992843ff
commit 8c6301b901

View file

@ -89,12 +89,6 @@ class Merger:
if not newDiff: if not newDiff:
# No more new diff. So normally, we should find what remains in # No more new diff. So normally, we should find what remains in
# oldText at self.lineB[self.i:] # oldText at self.lineB[self.i:]
if not self.lineB[self.i:].startswith(oldText):
# Anormal additional char. Probably a space? Indeed,
# word-level comparisons imply split(' ') which can be
# error-prone.
res += self.lineB[self.i]
self.i += 1
if not self.lineB[self.i:].startswith(oldText): if not self.lineB[self.i:].startswith(oldText):
raise 'Error!!!!' raise 'Error!!!!'
res += self.differ.getModifiedChunk(oldText, 'insert', '', res += self.differ.getModifiedChunk(oldText, 'insert', '',
@ -128,9 +122,6 @@ class Merger:
def merge(self): def merge(self):
'''Merges self.previousDiffs into self.lineB.''' '''Merges self.previousDiffs into self.lineB.'''
res = '' res = ''
print 'MERGE'
print 'Line A', self.lineA
print 'Line B', self.lineB
diff, diffStart, isPrevious = self.getNextDiff() diff, diffStart, isPrevious = self.getNextDiff()
while diff: while diff:
# Dump the part of lineB between self.i and diffStart # Dump the part of lineB between self.i and diffStart
@ -218,7 +209,9 @@ class HtmlDiff:
string) is a chunk that was either inserted (p_type='insert') or string) is a chunk that was either inserted (p_type='insert') or
deleted (p_type='delete'). This method will surround this part with deleted (p_type='delete'). This method will surround this part with
a div or span tag that will get some CSS class allowing to highlight a div or span tag that will get some CSS class allowing to highlight
the update. If p_msg is given, it will be used instead of the default the update.
If p_msg is given, it will be used instead of the default
p_type-related message stored on p_self.''' p_type-related message stored on p_self.'''
# Will the surrouding tag be a div or a span? # Will the surrouding tag be a div or a span?
if sep == '\n': tag = 'div' if sep == '\n': tag = 'div'
@ -233,7 +226,7 @@ class HtmlDiff:
else: else:
exec 'style = self.%sStyle' % type exec 'style = self.%sStyle' % type
style = 'style="%s"' % style style = 'style="%s"' % style
# the 'name' attribute of the tag indicates the type of the update. # The 'name' attribute of the tag indicates the type of the update.
exec 'tagName = self.%sName' % type exec 'tagName = self.%sName' % type
# The idea is: if there are several lines, every line must be surrounded # The idea is: if there are several lines, every line must be surrounded
# by a tag. This way, we know that a surrounding tag can't span several # by a tag. This way, we know that a surrounding tag can't span several
@ -421,7 +414,36 @@ class HtmlDiff:
i -= 1 i -= 1
return l return l
nextSeps = {'\n': ' ', ' ': ''} def getStringDiff(self, old, new):
'''Identifies the differences between strings p_old and p_new by
computing:
* i = the end index of the potential common starting part (if no
common part is found, i=0);
* jo = the start index in p_old of the potential common ending part;
* jn = the start index in p_new of the potential common ending part.
'''
# Compute i
i = -1
diffFound = False
while not diffFound:
i += 1
if (i == len(old)) or (i == len(new)): break
if old[i] != new[i]: diffFound = True
# Compute jo and jn
jo = len(old)
jn = len(new)
diffFound = False
while not diffFound:
if (jo == i) or (jn == i):
# We have reached the end of substring old[i:] or new[i:]
jo -=1
jn -= 1
break
jo -= 1
jn -= 1
if old[jo] != new[jn]: diffFound=True
return i, jo+1, jn+1
def getReplacement(self, sep, lineA, lineB, previousDiffsA, outerTagA): def getReplacement(self, sep, lineA, lineB, previousDiffsA, outerTagA):
'''p_lineA has been replaced with p_lineB. Here, we will investigate '''p_lineA has been replaced with p_lineB. Here, we will investigate
further here and explore differences at the *word* level between further here and explore differences at the *word* level between
@ -437,18 +459,11 @@ class HtmlDiff:
# As a preamble, and in order to restrict annoyances due to the presence # As a preamble, and in order to restrict annoyances due to the presence
# of XHTML tags, we will remove start and end tags from p_lineA and # of XHTML tags, we will remove start and end tags from p_lineA and
# p_lineB if present. # p_lineB if present.
matchA = htmlTag.match(lineA) i, ja, jb = self.getStringDiff(lineA, lineB)
contentA = matchA and matchA.group(3) or lineA diff = self.getHtmlDiff(lineA[i:ja], lineB[i:jb], ' ')
matchB = htmlTag.match(lineB) res = lineB[:i] + diff + lineB[jb:]
contentB = matchB and matchB.group(3) or lineB # Merge potential previous inner diff tags that were found (but
# Perform the diff at the level of words # extracted from) lineA.
diff = self.getHtmlDiff(contentA, contentB, self.nextSeps[sep])
if matchB:
res = self.computeTag(matchB, diff)
else:
res = diff
# Merge potential previous inner diff tags that
# were found (but extracted from) lineA.
if previousDiffsA: if previousDiffsA:
merger = Merger(lineA, res, previousDiffsA, self) merger = Merger(lineA, res, previousDiffsA, self)
res = merger.merge() res = merger.merge()
@ -468,15 +483,13 @@ class HtmlDiff:
similar in a previous call to m_getHtmlDiff with sep=carriage similar in a previous call to m_getHtmlDiff with sep=carriage
return.''' return.'''
res = [] res = []
if sep:
a = self.split(old, sep) a = self.split(old, sep)
b = self.split(new, sep) b = self.split(new, sep)
else:
a = old
b = new
matcher = difflib.SequenceMatcher() matcher = difflib.SequenceMatcher()
matcher.set_seqs(a, b) matcher.set_seqs(a, b)
for action, i1, i2, j1, j2 in matcher.get_opcodes(): for action, i1, i2, j1, j2 in matcher.get_opcodes():
# When sep is a space, we need to remember if we are dealing with
# the last diff within the line or not.
chunkA = self.removeGarbage(a[i1:i2]) chunkA = self.removeGarbage(a[i1:i2])
chunkB = self.removeGarbage(b[j1:j2]) chunkB = self.removeGarbage(b[j1:j2])
toAdd = None toAdd = None
@ -498,27 +511,34 @@ class HtmlDiff:
# Was a deletion, not a replacement # Was a deletion, not a replacement
toAdd = self.getModifiedChunk(chunkA, 'delete', sep) toAdd = self.getModifiedChunk(chunkA, 'delete', sep)
else: # At least, a true replacement else: # At least, a true replacement
toAdd = [] toAdd = ''
# We know that some lines/words have been replaced from a to # We know that some lines have been replaced from a to b.
# b. By identifying similarities between those lines/words, # By identifying similarities between those lines, consider
# consider some as having been deleted, modified or # some as having been deleted, modified or inserted.
# inserted. previousAdd = None
for sAction, line in self.getSeqDiff(chunkA, chunkB, sep): for sAction, line in self.getSeqDiff(chunkA, chunkB, sep):
if sAction in ('insert', 'delete'): if sAction in ('insert', 'delete'):
mChunk = self.getModifiedChunk(line, sAction, sep) add = self.getModifiedChunk(line, sAction, sep)
toAdd.append(mChunk)
elif sAction == 'equal': elif sAction == 'equal':
toAdd.append(line) add = line
elif sAction == 'replace': elif sAction == 'replace':
toAdd.append(self.getReplacement(sep, *line)) add = self.getReplacement(sep, *line)
# The following line, when sep is the space (=when workin # In most cases, I must prefix "add" with "sep" before
# on diffs at the word level), leads to additional spaces # concatenating it to "toAdd" (excepted if toAdd is
# being dumped into the result (ie, a space between a delete # still empty). But when "sep" is a space, no space
# and an insert, which was not in the initial text). We # must be inserted between 2 adjacent updates, because
# could not find a way to avoid inserting those spaces. So # such a space was not in the original version.
# when merging diffs (see Merger.merge), we know that a prefix = ''
# 'space' error can occur and we take it into account then. if toAdd:
toAdd = sep.join(toAdd) if (sep == ' ') and previousAdd and \
previousAdd.endswith('</span>') and \
add.startswith('<span'):
pass
else:
prefix = sep
toAdd += prefix + add
if sep == ' ':
previousAdd = add
if toAdd: res.append(toAdd) if toAdd: res.append(toAdd)
return sep.join(res) return sep.join(res)