appy.diff: bugfix (avoid infinite loop between m_getHtmlDiff and m_getReplacement) and better recovery when the Merger fails to manage overlaps (thhe whole line is then considered as having been deleted and replaced by something completely different, which is not really the case but at least is shows a simplified diff instead of crashing. appy.shared: bugfix in the XhtmlCleaner that now returns result as a str and not a unicode.

This commit is contained in:
Gaetan Delannay 2012-05-15 23:13:30 +02:00
parent 028040351c
commit 36257b1b3a
2 changed files with 25 additions and 5 deletions

View file

@ -10,6 +10,10 @@ htmlTag = re.compile('<(?P<tag>\w+)( .*?)?>(.*)</(?P=tag)>')
class Merger: class Merger:
'''This class allows to merge 2 lines of text, each containing inserts and '''This class allows to merge 2 lines of text, each containing inserts and
deletions.''' deletions.'''
# Exception that may be raised by this class if the merge fails.
class MergeError(Exception): pass
def __init__(self, lineA, lineB, previousDiffs, differ): def __init__(self, lineA, lineB, previousDiffs, differ):
# lineA comes "naked": any diff previously found on it was removed from # lineA comes "naked": any diff previously found on it was removed from
# it (ie, deleted text has been completely removed, while inserted text # it (ie, deleted text has been completely removed, while inserted text
@ -148,7 +152,8 @@ class Merger:
# Invariant: at this point, we should find what remains in # Invariant: at this point, we should find what remains in
# oldText at self.lineB[self.i:]. # oldText at self.lineB[self.i:].
if not self.lineB[self.i:].startswith(oldText): if not self.lineB[self.i:].startswith(oldText):
raise 'Error!!!!' raise self.MergeError('An error occurred while computing ' \
'overlapping diffs.')
res += self.differ.getModifiedChunk(oldText, 'insert', '', res += self.differ.getModifiedChunk(oldText, 'insert', '',
msg=oldDiff.group(2)) msg=oldDiff.group(2))
self.i += len(oldText) self.i += len(oldText)
@ -584,8 +589,15 @@ class HtmlDiff:
# Merge potential previous inner diff tags that were found (but # Merge potential previous inner diff tags that were found (but
# extracted from) lineA. # extracted from) lineA.
if previousDiffsA: if previousDiffsA:
merger = Merger(lineA, add, previousDiffsA, self) try:
add = merger.merge() merger = Merger(lineA, add, previousDiffsA, self)
add = merger.merge()
except Merger.MergeError, e:
# The merge algorithm has made a burn out. Simplify and
# consider lineA has having been completely deleted and
# lineB has completely inserted.
add = self.getModifiedChunk(lineA, 'delete', sep) + \
self.getModifiedChunk(lineB, 'insert', sep)
# Rewrap line into outerTagA if lineA was a line tagged as # Rewrap line into outerTagA if lineA was a line tagged as
# previously inserted. # previously inserted.
if outerTagA: if outerTagA:
@ -633,7 +645,15 @@ class HtmlDiff:
# Was a deletion, not a replacement # Was a deletion, not a replacement
add = self.getModifiedChunk(chunkA, 'delete', sep) add = self.getModifiedChunk(chunkA, 'delete', sep)
else: # At least, a true replacement else: # At least, a true replacement
add = self.getReplacement(chunkA, chunkB, sep) if (sep == ' ') and (sep not in chunkA) and \
(sep not in chunkB):
# By going here, we avoid infinite loops that may occur
# between m_getHtmlDiff and m_getReplacement
# (called below).
add = self.getModifiedChunk(chunkA, 'delete', sep) + \
self.getModifiedChunk(chunkB, 'insert', sep)
else:
add = self.getReplacement(chunkA, chunkB, sep)
if add: res += self.getDumpPrefix(res, add, previousAdd, sep) + add if add: res += self.getDumpPrefix(res, add, previousAdd, sep) + add
previousAdd = add previousAdd = add
return res return res

View file

@ -934,7 +934,7 @@ class XhtmlCleaner(XmlParser):
# 'ignoreContent' is True if, within the currently ignored tag, we must # 'ignoreContent' is True if, within the currently ignored tag, we must
# also ignore its content. # also ignore its content.
self.env.ignoreContent = False self.env.ignoreContent = False
return self.parse('<x>%s</x>' % s) return self.parse('<x>%s</x>' % s).encode('utf-8')
def startDocument(self): def startDocument(self):
# The result will be cleaned XHTML, joined from self.res. # The result will be cleaned XHTML, joined from self.res.