appy.pod: bugfix while generating tracebacks within odt results (unicode-related); appy.shared.diff: first draft of a full-functional version; appy.shared.utils: improved functions normalizeString and formatNumber.

This commit is contained in:
Gaetan Delannay 2011-11-17 12:41:06 +01:00
parent 8e1760842e
commit 2ec05939fe
3 changed files with 137 additions and 124 deletions

View file

@ -45,7 +45,10 @@ class PodError(Exception):
i += 1 i += 1
if i > linesToRemove: if i > linesToRemove:
buffer.write('<%s:p>' % textNs) buffer.write('<%s:p>' % textNs)
try:
buffer.dumpContent(tLine) buffer.dumpContent(tLine)
except UnicodeDecodeError, ude:
buffer.dumpContent(tLine.decode('utf-8'))
buffer.write('</%s:p>' % textNs) buffer.write('</%s:p>' % textNs)
dumpTraceback = staticmethod(dumpTraceback) dumpTraceback = staticmethod(dumpTraceback)
def dump(buffer, message, withinElement=None, removeFirstLine=False, dumpTb=True): def dump(buffer, message, withinElement=None, removeFirstLine=False, dumpTb=True):

View file

@ -4,6 +4,7 @@ import re, difflib
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
innerDiff = re.compile('<span name="(insert|delete)".*? title="(.*?)">' \ innerDiff = re.compile('<span name="(insert|delete)".*? title="(.*?)">' \
'(.*?)</span>') '(.*?)</span>')
htmlTag = re.compile('<(?P<tag>\w+)( .*?)?>(.*)</(?P=tag)>')
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
class Merger: class Merger:
@ -28,9 +29,6 @@ class Merger:
self.deltaPrevious = 0 self.deltaPrevious = 0
# A link to the caller HtmlDiff class. # A link to the caller HtmlDiff class.
self.differ = differ self.differ = differ
# While "consuming" diffs (see m_getNextDiff), keep here every message
# from every diff.
self.messages = [self.differ.complexMsg]
def computeNewDiffs(self): def computeNewDiffs(self):
'''lineB may include inner "insert" and/or tags. This function '''lineB may include inner "insert" and/or tags. This function
@ -74,49 +72,73 @@ class Merger:
del self.newDiffs[0] del self.newDiffs[0]
return newDiff, newDiffIndex, False return newDiff, newDiffIndex, False
def manageOverlaps(self): def manageOverlap(self, oldDiff):
'''We have detected that changes between lineA and lineB include '''p_oldDiff is a previously inserted text from self.lineA. This text
overlapping inserts and deletions. Our solution: to remember names is not found anymore at the start of self.lineB[self.i:]: it means
of editors and return the whole line in a distinct colour, where that an overlapping diff exists among new diffs. We will manage this
we (unfortunately) can't distinguish editors's specific updates.''' by identifying several, cutted, "insert" and/or "edit" zones.'''
# First, get a "naked" version of self.lineB, without the latest # The idea here is to "consume" the old inserted text until we have
# updates. # found, within the new diff, all updates that have been performed on
res = self.lineB # this old text. Then, we will have found the complete "zone" that was
for diff in self.newDiffs: # impacted by both old and new diffs.
res = self.differ.applyDiff(res, diff) oldText = oldDiff.group(3)
# Construct the message explaining the series of updates. res = ''
# self.messages already contains messages from the "consumed" diffs while oldText:
# (see m_getNextDiff). # Get the overlapping (new) diff.
for type in ('previous', 'new'): newDiff, newDiffStart, isPrevious = self.getNextDiff()
exec 'diffs = self.%sDiffs' % type if not newDiff:
for diff in diffs: res += self.differ.getModifiedChunk(oldText, 'insert', '',
self.messages.append(diff.group(2)) msg=oldDiff.group(2))
msg = ' -=- '.join(self.messages) self.i += len(oldText)
return self.differ.getModifiedChunk(res, 'complex', '\n', msg=msg) oldText = ''
break
# Dump the part of the old text that has been untouched by the new
# diff.
if self.i < newDiffStart:
untouched = self.lineB[self.i:newDiffStart]
res += self.differ.getModifiedChunk(untouched, 'insert', '',
msg=oldDiff.group(2))
self.i = newDiffStart
oldText = oldText[len(untouched):]
# Manage the new diff
res += newDiff.group(0)
self.i += len(newDiff.group(0))
self.deltaPrevious += len(newDiff.group(0))
if newDiff.group(1) == 'delete':
# Consume oldText, that was deleted, at least partly, by
# this diff.
if len(newDiff.group(3)) >= len(oldText):
# We have consumed oldText in its entirety
oldText = ''
else:
oldText = oldText[len(newDiff.group(3)):]
self.deltaPrevious -= len(newDiff.group(3))
return res
def merge(self): def merge(self):
'''Merges self.previousDiffs into self.lineB.''' '''Merges self.previousDiffs into self.lineB.'''
res = '' res = ''
diff, diffStart, isPrevious = self.getNextDiff() diff, diffStart, isPrevious = self.getNextDiff()
if diff: self.messages.append(diff.group(2))
while diff: while diff:
# Dump the part of lineB between self.i and diffStart # Dump the part of lineB between self.i and diffStart
res += self.lineB[self.i:diffStart] res += self.lineB[self.i:diffStart]
self.i = diffStart self.i = diffStart
# Dump the diff
res += diff.group(0)
if isPrevious: if isPrevious:
if diff.group(1) == 'insert': if diff.group(1) == 'insert':
# Check if the inserted text is still present in lineB # Check if the inserted text is still present in lineB
if self.lineB[self.i:].startswith(diff.group(3)): if self.lineB[self.i:].startswith(diff.group(3)):
# Yes. Go ahead within lineB # Yes. Dump the diff and go ahead within lineB
res += diff.group(0)
self.i += len(diff.group(3)) self.i += len(diff.group(3))
else: else:
# The inserted text can't be found as is in lineB. # The inserted text can't be found as is in lineB.
# Must have been (partly) re-edited or removed. # Must have been (partly) re-edited or removed.
return self.manageOverlaps()
overlap = self.manageOverlap(diff)
res += overlap
else: else:
# Update self.i # Dump the diff and update self.i
res += diff.group(0)
self.i += len(diff.group(0)) self.i += len(diff.group(0))
# Because of this new diff, all indexes computed on lineA are # Because of this new diff, all indexes computed on lineA are
# now wrong because we express them relative to lineB. So: # now wrong because we express them relative to lineB. So:
@ -129,7 +151,6 @@ class Merger:
self.deltaPrevious -= len(diff.group(3)) self.deltaPrevious -= len(diff.group(3))
# Load next diff # Load next diff
diff, diffStart, isPrevious = self.getNextDiff() diff, diffStart, isPrevious = self.getNextDiff()
if diff: self.messages.append(diff.group(2))
# Dump the end of self.lineB if not completely consumed # Dump the end of self.lineB if not completely consumed
if self.i < len(self.lineB): if self.i < len(self.lineB):
res += self.lineB[self.i:] res += self.lineB[self.i:]
@ -141,14 +162,11 @@ class HtmlDiff:
HTML chunk.''' HTML chunk.'''
insertStyle = 'color: blue; cursor: help' insertStyle = 'color: blue; cursor: help'
deleteStyle = 'color: red; text-decoration: line-through; cursor: help' deleteStyle = 'color: red; text-decoration: line-through; cursor: help'
complexStyle = 'color: purple; cursor: help'
def __init__(self, old, new, def __init__(self, old, new,
insertMsg='Inserted text', deleteMsg='Deleted text', insertMsg='Inserted text', deleteMsg='Deleted text',
complexMsg='Multiple inserts and/or deletions', insertCss=None, deleteCss=None, insertName='insert',
insertCss=None, deleteCss=None, complexCss=None, deleteName='delete', diffRatio=0.7):
insertName='insert', deleteName='delete',
complexName='complex', diffRatio=0.7):
# p_old and p_new are strings containing chunks of HTML. # p_old and p_new are strings containing chunks of HTML.
self.old = old.strip() self.old = old.strip()
self.new = new.strip() self.new = new.strip()
@ -159,18 +177,15 @@ class HtmlDiff:
# (who made it and at what time, for example). # (who made it and at what time, for example).
self.insertMsg = insertMsg self.insertMsg = insertMsg
self.deleteMsg = deleteMsg self.deleteMsg = deleteMsg
self.complexMsg = complexMsg
# This tag will get a CSS class p_insertCss or p_deleteCss for # This tag will get a CSS class p_insertCss or p_deleteCss for
# highlighting the change. If no class is provided, default styles will # highlighting the change. If no class is provided, default styles will
# be used (see HtmlDiff.insertStyle and HtmlDiff.deleteStyle). # be used (see HtmlDiff.insertStyle and HtmlDiff.deleteStyle).
self.insertCss = insertCss self.insertCss = insertCss
self.deleteCss = deleteCss self.deleteCss = deleteCss
self.complexCss = complexCss
# This tag will get a "name" attribute whose content will be # This tag will get a "name" attribute whose content will be
# p_insertName or p_deleteName # p_insertName or p_deleteName
self.insertName = insertName self.insertName = insertName
self.deleteName = deleteName self.deleteName = deleteName
self.complexName = complexName
# The diff algorithm of this class will need to identify similarities # The diff algorithm of this class will need to identify similarities
# between strings. Similarity ratios will be computed by using method # between strings. Similarity ratios will be computed by using method
# difflib.SequenceMatcher.ratio (see m_isSimilar below). Strings whose # difflib.SequenceMatcher.ratio (see m_isSimilar below). Strings whose
@ -179,19 +194,17 @@ class HtmlDiff:
self.diffRatio = diffRatio self.diffRatio = diffRatio
# Some computed values # Some computed values
for tag in ('div', 'span'): for tag in ('div', 'span'):
for type in ('insert', 'delete', 'complex'): for type in ('insert', 'delete'):
setattr(self, '%s%sPrefix' % (tag, type.capitalize()), setattr(self, '%s%sPrefix' % (tag, type.capitalize()),
'<%s name="%s"' % (tag, getattr(self, '%sName' % type))) '<%s name="%s"' % (tag, getattr(self, '%sName' % type)))
def getModifiedChunk(self, seq, type, sep, msg=None): def getModifiedChunk(self, seq, type, sep, msg=None):
'''p_sep.join(p_seq) (if p_seq is a list) or p_seq (if p_seq is a '''p_sep.join(p_seq) (if p_seq is a list) or p_seq (if p_seq is a
string) is a chunk that was either inserted (p_type='insert') or string) is a chunk that was either inserted (p_type='insert') or
deleted (p_type='delete'). It can also be a complex, partially deleted (p_type='delete'). This method will surround this part with
managed combination of inserts/deletions (p_type='insert'). a div or span tag that will get some CSS class allowing to highlight
This method will surround this part with a div or span tag that will the update. If p_msg is given, it will be used instead of the default
get some CSS class allowing to highlight the update. If p_msg is p_type-related message stored on p_self.'''
given, it will be used instead of the default p_type-related message
stored on p_self.'''
# Will the surrouding tag be a div or a span? # Will the surrouding tag be a div or a span?
if sep == '\n': tag = 'div' if sep == '\n': tag = 'div'
else: tag = 'span' else: tag = 'span'
@ -224,58 +237,21 @@ class HtmlDiff:
return res return res
def applyDiff(self, line, diff): def applyDiff(self, line, diff):
'''p_diff is a regex containing an insert or delete that was found within '''p_diff is a regex containing an insert or delete that was found
line. This function applies the diff, removing or inserting the diff within line. This function applies the diff, removing or inserting
into p_line.''' the diff into p_line.'''
# Keep content only for "insert" tags. # Keep content only for "insert" tags.
content = '' content = ''
if diff.group(1) == 'insert': if diff.group(1) == 'insert':
content = diff.group(3) content = diff.group(3)
return line[:diff.start()] + content + line[diff.end():] return line[:diff.start()] + content + line[diff.end():]
def getStringDiff(self, old, new):
'''Identifies the differences between strings p_old and p_new by
computing:
* i = the end index of the potential common starting part (if no
common part is found, i=0);
* jo = the start index in p_old of the potential common ending part;
* jn = the start index in p_new of the potential common ending part.
'''
# Compute i
i = -1
diffFound = False
while not diffFound:
i += 1
if (i == len(old)) or (i == len(new)): break
if old[i] != new[i]: diffFound = True
# Compute jo and jn
jo = len(old)
jn = len(new)
diffFound = False
while not diffFound:
if (jo == i) or (jn == i):
# We have reached the end of substring old[i:] or new[i:]
jo -=1
jn -= 1
break
jo -= 1
jn -= 1
if old[jo] != new[jn]: diffFound=True
return i, jo+1, jn+1
def isSimilar(self, s1, s2): def isSimilar(self, s1, s2):
'''Returns True if strings p_s1 and p_s2 can be considered as '''Returns True if strings p_s1 and p_s2 can be considered as
similar.''' similar.'''
ratio = difflib.SequenceMatcher(a=s1.lower(), b=s2.lower()).ratio() ratio = difflib.SequenceMatcher(a=s1.lower(), b=s2.lower()).ratio()
return ratio > self.diffRatio return ratio > self.diffRatio
def splitTagAndContent(self, line):
'''p_line is a XHTML tag with content. This method returns a tuple
(startTag, content), where p_startTag is the isolated start tag and
content is the tag content.'''
i = line.find('>')+1
return line[0:i], line[i:line.rfind('<')]
def getLineAndType(self, line): def getLineAndType(self, line):
'''p_line is a string that can already have been surrounded by an '''p_line is a string that can already have been surrounded by an
"insert" or "delete" tag. This is what we try to determine here. "insert" or "delete" tag. This is what we try to determine here.
@ -286,14 +262,14 @@ class HtmlDiff:
* None else; * None else;
"line" holds the original parameter p_line, excepted: "line" holds the original parameter p_line, excepted:
* if type="insert". In that case, the surrounding insert tag has been * if type="insert". In that case, the surrounding insert tag has been
removed and placed into "outerTag" (the outer start tag to be more removed and placed into "outerTag" (a re.MatchObject from regex
precise); innerHtml, see above);
* if inner diff tags (insert or delete) are found. In that case, * if inner diff tags (insert or delete) are found. In that case,
- if inner "insert" tags are found, they are removed but their - if inner "insert" tags are found, they are removed but their
content is kept; content is kept;
- if inner "delete" tags are found, they are removed, content - if inner "delete" tags are found, they are removed, content
included; included;
- "innerDiffs" holds the list of re.MatchObjects instances - "innerDiffs" holds the list of re.MatchObject instances
representing the found inner tags. representing the found inner tags.
''' '''
if line.startswith(self.divDeletePrefix): if line.startswith(self.divDeletePrefix):
@ -301,7 +277,8 @@ class HtmlDiff:
if line.startswith(self.divInsertPrefix): if line.startswith(self.divInsertPrefix):
# Return the line without the surrounding tag. # Return the line without the surrounding tag.
action = 'insert' action = 'insert'
outerTag, line = self.splitTagAndContent(line) outerTag = htmlTag.match(line)
line = outerTag.group(3)
else: else:
action = None action = None
outerTag = None outerTag = None
@ -315,6 +292,21 @@ class HtmlDiff:
line = self.applyDiff(line, match) line = self.applyDiff(line, match)
return (action, line, innerDiffs, outerTag) return (action, line, innerDiffs, outerTag)
def computeTag(self, regexTag, content):
'''p_regexTag is a re.MatchObject from regex htmlTag. p_content is a
new content to put within this tag. This method produces the new
string tag filled with p_content.'''
# Recompute start tag from p_regexTag
startTag = '<%s' % regexTag.group(1)
# Add tag attributes if found
if regexTag.group(2):
startTag += regexTag.group(2)
startTag += '>'
# Recompute end tag
endTag = '</%s>' % regexTag.group(1)
# Wrap content info reified tag
return startTag + content + endTag
def getSeqDiff(self, seqA, seqB): def getSeqDiff(self, seqA, seqB):
'''p_seqA and p_seqB are lists of strings. Here we will try to identify '''p_seqA and p_seqB are lists of strings. Here we will try to identify
similarities between strings from p_seqA and p_seqB, and return a similarities between strings from p_seqA and p_seqB, and return a
@ -403,6 +395,42 @@ class HtmlDiff:
i -= 1 i -= 1
return l return l
def getLineReplacement(self, lineA, lineB, previousDiffsA, outerTagA):
'''p_lineA has been replaced with p_lineB. Here, we will investigate
further here and explore differences at the *word* level between
p_lineA and p_lineB.
p_previousDiffsA may contain a series of updates (inserts, deletions)
that have already been performed on p_lineA.
If p_lineA was a previously inserted line, p_lineA comes without his
outer tag, that lies in p_outerTagA (as a re.MatchObject instance
computed from regex htmlTag). In that case, we will wrap the result
with that tag.'''
# As a preamble, and in order to restrict annoyances due to the presence
# of XHTML tags, we will remove start and end tags from p_lineA and
# p_lineB if present.
matchA = htmlTag.match(lineA)
contentA = matchA and matchA.group(3) or lineA
matchB = htmlTag.match(lineB)
contentB = matchB and matchB.group(3) or lineB
# Perform the diff at the level fo words
diff = self.getHtmlDiff(contentA, contentB, ' ')
if matchB:
res = self.computeTag(matchB, diff)
else:
res = diff
# Merge potential previous inner diff tags that
# were found (but extracted from) lineA.
if previousDiffsA:
merger = Merger(lineA, res, previousDiffsA, self)
res = merger.merge()
# Rewrap line into outerTagA if lineA was a line tagged as previously
# inserted.
if outerTagA:
res = self.computeTag(outerTagA, res)
return res
def getHtmlDiff(self, old, new, sep): def getHtmlDiff(self, old, new, sep):
'''Returns the differences between p_old and p_new. Result is a string '''Returns the differences between p_old and p_new. Result is a string
containing the comparison in HTML format. p_sep is used for turning containing the comparison in HTML format. p_sep is used for turning
@ -440,40 +468,20 @@ class HtmlDiff:
toAdd = self.getModifiedChunk(chunkA, 'delete', sep) toAdd = self.getModifiedChunk(chunkA, 'delete', sep)
else: # At least, a true replacement else: # At least, a true replacement
if sep == '\n': if sep == '\n':
toAdd = []
# We know that some lines have been replaced from a to # We know that some lines have been replaced from a to
# b. By identifying similarities between those lines, # b. By identifying similarities between those lines,
# consider some as having been deleted, modified or # consider some as having been deleted, modified or
# inserted. # inserted.
toAdd = ''
for sAction, line in self.getSeqDiff(chunkA, chunkB): for sAction, line in self.getSeqDiff(chunkA, chunkB):
if sAction in ('insert', 'delete'): if sAction in ('insert', 'delete'):
toAdd += self.getModifiedChunk(line,sAction,sep) mChunk = self.getModifiedChunk(line,sAction,sep)
toAdd.append(mChunk)
elif sAction == 'equal': elif sAction == 'equal':
toAdd += line toAdd.append(line)
elif sAction == 'replace': elif sAction == 'replace':
lineA, lineB, previousDiffsA, outerTag = line toAdd.append(self.getLineReplacement(*line))
# Investigate further here and explore toAdd = sep.join(toAdd)
# differences at the *word* level between lineA
# and lineB. As a preamble, and in order to
# restrict annoyances due to the presence of
# XHTML tags, we will compute start and end
# parts wich are similar between lineA and
# lineB: they may correspond to opening and
# closing XHTML tags.
i, ja, jb = self.getStringDiff(lineA, lineB)
diff = self.getHtmlDiff(lineA[i:ja],
lineB[i:jb], ' ')
toAdd += lineB[:i] + diff + lineB[jb:]
# Merge potential previous inner diff tags that
# were found (but extracted from) lineA.
if previousDiffsA:
merger = Merger(lineA, toAdd,
previousDiffsA, self)
toAdd = merger.merge()
# Rewrap line into outerTag if lineA was a line
# tagged as previously inserted.
if outerTag:
toAdd = outerTag + toAdd + '</div>'
else: else:
toAdd = self.getModifiedChunk(chunkA, 'delete', sep) toAdd = self.getModifiedChunk(chunkA, 'delete', sep)
toAdd += self.getModifiedChunk(chunkB, 'insert', sep) toAdd += self.getModifiedChunk(chunkB, 'insert', sep)

View file

@ -164,7 +164,7 @@ def executeCommand(cmd):
return res return res
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
unwantedChars = ('\\', '/', ':', '*', '?', '"', '<', '>', '|', ' ') unwantedChars = ('\\', '/', ':', '*', '?', '"', '<', '>', '|', ' ', '\t')
alphaRex = re.compile('[a-zA-Z]') alphaRex = re.compile('[a-zA-Z]')
alphanumRex = re.compile('[a-zA-Z0-9]') alphanumRex = re.compile('[a-zA-Z0-9]')
def normalizeString(s, usage='fileName'): def normalizeString(s, usage='fileName'):
@ -212,6 +212,8 @@ def formatNumber(n, sep=',', precision=2, tsep=' '):
# Insert p_tsep every 3 chars in the integer part of the number # Insert p_tsep every 3 chars in the integer part of the number
splitted = res.split(sep) splitted = res.split(sep)
res = '' res = ''
if len(splitted[0]) < 4: res = splitted[0]
else:
i = len(splitted[0])-1 i = len(splitted[0])-1
j = 0 j = 0
while i >= 0: while i >= 0: