diff --git a/pod/__init__.py b/pod/__init__.py index d8a31d4..8d00957 100644 --- a/pod/__init__.py +++ b/pod/__init__.py @@ -45,7 +45,10 @@ class PodError(Exception): i += 1 if i > linesToRemove: buffer.write('<%s:p>' % textNs) - buffer.dumpContent(tLine) + try: + buffer.dumpContent(tLine) + except UnicodeDecodeError, ude: + buffer.dumpContent(tLine.decode('utf-8')) buffer.write('' % textNs) dumpTraceback = staticmethod(dumpTraceback) def dump(buffer, message, withinElement=None, removeFirstLine=False, dumpTb=True): diff --git a/shared/diff.py b/shared/diff.py index b98c1cc..7fc3ef7 100644 --- a/shared/diff.py +++ b/shared/diff.py @@ -4,6 +4,7 @@ import re, difflib # ------------------------------------------------------------------------------ innerDiff = re.compile('' \ '(.*?)') +htmlTag = re.compile('<(?P\w+)( .*?)?>(.*)') # ------------------------------------------------------------------------------ class Merger: @@ -28,9 +29,6 @@ class Merger: self.deltaPrevious = 0 # A link to the caller HtmlDiff class. self.differ = differ - # While "consuming" diffs (see m_getNextDiff), keep here every message - # from every diff. - self.messages = [self.differ.complexMsg] def computeNewDiffs(self): '''lineB may include inner "insert" and/or tags. This function @@ -74,49 +72,73 @@ class Merger: del self.newDiffs[0] return newDiff, newDiffIndex, False - def manageOverlaps(self): - '''We have detected that changes between lineA and lineB include - overlapping inserts and deletions. Our solution: to remember names - of editors and return the whole line in a distinct colour, where - we (unfortunately) can't distinguish editors's specific updates.''' - # First, get a "naked" version of self.lineB, without the latest - # updates. - res = self.lineB - for diff in self.newDiffs: - res = self.differ.applyDiff(res, diff) - # Construct the message explaining the series of updates. - # self.messages already contains messages from the "consumed" diffs - # (see m_getNextDiff). - for type in ('previous', 'new'): - exec 'diffs = self.%sDiffs' % type - for diff in diffs: - self.messages.append(diff.group(2)) - msg = ' -=- '.join(self.messages) - return self.differ.getModifiedChunk(res, 'complex', '\n', msg=msg) + def manageOverlap(self, oldDiff): + '''p_oldDiff is a previously inserted text from self.lineA. This text + is not found anymore at the start of self.lineB[self.i:]: it means + that an overlapping diff exists among new diffs. We will manage this + by identifying several, cutted, "insert" and/or "edit" zones.''' + # The idea here is to "consume" the old inserted text until we have + # found, within the new diff, all updates that have been performed on + # this old text. Then, we will have found the complete "zone" that was + # impacted by both old and new diffs. + oldText = oldDiff.group(3) + res = '' + while oldText: + # Get the overlapping (new) diff. + newDiff, newDiffStart, isPrevious = self.getNextDiff() + if not newDiff: + res += self.differ.getModifiedChunk(oldText, 'insert', '', + msg=oldDiff.group(2)) + self.i += len(oldText) + oldText = '' + break + # Dump the part of the old text that has been untouched by the new + # diff. + if self.i < newDiffStart: + untouched = self.lineB[self.i:newDiffStart] + res += self.differ.getModifiedChunk(untouched, 'insert', '', + msg=oldDiff.group(2)) + self.i = newDiffStart + oldText = oldText[len(untouched):] + # Manage the new diff + res += newDiff.group(0) + self.i += len(newDiff.group(0)) + self.deltaPrevious += len(newDiff.group(0)) + if newDiff.group(1) == 'delete': + # Consume oldText, that was deleted, at least partly, by + # this diff. + if len(newDiff.group(3)) >= len(oldText): + # We have consumed oldText in its entirety + oldText = '' + else: + oldText = oldText[len(newDiff.group(3)):] + self.deltaPrevious -= len(newDiff.group(3)) + return res def merge(self): '''Merges self.previousDiffs into self.lineB.''' res = '' diff, diffStart, isPrevious = self.getNextDiff() - if diff: self.messages.append(diff.group(2)) while diff: # Dump the part of lineB between self.i and diffStart res += self.lineB[self.i:diffStart] self.i = diffStart - # Dump the diff - res += diff.group(0) if isPrevious: if diff.group(1) == 'insert': # Check if the inserted text is still present in lineB if self.lineB[self.i:].startswith(diff.group(3)): - # Yes. Go ahead within lineB + # Yes. Dump the diff and go ahead within lineB + res += diff.group(0) self.i += len(diff.group(3)) else: # The inserted text can't be found as is in lineB. # Must have been (partly) re-edited or removed. - return self.manageOverlaps() + + overlap = self.manageOverlap(diff) + res += overlap else: - # Update self.i + # Dump the diff and update self.i + res += diff.group(0) self.i += len(diff.group(0)) # Because of this new diff, all indexes computed on lineA are # now wrong because we express them relative to lineB. So: @@ -129,7 +151,6 @@ class Merger: self.deltaPrevious -= len(diff.group(3)) # Load next diff diff, diffStart, isPrevious = self.getNextDiff() - if diff: self.messages.append(diff.group(2)) # Dump the end of self.lineB if not completely consumed if self.i < len(self.lineB): res += self.lineB[self.i:] @@ -141,14 +162,11 @@ class HtmlDiff: HTML chunk.''' insertStyle = 'color: blue; cursor: help' deleteStyle = 'color: red; text-decoration: line-through; cursor: help' - complexStyle = 'color: purple; cursor: help' def __init__(self, old, new, insertMsg='Inserted text', deleteMsg='Deleted text', - complexMsg='Multiple inserts and/or deletions', - insertCss=None, deleteCss=None, complexCss=None, - insertName='insert', deleteName='delete', - complexName='complex', diffRatio=0.7): + insertCss=None, deleteCss=None, insertName='insert', + deleteName='delete', diffRatio=0.7): # p_old and p_new are strings containing chunks of HTML. self.old = old.strip() self.new = new.strip() @@ -159,18 +177,15 @@ class HtmlDiff: # (who made it and at what time, for example). self.insertMsg = insertMsg self.deleteMsg = deleteMsg - self.complexMsg = complexMsg # This tag will get a CSS class p_insertCss or p_deleteCss for # highlighting the change. If no class is provided, default styles will # be used (see HtmlDiff.insertStyle and HtmlDiff.deleteStyle). self.insertCss = insertCss self.deleteCss = deleteCss - self.complexCss = complexCss # This tag will get a "name" attribute whose content will be # p_insertName or p_deleteName self.insertName = insertName self.deleteName = deleteName - self.complexName = complexName # The diff algorithm of this class will need to identify similarities # between strings. Similarity ratios will be computed by using method # difflib.SequenceMatcher.ratio (see m_isSimilar below). Strings whose @@ -179,19 +194,17 @@ class HtmlDiff: self.diffRatio = diffRatio # Some computed values for tag in ('div', 'span'): - for type in ('insert', 'delete', 'complex'): + for type in ('insert', 'delete'): setattr(self, '%s%sPrefix' % (tag, type.capitalize()), '<%s name="%s"' % (tag, getattr(self, '%sName' % type))) def getModifiedChunk(self, seq, type, sep, msg=None): '''p_sep.join(p_seq) (if p_seq is a list) or p_seq (if p_seq is a string) is a chunk that was either inserted (p_type='insert') or - deleted (p_type='delete'). It can also be a complex, partially - managed combination of inserts/deletions (p_type='insert'). - This method will surround this part with a div or span tag that will - get some CSS class allowing to highlight the update. If p_msg is - given, it will be used instead of the default p_type-related message - stored on p_self.''' + deleted (p_type='delete'). This method will surround this part with + a div or span tag that will get some CSS class allowing to highlight + the update. If p_msg is given, it will be used instead of the default + p_type-related message stored on p_self.''' # Will the surrouding tag be a div or a span? if sep == '\n': tag = 'div' else: tag = 'span' @@ -224,58 +237,21 @@ class HtmlDiff: return res def applyDiff(self, line, diff): - '''p_diff is a regex containing an insert or delete that was found within - line. This function applies the diff, removing or inserting the diff - into p_line.''' + '''p_diff is a regex containing an insert or delete that was found + within line. This function applies the diff, removing or inserting + the diff into p_line.''' # Keep content only for "insert" tags. content = '' if diff.group(1) == 'insert': content = diff.group(3) return line[:diff.start()] + content + line[diff.end():] - def getStringDiff(self, old, new): - '''Identifies the differences between strings p_old and p_new by - computing: - * i = the end index of the potential common starting part (if no - common part is found, i=0); - * jo = the start index in p_old of the potential common ending part; - * jn = the start index in p_new of the potential common ending part. - ''' - # Compute i - i = -1 - diffFound = False - while not diffFound: - i += 1 - if (i == len(old)) or (i == len(new)): break - if old[i] != new[i]: diffFound = True - # Compute jo and jn - jo = len(old) - jn = len(new) - diffFound = False - while not diffFound: - if (jo == i) or (jn == i): - # We have reached the end of substring old[i:] or new[i:] - jo -=1 - jn -= 1 - break - jo -= 1 - jn -= 1 - if old[jo] != new[jn]: diffFound=True - return i, jo+1, jn+1 - def isSimilar(self, s1, s2): '''Returns True if strings p_s1 and p_s2 can be considered as similar.''' ratio = difflib.SequenceMatcher(a=s1.lower(), b=s2.lower()).ratio() return ratio > self.diffRatio - def splitTagAndContent(self, line): - '''p_line is a XHTML tag with content. This method returns a tuple - (startTag, content), where p_startTag is the isolated start tag and - content is the tag content.''' - i = line.find('>')+1 - return line[0:i], line[i:line.rfind('<')] - def getLineAndType(self, line): '''p_line is a string that can already have been surrounded by an "insert" or "delete" tag. This is what we try to determine here. @@ -286,14 +262,14 @@ class HtmlDiff: * None else; "line" holds the original parameter p_line, excepted: * if type="insert". In that case, the surrounding insert tag has been - removed and placed into "outerTag" (the outer start tag to be more - precise); + removed and placed into "outerTag" (a re.MatchObject from regex + innerHtml, see above); * if inner diff tags (insert or delete) are found. In that case, - if inner "insert" tags are found, they are removed but their content is kept; - if inner "delete" tags are found, they are removed, content included; - - "innerDiffs" holds the list of re.MatchObjects instances + - "innerDiffs" holds the list of re.MatchObject instances representing the found inner tags. ''' if line.startswith(self.divDeletePrefix): @@ -301,7 +277,8 @@ class HtmlDiff: if line.startswith(self.divInsertPrefix): # Return the line without the surrounding tag. action = 'insert' - outerTag, line = self.splitTagAndContent(line) + outerTag = htmlTag.match(line) + line = outerTag.group(3) else: action = None outerTag = None @@ -315,6 +292,21 @@ class HtmlDiff: line = self.applyDiff(line, match) return (action, line, innerDiffs, outerTag) + def computeTag(self, regexTag, content): + '''p_regexTag is a re.MatchObject from regex htmlTag. p_content is a + new content to put within this tag. This method produces the new + string tag filled with p_content.''' + # Recompute start tag from p_regexTag + startTag = '<%s' % regexTag.group(1) + # Add tag attributes if found + if regexTag.group(2): + startTag += regexTag.group(2) + startTag += '>' + # Recompute end tag + endTag = '' % regexTag.group(1) + # Wrap content info reified tag + return startTag + content + endTag + def getSeqDiff(self, seqA, seqB): '''p_seqA and p_seqB are lists of strings. Here we will try to identify similarities between strings from p_seqA and p_seqB, and return a @@ -403,6 +395,42 @@ class HtmlDiff: i -= 1 return l + def getLineReplacement(self, lineA, lineB, previousDiffsA, outerTagA): + '''p_lineA has been replaced with p_lineB. Here, we will investigate + further here and explore differences at the *word* level between + p_lineA and p_lineB. + + p_previousDiffsA may contain a series of updates (inserts, deletions) + that have already been performed on p_lineA. + + If p_lineA was a previously inserted line, p_lineA comes without his + outer tag, that lies in p_outerTagA (as a re.MatchObject instance + computed from regex htmlTag). In that case, we will wrap the result + with that tag.''' + # As a preamble, and in order to restrict annoyances due to the presence + # of XHTML tags, we will remove start and end tags from p_lineA and + # p_lineB if present. + matchA = htmlTag.match(lineA) + contentA = matchA and matchA.group(3) or lineA + matchB = htmlTag.match(lineB) + contentB = matchB and matchB.group(3) or lineB + # Perform the diff at the level fo words + diff = self.getHtmlDiff(contentA, contentB, ' ') + if matchB: + res = self.computeTag(matchB, diff) + else: + res = diff + # Merge potential previous inner diff tags that + # were found (but extracted from) lineA. + if previousDiffsA: + merger = Merger(lineA, res, previousDiffsA, self) + res = merger.merge() + # Rewrap line into outerTagA if lineA was a line tagged as previously + # inserted. + if outerTagA: + res = self.computeTag(outerTagA, res) + return res + def getHtmlDiff(self, old, new, sep): '''Returns the differences between p_old and p_new. Result is a string containing the comparison in HTML format. p_sep is used for turning @@ -440,40 +468,20 @@ class HtmlDiff: toAdd = self.getModifiedChunk(chunkA, 'delete', sep) else: # At least, a true replacement if sep == '\n': + toAdd = [] # We know that some lines have been replaced from a to # b. By identifying similarities between those lines, # consider some as having been deleted, modified or # inserted. - toAdd = '' for sAction, line in self.getSeqDiff(chunkA, chunkB): if sAction in ('insert', 'delete'): - toAdd += self.getModifiedChunk(line,sAction,sep) + mChunk = self.getModifiedChunk(line,sAction,sep) + toAdd.append(mChunk) elif sAction == 'equal': - toAdd += line + toAdd.append(line) elif sAction == 'replace': - lineA, lineB, previousDiffsA, outerTag = line - # Investigate further here and explore - # differences at the *word* level between lineA - # and lineB. As a preamble, and in order to - # restrict annoyances due to the presence of - # XHTML tags, we will compute start and end - # parts wich are similar between lineA and - # lineB: they may correspond to opening and - # closing XHTML tags. - i, ja, jb = self.getStringDiff(lineA, lineB) - diff = self.getHtmlDiff(lineA[i:ja], - lineB[i:jb], ' ') - toAdd += lineB[:i] + diff + lineB[jb:] - # Merge potential previous inner diff tags that - # were found (but extracted from) lineA. - if previousDiffsA: - merger = Merger(lineA, toAdd, - previousDiffsA, self) - toAdd = merger.merge() - # Rewrap line into outerTag if lineA was a line - # tagged as previously inserted. - if outerTag: - toAdd = outerTag + toAdd + '' + toAdd.append(self.getLineReplacement(*line)) + toAdd = sep.join(toAdd) else: toAdd = self.getModifiedChunk(chunkA, 'delete', sep) toAdd += self.getModifiedChunk(chunkB, 'insert', sep) diff --git a/shared/utils.py b/shared/utils.py index b5c48a5..000a407 100644 --- a/shared/utils.py +++ b/shared/utils.py @@ -164,7 +164,7 @@ def executeCommand(cmd): return res # ------------------------------------------------------------------------------ -unwantedChars = ('\\', '/', ':', '*', '?', '"', '<', '>', '|', ' ') +unwantedChars = ('\\', '/', ':', '*', '?', '"', '<', '>', '|', ' ', '\t') alphaRex = re.compile('[a-zA-Z]') alphanumRex = re.compile('[a-zA-Z0-9]') def normalizeString(s, usage='fileName'): @@ -212,14 +212,16 @@ def formatNumber(n, sep=',', precision=2, tsep=' '): # Insert p_tsep every 3 chars in the integer part of the number splitted = res.split(sep) res = '' - i = len(splitted[0])-1 - j = 0 - while i >= 0: - j += 1 - res = splitted[0][i] + res - if (j % 3) == 0: - res = tsep + res - i -= 1 + if len(splitted[0]) < 4: res = splitted[0] + else: + i = len(splitted[0])-1 + j = 0 + while i >= 0: + j += 1 + res = splitted[0][i] + res + if (j % 3) == 0: + res = tsep + res + i -= 1 # Add the decimal part if not 0 if len(splitted) > 1: try: