appy.shared.diff: more work.
This commit is contained in:
		
							parent
							
								
									8c6301b901
								
							
						
					
					
						commit
						10398e770a
					
				
					 1 changed files with 124 additions and 106 deletions
				
			
		
							
								
								
									
										230
									
								
								shared/diff.py
									
										
									
									
									
								
							
							
						
						
									
										230
									
								
								shared/diff.py
									
										
									
									
									
								
							|  | @ -86,15 +86,18 @@ class Merger: | ||||||
|         while oldText: |         while oldText: | ||||||
|             # Get the overlapping (new) diff. |             # Get the overlapping (new) diff. | ||||||
|             newDiff, newDiffStart, isPrevious = self.getNextDiff() |             newDiff, newDiffStart, isPrevious = self.getNextDiff() | ||||||
|             if not newDiff: |             if not newDiff or (newDiffStart >= (self.i + len(oldText))): | ||||||
|                 # No more new diff. So normally, we should find what remains in |                 # No more new diff, or a new diff but far away, not within | ||||||
|                 # oldText at self.lineB[self.i:] |                 # oldText. So insert new the rest of p_oldText. | ||||||
|                 if not self.lineB[self.i:].startswith(oldText): |                 # Invariant: at this point, we should find what remains in | ||||||
|                     raise 'Error!!!!' |                 # oldText at self.lineB[self.i:]. | ||||||
|                 res += self.differ.getModifiedChunk(oldText, 'insert', '', |                 res += self.differ.getModifiedChunk(oldText, 'insert', '', | ||||||
|                                                     msg=oldDiff.group(2)) |                                                     msg=oldDiff.group(2)) | ||||||
|                 self.i += len(oldText) |                 self.i += len(oldText) | ||||||
|                 oldText = '' |                 oldText = '' | ||||||
|  |                 # If we have "popped" a new diff, dump it anyway. | ||||||
|  |                 if newDiff: | ||||||
|  |                     res = self.dumpDiff(res, newDiff, newDiffStart, isPrevious) | ||||||
|                 break |                 break | ||||||
|             # Dump the part of the old text that has been untouched by the new |             # Dump the part of the old text that has been untouched by the new | ||||||
|             # diff. |             # diff. | ||||||
|  | @ -119,47 +122,54 @@ class Merger: | ||||||
|                 self.deltaPrevious -= len(newDiff.group(3)) |                 self.deltaPrevious -= len(newDiff.group(3)) | ||||||
|         return res |         return res | ||||||
| 
 | 
 | ||||||
|  |     def dumpDiff(self, res, diff, diffStart, isPrevious): | ||||||
|  |         '''Dumps the next p_diff (starting at p_diffStart) to insert into p_res | ||||||
|  |            and return p_res. If p_isPrevious is True, the diff is an old one | ||||||
|  |            (from self.lineA); else, it is a new one (from self.lineB).''' | ||||||
|  |         # Dump the part of lineB between self.i and diffStart | ||||||
|  |         res += self.lineB[self.i:diffStart] | ||||||
|  |         self.i = diffStart | ||||||
|  |         if isPrevious: | ||||||
|  |             # Dump the old diff (from self.lineA) | ||||||
|  |             if diff.group(1) == 'insert': | ||||||
|  |                 # Check if the inserted text is still present in lineB | ||||||
|  |                 if self.lineB[self.i:].startswith(diff.group(3)): | ||||||
|  |                     # Yes. Dump the diff and go ahead within lineB | ||||||
|  |                     res += diff.group(0) | ||||||
|  |                     self.i += len(diff.group(3)) | ||||||
|  |                 else: | ||||||
|  |                     # The inserted text can't be found as is in lineB. | ||||||
|  |                     # Must have been (partly) re-edited or removed. | ||||||
|  |                     overlap = self.manageOverlap(diff) | ||||||
|  |                     res += overlap | ||||||
|  |             elif diff.group(1) == 'delete': | ||||||
|  |                 res += diff.group(0) | ||||||
|  |         else: | ||||||
|  |             # Dump the new diff (from self.lineB) | ||||||
|  |             res += diff.group(0) | ||||||
|  |             # Move forward within self.lineB | ||||||
|  |             self.i += len(diff.group(0)) | ||||||
|  |             # Because of this new diff, all indexes computed on lineA are | ||||||
|  |             # now wrong because we express them relative to lineB. So: | ||||||
|  |             # update self.deltaPrevious to take this into account. | ||||||
|  |             self.deltaPrevious += len(diff.group(0)) | ||||||
|  |             if diff.group(1) == 'delete': | ||||||
|  |                 # The indexes in self.lineA do not take the deleted text into | ||||||
|  |                 # account, because it wasn't deleted at this time. So remove | ||||||
|  |                 # from self.deltaPrevious the length of removed text. | ||||||
|  |                 self.deltaPrevious -= len(diff.group(3)) | ||||||
|  |         return res | ||||||
|  | 
 | ||||||
|     def merge(self): |     def merge(self): | ||||||
|         '''Merges self.previousDiffs into self.lineB.''' |         '''Merges self.previousDiffs into self.lineB.''' | ||||||
|         res = '' |         res = '' | ||||||
|         diff, diffStart, isPrevious = self.getNextDiff() |         diff, diffStart, isPrevious = self.getNextDiff() | ||||||
|         while diff: |         while diff: | ||||||
|             # Dump the part of lineB between self.i and diffStart |             res = self.dumpDiff(res, diff, diffStart, isPrevious) | ||||||
|             res += self.lineB[self.i:diffStart] |             # Load the next diff, if any | ||||||
|             self.i = diffStart |  | ||||||
|             if isPrevious: |  | ||||||
|                 if diff.group(1) == 'insert': |  | ||||||
|                     # Check if the inserted text is still present in lineB |  | ||||||
|                     if self.lineB[self.i:].startswith(diff.group(3)): |  | ||||||
|                         # Yes. Dump the diff and go ahead within lineB |  | ||||||
|                         res += diff.group(0) |  | ||||||
|                         self.i += len(diff.group(3)) |  | ||||||
|                     else: |  | ||||||
|                         # The inserted text can't be found as is in lineB. |  | ||||||
|                         # Must have been (partly) re-edited or removed. |  | ||||||
|                          |  | ||||||
|                         overlap = self.manageOverlap(diff) |  | ||||||
|                         res += overlap |  | ||||||
|                 elif diff.group(1) == 'delete': |  | ||||||
|                     res += diff.group(0) |  | ||||||
|             else: |  | ||||||
|                 # Dump the diff and update self.i |  | ||||||
|                 res += diff.group(0) |  | ||||||
|                 self.i += len(diff.group(0)) |  | ||||||
|                 # Because of this new diff, all indexes computed on lineA are |  | ||||||
|                 # now wrong because we express them relative to lineB. So: |  | ||||||
|                 # update self.deltaPrevious to take this into account. |  | ||||||
|                 self.deltaPrevious += len(diff.group(0)) |  | ||||||
|                 if diff.group(1) == 'delete': |  | ||||||
|                     # The indexes in lineA do not take the deleted text into |  | ||||||
|                     # account, because it wasn't deleted at this time. So remove |  | ||||||
|                     # from self.deltaPrevious the length of removed text. |  | ||||||
|                     self.deltaPrevious -= len(diff.group(3)) |  | ||||||
|             # Load next diff |  | ||||||
|             diff, diffStart, isPrevious = self.getNextDiff() |             diff, diffStart, isPrevious = self.getNextDiff() | ||||||
|         # Dump the end of self.lineB if not completely consumed |         # Dump the end of self.lineB if not completely consumed | ||||||
|         if self.i < len(self.lineB): |         if self.i < len(self.lineB): res += self.lineB[self.i:] | ||||||
|             res += self.lineB[self.i:] |  | ||||||
|         return res |         return res | ||||||
| 
 | 
 | ||||||
| # ------------------------------------------------------------------------------ | # ------------------------------------------------------------------------------ | ||||||
|  | @ -405,12 +415,14 @@ class HtmlDiff: | ||||||
|         return res |         return res | ||||||
| 
 | 
 | ||||||
|     garbage = ('', '\r') |     garbage = ('', '\r') | ||||||
|     def removeGarbage(self, l): |     def removeGarbage(self, l, sep): | ||||||
|         '''Removes from list p_l elements that have no interest, like blank |         '''Removes from list p_l elements that have no interest, like blank | ||||||
|            strings or considered as is.''' |            strings or considered as is. Also: strip lines (ie, if sep is a | ||||||
|  |            carriage return.''' | ||||||
|         i = len(l)-1 |         i = len(l)-1 | ||||||
|         while i >= 0: |         while i >= 0: | ||||||
|             if l[i] in self.garbage: del l[i] |             if l[i] in self.garbage: del l[i] | ||||||
|  |             if sep == '\n': l[i] = l[i].strip() | ||||||
|             i -= 1 |             i -= 1 | ||||||
|         return l |         return l | ||||||
| 
 | 
 | ||||||
|  | @ -444,33 +456,64 @@ class HtmlDiff: | ||||||
|             if old[jo] != new[jn]: diffFound=True |             if old[jo] != new[jn]: diffFound=True | ||||||
|         return i, jo+1, jn+1 |         return i, jo+1, jn+1 | ||||||
| 
 | 
 | ||||||
|     def getReplacement(self, sep, lineA, lineB, previousDiffsA, outerTagA): |     def getDumpPrefix(self, res, add, previousAdd, sep): | ||||||
|         '''p_lineA has been replaced with p_lineB. Here, we will investigate |         '''In most cases, when concatenating the next diff (p_add) to the | ||||||
|            further here and explore differences at the *word* level between |            global result (p_res), I must prefix it with p_sep (excepted if p_res | ||||||
|            p_lineA and p_lineB. |            is still empty). But when p_sep is a space, no space must be inserted | ||||||
|  |            between 2 adjacent updates (p_add and p_previousAdd), because such a | ||||||
|  |            space was not in the original version. This method computes the | ||||||
|  |            prefix, that can thus be empty if this latter case is met.''' | ||||||
|  |         prefix = '' | ||||||
|  |         if not res: return prefix | ||||||
|  |         if (sep == ' ') and previousAdd and \ | ||||||
|  |            previousAdd.endswith('</span>') and add.startswith('<span'): | ||||||
|  |             pass | ||||||
|  |         else: | ||||||
|  |             prefix = sep | ||||||
|  |         return prefix | ||||||
| 
 | 
 | ||||||
|            p_previousDiffsA may contain a series of updates (inserts, deletions) |     def getReplacement(self, chunkA, chunkB, sep): | ||||||
|            that have already been performed on p_lineA. |         '''p_chunkA has been replaced with p_chunkB. Compute this update and | ||||||
|  |            return it.''' | ||||||
|  |         res = '' | ||||||
|  |         # We know that some lines have been replaced from chunkA to chunkB. By | ||||||
|  |         # identifying similarities between those lines, consider some as having | ||||||
|  |         # been deleted, modified or inserted. | ||||||
|  |         previousAdd = None | ||||||
|  |         for action, line in self.getSeqDiff(chunkA, chunkB, sep): | ||||||
|  |             add = None | ||||||
|  |             if action in ('insert', 'delete'): | ||||||
|  |                 add = self.getModifiedChunk(line, action, sep) | ||||||
|  |             elif action == 'equal': | ||||||
|  |                 add = line | ||||||
|  |             elif action == 'replace': | ||||||
|  |                 lineA, lineB, previousDiffsA, outerTagA = line | ||||||
|  |                 # lineA has been replaced with lineB. Here, we will investigate | ||||||
|  |                 # further here and explore differences at the *word* level | ||||||
|  |                 # between lineA and lineB. previousDiffsA may contain a series | ||||||
|  |                 # of updates (inserts, deletions) that have already been | ||||||
|  |                 # performed on lineA. If lineA was a previously inserted line, | ||||||
|  |                 # lineA comes without his outer tag, that lies in outerTagA | ||||||
|  |                 # (as a re.MatchObject instance computed from regex htmlTag). | ||||||
|  |                 # In that case, we will wrap the result with that tag. | ||||||
| 
 | 
 | ||||||
|            If p_lineA was a previously inserted line, p_lineA comes without his |                 # As a preamble, and in order to restrict annoyances due to the | ||||||
|            outer tag, that lies in p_outerTagA (as a re.MatchObject instance |                 # presence of XHTML tags, we will remove start and end tags | ||||||
|            computed from regex htmlTag). In that case, we will wrap the result |                 # from lineA and lineB if present. | ||||||
|            with that tag.''' |                 i, ja, jb = self.getStringDiff(lineA, lineB) | ||||||
|         # As a preamble, and in order to restrict annoyances due to the presence |                 diff = self.getHtmlDiff(lineA[i:ja], lineB[i:jb], ' ') | ||||||
|         # of XHTML tags, we will remove start and end tags from p_lineA and |                 add = lineB[:i] + diff + lineB[jb:] | ||||||
|         # p_lineB if present. |                 # Merge potential previous inner diff tags that were found (but | ||||||
|         i, ja, jb = self.getStringDiff(lineA, lineB) |                 # extracted from) lineA. | ||||||
|         diff = self.getHtmlDiff(lineA[i:ja], lineB[i:jb], ' ') |                 if previousDiffsA: | ||||||
|         res = lineB[:i] + diff + lineB[jb:] |                     merger = Merger(lineA, add, previousDiffsA, self) | ||||||
|         # Merge potential previous inner diff tags that were found (but |                     add = merger.merge() | ||||||
|         # extracted from) lineA. |                 # Rewrap line into outerTagA if lineA was a line tagged as | ||||||
|         if previousDiffsA: |                 # previously inserted. | ||||||
|             merger = Merger(lineA, res, previousDiffsA, self) |                 if outerTagA: | ||||||
|             res = merger.merge() |                     add = self.computeTag(outerTagA, add) | ||||||
|         # Rewrap line into outerTagA if lineA was a line tagged as previously |             if add: res += self.getDumpPrefix(res, add, previousAdd, sep) + add | ||||||
|         # inserted. |             previousAdd = add | ||||||
|         if outerTagA: |  | ||||||
|             res = self.computeTag(outerTagA, res) |  | ||||||
|         return res |         return res | ||||||
| 
 | 
 | ||||||
|     def getHtmlDiff(self, old, new, sep): |     def getHtmlDiff(self, old, new, sep): | ||||||
|  | @ -482,65 +525,40 @@ class HtmlDiff: | ||||||
|            word-by-word comparison within 2 lines that have been detected as |            word-by-word comparison within 2 lines that have been detected as | ||||||
|            similar in a previous call to m_getHtmlDiff with sep=carriage |            similar in a previous call to m_getHtmlDiff with sep=carriage | ||||||
|            return.''' |            return.''' | ||||||
|         res = [] |         res = '' | ||||||
|         a = self.split(old, sep) |         a = self.split(old, sep) | ||||||
|         b = self.split(new, sep) |         b = self.split(new, sep) | ||||||
|         matcher = difflib.SequenceMatcher() |         matcher = difflib.SequenceMatcher() | ||||||
|         matcher.set_seqs(a, b) |         matcher.set_seqs(a, b) | ||||||
|  |         previousAdd = None | ||||||
|         for action, i1, i2, j1, j2 in matcher.get_opcodes(): |         for action, i1, i2, j1, j2 in matcher.get_opcodes(): | ||||||
|  |             add = None | ||||||
|             # When sep is a space, we need to remember if we are dealing with |             # When sep is a space, we need to remember if we are dealing with | ||||||
|             # the last diff within the line or not. |             # the last diff within the line or not. | ||||||
|             chunkA = self.removeGarbage(a[i1:i2]) |             chunkA = self.removeGarbage(a[i1:i2], sep) | ||||||
|             chunkB = self.removeGarbage(b[j1:j2]) |             chunkB = self.removeGarbage(b[j1:j2], sep) | ||||||
|             toAdd = None |  | ||||||
|             if action == 'equal': |             if action == 'equal': | ||||||
|                 if chunkA: toAdd = sep.join(chunkA) |                 if chunkA: add = sep.join(chunkA) | ||||||
|             elif action == 'insert': |             elif action == 'insert': | ||||||
|                 if chunkB: |                 if chunkB: | ||||||
|                     toAdd = self.getModifiedChunk(chunkB, action, sep) |                     add = self.getModifiedChunk(chunkB, action, sep) | ||||||
|             elif action == 'delete': |             elif action == 'delete': | ||||||
|                 if chunkA: |                 if chunkA: | ||||||
|                     toAdd = self.getModifiedChunk(chunkA, action, sep) |                     add = self.getModifiedChunk(chunkA, action, sep) | ||||||
|             elif action == 'replace': |             elif action == 'replace': | ||||||
|                 if not chunkA and not chunkB: |                 if not chunkA and not chunkB: | ||||||
|                     toAdd = '' |                     pass | ||||||
|                 elif not chunkA: |                 elif not chunkA: | ||||||
|                     # Was an addition, not a replacement |                     # Was an addition, not a replacement | ||||||
|                     toAdd = self.getModifiedChunk(chunkB, 'insert', sep) |                     add = self.getModifiedChunk(chunkB, 'insert', sep) | ||||||
|                 elif not chunkB: |                 elif not chunkB: | ||||||
|                     # Was a deletion, not a replacement |                     # Was a deletion, not a replacement | ||||||
|                     toAdd = self.getModifiedChunk(chunkA, 'delete', sep) |                     add = self.getModifiedChunk(chunkA, 'delete', sep) | ||||||
|                 else: # At least, a true replacement |                 else: # At least, a true replacement | ||||||
|                     toAdd = '' |                     add = self.getReplacement(chunkA, chunkB, sep) | ||||||
|                     # We know that some lines have been replaced from a to b. |             if add: res += self.getDumpPrefix(res, add, previousAdd, sep) + add | ||||||
|                     # By identifying similarities between those lines, consider |             previousAdd = add | ||||||
|                     # some as having been deleted, modified or inserted. |         return res | ||||||
|                     previousAdd = None |  | ||||||
|                     for sAction, line in self.getSeqDiff(chunkA, chunkB, sep): |  | ||||||
|                         if sAction in ('insert', 'delete'): |  | ||||||
|                             add = self.getModifiedChunk(line, sAction, sep) |  | ||||||
|                         elif sAction == 'equal': |  | ||||||
|                             add = line |  | ||||||
|                         elif sAction == 'replace': |  | ||||||
|                             add = self.getReplacement(sep, *line) |  | ||||||
|                         # In most cases, I must prefix "add" with "sep" before |  | ||||||
|                         # concatenating it to "toAdd" (excepted if toAdd is |  | ||||||
|                         # still empty). But when "sep" is a space, no space |  | ||||||
|                         # must be inserted between 2 adjacent updates, because |  | ||||||
|                         # such a space was not in the original version. |  | ||||||
|                         prefix = '' |  | ||||||
|                         if toAdd: |  | ||||||
|                             if (sep == ' ') and previousAdd and \ |  | ||||||
|                                previousAdd.endswith('</span>') and \ |  | ||||||
|                                add.startswith('<span'): |  | ||||||
|                                 pass |  | ||||||
|                             else: |  | ||||||
|                                 prefix = sep |  | ||||||
|                         toAdd += prefix + add |  | ||||||
|                         if sep == ' ': |  | ||||||
|                         previousAdd = add |  | ||||||
|             if toAdd: res.append(toAdd) |  | ||||||
|         return sep.join(res) |  | ||||||
| 
 | 
 | ||||||
|     def get(self): |     def get(self): | ||||||
|         '''Produces the result.''' |         '''Produces the result.''' | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Gaetan Delannay
						Gaetan Delannay