Files
Umbraco-CMS/src/Umbraco.Core/Strings/Diff.cs

511 lines
21 KiB
C#

using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace Umbraco.Core.Strings
{
/// <summary>
/// This Class implements the Difference Algorithm published in
/// "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
/// Algorithmica Vol. 1 No. 2, 1986, p 251.
///
/// The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
/// each line is converted into a (hash) number. See DiffText().
///
/// diff.cs: A port of the algorithm to C#
/// Copyright (c) by Matthias Hertel, http://www.mathertel.de
/// This work is licensed under a BSD style license. See http://www.mathertel.de/License.aspx
/// </summary>
internal class Diff
{
/// <summary>Data on one input file being compared.
/// </summary>
internal class DiffData
{
/// <summary>Number of elements (lines).</summary>
internal int Length;
/// <summary>Buffer of numbers that will be compared.</summary>
internal int[] Data;
/// <summary>
/// Array of booleans that flag for modified data.
/// This is the result of the diff.
/// This means deletedA in the first Data or inserted in the second Data.
/// </summary>
internal bool[] Modified;
/// <summary>
/// Initialize the Diff-Data buffer.
/// </summary>
/// <param name="initData">reference to the buffer</param>
internal DiffData(int[] initData)
{
Data = initData;
Length = initData.Length;
Modified = new bool[Length + 2];
} // DiffData
} // class DiffData
/// <summary>details of one difference.</summary>
public struct Item
{
/// <summary>Start Line number in Data A.</summary>
public int StartA;
/// <summary>Start Line number in Data B.</summary>
public int StartB;
/// <summary>Number of changes in Data A.</summary>
public int DeletedA;
/// <summary>Number of changes in Data B.</summary>
public int InsertedB;
} // Item
/// <summary>
/// Shortest Middle Snake Return Data
/// </summary>
private struct Smsrd
{
internal int X, Y;
// internal int u, v; // 2002.09.20: no need for 2 points
}
/// <summary>
/// Find the difference in 2 texts, comparing by textlines.
/// </summary>
/// <param name="textA">A-version of the text (usualy the old one)</param>
/// <param name="textB">B-version of the text (usualy the new one)</param>
/// <returns>Returns a array of Items that describe the differences.</returns>
public static Item[] DiffText(string textA, string textB)
{
return (DiffText(textA, textB, false, false, false));
} // DiffText
/// <summary>
/// Find the difference in 2 texts, comparing by textlines.
/// This method uses the DiffInt internally by 1st converting the string into char codes
/// then uses the diff int method
/// </summary>
/// <param name="textA">A-version of the text (usualy the old one)</param>
/// <param name="textB">B-version of the text (usualy the new one)</param>
/// <returns>Returns a array of Items that describe the differences.</returns>
public static Item[] DiffText1(string textA, string textB)
{
return DiffInt(DiffCharCodes(textA, false), DiffCharCodes(textB, false));
}
/// <summary>
/// Find the difference in 2 text documents, comparing by textlines.
/// The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
/// each line is converted into a (hash) number. This hash-value is computed by storing all
/// textlines into a common hashtable so i can find dublicates in there, and generating a
/// new number each time a new textline is inserted.
/// </summary>
/// <param name="textA">A-version of the text (usualy the old one)</param>
/// <param name="textB">B-version of the text (usualy the new one)</param>
/// <param name="trimSpace">When set to true, all leading and trailing whitespace characters are stripped out before the comparation is done.</param>
/// <param name="ignoreSpace">When set to true, all whitespace characters are converted to a single space character before the comparation is done.</param>
/// <param name="ignoreCase">When set to true, all characters are converted to their lowercase equivivalence before the comparation is done.</param>
/// <returns>Returns a array of Items that describe the differences.</returns>
public static Item[] DiffText(string textA, string textB, bool trimSpace, bool ignoreSpace, bool ignoreCase)
{
// prepare the input-text and convert to comparable numbers.
var h = new Hashtable(textA.Length + textB.Length);
// The A-Version of the data (original data) to be compared.
var dataA = new DiffData(DiffCodes(textA, h, trimSpace, ignoreSpace, ignoreCase));
// The B-Version of the data (modified data) to be compared.
var dataB = new DiffData(DiffCodes(textB, h, trimSpace, ignoreSpace, ignoreCase));
h = null; // free up hashtable memory (maybe)
var max = dataA.Length + dataB.Length + 1;
// vector for the (0,0) to (x,y) search
var downVector = new int[2 * max + 2];
// vector for the (u,v) to (N,M) search
var upVector = new int[2 * max + 2];
Lcs(dataA, 0, dataA.Length, dataB, 0, dataB.Length, downVector, upVector);
Optimize(dataA);
Optimize(dataB);
return CreateDiffs(dataA, dataB);
} // DiffText
/// <summary>
/// Diffs the char codes.
/// </summary>
/// <param name="aText">A text.</param>
/// <param name="ignoreCase">if set to <c>true</c> [ignore case].</param>
/// <returns></returns>
private static int[] DiffCharCodes(string aText, bool ignoreCase)
{
if (ignoreCase)
aText = aText.ToUpperInvariant();
var codes = new int[aText.Length];
for (int n = 0; n < aText.Length; n++)
codes[n] = (int)aText[n];
return (codes);
} // DiffCharCodes
/// <summary>
/// If a sequence of modified lines starts with a line that contains the same content
/// as the line that appends the changes, the difference sequence is modified so that the
/// appended line and not the starting line is marked as modified.
/// This leads to more readable diff sequences when comparing text files.
/// </summary>
/// <param name="data">A Diff data buffer containing the identified changes.</param>
private static void Optimize(DiffData data)
{
var startPos = 0;
while (startPos < data.Length)
{
while ((startPos < data.Length) && (data.Modified[startPos] == false))
startPos++;
int endPos = startPos;
while ((endPos < data.Length) && (data.Modified[endPos] == true))
endPos++;
if ((endPos < data.Length) && (data.Data[startPos] == data.Data[endPos]))
{
data.Modified[startPos] = false;
data.Modified[endPos] = true;
}
else
{
startPos = endPos;
} // if
} // while
} // Optimize
/// <summary>
/// Find the difference in 2 arrays of integers.
/// </summary>
/// <param name="arrayA">A-version of the numbers (usualy the old one)</param>
/// <param name="arrayB">B-version of the numbers (usualy the new one)</param>
/// <returns>Returns a array of Items that describe the differences.</returns>
public static Item[] DiffInt(int[] arrayA, int[] arrayB)
{
// The A-Version of the data (original data) to be compared.
var dataA = new DiffData(arrayA);
// The B-Version of the data (modified data) to be compared.
var dataB = new DiffData(arrayB);
var max = dataA.Length + dataB.Length + 1;
// vector for the (0,0) to (x,y) search
var downVector = new int[2 * max + 2];
// vector for the (u,v) to (N,M) search
var upVector = new int[2 * max + 2];
Lcs(dataA, 0, dataA.Length, dataB, 0, dataB.Length, downVector, upVector);
return CreateDiffs(dataA, dataB);
} // Diff
/// <summary>
/// This function converts all textlines of the text into unique numbers for every unique textline
/// so further work can work only with simple numbers.
/// </summary>
/// <param name="aText">the input text</param>
/// <param name="h">This extern initialized hashtable is used for storing all ever used textlines.</param>
/// <param name="trimSpace">ignore leading and trailing space characters</param>
/// <param name="ignoreSpace"></param>
/// <param name="ignoreCase"></param>
/// <returns>a array of integers.</returns>
private static int[] DiffCodes(string aText, IDictionary h, bool trimSpace, bool ignoreSpace, bool ignoreCase)
{
// get all codes of the text
var lastUsedCode = h.Count;
// strip off all cr, only use lf as textline separator.
aText = aText.Replace("\r", "");
var lines = aText.Split('\n');
var codes = new int[lines.Length];
for (int i = 0; i < lines.Length; ++i)
{
string s = lines[i];
if (trimSpace)
s = s.Trim();
if (ignoreSpace)
{
s = Regex.Replace(s, "\\s+", " "); // TODO: optimization: faster blank removal.
}
if (ignoreCase)
s = s.ToLower();
object aCode = h[s];
if (aCode == null)
{
lastUsedCode++;
h[s] = lastUsedCode;
codes[i] = lastUsedCode;
}
else
{
codes[i] = (int)aCode;
} // if
} // for
return (codes);
} // DiffCodes
/// <summary>
/// This is the algorithm to find the Shortest Middle Snake (SMS).
/// </summary>
/// <param name="dataA">sequence A</param>
/// <param name="lowerA">lower bound of the actual range in DataA</param>
/// <param name="upperA">upper bound of the actual range in DataA (exclusive)</param>
/// <param name="dataB">sequence B</param>
/// <param name="lowerB">lower bound of the actual range in DataB</param>
/// <param name="upperB">upper bound of the actual range in DataB (exclusive)</param>
/// <param name="downVector">a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.</param>
/// <param name="upVector">a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.</param>
/// <returns>a MiddleSnakeData record containing x,y and u,v</returns>
private static Smsrd Sms(DiffData dataA, int lowerA, int upperA, DiffData dataB, int lowerB, int upperB, int[] downVector, int[] upVector)
{
int max = dataA.Length + dataB.Length + 1;
int downK = lowerA - lowerB; // the k-line to start the forward search
int upK = upperA - upperB; // the k-line to start the reverse search
int delta = (upperA - lowerA) - (upperB - lowerB);
bool oddDelta = (delta & 1) != 0;
// The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
// and are access using a specific offset: UpOffset UpVector and DownOffset for DownVektor
int downOffset = max - downK;
int upOffset = max - upK;
int maxD = ((upperA - lowerA + upperB - lowerB) / 2) + 1;
// Debug.Write(2, "SMS", String.Format("Search the box: A[{0}-{1}] to B[{2}-{3}]", LowerA, UpperA, LowerB, UpperB));
// init vectors
downVector[downOffset + downK + 1] = lowerA;
upVector[upOffset + upK - 1] = upperA;
for (int d = 0; d <= maxD; d++)
{
// Extend the forward path.
Smsrd ret;
for (int k = downK - d; k <= downK + d; k += 2)
{
// Debug.Write(0, "SMS", "extend forward path " + k.ToString());
// find the only or better starting point
int x, y;
if (k == downK - d)
{
x = downVector[downOffset + k + 1]; // down
}
else
{
x = downVector[downOffset + k - 1] + 1; // a step to the right
if ((k < downK + d) && (downVector[downOffset + k + 1] >= x))
x = downVector[downOffset + k + 1]; // down
}
y = x - k;
// find the end of the furthest reaching forward D-path in diagonal k.
while ((x < upperA) && (y < upperB) && (dataA.Data[x] == dataB.Data[y]))
{
x++; y++;
}
downVector[downOffset + k] = x;
// overlap ?
if (oddDelta && (upK - d < k) && (k < upK + d))
{
if (upVector[upOffset + k] <= downVector[downOffset + k])
{
ret.X = downVector[downOffset + k];
ret.Y = downVector[downOffset + k] - k;
// ret.u = UpVector[UpOffset + k]; // 2002.09.20: no need for 2 points
// ret.v = UpVector[UpOffset + k] - k;
return (ret);
} // if
} // if
} // for k
// Extend the reverse path.
for (int k = upK - d; k <= upK + d; k += 2)
{
// Debug.Write(0, "SMS", "extend reverse path " + k.ToString());
// find the only or better starting point
int x, y;
if (k == upK + d)
{
x = upVector[upOffset + k - 1]; // up
}
else
{
x = upVector[upOffset + k + 1] - 1; // left
if ((k > upK - d) && (upVector[upOffset + k - 1] < x))
x = upVector[upOffset + k - 1]; // up
} // if
y = x - k;
while ((x > lowerA) && (y > lowerB) && (dataA.Data[x - 1] == dataB.Data[y - 1]))
{
x--; y--; // diagonal
}
upVector[upOffset + k] = x;
// overlap ?
if (!oddDelta && (downK - d <= k) && (k <= downK + d))
{
if (upVector[upOffset + k] <= downVector[downOffset + k])
{
ret.X = downVector[downOffset + k];
ret.Y = downVector[downOffset + k] - k;
// ret.u = UpVector[UpOffset + k]; // 2002.09.20: no need for 2 points
// ret.v = UpVector[UpOffset + k] - k;
return (ret);
} // if
} // if
} // for k
} // for D
throw new ApplicationException("the algorithm should never come here.");
} // SMS
/// <summary>
/// This is the divide-and-conquer implementation of the longes common-subsequence (LCS)
/// algorithm.
/// The published algorithm passes recursively parts of the A and B sequences.
/// To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant.
/// </summary>
/// <param name="dataA">sequence A</param>
/// <param name="lowerA">lower bound of the actual range in DataA</param>
/// <param name="upperA">upper bound of the actual range in DataA (exclusive)</param>
/// <param name="dataB">sequence B</param>
/// <param name="lowerB">lower bound of the actual range in DataB</param>
/// <param name="upperB">upper bound of the actual range in DataB (exclusive)</param>
/// <param name="downVector">a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.</param>
/// <param name="upVector">a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.</param>
private static void Lcs(DiffData dataA, int lowerA, int upperA, DiffData dataB, int lowerB, int upperB, int[] downVector, int[] upVector)
{
// Debug.Write(2, "LCS", String.Format("Analyse the box: A[{0}-{1}] to B[{2}-{3}]", LowerA, UpperA, LowerB, UpperB));
// Fast walkthrough equal lines at the start
while (lowerA < upperA && lowerB < upperB && dataA.Data[lowerA] == dataB.Data[lowerB])
{
lowerA++; lowerB++;
}
// Fast walkthrough equal lines at the end
while (lowerA < upperA && lowerB < upperB && dataA.Data[upperA - 1] == dataB.Data[upperB - 1])
{
--upperA; --upperB;
}
if (lowerA == upperA)
{
// mark as inserted lines.
while (lowerB < upperB)
dataB.Modified[lowerB++] = true;
}
else if (lowerB == upperB)
{
// mark as deleted lines.
while (lowerA < upperA)
dataA.Modified[lowerA++] = true;
}
else
{
// Find the middle snakea and length of an optimal path for A and B
Smsrd smsrd = Sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector);
// Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y));
// The path is from LowerX to (x,y) and (x,y) to UpperX
Lcs(dataA, lowerA, smsrd.X, dataB, lowerB, smsrd.Y, downVector, upVector);
Lcs(dataA, smsrd.X, upperA, dataB, smsrd.Y, upperB, downVector, upVector); // 2002.09.20: no need for 2 points
}
} // LCS()
/// <summary>Scan the tables of which lines are inserted and deleted,
/// producing an edit script in forward order.
/// </summary>
/// dynamic array
private static Item[] CreateDiffs(DiffData dataA, DiffData dataB)
{
ArrayList a = new ArrayList();
Item aItem;
Item[] result;
int lineA = 0;
int lineB = 0;
while (lineA < dataA.Length || lineB < dataB.Length)
{
if ((lineA < dataA.Length) && (!dataA.Modified[lineA])
&& (lineB < dataB.Length) && (!dataB.Modified[lineB]))
{
// equal lines
lineA++;
lineB++;
}
else
{
// maybe deleted and/or inserted lines
int startA = lineA;
int startB = lineB;
while (lineA < dataA.Length && (lineB >= dataB.Length || dataA.Modified[lineA]))
// while (LineA < DataA.Length && DataA.modified[LineA])
lineA++;
while (lineB < dataB.Length && (lineA >= dataA.Length || dataB.Modified[lineB]))
// while (LineB < DataB.Length && DataB.modified[LineB])
lineB++;
if ((startA < lineA) || (startB < lineB))
{
// store a new difference-item
aItem = new Item();
aItem.StartA = startA;
aItem.StartB = startB;
aItem.DeletedA = lineA - startA;
aItem.InsertedB = lineB - startB;
a.Add(aItem);
} // if
} // if
} // while
result = new Item[a.Count];
a.CopyTo(result);
return (result);
}
} // class Diff
}