using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace Umbraco.Core.Strings
{
///
/// This Class implements the Difference Algorithm published in
/// "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
/// Algorithmica Vol. 1 No. 2, 1986, p 251.
///
/// The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
/// each line is converted into a (hash) number. See DiffText().
///
/// diff.cs: A port of the algorithm to C#
/// Copyright (c) by Matthias Hertel, http://www.mathertel.de
/// This work is licensed under a BSD style license. See http://www.mathertel.de/License.aspx
///
internal class Diff
{
/// Data on one input file being compared.
///
internal class DiffData
{
/// Number of elements (lines).
internal int Length;
/// Buffer of numbers that will be compared.
internal int[] Data;
///
/// Array of booleans that flag for modified data.
/// This is the result of the diff.
/// This means deletedA in the first Data or inserted in the second Data.
///
internal bool[] Modified;
///
/// Initialize the Diff-Data buffer.
///
/// reference to the buffer
internal DiffData(int[] initData)
{
Data = initData;
Length = initData.Length;
Modified = new bool[Length + 2];
} // DiffData
} // class DiffData
/// details of one difference.
public struct Item
{
/// Start Line number in Data A.
public int StartA;
/// Start Line number in Data B.
public int StartB;
/// Number of changes in Data A.
public int DeletedA;
/// Number of changes in Data B.
public int InsertedB;
} // Item
///
/// Shortest Middle Snake Return Data
///
private struct Smsrd
{
internal int X, Y;
// internal int u, v; // 2002.09.20: no need for 2 points
}
///
/// Find the difference in 2 texts, comparing by textlines.
///
/// A-version of the text (usualy the old one)
/// B-version of the text (usualy the new one)
/// Returns a array of Items that describe the differences.
public static Item[] DiffText(string textA, string textB)
{
return (DiffText(textA, textB, false, false, false));
} // DiffText
///
/// Find the difference in 2 texts, comparing by textlines.
/// This method uses the DiffInt internally by 1st converting the string into char codes
/// then uses the diff int method
///
/// A-version of the text (usualy the old one)
/// B-version of the text (usualy the new one)
/// Returns a array of Items that describe the differences.
public static Item[] DiffText1(string textA, string textB)
{
return DiffInt(DiffCharCodes(textA, false), DiffCharCodes(textB, false));
}
///
/// Find the difference in 2 text documents, comparing by textlines.
/// The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
/// each line is converted into a (hash) number. This hash-value is computed by storing all
/// textlines into a common hashtable so i can find dublicates in there, and generating a
/// new number each time a new textline is inserted.
///
/// A-version of the text (usualy the old one)
/// B-version of the text (usualy the new one)
/// When set to true, all leading and trailing whitespace characters are stripped out before the comparation is done.
/// When set to true, all whitespace characters are converted to a single space character before the comparation is done.
/// When set to true, all characters are converted to their lowercase equivivalence before the comparation is done.
/// Returns a array of Items that describe the differences.
public static Item[] DiffText(string textA, string textB, bool trimSpace, bool ignoreSpace, bool ignoreCase)
{
// prepare the input-text and convert to comparable numbers.
var h = new Hashtable(textA.Length + textB.Length);
// The A-Version of the data (original data) to be compared.
var dataA = new DiffData(DiffCodes(textA, h, trimSpace, ignoreSpace, ignoreCase));
// The B-Version of the data (modified data) to be compared.
var dataB = new DiffData(DiffCodes(textB, h, trimSpace, ignoreSpace, ignoreCase));
h = null; // free up hashtable memory (maybe)
var max = dataA.Length + dataB.Length + 1;
// vector for the (0,0) to (x,y) search
var downVector = new int[2 * max + 2];
// vector for the (u,v) to (N,M) search
var upVector = new int[2 * max + 2];
Lcs(dataA, 0, dataA.Length, dataB, 0, dataB.Length, downVector, upVector);
Optimize(dataA);
Optimize(dataB);
return CreateDiffs(dataA, dataB);
} // DiffText
///
/// Diffs the char codes.
///
/// A text.
/// if set to true [ignore case].
///
private static int[] DiffCharCodes(string aText, bool ignoreCase)
{
if (ignoreCase)
aText = aText.ToUpperInvariant();
var codes = new int[aText.Length];
for (int n = 0; n < aText.Length; n++)
codes[n] = (int)aText[n];
return (codes);
} // DiffCharCodes
///
/// If a sequence of modified lines starts with a line that contains the same content
/// as the line that appends the changes, the difference sequence is modified so that the
/// appended line and not the starting line is marked as modified.
/// This leads to more readable diff sequences when comparing text files.
///
/// A Diff data buffer containing the identified changes.
private static void Optimize(DiffData data)
{
var startPos = 0;
while (startPos < data.Length)
{
while ((startPos < data.Length) && (data.Modified[startPos] == false))
startPos++;
int endPos = startPos;
while ((endPos < data.Length) && (data.Modified[endPos] == true))
endPos++;
if ((endPos < data.Length) && (data.Data[startPos] == data.Data[endPos]))
{
data.Modified[startPos] = false;
data.Modified[endPos] = true;
}
else
{
startPos = endPos;
} // if
} // while
} // Optimize
///
/// Find the difference in 2 arrays of integers.
///
/// A-version of the numbers (usualy the old one)
/// B-version of the numbers (usualy the new one)
/// Returns a array of Items that describe the differences.
public static Item[] DiffInt(int[] arrayA, int[] arrayB)
{
// The A-Version of the data (original data) to be compared.
var dataA = new DiffData(arrayA);
// The B-Version of the data (modified data) to be compared.
var dataB = new DiffData(arrayB);
var max = dataA.Length + dataB.Length + 1;
// vector for the (0,0) to (x,y) search
var downVector = new int[2 * max + 2];
// vector for the (u,v) to (N,M) search
var upVector = new int[2 * max + 2];
Lcs(dataA, 0, dataA.Length, dataB, 0, dataB.Length, downVector, upVector);
return CreateDiffs(dataA, dataB);
} // Diff
///
/// This function converts all textlines of the text into unique numbers for every unique textline
/// so further work can work only with simple numbers.
///
/// the input text
/// This extern initialized hashtable is used for storing all ever used textlines.
/// ignore leading and trailing space characters
///
///
/// a array of integers.
private static int[] DiffCodes(string aText, IDictionary h, bool trimSpace, bool ignoreSpace, bool ignoreCase)
{
// get all codes of the text
var lastUsedCode = h.Count;
// strip off all cr, only use lf as textline separator.
aText = aText.Replace("\r", "");
var lines = aText.Split('\n');
var codes = new int[lines.Length];
for (int i = 0; i < lines.Length; ++i)
{
string s = lines[i];
if (trimSpace)
s = s.Trim();
if (ignoreSpace)
{
s = Regex.Replace(s, "\\s+", " "); // TODO: optimization: faster blank removal.
}
if (ignoreCase)
s = s.ToLower();
object aCode = h[s];
if (aCode == null)
{
lastUsedCode++;
h[s] = lastUsedCode;
codes[i] = lastUsedCode;
}
else
{
codes[i] = (int)aCode;
} // if
} // for
return (codes);
} // DiffCodes
///
/// This is the algorithm to find the Shortest Middle Snake (SMS).
///
/// sequence A
/// lower bound of the actual range in DataA
/// upper bound of the actual range in DataA (exclusive)
/// sequence B
/// lower bound of the actual range in DataB
/// upper bound of the actual range in DataB (exclusive)
/// a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
/// a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
/// a MiddleSnakeData record containing x,y and u,v
private static Smsrd Sms(DiffData dataA, int lowerA, int upperA, DiffData dataB, int lowerB, int upperB, int[] downVector, int[] upVector)
{
int max = dataA.Length + dataB.Length + 1;
int downK = lowerA - lowerB; // the k-line to start the forward search
int upK = upperA - upperB; // the k-line to start the reverse search
int delta = (upperA - lowerA) - (upperB - lowerB);
bool oddDelta = (delta & 1) != 0;
// The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
// and are access using a specific offset: UpOffset UpVector and DownOffset for DownVektor
int downOffset = max - downK;
int upOffset = max - upK;
int maxD = ((upperA - lowerA + upperB - lowerB) / 2) + 1;
// Debug.Write(2, "SMS", String.Format("Search the box: A[{0}-{1}] to B[{2}-{3}]", LowerA, UpperA, LowerB, UpperB));
// init vectors
downVector[downOffset + downK + 1] = lowerA;
upVector[upOffset + upK - 1] = upperA;
for (int d = 0; d <= maxD; d++)
{
// Extend the forward path.
Smsrd ret;
for (int k = downK - d; k <= downK + d; k += 2)
{
// Debug.Write(0, "SMS", "extend forward path " + k.ToString());
// find the only or better starting point
int x, y;
if (k == downK - d)
{
x = downVector[downOffset + k + 1]; // down
}
else
{
x = downVector[downOffset + k - 1] + 1; // a step to the right
if ((k < downK + d) && (downVector[downOffset + k + 1] >= x))
x = downVector[downOffset + k + 1]; // down
}
y = x - k;
// find the end of the furthest reaching forward D-path in diagonal k.
while ((x < upperA) && (y < upperB) && (dataA.Data[x] == dataB.Data[y]))
{
x++; y++;
}
downVector[downOffset + k] = x;
// overlap ?
if (oddDelta && (upK - d < k) && (k < upK + d))
{
if (upVector[upOffset + k] <= downVector[downOffset + k])
{
ret.X = downVector[downOffset + k];
ret.Y = downVector[downOffset + k] - k;
// ret.u = UpVector[UpOffset + k]; // 2002.09.20: no need for 2 points
// ret.v = UpVector[UpOffset + k] - k;
return (ret);
} // if
} // if
} // for k
// Extend the reverse path.
for (int k = upK - d; k <= upK + d; k += 2)
{
// Debug.Write(0, "SMS", "extend reverse path " + k.ToString());
// find the only or better starting point
int x, y;
if (k == upK + d)
{
x = upVector[upOffset + k - 1]; // up
}
else
{
x = upVector[upOffset + k + 1] - 1; // left
if ((k > upK - d) && (upVector[upOffset + k - 1] < x))
x = upVector[upOffset + k - 1]; // up
} // if
y = x - k;
while ((x > lowerA) && (y > lowerB) && (dataA.Data[x - 1] == dataB.Data[y - 1]))
{
x--; y--; // diagonal
}
upVector[upOffset + k] = x;
// overlap ?
if (!oddDelta && (downK - d <= k) && (k <= downK + d))
{
if (upVector[upOffset + k] <= downVector[downOffset + k])
{
ret.X = downVector[downOffset + k];
ret.Y = downVector[downOffset + k] - k;
// ret.u = UpVector[UpOffset + k]; // 2002.09.20: no need for 2 points
// ret.v = UpVector[UpOffset + k] - k;
return (ret);
} // if
} // if
} // for k
} // for D
throw new ApplicationException("the algorithm should never come here.");
} // SMS
///
/// This is the divide-and-conquer implementation of the longes common-subsequence (LCS)
/// algorithm.
/// The published algorithm passes recursively parts of the A and B sequences.
/// To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant.
///
/// sequence A
/// lower bound of the actual range in DataA
/// upper bound of the actual range in DataA (exclusive)
/// sequence B
/// lower bound of the actual range in DataB
/// upper bound of the actual range in DataB (exclusive)
/// a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
/// a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
private static void Lcs(DiffData dataA, int lowerA, int upperA, DiffData dataB, int lowerB, int upperB, int[] downVector, int[] upVector)
{
// Debug.Write(2, "LCS", String.Format("Analyse the box: A[{0}-{1}] to B[{2}-{3}]", LowerA, UpperA, LowerB, UpperB));
// Fast walkthrough equal lines at the start
while (lowerA < upperA && lowerB < upperB && dataA.Data[lowerA] == dataB.Data[lowerB])
{
lowerA++; lowerB++;
}
// Fast walkthrough equal lines at the end
while (lowerA < upperA && lowerB < upperB && dataA.Data[upperA - 1] == dataB.Data[upperB - 1])
{
--upperA; --upperB;
}
if (lowerA == upperA)
{
// mark as inserted lines.
while (lowerB < upperB)
dataB.Modified[lowerB++] = true;
}
else if (lowerB == upperB)
{
// mark as deleted lines.
while (lowerA < upperA)
dataA.Modified[lowerA++] = true;
}
else
{
// Find the middle snakea and length of an optimal path for A and B
Smsrd smsrd = Sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector);
// Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y));
// The path is from LowerX to (x,y) and (x,y) to UpperX
Lcs(dataA, lowerA, smsrd.X, dataB, lowerB, smsrd.Y, downVector, upVector);
Lcs(dataA, smsrd.X, upperA, dataB, smsrd.Y, upperB, downVector, upVector); // 2002.09.20: no need for 2 points
}
} // LCS()
/// Scan the tables of which lines are inserted and deleted,
/// producing an edit script in forward order.
///
/// dynamic array
private static Item[] CreateDiffs(DiffData dataA, DiffData dataB)
{
ArrayList a = new ArrayList();
Item aItem;
Item[] result;
int lineA = 0;
int lineB = 0;
while (lineA < dataA.Length || lineB < dataB.Length)
{
if ((lineA < dataA.Length) && (!dataA.Modified[lineA])
&& (lineB < dataB.Length) && (!dataB.Modified[lineB]))
{
// equal lines
lineA++;
lineB++;
}
else
{
// maybe deleted and/or inserted lines
int startA = lineA;
int startB = lineB;
while (lineA < dataA.Length && (lineB >= dataB.Length || dataA.Modified[lineA]))
// while (LineA < DataA.Length && DataA.modified[LineA])
lineA++;
while (lineB < dataB.Length && (lineA >= dataA.Length || dataB.Modified[lineB]))
// while (LineB < DataB.Length && DataB.modified[LineB])
lineB++;
if ((startA < lineA) || (startB < lineB))
{
// store a new difference-item
aItem = new Item();
aItem.StartA = startA;
aItem.StartB = startB;
aItem.DeletedA = lineA - startA;
aItem.InsertedB = lineB - startB;
a.Add(aItem);
} // if
} // if
} // while
result = new Item[a.Count];
a.CopyTo(result);
return (result);
}
} // class Diff
}