Added check for invalid HTML and sanitization on said HTML

Added HtmlSanitization package
Added check for invalid HTML and sanitization on WordsToLength and Truncate methods
Addes some extra comments
This commit is contained in:
Robert
2017-08-31 11:43:06 +02:00
parent 8c1996a7a1
commit 73937172dc
5 changed files with 42 additions and 8 deletions

View File

@@ -5,6 +5,7 @@ using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using Ganss.XSS;
using HtmlAgilityPack;
using Umbraco.Web.WebApi.Filters;
@@ -89,6 +90,18 @@ namespace Umbraco.Web
{
string hellip = "…";
HtmlDocument doc = new HtmlDocument();
HtmlSanitizer sanitizer = new HtmlSanitizer();
doc.LoadHtml(html);
//Check for invalid HTML
if (doc.ParseErrors.Any())
{
//Sanitize invalid HTML, it will not be pretty, but it will be valid
html = sanitizer.Sanitize(html);
}
using (var outputms = new MemoryStream())
{
using (var outputtw = new StreamWriter(outputms))
@@ -255,7 +268,7 @@ namespace Umbraco.Web
//if there is, remove it
if (string.IsNullOrWhiteSpace(firstTrim) == false)
{
result = firstTrim[firstTrim.Length - hellip.Length -1] == ' ' ? firstTrim.Remove(firstTrim.Length - hellip.Length -1, 1) : firstTrim;
result = firstTrim[firstTrim.Length - hellip.Length - 1] == ' ' ? firstTrim.Remove(firstTrim.Length - hellip.Length - 1, 1) : firstTrim;
}
return new HtmlString(result);
}
@@ -272,10 +285,13 @@ namespace Umbraco.Web
public int WordsToLength(string html, int words, bool tagsAsContent)
{
HtmlDocument doc = new HtmlDocument();
HtmlSanitizer sanitizer = new HtmlSanitizer();
doc.LoadHtml(html);
int wordCount = 0,
length = 0,
insideTagCounter = length,
insideTagCounter = 0,
maxWords = words;
string strippedOfTags = string.Empty;
@@ -283,14 +299,21 @@ namespace Umbraco.Web
//If tagsAsContent is on, use the string stripped of html tags
if (tagsAsContent == false)
{
doc.LoadHtml(html);
foreach (var node in doc.DocumentNode.ChildNodes)
{
strippedOfTags += node.InnerText;
}
html = strippedOfTags;
}
else
{
//Check for invalid HTML
if (doc.ParseErrors.Any())
{
//Sanitize invalid HTML, it will not be pretty, but it will be valid
html = sanitizer.Sanitize(html);
}
}
while (length < html.Length)
{
@@ -300,14 +323,13 @@ namespace Umbraco.Web
while (length < html.Length && char.IsWhiteSpace(html[length]) == false)
{
//Check if we have a space inside a tag and increase the length if we do
//We ignore the end tag as it is added by the Truncate method
if (html[length].Equals('<') && html[length + 1].Equals('/') == false && tagsAsContent)
{
insideTagCounter = length;
while (html[insideTagCounter].Equals('>') == false)
{
if (html[insideTagCounter].Equals(' '))
{
length++;
}
length++;
insideTagCounter++;
}
}