Added check for invalid HTML and sanitization on said HTML

Added HtmlSanitization package Added check for invalid HTML and sanitization on WordsToLength and Truncate methods Addes some extra comments
2017-08-31 11:43:06 +02:00
parent 8c1996a7a1
commit 73937172dc
5 changed files with 42 additions and 8 deletions
--- a/src/Umbraco.Web/HtmlStringUtilities.cs
+++ b/src/Umbraco.Web/HtmlStringUtilities.cs
@@ -5,6 +5,7 @@ using System.Linq;
 using System.Text;
 using System.Text.RegularExpressions;
 using System.Web;
+using Ganss.XSS;
 using HtmlAgilityPack;
 using Umbraco.Web.WebApi.Filters;

@@ -89,6 +90,18 @@ namespace Umbraco.Web
        {
            string hellip = "&hellip;";

+            HtmlDocument doc = new HtmlDocument();
+            HtmlSanitizer sanitizer = new HtmlSanitizer();
+
+            doc.LoadHtml(html);
+
+            //Check for invalid HTML
+            if (doc.ParseErrors.Any())
+            {
+                //Sanitize invalid HTML, it will not be pretty, but it will be valid
+                html = sanitizer.Sanitize(html);
+            }
+
            using (var outputms = new MemoryStream())
            {
                using (var outputtw = new StreamWriter(outputms))
@@ -255,7 +268,7 @@ namespace Umbraco.Web
                        //if there is, remove it
                        if (string.IsNullOrWhiteSpace(firstTrim) == false)
                        {
-                            result = firstTrim[firstTrim.Length - hellip.Length -1] == ' ' ? firstTrim.Remove(firstTrim.Length - hellip.Length -1, 1) : firstTrim;
+                            result = firstTrim[firstTrim.Length - hellip.Length - 1] == ' ' ? firstTrim.Remove(firstTrim.Length - hellip.Length - 1, 1) : firstTrim;
                        }
                        return new HtmlString(result);
                    }
@@ -272,10 +285,13 @@ namespace Umbraco.Web
        public int WordsToLength(string html, int words, bool tagsAsContent)
        {
            HtmlDocument doc = new HtmlDocument();
+            HtmlSanitizer sanitizer = new HtmlSanitizer();
+
+            doc.LoadHtml(html);

            int wordCount = 0,
                length = 0,
-                insideTagCounter = length,
+                insideTagCounter = 0,
                maxWords = words;

            string strippedOfTags = string.Empty;
@@ -283,14 +299,21 @@ namespace Umbraco.Web
            //If tagsAsContent is on, use the string stripped of html tags
            if (tagsAsContent == false)
            {
-                doc.LoadHtml(html);
-
                foreach (var node in doc.DocumentNode.ChildNodes)
                {
                    strippedOfTags += node.InnerText;
                }
                html = strippedOfTags;
            }
+            else
+            {
+                //Check for invalid HTML
+                if (doc.ParseErrors.Any())
+                {
+                    //Sanitize invalid HTML, it will not be pretty, but it will be valid
+                    html = sanitizer.Sanitize(html);
+                }
+            }

            while (length < html.Length)
            {
@@ -300,14 +323,13 @@ namespace Umbraco.Web
                while (length < html.Length && char.IsWhiteSpace(html[length]) == false)
                {
                    //Check if we have a space inside a tag and increase the length if we do
+                    //We ignore the end tag as it is added by the Truncate method
                    if (html[length].Equals('<') && html[length + 1].Equals('/') == false && tagsAsContent)
                    {
+                        insideTagCounter = length;
                        while (html[insideTagCounter].Equals('>') == false)
                        {
-                            if (html[insideTagCounter].Equals(' '))
-                            {
-                                length++;
-                            }
+                            length++;
                            insideTagCounter++;
                        }
                    }