Files
Umbraco-CMS/src/Umbraco.Web/HtmlStringUtilities.cs
Robert 73937172dc Added check for invalid HTML and sanitization on said HTML
Added HtmlSanitization package
Added check for invalid HTML and sanitization on WordsToLength and Truncate methods
Addes some extra comments
2017-08-31 11:43:06 +02:00

350 lines
15 KiB
C#

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using Ganss.XSS;
using HtmlAgilityPack;
using Umbraco.Web.WebApi.Filters;
namespace Umbraco.Web
{
/// <summary>
/// Utility class for working with strings and HTML in views
/// </summary>
/// <remarks>
/// The UmbracoHelper uses this class for it's string methods
/// </remarks>
public sealed class HtmlStringUtilities
{
/// <summary>
/// Replaces text line breaks with html line breaks
/// </summary>
/// <param name="text">The text.</param>
/// <returns>The text with text line breaks replaced with html linebreaks (<br/>)</returns>
public string ReplaceLineBreaksForHtml(string text)
{
return text.Replace("\n", "<br/>\n");
}
public HtmlString StripHtmlTags(string html, params string[] tags)
{
var doc = new HtmlDocument();
doc.LoadHtml("<p>" + html + "</p>");
var targets = new List<HtmlNode>();
var nodes = doc.DocumentNode.FirstChild.SelectNodes(".//*");
if (nodes != null)
{
foreach (var node in nodes)
{
//is element
if (node.NodeType != HtmlNodeType.Element) continue;
var filterAllTags = (tags == null || !tags.Any());
if (filterAllTags || tags.Any(tag => string.Equals(tag, node.Name, StringComparison.CurrentCultureIgnoreCase)))
{
targets.Add(node);
}
}
foreach (var target in targets)
{
HtmlNode content = doc.CreateTextNode(target.InnerText);
target.ParentNode.ReplaceChild(content, target);
}
}
else
{
return new HtmlString(html);
}
return new HtmlString(doc.DocumentNode.FirstChild.InnerHtml);
}
internal string Join<TIgnore>(string seperator, params object[] args)
{
var results = args.Where(arg => arg != null && arg.GetType() != typeof(TIgnore)).Select(arg => string.Format("{0}", arg)).Where(sArg => !string.IsNullOrWhiteSpace(sArg)).ToList();
return string.Join(seperator, results);
}
internal string Concatenate<TIgnore>(params object[] args)
{
var result = new StringBuilder();
foreach (var sArg in args.Where(arg => arg != null && arg.GetType() != typeof(TIgnore)).Select(arg => string.Format("{0}", arg)).Where(sArg => !string.IsNullOrWhiteSpace(sArg)))
{
result.Append(sArg);
}
return result.ToString();
}
internal string Coalesce<TIgnore>(params object[] args)
{
foreach (var sArg in args.Where(arg => arg != null && arg.GetType() != typeof(TIgnore)).Select(arg => string.Format("{0}", arg)).Where(sArg => !string.IsNullOrWhiteSpace(sArg)))
{
return sArg;
}
return string.Empty;
}
public IHtmlString Truncate(string html, int length, bool addElipsis, bool treatTagsAsContent)
{
string hellip = "&hellip;";
HtmlDocument doc = new HtmlDocument();
HtmlSanitizer sanitizer = new HtmlSanitizer();
doc.LoadHtml(html);
//Check for invalid HTML
if (doc.ParseErrors.Any())
{
//Sanitize invalid HTML, it will not be pretty, but it will be valid
html = sanitizer.Sanitize(html);
}
using (var outputms = new MemoryStream())
{
using (var outputtw = new StreamWriter(outputms))
{
using (var ms = new MemoryStream())
{
using (var tw = new StreamWriter(ms))
{
tw.Write(html);
tw.Flush();
ms.Position = 0;
var tagStack = new Stack<string>();
using (TextReader tr = new StreamReader(ms))
{
bool isInsideElement = false,
lengthReached = false,
insideTagSpaceEncountered = false,
isTagClose = false;
int ic = 0,
currentLength = 0,
currentTextLength = 0;
string currentTag = string.Empty,
tagContents = string.Empty;
while ((ic = tr.Read()) != -1)
{
bool write = true;
switch ((char)ic)
{
case '<':
if (!lengthReached)
{
isInsideElement = true;
}
insideTagSpaceEncountered = false;
currentTag = string.Empty;
tagContents = string.Empty;
isTagClose = false;
if (tr.Peek() == (int)'/')
{
isTagClose = true;
}
break;
case '>':
isInsideElement = false;
if (isTagClose && tagStack.Count > 0)
{
string thisTag = tagStack.Pop();
outputtw.Write("</" + thisTag + ">");
if (treatTagsAsContent)
{
currentTextLength++;
}
}
if (!isTagClose && currentTag.Length > 0)
{
if (!lengthReached)
{
tagStack.Push(currentTag);
outputtw.Write("<" + currentTag);
if (treatTagsAsContent)
{
currentTextLength++;
}
if (!string.IsNullOrEmpty(tagContents))
{
if (tagContents.EndsWith("/"))
{
// No end tag e.g. <br />.
tagStack.Pop();
}
outputtw.Write(tagContents);
write = true;
insideTagSpaceEncountered = false;
}
outputtw.Write(">");
}
}
// Continue to next iteration of the text reader.
continue;
default:
if (isInsideElement)
{
if (ic == (int)' ')
{
if (!insideTagSpaceEncountered)
{
insideTagSpaceEncountered = true;
}
}
if (!insideTagSpaceEncountered)
{
currentTag += (char)ic;
}
}
break;
}
if (isInsideElement || insideTagSpaceEncountered)
{
write = false;
if (insideTagSpaceEncountered)
{
tagContents += (char)ic;
}
}
if (!isInsideElement || treatTagsAsContent)
{
currentTextLength++;
}
if (currentTextLength <= length || (lengthReached && isInsideElement))
{
if (write)
{
var charToWrite = (char)ic;
outputtw.Write(charToWrite);
currentLength++;
}
}
if (!lengthReached && currentTextLength >= length)
{
// if the last character added was the first of a two character unicode pair, add the second character
if (Char.IsHighSurrogate((char)ic))
{
var lowSurrogate = tr.Read();
outputtw.Write((char)lowSurrogate);
}
// Reached truncate limit.
if (addElipsis)
{
outputtw.Write(hellip);
}
lengthReached = true;
}
}
}
}
}
outputtw.Flush();
outputms.Position = 0;
using (TextReader outputtr = new StreamReader(outputms))
{
string result = string.Empty;
string firstTrim = outputtr.ReadToEnd().Replace(" ", " ").Trim();
//Check to see if there is an empty char between the hellip and the output string
//if there is, remove it
if (string.IsNullOrWhiteSpace(firstTrim) == false)
{
result = firstTrim[firstTrim.Length - hellip.Length - 1] == ' ' ? firstTrim.Remove(firstTrim.Length - hellip.Length - 1, 1) : firstTrim;
}
return new HtmlString(result);
}
}
}
}
/// <summary>
/// Returns the length of the words from a html block
/// </summary>
/// <param name="html">Html text</param>
/// <param name="words">Amount of words you would like to measure</param>
/// <returns></returns>
public int WordsToLength(string html, int words, bool tagsAsContent)
{
HtmlDocument doc = new HtmlDocument();
HtmlSanitizer sanitizer = new HtmlSanitizer();
doc.LoadHtml(html);
int wordCount = 0,
length = 0,
insideTagCounter = 0,
maxWords = words;
string strippedOfTags = string.Empty;
//If tagsAsContent is on, use the string stripped of html tags
if (tagsAsContent == false)
{
foreach (var node in doc.DocumentNode.ChildNodes)
{
strippedOfTags += node.InnerText;
}
html = strippedOfTags;
}
else
{
//Check for invalid HTML
if (doc.ParseErrors.Any())
{
//Sanitize invalid HTML, it will not be pretty, but it will be valid
html = sanitizer.Sanitize(html);
}
}
while (length < html.Length)
{
// Check to see if the current wordCount reached the maxWords allowed
if (wordCount.Equals(maxWords)) break;
// Check if current char is part of a word
while (length < html.Length && char.IsWhiteSpace(html[length]) == false)
{
//Check if we have a space inside a tag and increase the length if we do
//We ignore the end tag as it is added by the Truncate method
if (html[length].Equals('<') && html[length + 1].Equals('/') == false && tagsAsContent)
{
insideTagCounter = length;
while (html[insideTagCounter].Equals('>') == false)
{
length++;
insideTagCounter++;
}
}
length++;
}
wordCount++;
// Skip whitespace until the next word
while (length < html.Length && char.IsWhiteSpace(html[length]) && wordCount.Equals(maxWords) == false)
{
length++;
}
}
return length;
}
}
}