using System.Text.RegularExpressions; using Umbraco.Cms.Core.Routing; using Umbraco.Extensions; namespace Umbraco.Cms.Core.Templates; public sealed class HtmlImageSourceParser { private static readonly Regex ResolveImgPattern = new( @"]*(data-udi=""([^""]*)"")[^>]*>", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace); private static readonly Regex SrcAttributeRegex = new( @"src=""([^""\?]*)(\?[^""]*)?""", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace); private static readonly Regex DataUdiAttributeRegex = new( @"data-udi=\\?(?:""|')(?umb://[A-z0-9\-]+/[A-z0-9]+)\\?(?:""|')", RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase); private readonly IPublishedUrlProvider? _publishedUrlProvider; private Func? _getMediaUrl; public HtmlImageSourceParser(Func getMediaUrl) => _getMediaUrl = getMediaUrl; public HtmlImageSourceParser(IPublishedUrlProvider publishedUrlProvider) => _publishedUrlProvider = publishedUrlProvider; /// /// Parses out media UDIs from an html string based on 'data-udi' html attributes /// /// /// public IEnumerable FindUdisFromDataAttributes(string text) { MatchCollection matches = DataUdiAttributeRegex.Matches(text); if (matches.Count == 0) { yield break; } foreach (Match match in matches) { if (match.Groups.Count == 2 && UdiParser.TryParse(match.Groups[1].Value, out Udi? udi)) { yield return udi; } } } /// /// Parses the string looking for Umbraco image tags and updates them to their up-to-date image sources. /// /// /// /// Umbraco image tags are identified by their data-udi attributes public string EnsureImageSources(string text) { if (_getMediaUrl == null) { _getMediaUrl = guid => _publishedUrlProvider?.GetMediaUrl(guid); } return ResolveImgPattern.Replace(text, match => { // match groups: // - 1 = the data-udi attribute // - 2 = the data-udi attribute value var udi = match.Groups[2].Value; if (udi.IsNullOrWhiteSpace() || UdiParser.TryParse(udi, out GuidUdi? guidUdi) == false) { return match.Value; } // Find the src attribute // src match groups: // - 1 = the src attribute value until the query string // - 2 = the src attribute query string including the '?' Match src = SrcAttributeRegex.Match(match.Value); if (src.Success == false) { // the src attribute isn't found, return the original value return match.Value; } var mediaUrl = _getMediaUrl(guidUdi.Guid); if (mediaUrl == null) { // image does not exist - we could choose to remove the image entirely here (return empty string), // but that would leave the editors completely in the dark as to why the image doesn't show return match.Value; } var newImgTag = match.Value.Replace(src.Value, $"src=\"{mediaUrl}{src.Groups[2].Value}\""); return newImgTag; }); } /// /// Removes media URLs from <img> tags where a data-udi attribute is present /// /// /// public string RemoveImageSources(string text) // find each ResolveImgPattern match in the text, then find each // SrcAttributeRegex match in the match value, then replace the src // attribute value with an empty string // (see comment in ResolveMediaFromTextString for group reference) => ResolveImgPattern.Replace(text, match => { // Find the src attribute Match src = SrcAttributeRegex.Match(match.Value); return src.Success == false || string.IsNullOrWhiteSpace(src.Groups[1].Value) ? match.Value : match.Value.Replace(src.Groups[1].Value, string.Empty); }); }