using System; using System.Collections.Generic; using System.Globalization; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Xml; using Umbraco.Core.Configuration; namespace Umbraco.Core.Strings { /// /// Legacy implementation of string functions for short strings such as aliases or url segments. /// /// /// Not necessarily optimized to work on large bodies of text. /// Can expose surprising or bogus behavior. /// Uses invariant culture everywhere. /// internal class LegacyShortStringHelper : IShortStringHelper { #region Ctor and vars /// /// Freezes the helper so it can prevents its configuration from being modified. /// /// Will be called by ShortStringHelperResolver when resolution freezes. public void Freeze() { // we have nothing to protect. } const string UmbracoValidAliasCharacters = "_-abcdefghijklmnopqrstuvwxyz1234567890"; const string UmbracoInvalidFirstCharacters = "0123456789"; #endregion #region Short string services JavaScript const string SssjsValidCharacters = "_-abcdefghijklmnopqrstuvwxyz1234567890"; const string SssjsInvalidFirstCharacters = "0123456789"; private const string SssjsFormat = @" var UMBRACO_FORCE_SAFE_ALIAS = {0}; var UMBRACO_FORCE_SAFE_ALIAS_VALIDCHARS = '{1}'; var UMBRACO_FORCE_SAFE_ALIAS_INVALID_FIRST_CHARS = '{2}'; function safeAlias(alias) {{ if (UMBRACO_FORCE_SAFE_ALIAS) {{ var safeAlias = ''; var aliasLength = alias.length; for (var i = 0; i < aliasLength; i++) {{ currentChar = alias.substring(i, i + 1); if (UMBRACO_FORCE_SAFE_ALIAS_VALIDCHARS.indexOf(currentChar.toLowerCase()) > -1) {{ // check for camel (if previous character is a space, we'll upper case the current one if (safeAlias == '' && UMBRACO_FORCE_SAFE_ALIAS_INVALID_FIRST_CHARS.indexOf(currentChar.toLowerCase()) > 0) {{ currentChar = ''; }} else {{ // first char should always be lowercase (camel style) if (safeAlias.length == 0) currentChar = currentChar.toLowerCase(); if (i < aliasLength - 1 && safeAlias != '' && alias.substring(i - 1, i) == ' ') currentChar = currentChar.toUpperCase(); safeAlias += currentChar; }} }} }} alias = safeAlias; }} return alias; }} function getSafeAlias(id, value, immediate, callback) {{ callback(safeAlias(value)); }} function validateSafeAlias(id, value, immediate, callback) {{ callback(value == safeAlias(value)); }} // legacy backward compatibility requires that one function isValidAlias(alias) {{ return alias == safeAlias(alias); }} "; /// /// Gets the JavaScript code defining client-side short string services. /// public string GetShortStringServicesJavaScript(string controllerPath) { return string.Format(SssjsFormat, UmbracoSettings.ForceSafeAliases ? "true" : "false", SssjsValidCharacters, SssjsInvalidFirstCharacters); } #endregion #region IShortStringHelper CleanFor... /// /// Cleans a string to produce a string that can safely be used in an alias. /// /// The text to filter. /// The safe alias. /// The string will be cleaned in the context of invariant culture. public string CleanStringForSafeAlias(string text) { // ported from StringExtensions.ToSafeAlias() const string validAliasCharacters = UmbracoValidAliasCharacters; const string invalidFirstCharacters = UmbracoInvalidFirstCharacters; var safeString = new StringBuilder(); int aliasLength = text.Length; for (var i = 0; i < aliasLength; i++) { var currentChar = text.Substring(i, 1); if (validAliasCharacters.Contains(currentChar.ToLowerInvariant())) { // check for camel (if previous character is a space, we'll upper case the current one if (safeString.Length == 0 && invalidFirstCharacters.Contains(currentChar.ToLowerInvariant())) { //currentChar = ""; } else { if (i < aliasLength - 1 && i > 0 && text.Substring(i - 1, 1) == " ") currentChar = currentChar.ToUpperInvariant(); safeString.Append(currentChar); } } } return safeString.ToString(); } /// /// Cleans a string, in the context of the invariant culture, to produce a string that can safely be used in an alias. /// /// The text to filter. /// The culture. /// The safe alias. /// Legacy does not support culture contexts. public string CleanStringForSafeAlias(string text, CultureInfo culture) { return CleanStringForSafeAlias(text); } /// /// Cleans a string to produce a string that can safely be used in an url segment, in the context of the invariant culture. /// /// The text to filter. /// The safe url segment. public string CleanStringForUrlSegment(string text) { return LegacyFormatUrl(text); } /// /// Cleans a string, in the context of the invariant culture, to produce a string that can safely be used in an url segment. /// /// The text to filter. /// The culture. /// The safe url segment. /// Legacy does not support culture contexts. public string CleanStringForUrlSegment(string text, CultureInfo culture) { return CleanStringForUrlSegment(text); } /// /// Cleans a string, in the context of the invariant culture, to produce a string that can safely be used as a filename, /// both internally (on disk) and externally (as a url). /// /// The text to filter. /// The safe filename. /// Legacy says this was used to "overcome an issue when Umbraco is used in IE in an intranet environment" but that issue is not documented. public string CleanStringForSafeFileName(string text) { var filePath = text; // ported from Core.IO.IOHelper.SafeFileName() if (String.IsNullOrEmpty(filePath)) return String.Empty; if (!String.IsNullOrWhiteSpace(filePath)) { foreach (var character in Path.GetInvalidFileNameChars()) { filePath = filePath.Replace(character, '-'); } } else { filePath = String.Empty; } //Break up the file in name and extension before applying the UrlReplaceCharacters var fileNamePart = filePath.Substring(0, filePath.LastIndexOf('.')); var ext = filePath.Substring(filePath.LastIndexOf('.')); //Because the file usually is downloadable as well we check characters against 'UmbracoSettings.UrlReplaceCharacters' XmlNode replaceChars = UmbracoSettings.UrlReplaceCharacters; foreach (XmlNode n in replaceChars.SelectNodes("char")) { if (n.Attributes.GetNamedItem("org") != null && n.Attributes.GetNamedItem("org").Value != "") fileNamePart = fileNamePart.Replace(n.Attributes.GetNamedItem("org").Value, XmlHelper.GetNodeValue(n)); } filePath = string.Concat(fileNamePart, ext); // Adapted from: http://stackoverflow.com/a/4827510/5018 // Combined both Reserved Characters and Character Data // from http://en.wikipedia.org/wiki/Percent-encoding var stringBuilder = new StringBuilder(); const string reservedCharacters = "!*'();:@&=+$,/?%#[]-~{}\"<>\\^`| "; foreach (var character in filePath) { if (reservedCharacters.IndexOf(character) == -1) stringBuilder.Append(character); else stringBuilder.Append("-"); } // Remove repeating dashes // From: http://stackoverflow.com/questions/5111967/regex-to-remove-a-specific-repeated-character var reducedString = Regex.Replace(stringBuilder.ToString(), "-+", "-"); return reducedString; } /// /// Cleans a string, in the context of the invariant culture, to produce a string that can safely be used as a filename, /// both internally (on disk) and externally (as a url). /// /// The text to filter. /// The culture. /// The safe filename. /// Legacy does not support culture contexts. public string CleanStringForSafeFileName(string text, CultureInfo culture) { return CleanStringForSafeFileName(text); } #endregion #region CleanString // legacy does not implement these public string CleanString(string text, CleanStringType stringType) { return text; } public string CleanString(string text, CleanStringType stringType, char separator) { return text; } public string CleanString(string text, CleanStringType stringType, CultureInfo culture) { return text; } public string CleanString(string text, CleanStringType stringType, char separator, CultureInfo culture) { return text; } #endregion #region SplitPascalCasing /// /// Splits a pascal-cased string by inserting a separator in between each term. /// /// The text to split. /// The separator. /// The splitted string. /// Probably only supports Ascii strings. public string SplitPascalCasing(string text, char separator) { // ported from StringExtensions.SplitPascalCasing() var replacement = "$1" + separator; var result = Regex.Replace(text, "([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))", replacement); return result; } #endregion #region Legacy /// /// Cleans a string to produce a string that can safely be used in an alias. /// /// The text to filter. /// The safe alias. /// The string will be cleaned in the context of invariant culture. public string LegacyCleanStringForUmbracoAlias(string text) { // ported from StringExtensions.ToUmbracoAlias() // kept here for reference, not used anymore if (string.IsNullOrEmpty(text)) return string.Empty; //convert case first //var tmp = text.ConvertCase(caseType); // note: always Camel anyway var tmp = LegacyConvertStringCase(text, CleanStringType.CamelCase); //remove non-alphanumeric chars var result = Regex.Replace(tmp, @"[^a-zA-Z0-9\s\.-]+", "", RegexOptions.Compiled); // note: spaces are always removed anyway //if (removeSpaces) // result = result.Replace(" ", ""); return result; } /// /// Filters a string to convert case, and more. /// /// the text to filter. /// The string case type. /// The filtered text. /// /// This is the legacy method, so we can't really change it, although it has issues (see unit tests). /// It does more than "converting the case", and also remove spaces, etc. /// public string LegacyConvertStringCase(string phrase, CleanStringType cases) { // ported from StringExtensions.ConvertCase cases &= CleanStringType.CaseMask; var splittedPhrase = Regex.Split(phrase, @"[^a-zA-Z0-9\']", RegexOptions.Compiled); if (cases == CleanStringType.Unchanged) return string.Join("", splittedPhrase); //var splittedPhrase = phrase.Split(' ', '-', '.'); var sb = new StringBuilder(); foreach (var splittedPhraseChars in splittedPhrase.Select(s => s.ToCharArray())) { if (splittedPhraseChars.Length > 0) { splittedPhraseChars[0] = ((new String(splittedPhraseChars[0], 1)).ToUpperInvariant().ToCharArray())[0]; } sb.Append(new String(splittedPhraseChars)); } var result = sb.ToString(); if (cases == CleanStringType.CamelCase) { if (result.Length > 1) { var pattern = new Regex("^([A-Z]*)([A-Z].*)$", RegexOptions.Singleline | RegexOptions.Compiled); var match = pattern.Match(result); if (match.Success) { result = match.Groups[1].Value.ToLowerInvariant() + match.Groups[2].Value; return result.Substring(0, 1).ToLowerInvariant() + result.Substring(1); } return result; } return result.ToLowerInvariant(); } return result; } /// /// Converts string to a URL alias. /// /// The value. /// The char replacements. /// if set to true replace double dashes. /// if set to true strip non ASCII. /// if set to true URL encode. /// /// /// This ensures that ONLY ascii chars are allowed and of those ascii chars, only digits and lowercase chars, all /// punctuation, etc... are stripped out, however this method allows you to pass in string's to replace with the /// specified replacement character before the string is converted to ascii and it has invalid characters stripped out. /// This allows you to replace strings like & , etc.. with your replacement character before the automatic /// reduction. /// public string LegacyToUrlAlias(string value, IDictionary charReplacements, bool replaceDoubleDashes, bool stripNonAscii, bool urlEncode) { // to lower case invariant // replace chars one by one using charReplacements // (opt) convert to ASCII then remove anything that's not ASCII // trim - and _ then (opt) remove double - // (opt) url-encode // charReplacement is actually *string* replacement ie it can replace " " by a non-breaking space // so it's kind of a pre-filter actually... // we need pre-filters, and post-filters, within each token... // not so... we may want to replace   with a space BEFORE cutting into tokens... //first to lower case value = value.ToLowerInvariant(); //then replacement chars value = charReplacements.Aggregate(value, (current, kvp) => current.Replace(kvp.Key, kvp.Value)); //then convert to only ascii, this will remove the rest of any invalid chars if (stripNonAscii) { value = Encoding.ASCII.GetString( Encoding.Convert( Encoding.UTF8, Encoding.GetEncoding( Encoding.ASCII.EncodingName, new EncoderReplacementFallback(String.Empty), new DecoderExceptionFallback()), Encoding.UTF8.GetBytes(value))); //remove all characters that do not fall into the following categories (apart from the replacement val) var validCodeRanges = //digits Enumerable.Range(48, 10).Concat( //lowercase chars Enumerable.Range(97, 26)); var sb = new StringBuilder(); foreach (var c in value.Where(c => charReplacements.Values.Contains(c.ToString(CultureInfo.InvariantCulture)) || validCodeRanges.Contains(c))) { sb.Append(c); } value = sb.ToString(); } //trim dashes from end value = value.Trim('-', '_'); //replace double occurances of - or _ value = replaceDoubleDashes ? Regex.Replace(value, @"([-_]){2,}", "$1", RegexOptions.Compiled) : value; //url encode result return urlEncode ? System.Web.HttpUtility.UrlEncode(value) : value; } /// /// Cleans a string to produce a string that can safely be used in an url segment. /// /// The text to filter. /// The safe url segment. /// /// Uses UmbracoSettings.UrlReplaceCharacters /// and UmbracoSettings.RemoveDoubleDashesFromUrlReplacing. /// public string LegacyFormatUrl(string url) { var newUrl = url.ToLowerInvariant(); var replaceChars = UmbracoSettings.UrlReplaceCharacters; foreach (XmlNode n in replaceChars.SelectNodes("char")) { if (n.Attributes.GetNamedItem("org") != null && n.Attributes.GetNamedItem("org").Value != "") newUrl = newUrl.Replace(n.Attributes.GetNamedItem("org").Value, XmlHelper.GetNodeValue(n)); } // check for double dashes if (UmbracoSettings.RemoveDoubleDashesFromUrlReplacing) { newUrl = Regex.Replace(newUrl, @"[-]{2,}", "-"); } return newUrl; } #endregion #region ReplaceMany /// /// Returns a new string in which all occurences of specified strings are replaced by other specified strings. /// /// The string to filter. /// The replacements definition. /// The filtered string. public string ReplaceMany(string text, IDictionary replacements) { // Have done various tests, implementing my own "super fast" state machine to handle // replacement of many items, or via regexes, but on short strings and not too // many replacements (which prob. is going to be our case) nothing can beat this... // (at least with safe and checked code -- we don't want unsafe/unchecked here) // Note that it will do chained-replacements ie replaced items can be replaced // in turn by another replacement (ie the order of replacements is important) return replacements.Aggregate(text, (current, kvp) => current.Replace(kvp.Key, kvp.Value)); } /// /// Returns a new string in which all occurences of specified characters are replaced by a specified character. /// /// The string to filter. /// The characters to replace. /// The replacement character. /// The filtered string. public string ReplaceMany(string text, char[] chars, char replacement) { // be safe if (text == null) throw new ArgumentNullException("text"); if (chars == null) throw new ArgumentNullException("chars"); // see note above return chars.Aggregate(text, (current, c) => current.Replace(c, replacement)); } #endregion } }