using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Globalization; using Umbraco.Core.Configuration; namespace Umbraco.Core.Strings { /// /// New default implementation of string functions for short strings such as aliases or url segments. /// /// /// Not optimized to work on large bodies of text. /// Meant to replace LegacyShortStringHelper where/when backward compatibility is not an issue. /// Full-unicode support is probably not so good. /// NOTE: pre-filters run _before_ the string is re-encoded. /// public class DefaultShortStringHelper : IShortStringHelper { #region Ctor and vars static DefaultShortStringHelper() { InitializeLegacyUrlReplaceCharacters(); } /// /// Freezes the helper so it can prevents its configuration from being modified. /// /// Will be called by ShortStringHelperResolver when resolution freezes. public void Freeze() { _frozen = true; } // see notes for CleanAsciiString //// beware! the order is quite important here! //const string ValidStringCharactersSource = "abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; //readonly static char[] ValidStringCharacters; private CultureInfo _defaultCulture = CultureInfo.InvariantCulture; private bool _frozen; private readonly Dictionary> _configs = new Dictionary>(); // see notes for CleanAsciiString //static DefaultShortStringHelper() //{ // ValidStringCharacters = ValidStringCharactersSource.ToCharArray(); //} #endregion #region Legacy UrlReplaceCharacters static readonly Dictionary UrlReplaceCharacters = new Dictionary(); static void InitializeLegacyUrlReplaceCharacters() { var replaceChars = UmbracoSettings.UrlReplaceCharacters; if (replaceChars == null) return; var nodes = replaceChars.SelectNodes("char"); if (nodes == null) return; foreach (var node in nodes.Cast()) { var attributes = node.Attributes; if (attributes == null) continue; var org = attributes.GetNamedItem("org"); if (org != null && org.Value != "") UrlReplaceCharacters[org.Value] = XmlHelper.GetNodeValue(node); } } /// /// Returns a new string in which characters have been replaced according to the Umbraco settings UrlReplaceCharacters. /// /// The string to filter. /// The filtered string. public static string ApplyUrlReplaceCharacters(string s) { return s.ReplaceMany(UrlReplaceCharacters); } #endregion #region Configuration private void EnsureNotFrozen() { if (_frozen) throw new InvalidOperationException("Cannot configure the helper once it is frozen."); } public DefaultShortStringHelper WithDefaultCulture(CultureInfo culture) { EnsureNotFrozen(); _defaultCulture = culture; return this; } public DefaultShortStringHelper WithConfig( Func preFilter = null, bool breakTermsOnUpper = true, bool allowLeadingDigits = false, bool allowUnderscoreInTerm = false) { return WithConfig(_defaultCulture, CleanStringType.RoleMask, preFilter, breakTermsOnUpper, allowLeadingDigits, allowUnderscoreInTerm); } public DefaultShortStringHelper WithConfig(CleanStringType stringRole, Func preFilter = null, bool breakTermsOnUpper = true, bool allowLeadingDigits = false, bool allowUnderscoreInTerm = false) { return WithConfig(_defaultCulture, stringRole, preFilter, breakTermsOnUpper, allowLeadingDigits, allowUnderscoreInTerm); } public DefaultShortStringHelper WithConfig(CultureInfo culture, CleanStringType stringRole, Func preFilter = null, bool breakTermsOnUpper = true, bool allowLeadingDigits = false, bool allowUnderscoreInTerm = false) { EnsureNotFrozen(); if (_configs.ContainsKey(culture) == false) _configs[culture] = new Dictionary(); _configs[culture][stringRole] = new HelperConfig(preFilter, breakTermsOnUpper, allowLeadingDigits, allowUnderscoreInTerm); return this; } internal sealed class HelperConfig { private HelperConfig() { PreFilter = null; BreakTermsOnUpper = true; AllowLeadingDigits = false; } public HelperConfig(Func preFilter, bool breakTermsOnUpper, bool allowLeadingDigits, bool allowUnderscoreInTerm) : this() { PreFilter = preFilter; BreakTermsOnUpper = breakTermsOnUpper; AllowLeadingDigits = allowLeadingDigits; AllowUnderscoreInTerm = allowUnderscoreInTerm; } public Func PreFilter { get; private set; } // indicate whether an uppercase within a term eg "fooBar" is to break // into a new term, or to be considered as part of the current term public bool BreakTermsOnUpper { get; private set; } // indicates whether it is legal to have leading digits, or whether they // should be stripped as any other illegal character public bool AllowLeadingDigits { get; private set; } // indicates whether underscore is a valid character in a term or is // to be considered as a separator public bool AllowUnderscoreInTerm { get; private set; } // indicates whether acronyms parsing is greedy ie whether "FOObar" is // "FOO" + "bar" (greedy) or "FO" + "Obar" (non-greedy) public bool GreedyAcronyms { get { return false; } } public static readonly HelperConfig Empty = new HelperConfig(); } private HelperConfig GetConfig(CleanStringType stringType, CultureInfo culture) { Dictionary config; if (_configs.ContainsKey(culture)) { config = _configs[culture]; if (config.ContainsKey(stringType)) // have we got a config for _that_ role? return config[stringType]; if (config.ContainsKey(CleanStringType.RoleMask)) // have we got a generic config for _all_ roles? return config[CleanStringType.RoleMask]; } else if (_configs.ContainsKey(_defaultCulture)) { config = _configs[_defaultCulture]; if (config.ContainsKey(stringType)) // have we got a config for _that_ role? return config[stringType]; if (config.ContainsKey(CleanStringType.RoleMask)) // have we got a generic config for _all_ roles? return config[CleanStringType.RoleMask]; } return HelperConfig.Empty; } #endregion #region JavaScript private const string SssjsFormat = @" var UMBRACO_FORCE_SAFE_ALIAS = {0}; var UMBRACO_FORCE_SAFE_ALIAS_URL = '{1}'; var UMBRACO_FORCE_SAFE_ALIAS_TIMEOUT = 666; var UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS = {{ }}; function getSafeAliasFromServer(value, callback) {{ $.getJSON(UMBRACO_FORCE_SAFE_ALIAS_URL + 'ToSafeAlias?value=' + encodeURIComponent(value), function(json) {{ if (json.alias) {{ callback(json.alias); }} }}); }} function getSafeAlias(id, value, immediate, callback) {{ if (!UMBRACO_FORCE_SAFE_ALIAS) {{ callback(value); return; }} if (UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id]) clearTimeout(UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id]); UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id] = setTimeout(function() {{ UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id] = null; getSafeAliasFromServer(value, function(alias) {{ callback(alias); }}); }}, UMBRACO_FORCE_SAFE_ALIAS_TIMEOUT); }} function validateSafeAlias(id, value, immediate, callback) {{ if (!UMBRACO_FORCE_SAFE_ALIAS) {{ callback(true); return; }} if (UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id]) clearTimeout(UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id]); UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id] = setTimeout(function() {{ UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id] = null; getSafeAliasFromServer(value, function(alias) {{ callback(value.toLowerCase() == alias.toLowerCase()); }}); }}, UMBRACO_FORCE_SAFE_ALIAS_TIMEOUT); }} "; /// /// Gets the JavaScript code defining client-side short string services. /// public string GetShortStringServicesJavaScript(string controllerPath) { return string.Format(SssjsFormat, UmbracoSettings.ForceSafeAliases ? "true" : "false", controllerPath); } #endregion #region IShortStringHelper CleanFor... /// /// Cleans a string to produce a string that can safely be used in an alias. /// /// The text to filter. /// The safe alias. /// /// The string will be cleaned in the context of the default culture. /// Safe aliases are Ascii only. /// public virtual string CleanStringForSafeAlias(string text) { return CleanString(text, CleanStringType.Ascii | CleanStringType.UmbracoCase | CleanStringType.Alias); } /// /// Cleans a string, in the context of a specified culture, to produce a string that can safely be used in an alias. /// /// The text to filter. /// The culture. /// The safe alias. /// /// Safe aliases are Ascii only. /// public virtual string CleanStringForSafeAlias(string text, CultureInfo culture) { return CleanString(text, CleanStringType.Ascii | CleanStringType.UmbracoCase | CleanStringType.Alias, culture); } /// /// Cleans a string to produce a string that can safely be used in an url segment. /// /// The text to filter. /// The safe url segment. /// /// The string will be cleaned in the context of the default culture. /// Url segments are Ascii only (no accents...). /// public virtual string CleanStringForUrlSegment(string text) { return CleanString(text, CleanStringType.Ascii | CleanStringType.LowerCase | CleanStringType.Url, '-'); } /// /// Cleans a string, in the context of a specified culture, to produce a string that can safely be used in an url segment. /// /// The text to filter. /// The culture. /// The safe url segment. /// /// Url segments are Ascii only (no accents...). /// public virtual string CleanStringForUrlSegment(string text, CultureInfo culture) { return CleanString(text, CleanStringType.Ascii | CleanStringType.LowerCase | CleanStringType.Url, '-', culture); } /// /// Cleans a string, in the context of the invariant culture, to produce a string that can safely be used as a filename, /// both internally (on disk) and externally (as a url). /// /// The text to filter. /// The safe filename. /// Legacy says this was used to "overcome an issue when Umbraco is used in IE in an intranet environment" but that issue is not documented. public virtual string CleanStringForSafeFileName(string text) { if (string.IsNullOrWhiteSpace(text)) return string.Empty; text = text.ReplaceMany(Path.GetInvalidFileNameChars(), '-'); var pos = text.LastIndexOf('.'); var name = pos < 0 ? text : text.Substring(0, pos); var ext = pos < 0 ? string.Empty : text.Substring(pos + 1); name = CleanString(name, CleanStringType.Ascii | CleanStringType.Alias | CleanStringType.LowerCase, '-'); ext = CleanString(ext, CleanStringType.Ascii | CleanStringType.Alias | CleanStringType.LowerCase, '-'); return pos < 0 ? name : (name + "." + ext); } /// /// Cleans a string, in the context of the invariant culture, to produce a string that can safely be used as a filename, /// both internally (on disk) and externally (as a url). /// /// The text to filter. /// The culture. /// The safe filename. public virtual string CleanStringForSafeFileName(string text, CultureInfo culture) { if (string.IsNullOrWhiteSpace(text)) return string.Empty; text = text.ReplaceMany(Path.GetInvalidFileNameChars(), '-'); var pos = text.LastIndexOf('.'); var name = pos < 0 ? text : text.Substring(0, pos); var ext = pos < 0 ? string.Empty : text.Substring(pos + 1); name = CleanString(name, CleanStringType.Ascii | CleanStringType.Alias | CleanStringType.LowerCase, '-', culture); ext = CleanString(ext, CleanStringType.Ascii | CleanStringType.Alias | CleanStringType.LowerCase, '-', culture); return pos < 0 ? name : (name + "." + ext); } #endregion #region CleanString // MS rules & guidelines: // - Do capitalize both characters of two-character acronyms, except the first word of a camel-cased identifier. // eg "DBRate" (pascal) or "ioHelper" (camel) - "specialDBRate" (pascal) or "specialIOHelper" (camel) // - Do capitalize only the first character of acronyms with three or more characters, except the first word of a camel-cased identifier. // eg "XmlWriter (pascal) or "htmlReader" (camel) - "SpecialXmlWriter" (pascal) or "specialHtmlReader" (camel) // - Do not capitalize any of the characters of any acronyms, whatever their length, at the beginning of a camel-cased identifier. // eg "xmlWriter" or "dbWriter" (camel) // // Our additional stuff: // - Leading digits are removed. // - Many consecutive separators are folded into one unique separator. const byte StateBreak = 1; const byte StateUp = 2; const byte StateWord = 3; const byte StateAcronym = 4; /// /// Cleans a string. /// /// The text to clean. /// A flag indicating the target casing and encoding of the string. By default, /// strings are cleaned up to camelCase and Ascii. /// The clean string. /// The string is cleaned in the context of the default culture. public string CleanString(string text, CleanStringType stringType) { return CleanString(text, stringType, char.MinValue, _defaultCulture); } /// /// Cleans a string, using a specified separator. /// /// The text to clean. /// A flag indicating the target casing and encoding of the string. By default, /// strings are cleaned up to camelCase and Ascii. /// The separator. /// The clean string. /// The string is cleaned in the context of the default culture. public string CleanString(string text, CleanStringType stringType, char separator) { return CleanString(text, stringType, separator, _defaultCulture); } /// /// Cleans a string in the context of a specified culture. /// /// The text to clean. /// A flag indicating the target casing and encoding of the string. By default, /// strings are cleaned up to camelCase and Ascii. /// The culture. /// The clean string. public string CleanString(string text, CleanStringType stringType, CultureInfo culture) { return CleanString(text, stringType, char.MinValue, culture); } /// /// Cleans a string in the context of a specified culture, using a specified separator. /// /// The text to clean. /// A flag indicating the target casing and encoding of the string. By default, /// strings are cleaned up to camelCase and Ascii. /// The separator. /// The culture. /// The clean string. public virtual string CleanString(string text, CleanStringType stringType, char separator, CultureInfo culture) { var config = GetConfig(stringType & CleanStringType.RoleMask, culture); return CleanString(text, stringType, separator, culture, config); } /// /// Cleans a string in the context of a specified culture, using a specified separator and configuration. /// /// The text to clean. /// A flag indicating the target casing and encoding of the string. By default, /// strings are cleaned up to camelCase and Ascii. /// The separator. /// The culture. /// The configuration. /// The clean string. private string CleanString(string text, CleanStringType stringType, char separator, CultureInfo culture, HelperConfig config) { // be safe if (text == null) throw new ArgumentNullException("text"); if (culture == null) throw new ArgumentNullException("culture"); // apply defaults if ((stringType & CleanStringType.CaseMask) == CleanStringType.None) stringType |= CleanStringType.CamelCase; if ((stringType & CleanStringType.CodeMask) == CleanStringType.None) stringType |= CleanStringType.Ascii; var codeType = stringType & CleanStringType.CodeMask; // apply pre-filter if (config.PreFilter != null) text = config.PreFilter(text); // apply replacements //if (config.Replacements != null) // text = ReplaceMany(text, config.Replacements); // recode text = Recode(text, stringType); // clean switch (codeType) { case CleanStringType.Ascii: // see note below - don't use CleanAsciiString //text = CleanAsciiString(text, stringType, separator); //break; case CleanStringType.Utf8: text = CleanUtf8String(text, stringType, separator, culture, config); break; case CleanStringType.Unicode: throw new NotImplementedException("DefaultShortStringHelper does not handle unicode yet."); default: throw new ArgumentOutOfRangeException("stringType"); } return text; } // however proud I can be of that subtle, ascii-optimized code, // benchmarking shows it is an order of magnitude slower that the utf8 version // don't use it - keep it here should anyone be tempted to micro-optimize again... // // beware, it has bugs that are fixed in CleanUtf8String but I'm not going to // bugfix commented code.... /* internal string CleanAsciiString(string text) { return CleanAsciiString(text, CleanStringType.CamelCase, char.MinValue); } internal string CleanAsciiString(string text, CleanStringType caseType, char separator) { int opos = 0, ipos = 0; var state = StateBreak; caseType &= CleanStringType.CaseMask; //switch (caseType) //{ // case CleanStringType.LowerCase: // input = text.ToLowerInvariant().ToCharArray(); // break; // case CleanStringType.UpperCase: // input = text.ToUpperInvariant().ToCharArray(); // break; // default: // input = text.ToCharArray(); // break; //} // if we apply global ToUpper or ToLower to text here // then we cannot break words on uppercase chars var input = text; // because we shouldn't be adding any extra char // it's faster to use an array than a StringBuilder var ilen = input.Length; var output = new char[ilen]; Func termFilter = null; for (var i = 0; i < ilen; i++) { var idx = ValidStringCharacters.IndexOf(input[i]); switch (state) { case StateBreak: if (idx >= 0 && (opos > 0 || idx < 26 || idx >= 36)) { ipos = i; if (opos > 0 && separator != char.MinValue) output[opos++] = separator; state = idx < 36 ? StateWord : StateUp; } break; case StateWord: if (idx < 0 || (_breakTermsOnUpper && idx >= 36)) { CopyAsciiTerm(input, ipos, output, ref opos, i - ipos, caseType, termFilter, false); ipos = i; state = idx < 0 ? StateBreak : StateUp; if (state != StateBreak && separator != char.MinValue) output[opos++] = separator; } break; case StateAcronym: if (idx < 36) { CopyAsciiTerm(input, ipos, output, ref opos, i - ipos, caseType, termFilter, true); ipos = i; state = idx < 0 ? StateBreak : StateWord; if (state != StateBreak && separator != char.MinValue) output[opos++] = separator; } break; case StateUp: if (idx >= 0) { state = idx < 36 ? StateWord : StateAcronym; } else { CopyAsciiTerm(input, ipos, output, ref opos, 1, caseType, termFilter, false); state = StateBreak; } break; default: throw new Exception("Invalid state."); } } //Console.WriteLine("xx: ({0}) {1}, {2}, {3}", state, input.Length, ipos, opos); switch (state) { case StateBreak: break; case StateWord: CopyAsciiTerm(input, ipos, output, ref opos, input.Length - ipos, caseType, termFilter, false); break; case StateAcronym: case StateUp: CopyAsciiTerm(input, ipos, output, ref opos, input.Length - ipos, caseType, termFilter, true); break; default: throw new Exception("Invalid state."); } return new string(output, 0, opos); } internal void CopyAsciiTerm(string input, int ipos, char[] output, ref int opos, int len, CleanStringType caseType, Func termFilter, bool isAcronym) { var term = input.Substring(ipos, len); ipos = 0; if (termFilter != null) { term = termFilter(term); len = term.Length; } if (isAcronym) { if (caseType == CleanStringType.CamelCase && len <= 2 && opos > 0) caseType = CleanStringType.Unchanged; else if (caseType == CleanStringType.PascalCase && len <= 2) caseType = CleanStringType.Unchanged; } int idx; switch (caseType) { //case CleanStringType.LowerCase: //case CleanStringType.UpperCase: case CleanStringType.Unchanged: term.CopyTo(ipos, output, opos, len); opos += len; break; case CleanStringType.LowerCase: for (var i = ipos; i < ipos + len; i++) { idx = ValidStringCharacters.IndexOf(term[i]); output[opos++] = ValidStringCharacters[idx >= 36 ? idx - 36 : idx]; } break; case CleanStringType.UpperCase: for (var i = ipos; i < ipos + len; i++) { idx = ValidStringCharacters.IndexOf(term[i]); output[opos++] = ValidStringCharacters[idx < 26 ? idx + 36 : idx]; } break; case CleanStringType.CamelCase: idx = ValidStringCharacters.IndexOf(term[ipos]); if (opos == 0) output[opos++] = ValidStringCharacters[idx >= 36 ? idx - 36 : idx]; else output[opos++] = ValidStringCharacters[idx < 26 ? idx + 36 : idx]; for (var i = ipos + 1; i < ipos + len; i++) { idx = ValidStringCharacters.IndexOf(term[i]); output[opos++] = ValidStringCharacters[idx >= 36 ? idx - 36 : idx]; } break; case CleanStringType.PascalCase: idx = ValidStringCharacters.IndexOf(term[ipos]); output[opos++] = ValidStringCharacters[idx < 26 ? idx + 36 : idx]; for (var i = ipos + 1; i < ipos + len; i++) { idx = ValidStringCharacters.IndexOf(term[i]); output[opos++] = ValidStringCharacters[idx >= 36 ? idx - 36 : idx]; } break; default: throw new ArgumentOutOfRangeException("caseType"); } } */ // that's the default code that will work for utf8 strings // will not handle unicode, though internal string CleanUtf8String(string text) { return CleanUtf8String(text, CleanStringType.CamelCase, char.MinValue, _defaultCulture, HelperConfig.Empty); } internal string CleanUtf8String(string text, CleanStringType caseType, char separator, CultureInfo culture, HelperConfig config) { int opos = 0, ipos = 0; var state = StateBreak; caseType &= CleanStringType.CaseMask; // if we apply global ToUpper or ToLower to text here // then we cannot break words on uppercase chars var input = text; // it's faster to use an array than a StringBuilder var ilen = input.Length; var output = new char[ilen * 2]; // twice the length should be OK in all cases //var termFilter = config.TermFilter; for (var i = 0; i < ilen; i++) { var c = input[i]; var isDigit = char.IsDigit(c); var isUpper = char.IsUpper(c); // false for digits, symbols... var isLower = char.IsLower(c); // false for digits, symbols... var isUnder = config.AllowUnderscoreInTerm && c == '_'; var isTerm = char.IsLetterOrDigit(c) || isUnder; switch (state) { case StateBreak: if (isTerm && (opos > 0 || (isUnder == false && (config.AllowLeadingDigits || isDigit == false)))) { ipos = i; if (opos > 0 && separator != char.MinValue) output[opos++] = separator; state = isUpper ? StateUp : StateWord; } break; case StateWord: if (isTerm == false || (config.BreakTermsOnUpper && isUpper)) { CopyUtf8Term(input, ipos, output, ref opos, i - ipos, caseType, culture, /*termFilter,*/ false); ipos = i; state = isTerm ? StateUp : StateBreak; if (state != StateBreak && separator != char.MinValue) output[opos++] = separator; } break; case StateAcronym: if (isTerm == false || isLower || isDigit) { if (isLower && config.GreedyAcronyms == false) i -= 1; CopyUtf8Term(input, ipos, output, ref opos, i - ipos, caseType, culture, /*termFilter,*/ true); ipos = i; state = isTerm ? StateWord : StateBreak; if (state != StateBreak && separator != char.MinValue) output[opos++] = separator; } break; case StateUp: if (isTerm) { state = isUpper ? StateAcronym : StateWord; } else { CopyUtf8Term(input, ipos, output, ref opos, 1, caseType, culture, /*termFilter,*/ false); state = StateBreak; } break; default: throw new Exception("Invalid state."); } } switch (state) { case StateBreak: break; case StateWord: CopyUtf8Term(input, ipos, output, ref opos, input.Length - ipos, caseType, culture, /*termFilter,*/ false); break; case StateAcronym: case StateUp: CopyUtf8Term(input, ipos, output, ref opos, input.Length - ipos, caseType, culture, /*termFilter,*/ true); break; default: throw new Exception("Invalid state."); } return new string(output, 0, opos); } internal void CopyUtf8Term(string input, int ipos, char[] output, ref int opos, int len, CleanStringType caseType, CultureInfo culture, /*Func termFilter,*/ bool isAcronym) { var term = input.Substring(ipos, len); ipos = 0; //if (termFilter != null) //{ // term = termFilter(term); // len = term.Length; //} if (isAcronym) { if ((caseType == CleanStringType.CamelCase && len <= 2 && opos > 0) || (caseType == CleanStringType.PascalCase && len <= 2) || (caseType == CleanStringType.UmbracoCase)) caseType = CleanStringType.Unchanged; } char c; switch (caseType) { //case CleanStringType.LowerCase: //case CleanStringType.UpperCase: case CleanStringType.Unchanged: term.CopyTo(ipos, output, opos, len); opos += len; break; case CleanStringType.LowerCase: term.ToLower(culture).CopyTo(ipos, output, opos, len); opos += len; break; case CleanStringType.UpperCase: term.ToUpper(culture).CopyTo(ipos, output, opos, len); opos += len; break; case CleanStringType.CamelCase: c = term[ipos++]; output[opos] = opos++ == 0 ? char.ToLower(c, culture) : char.ToUpper(c, culture); if (len > 1) term.ToLower(culture).CopyTo(ipos, output, opos, len - 1); opos += len - 1; break; case CleanStringType.PascalCase: c = term[ipos++]; output[opos++] = char.ToUpper(c, culture); if (len > 1) term.ToLower(culture).CopyTo(ipos, output, opos, len - 1); opos += len - 1; break; case CleanStringType.UmbracoCase: c = term[ipos++]; output[opos] = opos++ == 0 ? c : char.ToUpper(c, culture); if (len > 1) term.CopyTo(ipos, output, opos, len - 1); opos += len - 1; break; default: throw new ArgumentOutOfRangeException("caseType"); } } #endregion #region SplitPascalCasing /// /// Splits a Pascal-cased string into a phrase separated by a separator. /// /// The text to split. /// The separator, which defaults to a whitespace. /// The splitted text. /// Supports Utf8 and Ascii strings, not Unicode strings. public virtual string SplitPascalCasing(string text, char separator) { // be safe if (text == null) throw new ArgumentNullException("text"); var input = text.ToCharArray(); var output = new char[input.Length * 2]; var opos = 0; var a = input.Length > 0 ? input[0] : char.MinValue; var upos = char.IsUpper(a) ? 1 : 0; for (var i = 1; i < input.Length; i++) { var c = input[i]; if (char.IsUpper(c)) { output[opos++] = a; if (upos == 0) { if (opos > 0) output[opos++] = separator; upos = i + 1; } } else { if (upos > 0) { if (upos < i && opos > 0) output[opos++] = separator; upos = 0; } output[opos++] = a; } a = c; } if (a != char.MinValue) output[opos++] = a; return new string(output, 0, opos); } #endregion #region Recode /// /// Returns a new string containing only characters within the specified code type. /// /// The string to filter. /// The string type. /// The filtered string. /// If is not Unicode then non-utf8 characters are /// removed. If it is Ascii we try to do some intelligent replacement of accents, etc. public virtual string Recode(string text, CleanStringType stringType) { // be safe if (text == null) throw new ArgumentNullException("text"); var codeType = stringType & CleanStringType.CodeMask; // unicode to utf8 or ascii: just remove the unicode chars // utf8 to ascii: try to be clever and replace some chars // what's the point? if (codeType == CleanStringType.Unicode) return text; return codeType == CleanStringType.Utf8 ? RemoveNonUtf8(text) : Utf8ToAsciiConverter.ToAsciiString(text); } private string RemoveNonUtf8(string text) { var len = text.Length; var output = new char[len]; // we won't be adding chars int opos = 0; for (var ipos = 0; ipos < len; ipos++) { var c = text[ipos]; if (char.IsSurrogate(c)) ipos++; else output[opos++] = c; } return new string(output, 0, opos); } #endregion #region ReplaceMany /// /// Returns a new string in which all occurences of specified strings are replaced by other specified strings. /// /// The string to filter. /// The replacements definition. /// The filtered string. public virtual string ReplaceMany(string text, IDictionary replacements) { // be safe if (text == null) throw new ArgumentNullException("text"); if (replacements == null) throw new ArgumentNullException("replacements"); // Have done various tests, implementing my own "super fast" state machine to handle // replacement of many items, or via regexes, but on short strings and not too // many replacements (which prob. is going to be our case) nothing can beat this... // (at least with safe and checked code -- we don't want unsafe/unchecked here) // Note that it will do chained-replacements ie replaced items can be replaced // in turn by another replacement (ie the order of replacements is important) return replacements.Aggregate(text, (current, kvp) => current.Replace(kvp.Key, kvp.Value)); } /// /// Returns a new string in which all occurences of specified characters are replaced by a specified character. /// /// The string to filter. /// The characters to replace. /// The replacement character. /// The filtered string. public virtual string ReplaceMany(string text, char[] chars, char replacement) { // be safe if (text == null) throw new ArgumentNullException("text"); if (chars == null) throw new ArgumentNullException("chars"); // see note above return chars.Aggregate(text, (current, c) => current.Replace(c, replacement)); } #endregion } }