src/Umbraco.Core/Strings/DefaultShortStringHelper.cs


// debugging
// define WRTCONS to write cleaning details & steps to console
// leave it wrapped within #if DEBUG to make sure it does leak
// into RELEASE, see http://issues.umbraco.org/issue/U4-4199
#if DEBUG
#undef WRTCONS
#endif

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Globalization;
using System.Text;
using System.Text.RegularExpressions;
using Umbraco.Core.Configuration;

namespace Umbraco.Core.Strings
{
    /// <summary>
    /// New default implementation of string functions for short strings such as aliases or url segments.
    /// </summary>
    /// <remarks>
    /// <para>Not optimized to work on large bodies of text.</para>
    /// <para>Meant to replace <c>LegacyShortStringHelper</c> where/when backward compatibility is not an issue.</para>
    /// <para>NOTE: pre-filters run _before_ the string is re-encoded.</para>
    /// </remarks>
    public class DefaultShortStringHelper : IShortStringHelper
    {
        #region Ctor and vars

        public DefaultShortStringHelper()
        {
            InitializeLegacyUrlReplaceCharacters();
        }

        /// <summary>
        /// Freezes the helper so it can prevents its configuration from being modified.
        /// </summary>
        /// <remarks>Will be called by <c>ShortStringHelperResolver</c> when resolution freezes.</remarks>
        public void Freeze()
        {
            _frozen = true;
        }

        // see notes for CleanAsciiString
        //// beware! the order is quite important here!
        //const string ValidStringCharactersSource = "abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
        //readonly static char[] ValidStringCharacters;

        private CultureInfo _defaultCulture = CultureInfo.InvariantCulture;
        private bool _frozen;
        private readonly Dictionary<CultureInfo, Dictionary<CleanStringType, Config>> _configs = new Dictionary<CultureInfo, Dictionary<CleanStringType, Config>>();

        // see notes for CleanAsciiString
        //static DefaultShortStringHelper()
        //{
        //    ValidStringCharacters = ValidStringCharactersSource.ToCharArray();
        //}

        #endregion

        #region Filters

        private readonly Dictionary<string, string> _urlReplaceCharacters = new Dictionary<string, string>();

        private void InitializeLegacyUrlReplaceCharacters()
        {
            var replaceChars = UmbracoSettings.UrlReplaceCharacters;
            if (replaceChars == null) return;
            var nodes = replaceChars.SelectNodes("char");
            if (nodes == null) return;
            foreach (var node in nodes.Cast<System.Xml.XmlNode>())
            {
                var attributes = node.Attributes;
                if (attributes == null) continue;
                var org = attributes.GetNamedItem("org");
                if (org != null && org.Value != "")
                    _urlReplaceCharacters[org.Value] = XmlHelper.GetNodeValue(node);
            }
        }

        private static bool UrlReplacingToAscii
        {
            get
            {
                var replaceChars = UmbracoSettings.UrlReplaceCharacters;
                if (replaceChars == null || replaceChars.Attributes == null) return false;
                var attr = replaceChars.Attributes.GetNamedItem("toAscii");
                return attr != null && attr.Value == "true";
            }
        }

        /// <summary>
        /// Returns a new string in which characters have been replaced according to the Umbraco settings UrlReplaceCharacters.
        /// </summary>
        /// <param name="s">The string to filter.</param>
        /// <returns>The filtered string.</returns>
        public string ApplyUrlReplaceCharacters(string s)
        {
            return s.ReplaceMany(_urlReplaceCharacters);
        }

        // ok to be static here because it's not configureable in any way
        private static readonly char[] InvalidFileNameChars =
            Path.GetInvalidFileNameChars()
            .Union("!*'();:@&=+$,/?%#[]-~{}\"<>\\^`| ".ToCharArray())
            .Distinct()
            .ToArray();

        public static bool IsValidFileNameChar(char c)
        {
            return InvalidFileNameChars.Contains(c) == false;
        }

        public static string CutMaxLength(string text, int length)
        {
            return text.Length <= length ? text : text.Substring(0, length);
        }

        #endregion

        #region Configuration

        private void EnsureNotFrozen()
        {
            if (_frozen)
                throw new InvalidOperationException("Cannot configure the helper once it is frozen.");            
        }

        /// <summary>
        /// Sets a default culture.
        /// </summary>
        /// <param name="culture">The default culture.</param>
        /// <returns>The short string helper.</returns>
        public DefaultShortStringHelper WithDefaultCulture(CultureInfo culture)
        {
            EnsureNotFrozen();
            _defaultCulture = culture;
            return this;
        }

        public DefaultShortStringHelper WithConfig(Config config)
        {
            return WithConfig(_defaultCulture, CleanStringType.RoleMask, config);
        }

        public DefaultShortStringHelper WithConfig(CleanStringType stringRole, Config config)
        {
            return WithConfig(_defaultCulture, stringRole, config);
        }

        public DefaultShortStringHelper WithConfig(CultureInfo culture, CleanStringType stringRole, Config config)
        {
            if (config == null)
                throw new ArgumentNullException("config");

            EnsureNotFrozen();
            if (_configs.ContainsKey(culture) == false)
                _configs[culture] = new Dictionary<CleanStringType, Config>();
            _configs[culture][stringRole] = config.Clone(); // clone so it can't be changed
            return this;
        }

        /// <summary>
        /// Sets the default configuration.
        /// </summary>
        /// <returns>The short string helper.</returns>
        public DefaultShortStringHelper WithDefaultConfig()
        {
            return WithConfig(CleanStringType.UrlSegment, new Config
            {
                PreFilter = ApplyUrlReplaceCharacters,
                PostFilter = x => CutMaxLength(x, 240),
                IsTerm = (c, leading) => char.IsLetterOrDigit(c) || c == '_', // letter, digit or underscore
                StringType = (UrlReplacingToAscii ? CleanStringType.Ascii : CleanStringType.Utf8) | CleanStringType.LowerCase,
                BreakTermsOnUpper = false,
                Separator = '-'
            }).WithConfig(CleanStringType.FileName, new Config
            {
                PreFilter = ApplyUrlReplaceCharacters,
                IsTerm = (c, leading) => char.IsLetterOrDigit(c) || c == '_', // letter, digit or underscore
                StringType = CleanStringType.Utf8 | CleanStringType.LowerCase,
                BreakTermsOnUpper = false,
                Separator = '-'
            }).WithConfig(CleanStringType.Alias, new Config
            {
                PreFilter = ApplyUrlReplaceCharacters,
                IsTerm = (c, leading) => leading 
                    ? char.IsLetter(c) // only letters
                    : (char.IsLetterOrDigit(c) || c == '_'), // letter, digit or underscore
                StringType = CleanStringType.Ascii | CleanStringType.UmbracoCase,
                BreakTermsOnUpper = false
            }).WithConfig(CleanStringType.ConvertCase, new Config
            {
                PreFilter = null,
                IsTerm = (c, leading) => char.IsLetterOrDigit(c) || c == '_', // letter, digit or underscore
                StringType = CleanStringType.Ascii,
                BreakTermsOnUpper = true
            });
        }

        public sealed class Config
        {
            public Config()
            {
                StringType = CleanStringType.Utf8 | CleanStringType.Unchanged;
                PreFilter = null;
                PostFilter = null;
                IsTerm = (c, leading) => leading ? char.IsLetter(c) : char.IsLetterOrDigit(c);
                BreakTermsOnUpper = false;
                CutAcronymOnNonUpper = false;
                GreedyAcronyms = false;
                Separator = Char.MinValue;
            }

            public Config Clone()
            {
                return new Config
                {
                    PreFilter = PreFilter,
                    PostFilter =  PostFilter,
                    IsTerm = IsTerm,
                    StringType = StringType,
                    BreakTermsOnUpper = BreakTermsOnUpper,
                    CutAcronymOnNonUpper =  CutAcronymOnNonUpper,
                    GreedyAcronyms =  GreedyAcronyms,
                    Separator = Separator
                };
            }

            public Func<string, string> PreFilter { get; set; }
            public Func<string, string> PostFilter { get; set; }
            public Func<char, bool, bool> IsTerm { get; set; }

            public CleanStringType StringType { get; set; }

            // indicate whether an uppercase within a term eg "fooBar" is to break
            // into a new term, or to be considered as part of the current term
            public bool BreakTermsOnUpper { get; set; }

            // indicate whether a non-uppercase within an acronym eg "FOOBar" is to cut
            // the acronym (at "B" or "a" depending on GreedyAcronyms) or to give
            // up the acronym and treat the term as a word
            public bool CutAcronymOnNonUpper { get; set; }

            // indicates whether acronyms parsing is greedy ie whether "FOObar" is
            // "FOO" + "bar" (greedy) or "FO" + "Obar" (non-greedy)
            public bool GreedyAcronyms { get; set; }

            // the separator char
            // but then how can we tell we dont want any?
            public char Separator { get; set; }

            // extends the config
            public CleanStringType StringTypeExtend(CleanStringType stringType)
            {
                var st = StringType;
                foreach (var mask in new[] { CleanStringType.CaseMask, CleanStringType.CodeMask })
                {
                    var a = stringType & mask;
                    if (a == 0) continue;

                    st = st & ~mask; // clear what we have
                    st = st | a; // set the new value
                }
                return st;
            }

            internal static readonly Config NotConfigured = new Config();
        }

        private Config GetConfig(CleanStringType stringType, CultureInfo culture)
        {
            stringType = stringType & CleanStringType.RoleMask;

            Dictionary<CleanStringType, Config> config;
            if (_configs.ContainsKey(culture))
            {
                config = _configs[culture];
                if (config.ContainsKey(stringType)) // have we got a config for _that_ role?
                    return config[stringType];
                if (config.ContainsKey(CleanStringType.RoleMask)) // have we got a generic config for _all_ roles?
                    return config[CleanStringType.RoleMask];
            }
            else if (_configs.ContainsKey(_defaultCulture))
            {
                config = _configs[_defaultCulture];
                if (config.ContainsKey(stringType)) // have we got a config for _that_ role?
                    return config[stringType];
                if (config.ContainsKey(CleanStringType.RoleMask)) // have we got a generic config for _all_ roles?
                    return config[CleanStringType.RoleMask];
            }

            return Config.NotConfigured;
        }

        #endregion

        #region JavaScript

        private const string SssjsFormat = @"
var UMBRACO_FORCE_SAFE_ALIAS = {0};
var UMBRACO_FORCE_SAFE_ALIAS_URL = '{1}';
var UMBRACO_FORCE_SAFE_ALIAS_TIMEOUT = 666;
var UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS = {{ }};

function getSafeAliasFromServer(value, callback) {{
    $.getJSON(UMBRACO_FORCE_SAFE_ALIAS_URL + 'ToSafeAlias?value=' + encodeURIComponent(value), function(json) {{
        if (json.alias) {{ callback(json.alias); }}
    }});
}}

function getSafeAlias(id, value, immediate, callback) {{
    if (!UMBRACO_FORCE_SAFE_ALIAS) {{
        callback(value);
        return;
    }}
    if (UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id]) clearTimeout(UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id]);
    UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id] = setTimeout(function() {{
        UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id] = null;
        getSafeAliasFromServer(value, function(alias) {{ callback(alias); }});
    }}, UMBRACO_FORCE_SAFE_ALIAS_TIMEOUT);
}}

function validateSafeAlias(id, value, immediate, callback) {{
    if (!UMBRACO_FORCE_SAFE_ALIAS) {{
        callback(true);
        return;
    }}
    if (UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id]) clearTimeout(UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id]);
    UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id] = setTimeout(function() {{
        UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id] = null;
        getSafeAliasFromServer(value, function(alias) {{ callback(value.toLowerCase() == alias.toLowerCase()); }});
    }}, UMBRACO_FORCE_SAFE_ALIAS_TIMEOUT);
}}
";

        /// <summary>
        /// Gets the JavaScript code defining client-side short string services.
        /// </summary>
        public string GetShortStringServicesJavaScript(string controllerPath)
        {
                return string.Format(SssjsFormat,
                    UmbracoSettings.ForceSafeAliases ? "true" : "false", controllerPath);
        }

        #endregion

        #region IShortStringHelper CleanFor...

        /// <summary>
        /// Cleans a string to produce a string that can safely be used in an alias.
        /// </summary>
        /// <param name="text">The text to filter.</param>
        /// <returns>The safe alias.</returns>
        /// <remarks>
        /// <para>The string will be cleaned in the context of the default culture.</para>
        /// <para>Safe aliases are Ascii only.</para>
        /// </remarks>
        public virtual string CleanStringForSafeAlias(string text)
        {
            return CleanStringForSafeAlias(text, _defaultCulture);
        }

        /// <summary>
        /// Cleans a string, in the context of a specified culture, to produce a string that can safely be used in an alias.
        /// </summary>
        /// <param name="text">The text to filter.</param>
        /// <param name="culture">The culture.</param>
        /// <returns>The safe alias.</returns>
        /// <remarks>
        /// <para>Safe aliases are Ascii only.</para>
        /// </remarks>
        public virtual string CleanStringForSafeAlias(string text, CultureInfo culture)
        {
            return CleanString(text, CleanStringType.Alias, culture);
        }

        /// <summary>
        /// Cleans a string to produce a string that can safely be used in an url segment.
        /// </summary>
        /// <param name="text">The text to filter.</param>
        /// <returns>The safe url segment.</returns>
        /// <remarks>
        /// <para>The string will be cleaned in the context of the default culture.</para>
        /// <para>Url segments are Ascii only (no accents...).</para>
        /// </remarks>
        public virtual string CleanStringForUrlSegment(string text)
        {
            return CleanStringForUrlSegment(text, _defaultCulture);
        }

        /// <summary>
        /// Cleans a string, in the context of a specified culture, to produce a string that can safely be used in an url segment.
        /// </summary>
        /// <param name="text">The text to filter.</param>
        /// <param name="culture">The culture.</param>
        /// <returns>The safe url segment.</returns>
        /// <remarks>
        /// <para>Url segments are Ascii only (no accents...).</para>
        /// </remarks>
        public virtual string CleanStringForUrlSegment(string text, CultureInfo culture)
        {
            return CleanString(text, CleanStringType.UrlSegment, culture);
        }

        /// <summary>
        /// Cleans a string, in the context of the default culture, to produce a string that can safely be used as a filename,
        /// both internally (on disk) and externally (as a url).
        /// </summary>
        /// <param name="text">The text to filter.</param>
        /// <returns>The safe filename.</returns>
        /// <remarks>Legacy says this was used to "overcome an issue when Umbraco is used in IE in an intranet environment" but that issue is not documented.</remarks>
        public virtual string CleanStringForSafeFileName(string text)
        {
            return CleanStringForSafeFileName(text, _defaultCulture);
        }

        /// <summary>
        /// Cleans a string to produce a string that can safely be used as a filename,
        /// both internally (on disk) and externally (as a url).
        /// </summary>
        /// <param name="text">The text to filter.</param>
        /// <param name="culture">The culture.</param>
        /// <returns>The safe filename.</returns>
        public virtual string CleanStringForSafeFileName(string text, CultureInfo culture)
        {
            if (string.IsNullOrWhiteSpace(text))
                return string.Empty;

            text = text.ReplaceMany(Path.GetInvalidFileNameChars(), '-');

            var name = Path.GetFileNameWithoutExtension(text);
            var ext = Path.GetExtension(text); // includes the dot, empty if no extension

            Debug.Assert(name != null, "name != null");
            if (name.Length > 0)
                name = CleanString(name, CleanStringType.FileName, culture);
            Debug.Assert(ext != null, "ext != null");
            if (ext.Length > 0)
                ext = CleanString(ext.Substring(1), CleanStringType.FileName, culture);

            return ext.Length > 0 ? (name + "." + ext) : name;
        }

        #endregion

        #region CleanString

        // MS rules & guidelines:
        // - Do capitalize both characters of two-character acronyms, except the first word of a camel-cased identifier.
        //     eg "DBRate" (pascal) or "ioHelper" (camel) - "SpecialDBRate" (pascal) or "specialIOHelper" (camel)
        // - Do capitalize only the first character of acronyms with three or more characters, except the first word of a camel-cased identifier.
        //     eg "XmlWriter (pascal) or "htmlReader" (camel) - "SpecialXmlWriter" (pascal) or "specialHtmlReader" (camel)
        // - Do not capitalize any of the characters of any acronyms, whatever their length, at the beginning of a camel-cased identifier.
        //     eg "xmlWriter" or "dbWriter" (camel)
        //
        // Our additional stuff:
        // - Leading digits are removed.
        // - Many consecutive separators are folded into one unique separator.

        const byte StateBreak = 1;
        const byte StateUp = 2;
        const byte StateWord = 3;
        const byte StateAcronym = 4;

        /// <summary>
        /// Cleans a string.
        /// </summary>
        /// <param name="text">The text to clean.</param>
        /// <param name="stringType">A flag indicating the target casing and encoding of the string. By default, 
        /// strings are cleaned up to camelCase and Ascii.</param>
        /// <returns>The clean string.</returns>
        /// <remarks>The string is cleaned in the context of the default culture.</remarks>
        public string CleanString(string text, CleanStringType stringType)
        {
            return CleanString(text, stringType, _defaultCulture, null);
        }

        /// <summary>
        /// Cleans a string, using a specified separator.
        /// </summary>
        /// <param name="text">The text to clean.</param>
        /// <param name="stringType">A flag indicating the target casing and encoding of the string. By default, 
        /// strings are cleaned up to camelCase and Ascii.</param>
        /// <param name="separator">The separator.</param>
        /// <returns>The clean string.</returns>
        /// <remarks>The string is cleaned in the context of the default culture.</remarks>
        public string CleanString(string text, CleanStringType stringType, char separator)
        {
            return CleanString(text, stringType, _defaultCulture, separator);
        }

        /// <summary>
        /// Cleans a string in the context of a specified culture.
        /// </summary>
        /// <param name="text">The text to clean.</param>
        /// <param name="stringType">A flag indicating the target casing and encoding of the string. By default, 
        /// strings are cleaned up to camelCase and Ascii.</param>
        /// <param name="culture">The culture.</param>
        /// <returns>The clean string.</returns>
        public string CleanString(string text, CleanStringType stringType, CultureInfo culture)
        {
            return CleanString(text, stringType, culture, null);
        }

        /// <summary>
        /// Cleans a string in the context of a specified culture, using a specified separator.
        /// </summary>
        /// <param name="text">The text to clean.</param>
        /// <param name="stringType">A flag indicating the target casing and encoding of the string. By default, 
        /// strings are cleaned up to camelCase and Ascii.</param>
        /// <param name="separator">The separator.</param>
        /// <param name="culture">The culture.</param>
        /// <returns>The clean string.</returns>
        public string CleanString(string text, CleanStringType stringType, char separator, CultureInfo culture)
        {
            return CleanString(text, stringType, culture, separator);
        }

        protected virtual string CleanString(string text, CleanStringType stringType, CultureInfo culture, char? separator)
        {
            // be safe
            if (text == null)
                throw new ArgumentNullException("text");
            if (culture == null)
                throw new ArgumentNullException("culture");

#if WRTCONS
            Console.WriteLine("STRING TYPE {0}", stringType);
#endif

            // get config
            var config = GetConfig(stringType, culture);
            stringType = config.StringTypeExtend(stringType);

            // apply defaults
            if ((stringType & CleanStringType.CaseMask) == CleanStringType.None)
                stringType |= CleanStringType.CamelCase;
            if ((stringType & CleanStringType.CodeMask) == CleanStringType.None)
                stringType |= CleanStringType.Ascii;

            // use configured unless specified
            separator = separator ?? config.Separator;

            // apply pre-filter
            if (config.PreFilter != null)
                text = config.PreFilter(text);

            // apply replacements
            //if (config.Replacements != null)
            //    text = ReplaceMany(text, config.Replacements);

            // recode
            var codeType = stringType & CleanStringType.CodeMask;
            text = codeType == CleanStringType.Ascii 
                ? Utf8ToAsciiConverter.ToAsciiString(text) 
                : RemoveSurrogatePairs(text);

            // clean
            text = CleanCodeString(text, stringType, separator.Value, culture, config);

            // apply post-filter
            if (config.PostFilter != null)
                text = config.PostFilter(text);
            
            return text;
        }

        private static string RemoveSurrogatePairs(string text)
        {
            var input = text.ToCharArray();
            var output = new char[input.Length];
            var opos = 0;

            for (var ipos = 0; ipos < input.Length; ipos++)
            {
                var c = input[ipos];
                if (char.IsSurrogate(c)) // ignore high surrogate
                {
                    ipos++; // and skip low surrogate
                    output[opos++] = '?';
                }
                else
                {
                    output[opos++] = c;
                }
            }

            return new string(output, 0, opos);
        }

        // here was a subtle, ascii-optimized version of the cleaning code, and I was
        // very proud of it until benchmarking showed it was an order of magnitude slower
        // that the utf8 version. Micro-optimizing sometimes isn't such a good idea.

        // note: does NOT support surrogate pairs in text
        internal string CleanCodeString(string text, CleanStringType caseType, char separator, CultureInfo culture, Config config)
        {
            int opos = 0, ipos = 0;
            var state = StateBreak;

            caseType &= CleanStringType.CaseMask;
#if WRTCONS
            Console.WriteLine("CASE {0}", caseType);
#endif

            // if we apply global ToUpper or ToLower to text here
            // then we cannot break words on uppercase chars
            var input = text;

            // it's faster to use an array than a StringBuilder
            var ilen = input.Length;
            var output = new char[ilen * 2]; // twice the length should be OK in all cases

            for (var i = 0; i < ilen; i++)
            {
                var c = input[i];
                // leading as long as StateBreak and ipos still zero
                var leading = state == StateBreak && ipos == 0;
                var isTerm = config.IsTerm(c, leading);

                //var isDigit = char.IsDigit(c);
                var isUpper = char.IsUpper(c); // false for digits, symbols...
                //var isLower = char.IsLower(c); // false for digits, symbols...

                // what should I do with surrogates?
                // no idea, really, so they are not supported at the moment
                var isPair = char.IsSurrogate(c);
                if (isPair)
                    throw new NotSupportedException("Surrogate pairs are not supported.");
#if WRTCONS
                Console.WriteLine("CHAR '{0}' {1} {2} - {3} - {4}/{5} {6}",
                    c,
                    isTerm ? "term" : "!term", isUpper ? "upper" : "!upper",
                    state,
                    i, ipos, leading ? "leading" : "!leading");
#endif
                switch (state)
                {
                    // within a break
                    case StateBreak:
                        // begin a new term if char is a term char,
                        // and ( pos > 0 or it's also a valid leading char )
                        if (isTerm)
                        {
                            ipos = i;
                            if (opos > 0 && separator != char.MinValue)
                                output[opos++] = separator;
                            state = isUpper ? StateUp : StateWord;
                        }
                        break;

                    // within a term / word
                    case StateWord:
                        // end a term if char is not a term char,
                        // or ( it's uppercase and we break terms on uppercase)
                        if (isTerm == false || (config.BreakTermsOnUpper && isUpper))
                        {
                            CopyTerm(input, ipos, output, ref opos, i - ipos, caseType, culture, false);
                            ipos = i;
                            state = isTerm ? StateUp : StateBreak;
                            if (state != StateBreak && separator != char.MinValue)
                                output[opos++] = separator;
                        }
                        break;

                    // within a term / acronym
                    case StateAcronym:
                        // end an acronym if char is not a term char,
                        // or if it's not uppercase / config
                        //Console.WriteLine("acro {0} {1}", c, (config.CutAcronymOnNonUpper && isUpper == false));
                        if (isTerm == false || (config.CutAcronymOnNonUpper && isUpper == false))
                        {
                            // whether it's part of the acronym depends on whether we're greedy
                            if (isTerm && config.GreedyAcronyms == false)
                                i -= 1; // handle that char again, in another state - not part of the acronym
                            if (i - ipos > 1) // single-char can't be an acronym
                            {
                                CopyTerm(input, ipos, output, ref opos, i - ipos, caseType, culture, true);
                                ipos = i;
                                state = isTerm ? StateWord : StateBreak;
                                if (state != StateBreak && separator != char.MinValue)
                                    output[opos++] = separator;
                            }
                            else if (isTerm)
                            {
                                state = StateWord;
                            }
                        }
                        else if (isUpper == false) // isTerm == true
                        {
                            // it's a term char and we don't cut...
                            // keep moving forward as a word
                            state = StateWord;
                        }
                        break;

                    // within a term / uppercase = could be a word or an acronym
                    case StateUp:
                        if (isTerm)
                        {
                            // add that char to the term and pick word or acronym
                            state = isUpper ? StateAcronym : StateWord;
                        }
                        else
                        {
                            // single char, copy then break
                            CopyTerm(input, ipos, output, ref opos, 1, caseType, culture, false);
                            state = StateBreak;
                        }
                        break;

                    default:
                        throw new Exception("Invalid state.");
                }
            }

            switch (state)
            {
                case StateBreak:
                    break;

                case StateWord:
                    CopyTerm(input, ipos, output, ref opos, input.Length - ipos, caseType, culture, false);
                    break;

                case StateAcronym:
                case StateUp:
                    CopyTerm(input, ipos, output, ref opos, input.Length - ipos, caseType, culture, true);
                    break;

                default:
                    throw new Exception("Invalid state.");
            }

            return new string(output, 0, opos);
        }

        // note: supports surrogate pairs in input string
        internal void CopyTerm(string input, int ipos, char[] output, ref int opos, int len,
            CleanStringType caseType, CultureInfo culture, bool isAcronym)
        {
            var term = input.Substring(ipos, len);
#if WRTCONS
            Console.WriteLine("TERM \"{0}\" {1} {2}",
                term,
                isAcronym ? "acronym" : "word",
                caseType);
#endif
            if (isAcronym)
            {
                if ((caseType == CleanStringType.CamelCase && len <= 2 && opos > 0) ||
                        (caseType == CleanStringType.PascalCase && len <= 2) ||
                        (caseType == CleanStringType.UmbracoCase))
                    caseType = CleanStringType.Unchanged;
            }

            // note: MSDN seems to imply that ToUpper or ToLower preserve the length
            // of the string, but that this behavior is not guaranteed and could change.

            char c;
            int i;
            string s;
            switch (caseType)
            {
                //case CleanStringType.LowerCase:
                //case CleanStringType.UpperCase:
                case CleanStringType.Unchanged:
                    term.CopyTo(0, output, opos, len);
                    opos += len;
                    break;

                case CleanStringType.LowerCase:
                    term = term.ToLower(culture);
                    term.CopyTo(0, output, opos, term.Length);
                    opos += term.Length;
                    break;

                case CleanStringType.UpperCase:
                    term = term.ToUpper(culture);
                    term.CopyTo(0, output, opos, term.Length);
                    opos += term.Length;
                    break;

                case CleanStringType.CamelCase:
                    c = term[0];
                    i = 1;
                    if (char.IsSurrogate(c))
                    {
                        s = term.Substring(ipos, 2);
                        s = opos == 0 ? s.ToLower(culture) : s.ToUpper(culture);
                        s.CopyTo(0, output, opos, s.Length);
                        opos += s.Length;
                        i++; // surrogate pair len is 2
                    }
                    else
                    {
                        output[opos] = opos++ == 0 ? char.ToLower(c, culture) : char.ToUpper(c, culture);
                    }
                    if (len > i)
                    {
                        term = term.Substring(i).ToLower(culture);
                        term.CopyTo(0, output, opos, term.Length);
                        opos += term.Length;
                    }
                    break;

                case CleanStringType.PascalCase:
                    c = term[0];
                    i = 1;
                    if (char.IsSurrogate(c))
                    {
                        s = term.Substring(ipos, 2);
                        s = s.ToUpper(culture);
                        s.CopyTo(0, output, opos, s.Length);
                        opos += s.Length;
                        i++; // surrogate pair len is 2
                    }
                    else
                    {
                        output[opos++] = char.ToUpper(c, culture);
                    }
                    if (len > i)
                    {
                        term = term.Substring(i).ToLower(culture);
                        term.CopyTo(0, output, opos, term.Length);
                        opos += term.Length;
                    }
                    break;

                case CleanStringType.UmbracoCase:
                    c = term[0];
                    i = 1;
                    if (char.IsSurrogate(c))
                    {
                        s = term.Substring(ipos, 2);
                        s = opos == 0 ? s : s.ToUpper(culture);
                        s.CopyTo(0, output, opos, s.Length);
                        opos += s.Length;
                        i++; // surrogate pair len is 2
                    }
                    else
                    {
                        output[opos] = opos++ == 0 ? c : char.ToUpper(c, culture);
                    }
                    if (len > i)
                    {
                        term = term.Substring(i);
                        term.CopyTo(0, output, opos, term.Length);
                        opos += term.Length;                        
                    }
                    break;

                default:
                    throw new ArgumentOutOfRangeException("caseType");
            }
        }

        #endregion

        #region SplitPascalCasing

        /// <summary>
        /// Splits a Pascal-cased string into a phrase separated by a separator.
        /// </summary>
        /// <param name="text">The text to split.</param>
        /// <param name="separator">The separator, which defaults to a whitespace.</param>
        /// <returns>The splitted text.</returns>
        /// <remarks>Supports Utf8 and Ascii strings, not Unicode strings.</remarks>
        // NOTE does not support surrogates pairs at the moment
        public virtual string SplitPascalCasing(string text, char separator)
        {
            // be safe
            if (text == null)
                throw new ArgumentNullException("text");

            var input = text.ToCharArray();
            var output = new char[input.Length * 2];
            var opos = 0;
            var a = input.Length > 0 ? input[0] : char.MinValue;
            var upos = char.IsUpper(a) ? 1 : 0;

            for (var i = 1; i < input.Length; i++)
            {
                var c = input[i];
                if (char.IsUpper(c))
                {
                    output[opos++] = a;
                    if (upos == 0)
                    {
                        if (opos > 0)
                            output[opos++] = separator;
                        upos = i + 1;
                    }
                }
                else
                {
                    if (upos > 0)
                    {
                        if (upos < i && opos > 0)
                            output[opos++] = separator;
                        upos = 0;
                    }
                    output[opos++] = a;
                }
                a = c;
            }
            if (a != char.MinValue)
                output[opos++] = a;
            return new string(output, 0, opos);
        }

        #endregion

        #region ReplaceMany

        /// <summary>
        /// Returns a new string in which all occurences of specified strings are replaced by other specified strings.
        /// </summary>
        /// <param name="text">The string to filter.</param>
        /// <param name="replacements">The replacements definition.</param>
        /// <returns>The filtered string.</returns>
        public virtual string ReplaceMany(string text, IDictionary<string, string> replacements)
        {
            // be safe
            if (text == null)
                throw new ArgumentNullException("text");
            if (replacements == null)
                throw new ArgumentNullException("replacements");

            // Have done various tests, implementing my own "super fast" state machine to handle 
            // replacement of many items, or via regexes, but on short strings and not too
            // many replacements (which prob. is going to be our case) nothing can beat this...
            // (at least with safe and checked code -- we don't want unsafe/unchecked here)

            // Note that it will do chained-replacements ie replaced items can be replaced
            // in turn by another replacement (ie the order of replacements is important)

            return replacements.Aggregate(text, (current, kvp) => current.Replace(kvp.Key, kvp.Value));
        }

        /// <summary>
        /// Returns a new string in which all occurences of specified characters are replaced by a specified character.
        /// </summary>
        /// <param name="text">The string to filter.</param>
        /// <param name="chars">The characters to replace.</param>
        /// <param name="replacement">The replacement character.</param>
        /// <returns>The filtered string.</returns>
        public virtual string ReplaceMany(string text, char[] chars, char replacement)
        {
            // be safe
            if (text == null)
                throw new ArgumentNullException("text");
            if (chars == null)
                throw new ArgumentNullException("chars");

            // see note above

            return chars.Aggregate(text, (current, c) => current.Replace(c, replacement));
        }

        #endregion
    }
}