931 lines
39 KiB
C#
931 lines
39 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.Diagnostics;
|
|
using System.IO;
|
|
using System.Linq;
|
|
using System.Globalization;
|
|
using Umbraco.Core.Configuration;
|
|
|
|
namespace Umbraco.Core.Strings
|
|
{
|
|
/// <summary>
|
|
/// New default implementation of string functions for short strings such as aliases or url segments.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// <para>Not optimized to work on large bodies of text.</para>
|
|
/// <para>Meant to replace <c>LegacyShortStringHelper</c> where/when backward compatibility is not an issue.</para>
|
|
/// <para>NOTE: pre-filters run _before_ the string is re-encoded.</para>
|
|
/// </remarks>
|
|
public class DefaultShortStringHelper : IShortStringHelper
|
|
{
|
|
#region Ctor and vars
|
|
|
|
public DefaultShortStringHelper()
|
|
{
|
|
InitializeLegacyUrlReplaceCharacters();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Freezes the helper so it can prevents its configuration from being modified.
|
|
/// </summary>
|
|
/// <remarks>Will be called by <c>ShortStringHelperResolver</c> when resolution freezes.</remarks>
|
|
public void Freeze()
|
|
{
|
|
_frozen = true;
|
|
}
|
|
|
|
// see notes for CleanAsciiString
|
|
//// beware! the order is quite important here!
|
|
//const string ValidStringCharactersSource = "abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
|
//readonly static char[] ValidStringCharacters;
|
|
|
|
private CultureInfo _defaultCulture = CultureInfo.InvariantCulture;
|
|
private bool _frozen;
|
|
private readonly Dictionary<CultureInfo, Dictionary<CleanStringType, Config>> _configs = new Dictionary<CultureInfo, Dictionary<CleanStringType, Config>>();
|
|
|
|
// see notes for CleanAsciiString
|
|
//static DefaultShortStringHelper()
|
|
//{
|
|
// ValidStringCharacters = ValidStringCharactersSource.ToCharArray();
|
|
//}
|
|
|
|
#endregion
|
|
|
|
#region Filters
|
|
|
|
private readonly Dictionary<string, string> _urlReplaceCharacters = new Dictionary<string, string>();
|
|
|
|
private void InitializeLegacyUrlReplaceCharacters()
|
|
{
|
|
var replaceChars = UmbracoSettings.UrlReplaceCharacters;
|
|
if (replaceChars == null) return;
|
|
var nodes = replaceChars.SelectNodes("char");
|
|
if (nodes == null) return;
|
|
foreach (var node in nodes.Cast<System.Xml.XmlNode>())
|
|
{
|
|
var attributes = node.Attributes;
|
|
if (attributes == null) continue;
|
|
var org = attributes.GetNamedItem("org");
|
|
if (org != null && org.Value != "")
|
|
_urlReplaceCharacters[org.Value] = XmlHelper.GetNodeValue(node);
|
|
}
|
|
}
|
|
|
|
private static bool UrlReplacingToAscii
|
|
{
|
|
get
|
|
{
|
|
var replaceChars = UmbracoSettings.UrlReplaceCharacters;
|
|
if (replaceChars == null || replaceChars.Attributes == null) return false;
|
|
var attr = replaceChars.Attributes.GetNamedItem("toAscii");
|
|
return attr != null && attr.Value == "true";
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns a new string in which characters have been replaced according to the Umbraco settings UrlReplaceCharacters.
|
|
/// </summary>
|
|
/// <param name="s">The string to filter.</param>
|
|
/// <returns>The filtered string.</returns>
|
|
public string ApplyUrlReplaceCharacters(string s)
|
|
{
|
|
return s.ReplaceMany(_urlReplaceCharacters);
|
|
}
|
|
|
|
// ok to be static here because it's not configureable in any way
|
|
private static readonly char[] InvalidFileNameChars =
|
|
Path.GetInvalidFileNameChars()
|
|
.Union("!*'();:@&=+$,/?%#[]-~{}\"<>\\^`| ".ToCharArray())
|
|
.Distinct()
|
|
.ToArray();
|
|
|
|
public static bool IsValidFileNameChar(char c)
|
|
{
|
|
return InvalidFileNameChars.Contains(c) == false;
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Configuration
|
|
|
|
private void EnsureNotFrozen()
|
|
{
|
|
if (_frozen)
|
|
throw new InvalidOperationException("Cannot configure the helper once it is frozen.");
|
|
}
|
|
|
|
/// <summary>
|
|
/// Sets a default culture.
|
|
/// </summary>
|
|
/// <param name="culture">The default culture.</param>
|
|
/// <returns>The short string helper.</returns>
|
|
public DefaultShortStringHelper WithDefaultCulture(CultureInfo culture)
|
|
{
|
|
EnsureNotFrozen();
|
|
_defaultCulture = culture;
|
|
return this;
|
|
}
|
|
|
|
public DefaultShortStringHelper WithConfig(Config config)
|
|
{
|
|
return WithConfig(_defaultCulture, CleanStringType.RoleMask, config);
|
|
}
|
|
|
|
public DefaultShortStringHelper WithConfig(CleanStringType stringRole, Config config)
|
|
{
|
|
return WithConfig(_defaultCulture, stringRole, config);
|
|
}
|
|
|
|
public DefaultShortStringHelper WithConfig(CultureInfo culture, CleanStringType stringRole, Config config)
|
|
{
|
|
if (config == null)
|
|
throw new ArgumentNullException("config");
|
|
|
|
EnsureNotFrozen();
|
|
if (_configs.ContainsKey(culture) == false)
|
|
_configs[culture] = new Dictionary<CleanStringType, Config>();
|
|
_configs[culture][stringRole] = config.Clone(); // clone so it can't be changed
|
|
return this;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Sets the default configuration.
|
|
/// </summary>
|
|
/// <returns>The short string helper.</returns>
|
|
public DefaultShortStringHelper WithDefaultConfig()
|
|
{
|
|
return WithConfig(CleanStringType.UrlSegment, new Config
|
|
{
|
|
PreFilter = ApplyUrlReplaceCharacters,
|
|
IsTerm = (c, leading) => char.IsLetterOrDigit(c) || c == '_', // letter, digit or underscore
|
|
StringType = (UrlReplacingToAscii ? CleanStringType.Ascii : CleanStringType.Utf8) | CleanStringType.LowerCase,
|
|
BreakTermsOnUpper = false,
|
|
Separator = '-'
|
|
}).WithConfig(CleanStringType.FileName, new Config
|
|
{
|
|
PreFilter = ApplyUrlReplaceCharacters,
|
|
IsTerm = (c, leading) => char.IsLetterOrDigit(c) || c == '_', // letter, digit or underscore
|
|
StringType = CleanStringType.Utf8 | CleanStringType.LowerCase,
|
|
BreakTermsOnUpper = false,
|
|
Separator = '-'
|
|
}).WithConfig(CleanStringType.Alias, new Config
|
|
{
|
|
PreFilter = ApplyUrlReplaceCharacters,
|
|
IsTerm = (c, leading) => leading
|
|
? char.IsLetter(c) // only letters
|
|
: (char.IsLetterOrDigit(c) || c == '_'), // letter, digit or underscore
|
|
StringType = CleanStringType.Ascii | CleanStringType.UmbracoCase,
|
|
BreakTermsOnUpper = false
|
|
});
|
|
}
|
|
|
|
public sealed class Config
|
|
{
|
|
public Config()
|
|
{
|
|
StringType = CleanStringType.Utf8 | CleanStringType.Unchanged;
|
|
PreFilter = null;
|
|
IsTerm = (c, leading) => leading ? char.IsLetter(c) : char.IsLetterOrDigit(c);
|
|
BreakTermsOnUpper = false;
|
|
CutAcronymOnNonUpper = false;
|
|
GreedyAcronyms = false;
|
|
Separator = Char.MinValue;
|
|
}
|
|
|
|
public Config Clone()
|
|
{
|
|
return new Config
|
|
{
|
|
PreFilter = PreFilter,
|
|
IsTerm = IsTerm,
|
|
StringType = StringType,
|
|
BreakTermsOnUpper = BreakTermsOnUpper,
|
|
CutAcronymOnNonUpper = CutAcronymOnNonUpper,
|
|
GreedyAcronyms = GreedyAcronyms,
|
|
Separator = Separator
|
|
};
|
|
}
|
|
|
|
public Func<string, string> PreFilter { get; set; }
|
|
public Func<char, bool, bool> IsTerm { get; set; }
|
|
|
|
public CleanStringType StringType { get; set; }
|
|
|
|
// indicate whether an uppercase within a term eg "fooBar" is to break
|
|
// into a new term, or to be considered as part of the current term
|
|
public bool BreakTermsOnUpper { get; set; }
|
|
|
|
// indicate whether a non-uppercase within an acronym eg "FOOBar" is to cut
|
|
// the acronym (at "B" or "a" depending on GreedyAcronyms) or to give
|
|
// up the acronym and treat the term as a word
|
|
public bool CutAcronymOnNonUpper { get; set; }
|
|
|
|
// indicates whether acronyms parsing is greedy ie whether "FOObar" is
|
|
// "FOO" + "bar" (greedy) or "FO" + "Obar" (non-greedy)
|
|
public bool GreedyAcronyms { get; set; }
|
|
|
|
// the separator char
|
|
// but then how can we tell we dont want any?
|
|
public char Separator { get; set; }
|
|
|
|
// extends the config
|
|
public CleanStringType StringTypeExtend(CleanStringType stringType)
|
|
{
|
|
var st = StringType;
|
|
foreach (var mask in new[] { CleanStringType.CaseMask, CleanStringType.CodeMask })
|
|
{
|
|
var a = stringType & mask;
|
|
if (a == 0) continue;
|
|
|
|
st = st & ~mask; // clear what we have
|
|
st = st | a; // set the new value
|
|
}
|
|
return st;
|
|
}
|
|
|
|
internal static readonly Config NotConfigured = new Config();
|
|
}
|
|
|
|
private Config GetConfig(CleanStringType stringType, CultureInfo culture)
|
|
{
|
|
stringType = stringType & CleanStringType.RoleMask;
|
|
|
|
Dictionary<CleanStringType, Config> config;
|
|
if (_configs.ContainsKey(culture))
|
|
{
|
|
config = _configs[culture];
|
|
if (config.ContainsKey(stringType)) // have we got a config for _that_ role?
|
|
return config[stringType];
|
|
if (config.ContainsKey(CleanStringType.RoleMask)) // have we got a generic config for _all_ roles?
|
|
return config[CleanStringType.RoleMask];
|
|
}
|
|
else if (_configs.ContainsKey(_defaultCulture))
|
|
{
|
|
config = _configs[_defaultCulture];
|
|
if (config.ContainsKey(stringType)) // have we got a config for _that_ role?
|
|
return config[stringType];
|
|
if (config.ContainsKey(CleanStringType.RoleMask)) // have we got a generic config for _all_ roles?
|
|
return config[CleanStringType.RoleMask];
|
|
}
|
|
|
|
return Config.NotConfigured;
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region JavaScript
|
|
|
|
private const string SssjsFormat = @"
|
|
var UMBRACO_FORCE_SAFE_ALIAS = {0};
|
|
var UMBRACO_FORCE_SAFE_ALIAS_URL = '{1}';
|
|
var UMBRACO_FORCE_SAFE_ALIAS_TIMEOUT = 666;
|
|
var UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS = {{ }};
|
|
|
|
function getSafeAliasFromServer(value, callback) {{
|
|
$.getJSON(UMBRACO_FORCE_SAFE_ALIAS_URL + 'ToSafeAlias?value=' + encodeURIComponent(value), function(json) {{
|
|
if (json.alias) {{ callback(json.alias); }}
|
|
}});
|
|
}}
|
|
|
|
function getSafeAlias(id, value, immediate, callback) {{
|
|
if (!UMBRACO_FORCE_SAFE_ALIAS) {{
|
|
callback(value);
|
|
return;
|
|
}}
|
|
if (UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id]) clearTimeout(UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id]);
|
|
UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id] = setTimeout(function() {{
|
|
UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id] = null;
|
|
getSafeAliasFromServer(value, function(alias) {{ callback(alias); }});
|
|
}}, UMBRACO_FORCE_SAFE_ALIAS_TIMEOUT);
|
|
}}
|
|
|
|
function validateSafeAlias(id, value, immediate, callback) {{
|
|
if (!UMBRACO_FORCE_SAFE_ALIAS) {{
|
|
callback(true);
|
|
return;
|
|
}}
|
|
if (UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id]) clearTimeout(UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id]);
|
|
UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id] = setTimeout(function() {{
|
|
UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS[id] = null;
|
|
getSafeAliasFromServer(value, function(alias) {{ callback(value.toLowerCase() == alias.toLowerCase()); }});
|
|
}}, UMBRACO_FORCE_SAFE_ALIAS_TIMEOUT);
|
|
}}
|
|
";
|
|
|
|
/// <summary>
|
|
/// Gets the JavaScript code defining client-side short string services.
|
|
/// </summary>
|
|
public string GetShortStringServicesJavaScript(string controllerPath)
|
|
{
|
|
return string.Format(SssjsFormat,
|
|
UmbracoSettings.ForceSafeAliases ? "true" : "false", controllerPath);
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region IShortStringHelper CleanFor...
|
|
|
|
/// <summary>
|
|
/// Cleans a string to produce a string that can safely be used in an alias.
|
|
/// </summary>
|
|
/// <param name="text">The text to filter.</param>
|
|
/// <returns>The safe alias.</returns>
|
|
/// <remarks>
|
|
/// <para>The string will be cleaned in the context of the default culture.</para>
|
|
/// <para>Safe aliases are Ascii only.</para>
|
|
/// </remarks>
|
|
public virtual string CleanStringForSafeAlias(string text)
|
|
{
|
|
return CleanStringForSafeAlias(text, _defaultCulture);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cleans a string, in the context of a specified culture, to produce a string that can safely be used in an alias.
|
|
/// </summary>
|
|
/// <param name="text">The text to filter.</param>
|
|
/// <param name="culture">The culture.</param>
|
|
/// <returns>The safe alias.</returns>
|
|
/// <remarks>
|
|
/// <para>Safe aliases are Ascii only.</para>
|
|
/// </remarks>
|
|
public virtual string CleanStringForSafeAlias(string text, CultureInfo culture)
|
|
{
|
|
return CleanString(text, CleanStringType.Alias, culture);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cleans a string to produce a string that can safely be used in an url segment.
|
|
/// </summary>
|
|
/// <param name="text">The text to filter.</param>
|
|
/// <returns>The safe url segment.</returns>
|
|
/// <remarks>
|
|
/// <para>The string will be cleaned in the context of the default culture.</para>
|
|
/// <para>Url segments are Ascii only (no accents...).</para>
|
|
/// </remarks>
|
|
public virtual string CleanStringForUrlSegment(string text)
|
|
{
|
|
return CleanStringForUrlSegment(text, _defaultCulture);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cleans a string, in the context of a specified culture, to produce a string that can safely be used in an url segment.
|
|
/// </summary>
|
|
/// <param name="text">The text to filter.</param>
|
|
/// <param name="culture">The culture.</param>
|
|
/// <returns>The safe url segment.</returns>
|
|
/// <remarks>
|
|
/// <para>Url segments are Ascii only (no accents...).</para>
|
|
/// </remarks>
|
|
public virtual string CleanStringForUrlSegment(string text, CultureInfo culture)
|
|
{
|
|
return CleanString(text, CleanStringType.UrlSegment, culture);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cleans a string, in the context of the default culture, to produce a string that can safely be used as a filename,
|
|
/// both internally (on disk) and externally (as a url).
|
|
/// </summary>
|
|
/// <param name="text">The text to filter.</param>
|
|
/// <returns>The safe filename.</returns>
|
|
/// <remarks>Legacy says this was used to "overcome an issue when Umbraco is used in IE in an intranet environment" but that issue is not documented.</remarks>
|
|
public virtual string CleanStringForSafeFileName(string text)
|
|
{
|
|
return CleanStringForSafeFileName(text, _defaultCulture);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cleans a string to produce a string that can safely be used as a filename,
|
|
/// both internally (on disk) and externally (as a url).
|
|
/// </summary>
|
|
/// <param name="text">The text to filter.</param>
|
|
/// <param name="culture">The culture.</param>
|
|
/// <returns>The safe filename.</returns>
|
|
public virtual string CleanStringForSafeFileName(string text, CultureInfo culture)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(text))
|
|
return string.Empty;
|
|
|
|
text = text.ReplaceMany(Path.GetInvalidFileNameChars(), '-');
|
|
|
|
var name = Path.GetFileNameWithoutExtension(text);
|
|
var ext = Path.GetExtension(text); // includes the dot, empty if no extension
|
|
|
|
Debug.Assert(name != null, "name != null");
|
|
if (name.Length > 0)
|
|
name = CleanString(name, CleanStringType.FileName, culture);
|
|
Debug.Assert(ext != null, "ext != null");
|
|
if (ext.Length > 0)
|
|
ext = CleanString(ext.Substring(1), CleanStringType.FileName, culture);
|
|
|
|
return ext.Length > 0 ? (name + "." + ext) : name;
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region CleanString
|
|
|
|
// MS rules & guidelines:
|
|
// - Do capitalize both characters of two-character acronyms, except the first word of a camel-cased identifier.
|
|
// eg "DBRate" (pascal) or "ioHelper" (camel) - "SpecialDBRate" (pascal) or "specialIOHelper" (camel)
|
|
// - Do capitalize only the first character of acronyms with three or more characters, except the first word of a camel-cased identifier.
|
|
// eg "XmlWriter (pascal) or "htmlReader" (camel) - "SpecialXmlWriter" (pascal) or "specialHtmlReader" (camel)
|
|
// - Do not capitalize any of the characters of any acronyms, whatever their length, at the beginning of a camel-cased identifier.
|
|
// eg "xmlWriter" or "dbWriter" (camel)
|
|
//
|
|
// Our additional stuff:
|
|
// - Leading digits are removed.
|
|
// - Many consecutive separators are folded into one unique separator.
|
|
|
|
const byte StateBreak = 1;
|
|
const byte StateUp = 2;
|
|
const byte StateWord = 3;
|
|
const byte StateAcronym = 4;
|
|
|
|
/// <summary>
|
|
/// Cleans a string.
|
|
/// </summary>
|
|
/// <param name="text">The text to clean.</param>
|
|
/// <param name="stringType">A flag indicating the target casing and encoding of the string. By default,
|
|
/// strings are cleaned up to camelCase and Ascii.</param>
|
|
/// <returns>The clean string.</returns>
|
|
/// <remarks>The string is cleaned in the context of the default culture.</remarks>
|
|
public string CleanString(string text, CleanStringType stringType)
|
|
{
|
|
return CleanString(text, stringType, _defaultCulture, null);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cleans a string, using a specified separator.
|
|
/// </summary>
|
|
/// <param name="text">The text to clean.</param>
|
|
/// <param name="stringType">A flag indicating the target casing and encoding of the string. By default,
|
|
/// strings are cleaned up to camelCase and Ascii.</param>
|
|
/// <param name="separator">The separator.</param>
|
|
/// <returns>The clean string.</returns>
|
|
/// <remarks>The string is cleaned in the context of the default culture.</remarks>
|
|
public string CleanString(string text, CleanStringType stringType, char separator)
|
|
{
|
|
return CleanString(text, stringType, _defaultCulture, separator);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cleans a string in the context of a specified culture.
|
|
/// </summary>
|
|
/// <param name="text">The text to clean.</param>
|
|
/// <param name="stringType">A flag indicating the target casing and encoding of the string. By default,
|
|
/// strings are cleaned up to camelCase and Ascii.</param>
|
|
/// <param name="culture">The culture.</param>
|
|
/// <returns>The clean string.</returns>
|
|
public string CleanString(string text, CleanStringType stringType, CultureInfo culture)
|
|
{
|
|
return CleanString(text, stringType, culture, null);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cleans a string in the context of a specified culture, using a specified separator.
|
|
/// </summary>
|
|
/// <param name="text">The text to clean.</param>
|
|
/// <param name="stringType">A flag indicating the target casing and encoding of the string. By default,
|
|
/// strings are cleaned up to camelCase and Ascii.</param>
|
|
/// <param name="separator">The separator.</param>
|
|
/// <param name="culture">The culture.</param>
|
|
/// <returns>The clean string.</returns>
|
|
public string CleanString(string text, CleanStringType stringType, char separator, CultureInfo culture)
|
|
{
|
|
return CleanString(text, stringType, culture, separator);
|
|
}
|
|
|
|
protected virtual string CleanString(string text, CleanStringType stringType, CultureInfo culture, char? separator)
|
|
{
|
|
// be safe
|
|
if (text == null)
|
|
throw new ArgumentNullException("text");
|
|
if (culture == null)
|
|
throw new ArgumentNullException("culture");
|
|
|
|
// get config
|
|
var config = GetConfig(stringType, culture);
|
|
stringType = config.StringTypeExtend(stringType);
|
|
|
|
// apply defaults
|
|
if ((stringType & CleanStringType.CaseMask) == CleanStringType.None)
|
|
stringType |= CleanStringType.CamelCase;
|
|
if ((stringType & CleanStringType.CodeMask) == CleanStringType.None)
|
|
stringType |= CleanStringType.Ascii;
|
|
|
|
// use configured unless specified
|
|
separator = separator ?? config.Separator;
|
|
|
|
// apply pre-filter
|
|
if (config.PreFilter != null)
|
|
text = config.PreFilter(text);
|
|
|
|
// apply replacements
|
|
//if (config.Replacements != null)
|
|
// text = ReplaceMany(text, config.Replacements);
|
|
|
|
// recode
|
|
var codeType = stringType & CleanStringType.CodeMask;
|
|
text = codeType == CleanStringType.Ascii
|
|
? Utf8ToAsciiConverter.ToAsciiString(text)
|
|
: RemoveSurrogatePairs(text);
|
|
|
|
// clean
|
|
text = CleanCodeString(text, stringType, separator.Value, culture, config);
|
|
|
|
return text;
|
|
}
|
|
|
|
private static string RemoveSurrogatePairs(string text)
|
|
{
|
|
var input = text.ToCharArray();
|
|
var output = new char[input.Length];
|
|
var opos = 0;
|
|
|
|
for (var ipos = 0; ipos < input.Length; ipos++)
|
|
{
|
|
var c = input[ipos];
|
|
if (char.IsSurrogate(c)) // ignore high surrogate
|
|
{
|
|
ipos++; // and skip low surrogate
|
|
output[opos++] = '?';
|
|
}
|
|
else
|
|
{
|
|
output[opos++] = c;
|
|
}
|
|
}
|
|
|
|
return new string(output, 0, opos);
|
|
}
|
|
|
|
// here was a subtle, ascii-optimized version of the cleaning code, and I was
|
|
// very proud of it until benchmarking showed it was an order of magnitude slower
|
|
// that the utf8 version. Micro-optimizing sometimes isn't such a good idea.
|
|
|
|
// note: does NOT support surrogate pairs in text
|
|
internal string CleanCodeString(string text, CleanStringType caseType, char separator, CultureInfo culture, Config config)
|
|
{
|
|
int opos = 0, ipos = 0;
|
|
var state = StateBreak;
|
|
|
|
caseType &= CleanStringType.CaseMask;
|
|
|
|
// if we apply global ToUpper or ToLower to text here
|
|
// then we cannot break words on uppercase chars
|
|
var input = text;
|
|
|
|
// it's faster to use an array than a StringBuilder
|
|
var ilen = input.Length;
|
|
var output = new char[ilen * 2]; // twice the length should be OK in all cases
|
|
|
|
for (var i = 0; i < ilen; i++)
|
|
{
|
|
var c = input[i];
|
|
// leading as long as StateBreak and ipos still zero
|
|
var leading = state == StateBreak && ipos == 0;
|
|
var isTerm = config.IsTerm(c, leading);
|
|
|
|
//var isDigit = char.IsDigit(c);
|
|
var isUpper = char.IsUpper(c); // false for digits, symbols...
|
|
//var isLower = char.IsLower(c); // false for digits, symbols...
|
|
|
|
// what should I do with surrogates?
|
|
// no idea, really, so they are not supported at the moment
|
|
var isPair = char.IsSurrogate(c);
|
|
if (isPair)
|
|
throw new NotSupportedException("Surrogate pairs are not supported.");
|
|
|
|
//Console.WriteLine("CHAR '{0}' {1} {2} - {3} - {4}/{5} {6}",
|
|
// c,
|
|
// isTerm ? "term" : "!term", isUpper ? "upper" : "!upper",
|
|
// state,
|
|
// i, ipos, leading ? "leading" : "!leading");
|
|
|
|
switch (state)
|
|
{
|
|
// within a break
|
|
case StateBreak:
|
|
// begin a new term if char is a term char,
|
|
// and ( pos > 0 or it's also a valid leading char )
|
|
if (isTerm)
|
|
{
|
|
ipos = i;
|
|
if (opos > 0 && separator != char.MinValue)
|
|
output[opos++] = separator;
|
|
state = isUpper ? StateUp : StateWord;
|
|
}
|
|
break;
|
|
|
|
// within a term / word
|
|
case StateWord:
|
|
// end a term if char is not a term char,
|
|
// or ( it's uppercase and we break terms on uppercase)
|
|
if (isTerm == false || (config.BreakTermsOnUpper && isUpper))
|
|
{
|
|
CopyTerm(input, ipos, output, ref opos, i - ipos, caseType, culture, false);
|
|
ipos = i;
|
|
state = isTerm ? StateUp : StateBreak;
|
|
if (state != StateBreak && separator != char.MinValue)
|
|
output[opos++] = separator;
|
|
}
|
|
break;
|
|
|
|
// within a term / acronym
|
|
case StateAcronym:
|
|
// end an acronym if char is not a term char,
|
|
// or if it's not uppercase / config
|
|
//Console.WriteLine("acro {0} {1}", c, (config.CutAcronymOnNonUpper && isUpper == false));
|
|
if (isTerm == false || (config.CutAcronymOnNonUpper && isUpper == false))
|
|
{
|
|
// whether it's part of the acronym depends on whether we're greedy
|
|
if (isTerm && config.GreedyAcronyms == false)
|
|
i -= 1; // handle that char again, in another state - not part of the acronym
|
|
if (i - ipos > 1) // single-char can't be an acronym
|
|
{
|
|
CopyTerm(input, ipos, output, ref opos, i - ipos, caseType, culture, true);
|
|
ipos = i;
|
|
state = isTerm ? StateWord : StateBreak;
|
|
if (state != StateBreak && separator != char.MinValue)
|
|
output[opos++] = separator;
|
|
}
|
|
else if (isTerm)
|
|
{
|
|
state = StateWord;
|
|
}
|
|
}
|
|
else if (isUpper == false) // isTerm == true
|
|
{
|
|
// it's a term char and we don't cut...
|
|
// keep moving forward as a word
|
|
state = StateWord;
|
|
}
|
|
break;
|
|
|
|
// within a term / uppercase = could be a word or an acronym
|
|
case StateUp:
|
|
if (isTerm)
|
|
{
|
|
// add that char to the term and pick word or acronym
|
|
state = isUpper ? StateAcronym : StateWord;
|
|
}
|
|
else
|
|
{
|
|
// single char, copy then break
|
|
CopyTerm(input, ipos, output, ref opos, 1, caseType, culture, false);
|
|
state = StateBreak;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
throw new Exception("Invalid state.");
|
|
}
|
|
}
|
|
|
|
switch (state)
|
|
{
|
|
case StateBreak:
|
|
break;
|
|
|
|
case StateWord:
|
|
CopyTerm(input, ipos, output, ref opos, input.Length - ipos, caseType, culture, false);
|
|
break;
|
|
|
|
case StateAcronym:
|
|
case StateUp:
|
|
CopyTerm(input, ipos, output, ref opos, input.Length - ipos, caseType, culture, true);
|
|
break;
|
|
|
|
default:
|
|
throw new Exception("Invalid state.");
|
|
}
|
|
|
|
return new string(output, 0, opos);
|
|
}
|
|
|
|
// note: supports surrogate pairs in input string
|
|
internal void CopyTerm(string input, int ipos, char[] output, ref int opos, int len,
|
|
CleanStringType caseType, CultureInfo culture, bool isAcronym)
|
|
{
|
|
var term = input.Substring(ipos, len);
|
|
//Console.WriteLine("TERM \"{0}\" {1} {2}",
|
|
// term,
|
|
// isAcronym ? "acronym" : "word",
|
|
// caseType);
|
|
|
|
if (isAcronym)
|
|
{
|
|
if ((caseType == CleanStringType.CamelCase && len <= 2 && opos > 0) ||
|
|
(caseType == CleanStringType.PascalCase && len <= 2) ||
|
|
(caseType == CleanStringType.UmbracoCase))
|
|
caseType = CleanStringType.Unchanged;
|
|
}
|
|
|
|
// note: MSDN seems to imply that ToUpper or ToLower preserve the length
|
|
// of the string, but that this behavior is not guaranteed and could change.
|
|
|
|
char c;
|
|
int i;
|
|
string s;
|
|
switch (caseType)
|
|
{
|
|
//case CleanStringType.LowerCase:
|
|
//case CleanStringType.UpperCase:
|
|
case CleanStringType.Unchanged:
|
|
term.CopyTo(0, output, opos, len);
|
|
opos += len;
|
|
break;
|
|
|
|
case CleanStringType.LowerCase:
|
|
term = term.ToLower(culture);
|
|
term.CopyTo(0, output, opos, term.Length);
|
|
opos += term.Length;
|
|
break;
|
|
|
|
case CleanStringType.UpperCase:
|
|
term = term.ToUpper(culture);
|
|
term.CopyTo(0, output, opos, term.Length);
|
|
opos += term.Length;
|
|
break;
|
|
|
|
case CleanStringType.CamelCase:
|
|
c = term[0];
|
|
i = 1;
|
|
if (char.IsSurrogate(c))
|
|
{
|
|
s = term.Substring(ipos, 2);
|
|
s = opos == 0 ? s.ToLower(culture) : s.ToUpper(culture);
|
|
s.CopyTo(0, output, opos, s.Length);
|
|
opos += s.Length;
|
|
i++; // surrogate pair len is 2
|
|
}
|
|
else
|
|
{
|
|
output[opos] = opos++ == 0 ? char.ToLower(c, culture) : char.ToUpper(c, culture);
|
|
}
|
|
if (len > i)
|
|
{
|
|
term = term.Substring(i).ToLower(culture);
|
|
term.CopyTo(0, output, opos, term.Length);
|
|
opos += term.Length;
|
|
}
|
|
break;
|
|
|
|
case CleanStringType.PascalCase:
|
|
c = term[0];
|
|
i = 1;
|
|
if (char.IsSurrogate(c))
|
|
{
|
|
s = term.Substring(ipos, 2);
|
|
s = s.ToUpper(culture);
|
|
s.CopyTo(0, output, opos, s.Length);
|
|
opos += s.Length;
|
|
i++; // surrogate pair len is 2
|
|
}
|
|
else
|
|
{
|
|
output[opos++] = char.ToUpper(c, culture);
|
|
}
|
|
if (len > i)
|
|
{
|
|
term = term.Substring(i).ToLower(culture);
|
|
term.CopyTo(0, output, opos, term.Length);
|
|
opos += term.Length;
|
|
}
|
|
break;
|
|
|
|
case CleanStringType.UmbracoCase:
|
|
c = term[0];
|
|
i = 1;
|
|
if (char.IsSurrogate(c))
|
|
{
|
|
s = term.Substring(ipos, 2);
|
|
s = opos == 0 ? s : s.ToUpper(culture);
|
|
s.CopyTo(0, output, opos, s.Length);
|
|
opos += s.Length;
|
|
i++; // surrogate pair len is 2
|
|
}
|
|
else
|
|
{
|
|
output[opos] = opos++ == 0 ? c : char.ToUpper(c, culture);
|
|
}
|
|
if (len > i)
|
|
{
|
|
term = term.Substring(i);
|
|
term.CopyTo(0, output, opos, term.Length);
|
|
opos += term.Length;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
throw new ArgumentOutOfRangeException("caseType");
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region SplitPascalCasing
|
|
|
|
/// <summary>
|
|
/// Splits a Pascal-cased string into a phrase separated by a separator.
|
|
/// </summary>
|
|
/// <param name="text">The text to split.</param>
|
|
/// <param name="separator">The separator, which defaults to a whitespace.</param>
|
|
/// <returns>The splitted text.</returns>
|
|
/// <remarks>Supports Utf8 and Ascii strings, not Unicode strings.</remarks>
|
|
// NOTE does not support surrogates pairs at the moment
|
|
public virtual string SplitPascalCasing(string text, char separator)
|
|
{
|
|
// be safe
|
|
if (text == null)
|
|
throw new ArgumentNullException("text");
|
|
|
|
var input = text.ToCharArray();
|
|
var output = new char[input.Length * 2];
|
|
var opos = 0;
|
|
var a = input.Length > 0 ? input[0] : char.MinValue;
|
|
var upos = char.IsUpper(a) ? 1 : 0;
|
|
|
|
for (var i = 1; i < input.Length; i++)
|
|
{
|
|
var c = input[i];
|
|
if (char.IsUpper(c))
|
|
{
|
|
output[opos++] = a;
|
|
if (upos == 0)
|
|
{
|
|
if (opos > 0)
|
|
output[opos++] = separator;
|
|
upos = i + 1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (upos > 0)
|
|
{
|
|
if (upos < i && opos > 0)
|
|
output[opos++] = separator;
|
|
upos = 0;
|
|
}
|
|
output[opos++] = a;
|
|
}
|
|
a = c;
|
|
}
|
|
if (a != char.MinValue)
|
|
output[opos++] = a;
|
|
return new string(output, 0, opos);
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region ReplaceMany
|
|
|
|
/// <summary>
|
|
/// Returns a new string in which all occurences of specified strings are replaced by other specified strings.
|
|
/// </summary>
|
|
/// <param name="text">The string to filter.</param>
|
|
/// <param name="replacements">The replacements definition.</param>
|
|
/// <returns>The filtered string.</returns>
|
|
public virtual string ReplaceMany(string text, IDictionary<string, string> replacements)
|
|
{
|
|
// be safe
|
|
if (text == null)
|
|
throw new ArgumentNullException("text");
|
|
if (replacements == null)
|
|
throw new ArgumentNullException("replacements");
|
|
|
|
// Have done various tests, implementing my own "super fast" state machine to handle
|
|
// replacement of many items, or via regexes, but on short strings and not too
|
|
// many replacements (which prob. is going to be our case) nothing can beat this...
|
|
// (at least with safe and checked code -- we don't want unsafe/unchecked here)
|
|
|
|
// Note that it will do chained-replacements ie replaced items can be replaced
|
|
// in turn by another replacement (ie the order of replacements is important)
|
|
|
|
return replacements.Aggregate(text, (current, kvp) => current.Replace(kvp.Key, kvp.Value));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns a new string in which all occurences of specified characters are replaced by a specified character.
|
|
/// </summary>
|
|
/// <param name="text">The string to filter.</param>
|
|
/// <param name="chars">The characters to replace.</param>
|
|
/// <param name="replacement">The replacement character.</param>
|
|
/// <returns>The filtered string.</returns>
|
|
public virtual string ReplaceMany(string text, char[] chars, char replacement)
|
|
{
|
|
// be safe
|
|
if (text == null)
|
|
throw new ArgumentNullException("text");
|
|
if (chars == null)
|
|
throw new ArgumentNullException("chars");
|
|
|
|
// see note above
|
|
|
|
return chars.Aggregate(text, (current, c) => current.Replace(c, replacement));
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
}
|