2014-02-12 10:32:07 +01:00
// debugging
// define WRTCONS to write cleaning details & steps to console
// leave it wrapped within #if DEBUG to make sure it does leak
// into RELEASE, see http://issues.umbraco.org/issue/U4-4199
#if DEBUG
#undef WRTCONS
#endif
using System ;
2013-02-07 13:30:50 -01:00
using System.Collections.Generic ;
2013-12-13 12:06:56 +01:00
using System.Diagnostics ;
2013-03-11 14:58:07 -01:00
using System.IO ;
2013-02-07 13:30:50 -01:00
using System.Linq ;
using System.Globalization ;
2014-02-12 10:32:07 +01:00
using System.Text ;
using System.Text.RegularExpressions ;
2013-02-19 06:26:58 -01:00
using Umbraco.Core.Configuration ;
2013-02-07 13:30:50 -01:00
namespace Umbraco.Core.Strings
{
/// <summary>
/// New default implementation of string functions for short strings such as aliases or url segments.
/// </summary>
/// <remarks>
/// <para>Not optimized to work on large bodies of text.</para>
/// <para>Meant to replace <c>LegacyShortStringHelper</c> where/when backward compatibility is not an issue.</para>
/// <para>NOTE: pre-filters run _before_ the string is re-encoded.</para>
/// </remarks>
2013-02-19 06:37:24 -01:00
public class DefaultShortStringHelper : IShortStringHelper
2013-02-07 13:30:50 -01:00
{
#region Ctor and vars
2013-12-13 12:06:56 +01:00
public DefaultShortStringHelper ( )
2013-03-22 17:39:35 -01:00
{
InitializeLegacyUrlReplaceCharacters ( ) ;
}
2013-02-07 13:30:50 -01:00
/// <summary>
/// Freezes the helper so it can prevents its configuration from being modified.
/// </summary>
/// <remarks>Will be called by <c>ShortStringHelperResolver</c> when resolution freezes.</remarks>
public void Freeze ( )
{
_frozen = true ;
}
// see notes for CleanAsciiString
//// beware! the order is quite important here!
//const string ValidStringCharactersSource = "abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
//readonly static char[] ValidStringCharacters;
private CultureInfo _defaultCulture = CultureInfo . InvariantCulture ;
private bool _frozen ;
2013-12-13 12:06:56 +01:00
private readonly Dictionary < CultureInfo , Dictionary < CleanStringType , Config > > _configs = new Dictionary < CultureInfo , Dictionary < CleanStringType , Config > > ( ) ;
2013-02-07 13:30:50 -01:00
// see notes for CleanAsciiString
//static DefaultShortStringHelper()
//{
// ValidStringCharacters = ValidStringCharactersSource.ToCharArray();
//}
#endregion
2013-12-13 12:06:56 +01:00
#region Filters
2013-03-22 17:39:35 -01:00
2013-12-13 12:06:56 +01:00
private readonly Dictionary < string , string > _urlReplaceCharacters = new Dictionary < string , string > ( ) ;
2013-03-22 17:39:35 -01:00
2013-12-13 12:06:56 +01:00
private void InitializeLegacyUrlReplaceCharacters ( )
2013-03-22 17:39:35 -01:00
{
var replaceChars = UmbracoSettings . UrlReplaceCharacters ;
2013-03-31 18:40:54 -02:00
if ( replaceChars = = null ) return ;
var nodes = replaceChars . SelectNodes ( "char" ) ;
if ( nodes = = null ) return ;
foreach ( var node in nodes . Cast < System . Xml . XmlNode > ( ) )
2013-03-22 17:39:35 -01:00
{
2013-03-31 18:40:54 -02:00
var attributes = node . Attributes ;
if ( attributes = = null ) continue ;
var org = attributes . GetNamedItem ( "org" ) ;
2013-03-22 17:39:35 -01:00
if ( org ! = null & & org . Value ! = "" )
2013-12-13 12:06:56 +01:00
_urlReplaceCharacters [ org . Value ] = XmlHelper . GetNodeValue ( node ) ;
2013-03-22 17:39:35 -01:00
}
}
2014-01-16 17:03:51 +01:00
private static bool UrlReplacingToAscii
{
get
{
var replaceChars = UmbracoSettings . UrlReplaceCharacters ;
if ( replaceChars = = null | | replaceChars . Attributes = = null ) return false ;
var attr = replaceChars . Attributes . GetNamedItem ( "toAscii" ) ;
return attr ! = null & & attr . Value = = "true" ;
}
}
2013-03-22 17:39:35 -01:00
/// <summary>
/// Returns a new string in which characters have been replaced according to the Umbraco settings UrlReplaceCharacters.
/// </summary>
/// <param name="s">The string to filter.</param>
/// <returns>The filtered string.</returns>
2013-12-13 12:06:56 +01:00
public string ApplyUrlReplaceCharacters ( string s )
2013-03-22 17:39:35 -01:00
{
2013-12-13 12:06:56 +01:00
return s . ReplaceMany ( _urlReplaceCharacters ) ;
}
// ok to be static here because it's not configureable in any way
private static readonly char [ ] InvalidFileNameChars =
Path . GetInvalidFileNameChars ( )
. Union ( "!*'();:@&=+$,/?%#[]-~{}\"<>\\^`| " . ToCharArray ( ) )
. Distinct ( )
. ToArray ( ) ;
public static bool IsValidFileNameChar ( char c )
{
return InvalidFileNameChars . Contains ( c ) = = false ;
2013-03-22 17:39:35 -01:00
}
2014-02-12 23:27:48 +01:00
public static string CutMaxLength ( string text , int length )
{
return text . Length < = length ? text : text . Substring ( 0 , length ) ;
}
2013-03-22 17:39:35 -01:00
#endregion
2013-02-07 13:30:50 -01:00
#region Configuration
private void EnsureNotFrozen ( )
{
if ( _frozen )
throw new InvalidOperationException ( "Cannot configure the helper once it is frozen." ) ;
}
2013-12-13 12:06:56 +01:00
/// <summary>
/// Sets a default culture.
/// </summary>
/// <param name="culture">The default culture.</param>
/// <returns>The short string helper.</returns>
2013-02-07 13:30:50 -01:00
public DefaultShortStringHelper WithDefaultCulture ( CultureInfo culture )
{
EnsureNotFrozen ( ) ;
_defaultCulture = culture ;
return this ;
}
2013-12-13 12:06:56 +01:00
public DefaultShortStringHelper WithConfig ( Config config )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
return WithConfig ( _defaultCulture , CleanStringType . RoleMask , config ) ;
2013-02-07 13:30:50 -01:00
}
2013-12-13 12:06:56 +01:00
public DefaultShortStringHelper WithConfig ( CleanStringType stringRole , Config config )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
return WithConfig ( _defaultCulture , stringRole , config ) ;
2013-02-07 13:30:50 -01:00
}
2013-12-13 12:06:56 +01:00
public DefaultShortStringHelper WithConfig ( CultureInfo culture , CleanStringType stringRole , Config config )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
if ( config = = null )
throw new ArgumentNullException ( "config" ) ;
2013-02-07 13:30:50 -01:00
EnsureNotFrozen ( ) ;
2013-05-04 14:53:33 -02:00
if ( _configs . ContainsKey ( culture ) = = false )
2013-12-13 12:06:56 +01:00
_configs [ culture ] = new Dictionary < CleanStringType , Config > ( ) ;
_configs [ culture ] [ stringRole ] = config . Clone ( ) ; // clone so it can't be changed
2013-02-07 13:30:50 -01:00
return this ;
}
2013-12-13 12:06:56 +01:00
/// <summary>
/// Sets the default configuration.
/// </summary>
/// <returns>The short string helper.</returns>
public DefaultShortStringHelper WithDefaultConfig ( )
{
return WithConfig ( CleanStringType . UrlSegment , new Config
{
PreFilter = ApplyUrlReplaceCharacters ,
2014-02-12 23:27:48 +01:00
PostFilter = x = > CutMaxLength ( x , 240 ) ,
2013-12-13 12:06:56 +01:00
IsTerm = ( c , leading ) = > char . IsLetterOrDigit ( c ) | | c = = '_' , // letter, digit or underscore
2014-01-16 17:03:51 +01:00
StringType = ( UrlReplacingToAscii ? CleanStringType . Ascii : CleanStringType . Utf8 ) | CleanStringType . LowerCase ,
2013-12-13 12:06:56 +01:00
BreakTermsOnUpper = false ,
Separator = '-'
} ) . WithConfig ( CleanStringType . FileName , new Config
{
PreFilter = ApplyUrlReplaceCharacters ,
IsTerm = ( c , leading ) = > char . IsLetterOrDigit ( c ) | | c = = '_' , // letter, digit or underscore
StringType = CleanStringType . Utf8 | CleanStringType . LowerCase ,
BreakTermsOnUpper = false ,
Separator = '-'
} ) . WithConfig ( CleanStringType . Alias , new Config
{
PreFilter = ApplyUrlReplaceCharacters ,
IsTerm = ( c , leading ) = > leading
? char . IsLetter ( c ) // only letters
: ( char . IsLetterOrDigit ( c ) | | c = = '_' ) , // letter, digit or underscore
StringType = CleanStringType . Ascii | CleanStringType . UmbracoCase ,
BreakTermsOnUpper = false
2014-02-12 10:32:07 +01:00
} ) . WithConfig ( CleanStringType . ConvertCase , new Config
{
PreFilter = null ,
IsTerm = ( c , leading ) = > char . IsLetterOrDigit ( c ) | | c = = '_' , // letter, digit or underscore
StringType = CleanStringType . Ascii ,
BreakTermsOnUpper = true
2013-12-13 12:06:56 +01:00
} ) ;
}
public sealed class Config
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
public Config ( )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
StringType = CleanStringType . Utf8 | CleanStringType . Unchanged ;
2013-02-07 13:30:50 -01:00
PreFilter = null ;
2014-02-12 23:27:48 +01:00
PostFilter = null ;
2013-12-13 12:06:56 +01:00
IsTerm = ( c , leading ) = > leading ? char . IsLetter ( c ) : char . IsLetterOrDigit ( c ) ;
BreakTermsOnUpper = false ;
CutAcronymOnNonUpper = false ;
GreedyAcronyms = false ;
Separator = Char . MinValue ;
2013-02-07 13:30:50 -01:00
}
2013-12-13 12:06:56 +01:00
public Config Clone ( )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
return new Config
{
PreFilter = PreFilter ,
2014-02-12 23:27:48 +01:00
PostFilter = PostFilter ,
2013-12-13 12:06:56 +01:00
IsTerm = IsTerm ,
StringType = StringType ,
BreakTermsOnUpper = BreakTermsOnUpper ,
CutAcronymOnNonUpper = CutAcronymOnNonUpper ,
GreedyAcronyms = GreedyAcronyms ,
Separator = Separator
} ;
2013-02-07 13:30:50 -01:00
}
2013-12-13 12:06:56 +01:00
public Func < string , string > PreFilter { get ; set ; }
2014-02-12 23:27:48 +01:00
public Func < string , string > PostFilter { get ; set ; }
2013-12-13 12:06:56 +01:00
public Func < char , bool , bool > IsTerm { get ; set ; }
public CleanStringType StringType { get ; set ; }
2013-04-12 05:33:39 -02:00
// indicate whether an uppercase within a term eg "fooBar" is to break
// into a new term, or to be considered as part of the current term
2013-12-13 12:06:56 +01:00
public bool BreakTermsOnUpper { get ; set ; }
2013-04-12 05:33:39 -02:00
2013-12-13 12:06:56 +01:00
// indicate whether a non-uppercase within an acronym eg "FOOBar" is to cut
// the acronym (at "B" or "a" depending on GreedyAcronyms) or to give
// up the acronym and treat the term as a word
public bool CutAcronymOnNonUpper { get ; set ; }
2013-02-07 13:30:50 -01:00
2013-04-12 05:33:39 -02:00
// indicates whether acronyms parsing is greedy ie whether "FOObar" is
// "FOO" + "bar" (greedy) or "FO" + "Obar" (non-greedy)
2013-12-13 12:06:56 +01:00
public bool GreedyAcronyms { get ; set ; }
// the separator char
// but then how can we tell we dont want any?
public char Separator { get ; set ; }
2013-04-12 05:33:39 -02:00
2013-12-13 12:06:56 +01:00
// extends the config
public CleanStringType StringTypeExtend ( CleanStringType stringType )
{
var st = StringType ;
foreach ( var mask in new [ ] { CleanStringType . CaseMask , CleanStringType . CodeMask } )
{
var a = stringType & mask ;
if ( a = = 0 ) continue ;
st = st & ~ mask ; // clear what we have
st = st | a ; // set the new value
}
return st ;
}
internal static readonly Config NotConfigured = new Config ( ) ;
2013-02-07 13:30:50 -01:00
}
2013-12-13 12:06:56 +01:00
private Config GetConfig ( CleanStringType stringType , CultureInfo culture )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
stringType = stringType & CleanStringType . RoleMask ;
Dictionary < CleanStringType , Config > config ;
2013-02-07 13:30:50 -01:00
if ( _configs . ContainsKey ( culture ) )
{
config = _configs [ culture ] ;
if ( config . ContainsKey ( stringType ) ) // have we got a config for _that_ role?
return config [ stringType ] ;
if ( config . ContainsKey ( CleanStringType . RoleMask ) ) // have we got a generic config for _all_ roles?
return config [ CleanStringType . RoleMask ] ;
}
else if ( _configs . ContainsKey ( _defaultCulture ) )
{
config = _configs [ _defaultCulture ] ;
if ( config . ContainsKey ( stringType ) ) // have we got a config for _that_ role?
return config [ stringType ] ;
if ( config . ContainsKey ( CleanStringType . RoleMask ) ) // have we got a generic config for _all_ roles?
return config [ CleanStringType . RoleMask ] ;
}
2013-12-13 12:06:56 +01:00
return Config . NotConfigured ;
2013-02-07 13:30:50 -01:00
}
#endregion
2013-02-19 06:26:58 -01:00
#region JavaScript
2013-02-19 06:30:19 -01:00
private const string SssjsFormat = @ "
2013-02-19 06:26:58 -01:00
var UMBRACO_FORCE_SAFE_ALIAS = { 0 } ;
2013-02-19 06:30:19 -01:00
var UMBRACO_FORCE_SAFE_ALIAS_URL = ' { 1 } ' ;
var UMBRACO_FORCE_SAFE_ALIAS_TIMEOUT = 666 ;
var UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS = { { } } ;
function getSafeAliasFromServer ( value , callback ) { {
$ . getJSON ( UMBRACO_FORCE_SAFE_ALIAS_URL + ' ToSafeAlias ? value = ' + encodeURIComponent ( value ) , function ( json ) { {
if ( json . alias ) { { callback ( json . alias ) ; } }
} } ) ;
2013-02-19 06:29:59 -01:00
} }
function getSafeAlias ( id , value , immediate , callback ) { {
2013-02-19 06:30:19 -01:00
if ( ! UMBRACO_FORCE_SAFE_ALIAS ) { {
callback ( value ) ;
return ;
} }
if ( UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS [ id ] ) clearTimeout ( UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS [ id ] ) ;
UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS [ id ] = setTimeout ( function ( ) { {
UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS [ id ] = null ;
getSafeAliasFromServer ( value , function ( alias ) { { callback ( alias ) ; } } ) ;
} } , UMBRACO_FORCE_SAFE_ALIAS_TIMEOUT ) ;
2013-02-19 06:26:58 -01:00
} }
2013-02-19 06:29:59 -01:00
function validateSafeAlias ( id , value , immediate , callback ) { {
2013-02-19 06:30:19 -01:00
if ( ! UMBRACO_FORCE_SAFE_ALIAS ) { {
callback ( true ) ;
return ;
} }
if ( UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS [ id ] ) clearTimeout ( UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS [ id ] ) ;
UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS [ id ] = setTimeout ( function ( ) { {
UMBRACO_FORCE_SAFE_ALIAS_TIMEOUTS [ id ] = null ;
getSafeAliasFromServer ( value , function ( alias ) { { callback ( value . toLowerCase ( ) = = alias . toLowerCase ( ) ) ; } } ) ;
} } , UMBRACO_FORCE_SAFE_ALIAS_TIMEOUT ) ;
2013-02-19 06:26:58 -01:00
} }
";
/// <summary>
2013-02-19 06:30:19 -01:00
/// Gets the JavaScript code defining client-side short string services.
2013-02-19 06:26:58 -01:00
/// </summary>
2013-02-19 06:30:19 -01:00
public string GetShortStringServicesJavaScript ( string controllerPath )
2013-02-19 06:26:58 -01:00
{
2013-02-19 06:30:19 -01:00
return string . Format ( SssjsFormat ,
UmbracoSettings . ForceSafeAliases ? "true" : "false" , controllerPath ) ;
2013-02-19 06:26:58 -01:00
}
#endregion
2013-02-07 13:30:50 -01:00
#region IShortStringHelper CleanFor . . .
/// <summary>
/// Cleans a string to produce a string that can safely be used in an alias.
/// </summary>
/// <param name="text">The text to filter.</param>
/// <returns>The safe alias.</returns>
/// <remarks>
/// <para>The string will be cleaned in the context of the default culture.</para>
/// <para>Safe aliases are Ascii only.</para>
/// </remarks>
2013-02-19 06:37:24 -01:00
public virtual string CleanStringForSafeAlias ( string text )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
return CleanStringForSafeAlias ( text , _defaultCulture ) ;
2013-02-07 13:30:50 -01:00
}
/// <summary>
/// Cleans a string, in the context of a specified culture, to produce a string that can safely be used in an alias.
/// </summary>
/// <param name="text">The text to filter.</param>
/// <param name="culture">The culture.</param>
/// <returns>The safe alias.</returns>
/// <remarks>
/// <para>Safe aliases are Ascii only.</para>
/// </remarks>
2013-02-19 06:37:24 -01:00
public virtual string CleanStringForSafeAlias ( string text , CultureInfo culture )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
return CleanString ( text , CleanStringType . Alias , culture ) ;
2013-02-07 13:30:50 -01:00
}
/// <summary>
/// Cleans a string to produce a string that can safely be used in an url segment.
/// </summary>
/// <param name="text">The text to filter.</param>
/// <returns>The safe url segment.</returns>
/// <remarks>
/// <para>The string will be cleaned in the context of the default culture.</para>
/// <para>Url segments are Ascii only (no accents...).</para>
/// </remarks>
2013-02-19 06:37:24 -01:00
public virtual string CleanStringForUrlSegment ( string text )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
return CleanStringForUrlSegment ( text , _defaultCulture ) ;
2013-02-07 13:30:50 -01:00
}
/// <summary>
/// Cleans a string, in the context of a specified culture, to produce a string that can safely be used in an url segment.
/// </summary>
/// <param name="text">The text to filter.</param>
/// <param name="culture">The culture.</param>
/// <returns>The safe url segment.</returns>
/// <remarks>
/// <para>Url segments are Ascii only (no accents...).</para>
/// </remarks>
2013-02-19 06:37:24 -01:00
public virtual string CleanStringForUrlSegment ( string text , CultureInfo culture )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
return CleanString ( text , CleanStringType . UrlSegment , culture ) ;
2013-02-07 13:30:50 -01:00
}
2013-03-11 14:58:07 -01:00
/// <summary>
2013-12-13 12:06:56 +01:00
/// Cleans a string, in the context of the default culture, to produce a string that can safely be used as a filename,
2013-03-11 14:58:07 -01:00
/// both internally (on disk) and externally (as a url).
/// </summary>
/// <param name="text">The text to filter.</param>
/// <returns>The safe filename.</returns>
/// <remarks>Legacy says this was used to "overcome an issue when Umbraco is used in IE in an intranet environment" but that issue is not documented.</remarks>
public virtual string CleanStringForSafeFileName ( string text )
{
2013-12-13 12:06:56 +01:00
return CleanStringForSafeFileName ( text , _defaultCulture ) ;
2013-03-11 14:58:07 -01:00
}
/// <summary>
2013-12-13 12:06:56 +01:00
/// Cleans a string to produce a string that can safely be used as a filename,
2013-03-11 14:58:07 -01:00
/// both internally (on disk) and externally (as a url).
/// </summary>
/// <param name="text">The text to filter.</param>
/// <param name="culture">The culture.</param>
/// <returns>The safe filename.</returns>
public virtual string CleanStringForSafeFileName ( string text , CultureInfo culture )
{
if ( string . IsNullOrWhiteSpace ( text ) )
return string . Empty ;
text = text . ReplaceMany ( Path . GetInvalidFileNameChars ( ) , '-' ) ;
2013-12-13 12:06:56 +01:00
var name = Path . GetFileNameWithoutExtension ( text ) ;
var ext = Path . GetExtension ( text ) ; // includes the dot, empty if no extension
2013-03-11 14:58:07 -01:00
2013-12-13 12:06:56 +01:00
Debug . Assert ( name ! = null , "name != null" ) ;
if ( name . Length > 0 )
name = CleanString ( name , CleanStringType . FileName , culture ) ;
Debug . Assert ( ext ! = null , "ext != null" ) ;
if ( ext . Length > 0 )
ext = CleanString ( ext . Substring ( 1 ) , CleanStringType . FileName , culture ) ;
2013-03-11 14:58:07 -01:00
2013-12-13 12:06:56 +01:00
return ext . Length > 0 ? ( name + "." + ext ) : name ;
2013-03-11 14:58:07 -01:00
}
2013-02-07 13:30:50 -01:00
#endregion
#region CleanString
// MS rules & guidelines:
// - Do capitalize both characters of two-character acronyms, except the first word of a camel-cased identifier.
2013-12-13 12:06:56 +01:00
// eg "DBRate" (pascal) or "ioHelper" (camel) - "SpecialDBRate" (pascal) or "specialIOHelper" (camel)
2013-02-07 13:30:50 -01:00
// - Do capitalize only the first character of acronyms with three or more characters, except the first word of a camel-cased identifier.
// eg "XmlWriter (pascal) or "htmlReader" (camel) - "SpecialXmlWriter" (pascal) or "specialHtmlReader" (camel)
// - Do not capitalize any of the characters of any acronyms, whatever their length, at the beginning of a camel-cased identifier.
// eg "xmlWriter" or "dbWriter" (camel)
//
// Our additional stuff:
// - Leading digits are removed.
// - Many consecutive separators are folded into one unique separator.
const byte StateBreak = 1 ;
const byte StateUp = 2 ;
const byte StateWord = 3 ;
const byte StateAcronym = 4 ;
/// <summary>
/// Cleans a string.
/// </summary>
/// <param name="text">The text to clean.</param>
/// <param name="stringType">A flag indicating the target casing and encoding of the string. By default,
/// strings are cleaned up to camelCase and Ascii.</param>
/// <returns>The clean string.</returns>
/// <remarks>The string is cleaned in the context of the default culture.</remarks>
public string CleanString ( string text , CleanStringType stringType )
{
2013-12-13 12:06:56 +01:00
return CleanString ( text , stringType , _defaultCulture , null ) ;
2013-02-07 13:30:50 -01:00
}
/// <summary>
/// Cleans a string, using a specified separator.
/// </summary>
/// <param name="text">The text to clean.</param>
/// <param name="stringType">A flag indicating the target casing and encoding of the string. By default,
/// strings are cleaned up to camelCase and Ascii.</param>
/// <param name="separator">The separator.</param>
/// <returns>The clean string.</returns>
/// <remarks>The string is cleaned in the context of the default culture.</remarks>
public string CleanString ( string text , CleanStringType stringType , char separator )
{
2013-12-13 12:06:56 +01:00
return CleanString ( text , stringType , _defaultCulture , separator ) ;
2013-02-07 13:30:50 -01:00
}
/// <summary>
/// Cleans a string in the context of a specified culture.
/// </summary>
/// <param name="text">The text to clean.</param>
/// <param name="stringType">A flag indicating the target casing and encoding of the string. By default,
/// strings are cleaned up to camelCase and Ascii.</param>
/// <param name="culture">The culture.</param>
/// <returns>The clean string.</returns>
public string CleanString ( string text , CleanStringType stringType , CultureInfo culture )
{
2013-12-13 12:06:56 +01:00
return CleanString ( text , stringType , culture , null ) ;
2013-02-07 13:30:50 -01:00
}
/// <summary>
/// Cleans a string in the context of a specified culture, using a specified separator.
/// </summary>
/// <param name="text">The text to clean.</param>
/// <param name="stringType">A flag indicating the target casing and encoding of the string. By default,
/// strings are cleaned up to camelCase and Ascii.</param>
/// <param name="separator">The separator.</param>
/// <param name="culture">The culture.</param>
/// <returns>The clean string.</returns>
2013-12-13 12:06:56 +01:00
public string CleanString ( string text , CleanStringType stringType , char separator , CultureInfo culture )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
return CleanString ( text , stringType , culture , separator ) ;
2013-02-07 13:30:50 -01:00
}
2013-12-13 12:06:56 +01:00
protected virtual string CleanString ( string text , CleanStringType stringType , CultureInfo culture , char? separator )
2013-02-07 13:30:50 -01:00
{
2013-02-19 06:34:06 -01:00
// be safe
if ( text = = null )
throw new ArgumentNullException ( "text" ) ;
if ( culture = = null )
throw new ArgumentNullException ( "culture" ) ;
2014-02-12 10:32:07 +01:00
#if WRTCONS
Console . WriteLine ( "STRING TYPE {0}" , stringType ) ;
#endif
2013-12-13 12:06:56 +01:00
// get config
var config = GetConfig ( stringType , culture ) ;
stringType = config . StringTypeExtend ( stringType ) ;
2013-02-07 13:30:50 -01:00
// apply defaults
if ( ( stringType & CleanStringType . CaseMask ) = = CleanStringType . None )
stringType | = CleanStringType . CamelCase ;
if ( ( stringType & CleanStringType . CodeMask ) = = CleanStringType . None )
stringType | = CleanStringType . Ascii ;
2013-12-13 12:06:56 +01:00
// use configured unless specified
separator = separator ? ? config . Separator ;
2013-02-07 13:30:50 -01:00
// apply pre-filter
if ( config . PreFilter ! = null )
text = config . PreFilter ( text ) ;
// apply replacements
//if (config.Replacements != null)
// text = ReplaceMany(text, config.Replacements);
// recode
2013-12-13 12:06:56 +01:00
var codeType = stringType & CleanStringType . CodeMask ;
text = codeType = = CleanStringType . Ascii
? Utf8ToAsciiConverter . ToAsciiString ( text )
: RemoveSurrogatePairs ( text ) ;
2013-02-07 13:30:50 -01:00
// clean
2013-12-13 12:06:56 +01:00
text = CleanCodeString ( text , stringType , separator . Value , culture , config ) ;
2013-02-07 13:30:50 -01:00
2014-02-12 23:27:48 +01:00
// apply post-filter
if ( config . PostFilter ! = null )
text = config . PostFilter ( text ) ;
2013-02-07 13:30:50 -01:00
return text ;
}
2013-12-13 12:06:56 +01:00
private static string RemoveSurrogatePairs ( string text )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
var input = text . ToCharArray ( ) ;
var output = new char [ input . Length ] ;
var opos = 0 ;
2013-02-07 13:30:50 -01:00
2013-12-13 12:06:56 +01:00
for ( var ipos = 0 ; ipos < input . Length ; ipos + + )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
var c = input [ ipos ] ;
if ( char . IsSurrogate ( c ) ) // ignore high surrogate
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
ipos + + ; // and skip low surrogate
output [ opos + + ] = '?' ;
}
else
{
output [ opos + + ] = c ;
2013-02-07 13:30:50 -01:00
}
}
return new string ( output , 0 , opos ) ;
}
2013-12-13 12:06:56 +01:00
// here was a subtle, ascii-optimized version of the cleaning code, and I was
// very proud of it until benchmarking showed it was an order of magnitude slower
// that the utf8 version. Micro-optimizing sometimes isn't such a good idea.
2013-02-07 13:30:50 -01:00
2013-12-13 12:06:56 +01:00
// note: does NOT support surrogate pairs in text
internal string CleanCodeString ( string text , CleanStringType caseType , char separator , CultureInfo culture , Config config )
2013-02-07 13:30:50 -01:00
{
int opos = 0 , ipos = 0 ;
var state = StateBreak ;
caseType & = CleanStringType . CaseMask ;
2014-02-12 10:32:07 +01:00
#if WRTCONS
Console . WriteLine ( "CASE {0}" , caseType ) ;
#endif
2013-02-07 13:30:50 -01:00
// if we apply global ToUpper or ToLower to text here
// then we cannot break words on uppercase chars
var input = text ;
// it's faster to use an array than a StringBuilder
var ilen = input . Length ;
2013-05-04 14:53:33 -02:00
var output = new char [ ilen * 2 ] ; // twice the length should be OK in all cases
2013-02-07 13:30:50 -01:00
for ( var i = 0 ; i < ilen ; i + + )
{
var c = input [ i ] ;
2013-12-16 17:00:38 +01:00
// leading as long as StateBreak and ipos still zero
var leading = state = = StateBreak & & ipos = = 0 ;
var isTerm = config . IsTerm ( c , leading ) ;
2013-12-13 12:06:56 +01:00
//var isDigit = char.IsDigit(c);
2013-02-07 13:30:50 -01:00
var isUpper = char . IsUpper ( c ) ; // false for digits, symbols...
2013-12-13 12:06:56 +01:00
//var isLower = char.IsLower(c); // false for digits, symbols...
// what should I do with surrogates?
// no idea, really, so they are not supported at the moment
var isPair = char . IsSurrogate ( c ) ;
if ( isPair )
throw new NotSupportedException ( "Surrogate pairs are not supported." ) ;
2014-02-12 10:32:07 +01:00
#if WRTCONS
Console . WriteLine ( "CHAR '{0}' {1} {2} - {3} - {4}/{5} {6}" ,
c ,
isTerm ? "term" : "!term" , isUpper ? "upper" : "!upper" ,
state ,
i , ipos , leading ? "leading" : "!leading" ) ;
#endif
2013-02-07 13:30:50 -01:00
switch ( state )
{
2013-12-13 12:06:56 +01:00
// within a break
2013-02-07 13:30:50 -01:00
case StateBreak :
2013-12-13 12:06:56 +01:00
// begin a new term if char is a term char,
// and ( pos > 0 or it's also a valid leading char )
if ( isTerm )
2013-02-07 13:30:50 -01:00
{
ipos = i ;
if ( opos > 0 & & separator ! = char . MinValue )
output [ opos + + ] = separator ;
state = isUpper ? StateUp : StateWord ;
}
break ;
2013-12-13 12:06:56 +01:00
// within a term / word
2013-02-07 13:30:50 -01:00
case StateWord :
2013-12-13 12:06:56 +01:00
// end a term if char is not a term char,
// or ( it's uppercase and we break terms on uppercase)
2013-05-04 14:53:33 -02:00
if ( isTerm = = false | | ( config . BreakTermsOnUpper & & isUpper ) )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
CopyTerm ( input , ipos , output , ref opos , i - ipos , caseType , culture , false ) ;
2013-02-07 13:30:50 -01:00
ipos = i ;
state = isTerm ? StateUp : StateBreak ;
if ( state ! = StateBreak & & separator ! = char . MinValue )
output [ opos + + ] = separator ;
}
break ;
2013-12-13 12:06:56 +01:00
// within a term / acronym
2013-02-07 13:30:50 -01:00
case StateAcronym :
2013-12-13 12:06:56 +01:00
// end an acronym if char is not a term char,
// or if it's not uppercase / config
//Console.WriteLine("acro {0} {1}", c, (config.CutAcronymOnNonUpper && isUpper == false));
if ( isTerm = = false | | ( config . CutAcronymOnNonUpper & & isUpper = = false ) )
2013-02-07 13:30:50 -01:00
{
2013-12-13 12:06:56 +01:00
// whether it's part of the acronym depends on whether we're greedy
if ( isTerm & & config . GreedyAcronyms = = false )
i - = 1 ; // handle that char again, in another state - not part of the acronym
if ( i - ipos > 1 ) // single-char can't be an acronym
{
CopyTerm ( input , ipos , output , ref opos , i - ipos , caseType , culture , true ) ;
ipos = i ;
state = isTerm ? StateWord : StateBreak ;
if ( state ! = StateBreak & & separator ! = char . MinValue )
output [ opos + + ] = separator ;
}
else if ( isTerm )
{
state = StateWord ;
}
}
else if ( isUpper = = false ) // isTerm == true
{
// it's a term char and we don't cut...
// keep moving forward as a word
state = StateWord ;
2013-02-07 13:30:50 -01:00
}
break ;
2013-12-13 12:06:56 +01:00
// within a term / uppercase = could be a word or an acronym
2013-02-07 13:30:50 -01:00
case StateUp :
if ( isTerm )
{
2013-12-13 12:06:56 +01:00
// add that char to the term and pick word or acronym
2013-02-07 13:30:50 -01:00
state = isUpper ? StateAcronym : StateWord ;
}
else
{
2013-12-13 12:06:56 +01:00
// single char, copy then break
CopyTerm ( input , ipos , output , ref opos , 1 , caseType , culture , false ) ;
2013-02-07 13:30:50 -01:00
state = StateBreak ;
}
break ;
default :
throw new Exception ( "Invalid state." ) ;
}
}
switch ( state )
{
case StateBreak :
break ;
case StateWord :
2013-12-13 12:06:56 +01:00
CopyTerm ( input , ipos , output , ref opos , input . Length - ipos , caseType , culture , false ) ;
2013-02-07 13:30:50 -01:00
break ;
case StateAcronym :
case StateUp :
2013-12-13 12:06:56 +01:00
CopyTerm ( input , ipos , output , ref opos , input . Length - ipos , caseType , culture , true ) ;
2013-02-07 13:30:50 -01:00
break ;
default :
throw new Exception ( "Invalid state." ) ;
}
return new string ( output , 0 , opos ) ;
}
2013-12-13 12:06:56 +01:00
// note: supports surrogate pairs in input string
internal void CopyTerm ( string input , int ipos , char [ ] output , ref int opos , int len ,
CleanStringType caseType , CultureInfo culture , bool isAcronym )
2013-02-07 13:30:50 -01:00
{
var term = input . Substring ( ipos , len ) ;
2014-02-12 10:32:07 +01:00
#if WRTCONS
Console . WriteLine ( "TERM \"{0}\" {1} {2}" ,
term ,
isAcronym ? "acronym" : "word" ,
caseType ) ;
#endif
2013-02-07 13:30:50 -01:00
if ( isAcronym )
{
2013-04-12 05:33:39 -02:00
if ( ( caseType = = CleanStringType . CamelCase & & len < = 2 & & opos > 0 ) | |
( caseType = = CleanStringType . PascalCase & & len < = 2 ) | |
( caseType = = CleanStringType . UmbracoCase ) )
2013-02-07 13:30:50 -01:00
caseType = CleanStringType . Unchanged ;
}
2013-12-13 12:06:56 +01:00
// note: MSDN seems to imply that ToUpper or ToLower preserve the length
// of the string, but that this behavior is not guaranteed and could change.
2013-02-07 13:30:50 -01:00
char c ;
2013-12-13 12:06:56 +01:00
int i ;
string s ;
2013-02-07 13:30:50 -01:00
switch ( caseType )
{
//case CleanStringType.LowerCase:
//case CleanStringType.UpperCase:
case CleanStringType . Unchanged :
2013-12-13 12:06:56 +01:00
term . CopyTo ( 0 , output , opos , len ) ;
2013-02-07 13:30:50 -01:00
opos + = len ;
break ;
case CleanStringType . LowerCase :
2013-12-13 12:06:56 +01:00
term = term . ToLower ( culture ) ;
term . CopyTo ( 0 , output , opos , term . Length ) ;
opos + = term . Length ;
2013-02-07 13:30:50 -01:00
break ;
case CleanStringType . UpperCase :
2013-12-13 12:06:56 +01:00
term = term . ToUpper ( culture ) ;
term . CopyTo ( 0 , output , opos , term . Length ) ;
opos + = term . Length ;
2013-02-07 13:30:50 -01:00
break ;
case CleanStringType . CamelCase :
2013-12-13 12:06:56 +01:00
c = term [ 0 ] ;
i = 1 ;
if ( char . IsSurrogate ( c ) )
{
s = term . Substring ( ipos , 2 ) ;
s = opos = = 0 ? s . ToLower ( culture ) : s . ToUpper ( culture ) ;
s . CopyTo ( 0 , output , opos , s . Length ) ;
opos + = s . Length ;
i + + ; // surrogate pair len is 2
}
else
{
output [ opos ] = opos + + = = 0 ? char . ToLower ( c , culture ) : char . ToUpper ( c , culture ) ;
}
if ( len > i )
{
term = term . Substring ( i ) . ToLower ( culture ) ;
term . CopyTo ( 0 , output , opos , term . Length ) ;
opos + = term . Length ;
}
2013-02-07 13:30:50 -01:00
break ;
case CleanStringType . PascalCase :
2013-12-13 12:06:56 +01:00
c = term [ 0 ] ;
i = 1 ;
if ( char . IsSurrogate ( c ) )
{
s = term . Substring ( ipos , 2 ) ;
s = s . ToUpper ( culture ) ;
s . CopyTo ( 0 , output , opos , s . Length ) ;
opos + = s . Length ;
i + + ; // surrogate pair len is 2
}
else
{
output [ opos + + ] = char . ToUpper ( c , culture ) ;
}
if ( len > i )
{
term = term . Substring ( i ) . ToLower ( culture ) ;
term . CopyTo ( 0 , output , opos , term . Length ) ;
opos + = term . Length ;
}
2013-04-12 05:33:39 -02:00
break ;
case CleanStringType . UmbracoCase :
2013-12-13 12:06:56 +01:00
c = term [ 0 ] ;
i = 1 ;
if ( char . IsSurrogate ( c ) )
{
s = term . Substring ( ipos , 2 ) ;
s = opos = = 0 ? s : s . ToUpper ( culture ) ;
s . CopyTo ( 0 , output , opos , s . Length ) ;
opos + = s . Length ;
i + + ; // surrogate pair len is 2
}
else
{
output [ opos ] = opos + + = = 0 ? c : char . ToUpper ( c , culture ) ;
}
if ( len > i )
{
term = term . Substring ( i ) ;
term . CopyTo ( 0 , output , opos , term . Length ) ;
opos + = term . Length ;
}
2013-02-07 13:30:50 -01:00
break ;
default :
throw new ArgumentOutOfRangeException ( "caseType" ) ;
}
}
#endregion
#region SplitPascalCasing
/// <summary>
/// Splits a Pascal-cased string into a phrase separated by a separator.
/// </summary>
/// <param name="text">The text to split.</param>
/// <param name="separator">The separator, which defaults to a whitespace.</param>
/// <returns>The splitted text.</returns>
/// <remarks>Supports Utf8 and Ascii strings, not Unicode strings.</remarks>
2013-12-13 12:06:56 +01:00
// NOTE does not support surrogates pairs at the moment
2013-02-19 06:37:24 -01:00
public virtual string SplitPascalCasing ( string text , char separator )
2013-02-07 13:30:50 -01:00
{
2013-02-19 06:34:06 -01:00
// be safe
if ( text = = null )
throw new ArgumentNullException ( "text" ) ;
2013-02-07 13:30:50 -01:00
var input = text . ToCharArray ( ) ;
var output = new char [ input . Length * 2 ] ;
var opos = 0 ;
var a = input . Length > 0 ? input [ 0 ] : char . MinValue ;
var upos = char . IsUpper ( a ) ? 1 : 0 ;
for ( var i = 1 ; i < input . Length ; i + + )
{
var c = input [ i ] ;
if ( char . IsUpper ( c ) )
{
output [ opos + + ] = a ;
if ( upos = = 0 )
{
if ( opos > 0 )
output [ opos + + ] = separator ;
upos = i + 1 ;
}
}
else
{
if ( upos > 0 )
{
if ( upos < i & & opos > 0 )
output [ opos + + ] = separator ;
upos = 0 ;
}
output [ opos + + ] = a ;
}
a = c ;
}
if ( a ! = char . MinValue )
output [ opos + + ] = a ;
return new string ( output , 0 , opos ) ;
}
#endregion
#region ReplaceMany
/// <summary>
/// Returns a new string in which all occurences of specified strings are replaced by other specified strings.
/// </summary>
/// <param name="text">The string to filter.</param>
/// <param name="replacements">The replacements definition.</param>
/// <returns>The filtered string.</returns>
2013-02-19 06:37:24 -01:00
public virtual string ReplaceMany ( string text , IDictionary < string , string > replacements )
2013-02-07 13:30:50 -01:00
{
2013-02-19 06:34:06 -01:00
// be safe
if ( text = = null )
throw new ArgumentNullException ( "text" ) ;
if ( replacements = = null )
throw new ArgumentNullException ( "replacements" ) ;
2013-02-07 13:30:50 -01:00
// Have done various tests, implementing my own "super fast" state machine to handle
// replacement of many items, or via regexes, but on short strings and not too
// many replacements (which prob. is going to be our case) nothing can beat this...
// (at least with safe and checked code -- we don't want unsafe/unchecked here)
// Note that it will do chained-replacements ie replaced items can be replaced
// in turn by another replacement (ie the order of replacements is important)
return replacements . Aggregate ( text , ( current , kvp ) = > current . Replace ( kvp . Key , kvp . Value ) ) ;
}
2013-03-11 14:58:07 -01:00
/// <summary>
/// Returns a new string in which all occurences of specified characters are replaced by a specified character.
/// </summary>
/// <param name="text">The string to filter.</param>
/// <param name="chars">The characters to replace.</param>
/// <param name="replacement">The replacement character.</param>
/// <returns>The filtered string.</returns>
public virtual string ReplaceMany ( string text , char [ ] chars , char replacement )
{
// be safe
if ( text = = null )
throw new ArgumentNullException ( "text" ) ;
if ( chars = = null )
throw new ArgumentNullException ( "chars" ) ;
// see note above
return chars . Aggregate ( text , ( current , c ) = > current . Replace ( c , replacement ) ) ;
}
2013-02-07 13:30:50 -01:00
#endregion
}
}