From 34df5009e040463335d76571938e649008800b79 Mon Sep 17 00:00:00 2001 From: Shannon Date: Fri, 13 Dec 2013 16:58:21 +1100 Subject: [PATCH 1/3] Upgraded installer to do a two part install - first install then if an upgrade is required do the upgrade in a second call - this way we can have the progress bar update since the media xml installer can take some time. --- src/Umbraco.Core/DatabaseContext.cs | 180 ++++++++++++------ .../install/InstallerRestService.aspx.cs | 43 ++++- .../install/steps/database.ascx | 92 ++++++--- .../install/utills/LegacyClasses.cs | 6 +- 4 files changed, 233 insertions(+), 88 deletions(-) diff --git a/src/Umbraco.Core/DatabaseContext.cs b/src/Umbraco.Core/DatabaseContext.cs index 4d5170ec80..e074297eb8 100644 --- a/src/Umbraco.Core/DatabaseContext.cs +++ b/src/Umbraco.Core/DatabaseContext.cs @@ -433,24 +433,19 @@ namespace Umbraco.Core return _result; } - internal Result CreateDatabaseSchemaAndDataOrUpgrade() + internal Result CreateDatabaseSchemaAndData() { - if (_configured == false || (string.IsNullOrEmpty(_connectionString) || string.IsNullOrEmpty(ProviderName))) + var readyForInstall = CheckReadyForInstall(); + if (readyForInstall.Success == false) { - return new Result - { - Message = - "Database configuration is invalid. Please check that the entered database exists and that the provided username and password has write access to the database.", - Success = false, - Percentage = "10" - }; + return readyForInstall.Result; } - + try { LogHelper.Info("Database configuration status: Started"); - var message = string.Empty; + string message; var database = new UmbracoDatabase(_connectionString, ProviderName); var supportsCaseInsensitiveQueries = SqlSyntaxContext.SqlSyntaxProvider.SupportsCaseInsensitiveQueries(database); @@ -465,50 +460,73 @@ namespace Umbraco.Core return new Result { Message = message, Success = false, Percentage = "15" }; } - else if (supportsCaseInsensitiveQueries == null) - { - message = "

 

Warning! Could not check if your database type supports case insensitive queries.
We currently do not support these databases that do not support case insensitive queries.

" + - "

You can check this by looking for the following setting in your my.ini file in your MySQL installation directory:

" + - "
lower_case_table_names=1

" + - "

Note: Make sure to check with your hosting provider if they support case insensitive queries as well.

" + - "

For more technical information on case sensitivity in MySQL, have a look at " + - "the documentation on the subject

"; - } - else - { - if (SqlSyntaxContext.SqlSyntaxProvider.GetType() == typeof(MySqlSyntaxProvider)) - { - message = "

 

Congratulations, the database step ran successfully!

" + - "

Note: You're using MySQL and the database instance you're connecting to seems to support case insensitive queries.

" + - "

However, your hosting provider may not support this option. Umbraco does not currently support MySQL installs that do not support case insensitive queries

" + - "

Make sure to check with your hosting provider if they support case insensitive queries as well.

" + - "

They can check this by looking for the following setting in the my.ini file in their MySQL installation directory:

" + - "
lower_case_table_names=1

" + - "

For more technical information on case sensitivity in MySQL, have a look at " + - "the documentation on the subject

"; - } - } + + message = GetResultMessageForMySql(supportsCaseInsensitiveQueries); var schemaResult = ValidateDatabaseSchema(); var installedVersion = schemaResult.DetermineInstalledVersion(); - //If Configuration Status is empty and the determined version is "empty" its a new install - otherwise upgrade the existing if (string.IsNullOrEmpty(GlobalSettings.ConfigurationStatus) && installedVersion.Equals(new Version(0, 0, 0))) { database.CreateDatabaseSchema(); message = message + "

Installation completed!

"; + + //now that everything is done, we need to determine the version of SQL server that is executing + LogHelper.Info("Database configuration status: " + message); + return new Result { Message = message, Success = true, Percentage = "100" }; } - else - { - var configuredVersion = string.IsNullOrEmpty(GlobalSettings.ConfigurationStatus) + + //we need to do an upgrade so return a new status message and it will need to be done during the next step + LogHelper.Info("Database requires upgrade"); + message = "

Upgrading database, this may take some time...

"; + return new Result + { + RequiresUpgrade = true, + Message = message, + Success = true, + Percentage = "30" + }; + } + catch (Exception ex) + { + return HandleInstallException(ex); + } + } + + /// + /// This assumes all of the previous checks are done! + /// + /// + internal Result UpgradeSchemaAndData() + { + var readyForInstall = CheckReadyForInstall(); + if (readyForInstall.Success == false) + { + return readyForInstall.Result; + } + + try + { + LogHelper.Info("Database upgrade started"); + + var database = new UmbracoDatabase(_connectionString, ProviderName); + var supportsCaseInsensitiveQueries = SqlSyntaxContext.SqlSyntaxProvider.SupportsCaseInsensitiveQueries(database); + + var message = GetResultMessageForMySql(supportsCaseInsensitiveQueries); + + var schemaResult = ValidateDatabaseSchema(); + var installedVersion = schemaResult.DetermineInstalledVersion(); + + //DO the upgrade! + + var configuredVersion = string.IsNullOrEmpty(GlobalSettings.ConfigurationStatus) ? installedVersion : new Version(GlobalSettings.ConfigurationStatus); - var targetVersion = UmbracoVersion.Current; - var runner = new MigrationRunner(configuredVersion, targetVersion, GlobalSettings.UmbracoMigrationName); - var upgraded = runner.Execute(database, true); - message = message + "

Upgrade completed!

"; - } + var targetVersion = UmbracoVersion.Current; + var runner = new MigrationRunner(configuredVersion, targetVersion, GlobalSettings.UmbracoMigrationName); + var upgraded = runner.Execute(database, true); + message = message + "

Upgrade completed!

"; //now that everything is done, we need to determine the version of SQL server that is executing @@ -518,26 +536,72 @@ namespace Umbraco.Core } catch (Exception ex) { - LogHelper.Info("Database configuration failed with the following error and stack trace: " + ex.Message + "\n" + ex.StackTrace); - - if (_result != null) - { - LogHelper.Info("The database schema validation produced the following summary: \n" + _result.GetSummary()); - } - - return new Result - { - Message = - "The database configuration failed with the following message: " + ex.Message + - "\n Please check log file for additional information (can be found in '/App_Data/Logs/UmbracoTraceLog.txt')", - Success = false, - Percentage = "90" - }; + return HandleInstallException(ex); } } + private string GetResultMessageForMySql(bool? supportsCaseInsensitiveQueries) + { + if (supportsCaseInsensitiveQueries == null) + { + return "

 

Warning! Could not check if your database type supports case insensitive queries.
We currently do not support these databases that do not support case insensitive queries.

" + + "

You can check this by looking for the following setting in your my.ini file in your MySQL installation directory:

" + + "
lower_case_table_names=1

" + + "

Note: Make sure to check with your hosting provider if they support case insensitive queries as well.

" + + "

For more technical information on case sensitivity in MySQL, have a look at " + + "the documentation on the subject

"; + } + if (SqlSyntaxContext.SqlSyntaxProvider.GetType() == typeof(MySqlSyntaxProvider)) + { + return "

 

Congratulations, the database step ran successfully!

" + + "

Note: You're using MySQL and the database instance you're connecting to seems to support case insensitive queries.

" + + "

However, your hosting provider may not support this option. Umbraco does not currently support MySQL installs that do not support case insensitive queries

" + + "

Make sure to check with your hosting provider if they support case insensitive queries as well.

" + + "

They can check this by looking for the following setting in the my.ini file in their MySQL installation directory:

" + + "
lower_case_table_names=1

" + + "

For more technical information on case sensitivity in MySQL, have a look at " + + "the documentation on the subject

"; + } + return string.Empty; + } + + private Attempt CheckReadyForInstall() + { + if (_configured == false || (string.IsNullOrEmpty(_connectionString) || string.IsNullOrEmpty(ProviderName))) + { + return Attempt.Fail(new Result + { + Message = + "Database configuration is invalid. Please check that the entered database exists and that the provided username and password has write access to the database.", + Success = false, + Percentage = "10" + }); + } + return Attempt.Succeed(); + } + + private Result HandleInstallException(Exception ex) + { + LogHelper.Info("Database configuration failed with the following error and stack trace: " + ex.Message + "\n" + ex.StackTrace); + + if (_result != null) + { + LogHelper.Info("The database schema validation produced the following summary: \n" + _result.GetSummary()); + } + + return new Result + { + Message = + "The database configuration failed with the following message: " + ex.Message + + "\n Please check log file for additional information (can be found in '/App_Data/Logs/UmbracoTraceLog.txt')", + Success = false, + Percentage = "90" + }; + } + internal class Result { + public bool RequiresUpgrade { get; set; } public string Message { get; set; } public bool Success { get; set; } public string Percentage { get; set; } diff --git a/src/Umbraco.Web.UI/install/InstallerRestService.aspx.cs b/src/Umbraco.Web.UI/install/InstallerRestService.aspx.cs index a2857c5885..248e6c17d2 100644 --- a/src/Umbraco.Web.UI/install/InstallerRestService.aspx.cs +++ b/src/Umbraco.Web.UI/install/InstallerRestService.aspx.cs @@ -59,7 +59,7 @@ namespace Umbraco.Web.UI.Install [WebMethod] [ScriptMethod(ResponseFormat = ResponseFormat.Json)] - public static string InstallOrUpgrade() + public static string Install() { //if its not configured then we can continue if (ApplicationContext.Current == null || ApplicationContext.Current.IsConfigured) @@ -67,10 +67,43 @@ namespace Umbraco.Web.UI.Install throw new AuthenticationException("The application is already configured"); } - LogHelper.Info("Running 'InstallOrUpgrade' service"); + LogHelper.Info("Running 'Install' service"); - var result = ApplicationContext.Current.DatabaseContext.CreateDatabaseSchemaAndDataOrUpgrade(); + var result = ApplicationContext.Current.DatabaseContext.CreateDatabaseSchemaAndData(); + if (result.RequiresUpgrade == false) + { + HandleConnectionStrings(); + } + + var js = new JavaScriptSerializer(); + var jsonResult = js.Serialize(result); + return jsonResult; + } + + [WebMethod] + [ScriptMethod(ResponseFormat = ResponseFormat.Json)] + public static string Upgrade() + { + //if its not configured then we can continue + if (ApplicationContext.Current == null || ApplicationContext.Current.IsConfigured) + { + throw new AuthenticationException("The application is already configured"); + } + + LogHelper.Info("Running 'Upgrade' service"); + + var result = ApplicationContext.Current.DatabaseContext.UpgradeSchemaAndData(); + + HandleConnectionStrings(); + + var js = new JavaScriptSerializer(); + var jsonResult = js.Serialize(result); + return jsonResult; + } + + private static void HandleConnectionStrings() + { // Remove legacy umbracoDbDsn configuration setting if it exists and connectionstring also exists if (ConfigurationManager.ConnectionStrings[Core.Configuration.GlobalSettings.UmbracoConnectionName] != null) { @@ -82,10 +115,6 @@ namespace Umbraco.Web.UI.Install LogHelper.Error("", ex); throw ex; } - - var js = new JavaScriptSerializer(); - var jsonResult = js.Serialize(result); - return jsonResult; } } } \ No newline at end of file diff --git a/src/Umbraco.Web.UI/install/steps/database.ascx b/src/Umbraco.Web.UI/install/steps/database.ascx index 6b6f1c4496..c39eb00980 100644 --- a/src/Umbraco.Web.UI/install/steps/database.ascx +++ b/src/Umbraco.Web.UI/install/steps/database.ascx @@ -368,33 +368,81 @@ diff --git a/src/Umbraco.Web/umbraco.presentation/install/utills/LegacyClasses.cs b/src/Umbraco.Web/umbraco.presentation/install/utills/LegacyClasses.cs index 52fd5d3b9a..c068ac2c01 100644 --- a/src/Umbraco.Web/umbraco.presentation/install/utills/LegacyClasses.cs +++ b/src/Umbraco.Web/umbraco.presentation/install/utills/LegacyClasses.cs @@ -66,7 +66,11 @@ namespace umbraco.presentation.install.utills { LogHelper.Info

("Running 'installOrUpgrade' service"); - var result = ApplicationContext.Current.DatabaseContext.CreateDatabaseSchemaAndDataOrUpgrade(); + var result = ApplicationContext.Current.DatabaseContext.CreateDatabaseSchemaAndData(); + if (result.RequiresUpgrade) + { + result = ApplicationContext.Current.DatabaseContext.UpgradeSchemaAndData(); + } // Remove legacy umbracoDbDsn configuration setting if it exists and connectionstring also exists if (ConfigurationManager.ConnectionStrings[Umbraco.Core.Configuration.GlobalSettings.UmbracoConnectionName] != null) From 51da5343eae89c71ecda9504fa6255580ceb2151 Mon Sep 17 00:00:00 2001 From: Shannon Date: Fri, 13 Dec 2013 17:07:29 +1100 Subject: [PATCH 2/3] Fixes installation issue with rebuilding media cache Conflicts: src/Umbraco.Core/Umbraco.Core.csproj src/Umbraco.Web.UI/config/trees.config src/Umbraco.Web/Umbraco.Web.csproj --- .../RemoveCachedRecycleMediaXml.cs | 31 --------------- src/Umbraco.Core/Umbraco.Core.csproj | 1 - .../RebuildMediaXmlCacheAfterUpgrade.cs | 38 +++++++++++++++++++ src/Umbraco.Web/Umbraco.Web.csproj | 1 + 4 files changed, 39 insertions(+), 32 deletions(-) delete mode 100644 src/Umbraco.Core/Persistence/Migrations/Upgrades/TargetVersionSixTwoZero/RemoveCachedRecycleMediaXml.cs create mode 100644 src/Umbraco.Web/Strategies/Migrations/RebuildMediaXmlCacheAfterUpgrade.cs diff --git a/src/Umbraco.Core/Persistence/Migrations/Upgrades/TargetVersionSixTwoZero/RemoveCachedRecycleMediaXml.cs b/src/Umbraco.Core/Persistence/Migrations/Upgrades/TargetVersionSixTwoZero/RemoveCachedRecycleMediaXml.cs deleted file mode 100644 index d0945db957..0000000000 --- a/src/Umbraco.Core/Persistence/Migrations/Upgrades/TargetVersionSixTwoZero/RemoveCachedRecycleMediaXml.cs +++ /dev/null @@ -1,31 +0,0 @@ -using Umbraco.Core.Configuration; -using Umbraco.Core.Services; - -namespace Umbraco.Core.Persistence.Migrations.Upgrades.TargetVersionSixTwoZero -{ - ///

- /// Due to this bug: http://issues.umbraco.org/issue/U4-3820 we need to remove the cached media - /// xml found in the cmsContentXml table for any media that has been recycled. - /// - [Migration("6.2.0", 1, GlobalSettings.UmbracoMigrationName)] - public class RemoveCachedRecycleMediaXml : MigrationBase - { - public override void Up() - { - //now that the controlId column is renamed and now a string we need to convert - if (Context == null || Context.Database == null) return; - - Execute.Code(database => - { - var mediasvc = (MediaService)ApplicationContext.Current.Services.MediaService; - mediasvc.RebuildXmlStructures(); - - return string.Empty; - }); - } - - public override void Down() - { - } - } -} \ No newline at end of file diff --git a/src/Umbraco.Core/Umbraco.Core.csproj b/src/Umbraco.Core/Umbraco.Core.csproj index d9ff8081c2..021ae04633 100644 --- a/src/Umbraco.Core/Umbraco.Core.csproj +++ b/src/Umbraco.Core/Umbraco.Core.csproj @@ -189,7 +189,6 @@ - diff --git a/src/Umbraco.Web/Strategies/Migrations/RebuildMediaXmlCacheAfterUpgrade.cs b/src/Umbraco.Web/Strategies/Migrations/RebuildMediaXmlCacheAfterUpgrade.cs new file mode 100644 index 0000000000..72cf6c24ea --- /dev/null +++ b/src/Umbraco.Web/Strategies/Migrations/RebuildMediaXmlCacheAfterUpgrade.cs @@ -0,0 +1,38 @@ +using System; +using Umbraco.Core; +using Umbraco.Core.Persistence.Migrations; +using Umbraco.Core.Services; +using umbraco.interfaces; + +namespace Umbraco.Web.Strategies.Migrations +{ + /// + /// This will execute after upgrading to rebuild the xml cache + /// + /// + /// This cannot execute as part of a db migration since we need access to the services/repos. + /// + /// This will execute for specific versions - + /// + /// * If current is less than or equal to 7.0.0 + /// + public class RebuildMediaXmlCacheAfterUpgrade : IApplicationStartupHandler + { + public RebuildMediaXmlCacheAfterUpgrade() + { + MigrationRunner.Migrated += MigrationRunner_Migrated; + } + + void MigrationRunner_Migrated(MigrationRunner sender, Core.Events.MigrationEventArgs e) + { + var target70 = new Version(7, 0, 0); + + if (e.ConfiguredVersion <= target70) + { + var mediasvc = (MediaService)ApplicationContext.Current.Services.MediaService; + mediasvc.RebuildXmlStructures(); + } + + } + } +} \ No newline at end of file diff --git a/src/Umbraco.Web/Umbraco.Web.csproj b/src/Umbraco.Web/Umbraco.Web.csproj index c9a8e14740..42379fb990 100644 --- a/src/Umbraco.Web/Umbraco.Web.csproj +++ b/src/Umbraco.Web/Umbraco.Web.csproj @@ -389,6 +389,7 @@ + From 5aec75385947b7a1e85f3d705dfc3781b52781be Mon Sep 17 00:00:00 2001 From: Stephan Date: Fri, 13 Dec 2013 12:06:56 +0100 Subject: [PATCH 3/3] U4-3710, -3732 - Fix default ShortStringHelper --- src/Umbraco.Core/CoreBootManager.cs | 13 +- src/Umbraco.Core/StringExtensions.cs | 8 +- src/Umbraco.Core/Strings/CleanStringType.cs | 27 +- .../Strings/DefaultShortStringHelper.cs | 700 ++++++++---------- .../Strings/Utf8ToAsciiConverter.cs | 7 +- .../DefaultShortStringHelperTests.cs | 583 ++++++++++----- 6 files changed, 734 insertions(+), 604 deletions(-) diff --git a/src/Umbraco.Core/CoreBootManager.cs b/src/Umbraco.Core/CoreBootManager.cs index 91ee5db40f..7b8b0b5b9d 100644 --- a/src/Umbraco.Core/CoreBootManager.cs +++ b/src/Umbraco.Core/CoreBootManager.cs @@ -266,17 +266,10 @@ namespace Umbraco.Core PropertyValueConvertersResolver.Current = new PropertyValueConvertersResolver( PluginManager.Current.ResolveTypes()); - // use the new DefaultShortStringHelper but sort-of remain compatible - // - use UmbracoSettings UrlReplaceCharacters - // - allow underscores in terms, allow leading digits + // use the new DefaultShortStringHelper ShortStringHelperResolver.Current = new ShortStringHelperResolver( - new DefaultShortStringHelper() - .WithConfig(CleanStringType.Url, DefaultShortStringHelper.ApplyUrlReplaceCharacters, - allowUnderscoreInTerm: true, allowLeadingDigits: true)); - - // that was the old one - //ShortStringHelperResolver.Current = new ShortStringHelperResolver( - // new LegacyShortStringHelper()); + //new LegacyShortStringHelper()); + new DefaultShortStringHelper().WithDefaultConfig()); UrlSegmentProviderResolver.Current = new UrlSegmentProviderResolver( typeof (DefaultUrlSegmentProvider)); diff --git a/src/Umbraco.Core/StringExtensions.cs b/src/Umbraco.Core/StringExtensions.cs index 4c3cfbeba5..8f4c7a57b1 100644 --- a/src/Umbraco.Core/StringExtensions.cs +++ b/src/Umbraco.Core/StringExtensions.cs @@ -801,9 +801,11 @@ namespace Umbraco.Core if (_helper != null) return _helper; - // there *has* to be a short string helper, even if the resolver has not - // been initialized - used the default one with default configuration. - _helper = new DefaultShortStringHelper().WithConfig(allowLeadingDigits: true); + // we don't want Umbraco to die because the resolver hasn't been initialized + // as the ShortStringHelper is too important, so as long as it's not there + // already, we use a default one. That should never happen, but... + Logging.LogHelper.Warn("ShortStringHelperResolver.HasCurrent == false, fallback to default."); + _helper = new DefaultShortStringHelper().WithDefaultConfig(); _helper.Freeze(); return _helper; } diff --git a/src/Umbraco.Core/Strings/CleanStringType.cs b/src/Umbraco.Core/Strings/CleanStringType.cs index 28a801aa54..f681c42d4a 100644 --- a/src/Umbraco.Core/Strings/CleanStringType.cs +++ b/src/Umbraco.Core/Strings/CleanStringType.cs @@ -14,6 +14,9 @@ namespace Umbraco.Core.Strings // note: you have 32 bits at your disposal // 0xffffffff + + // masks + /// /// Flag mask for casing. /// @@ -27,13 +30,19 @@ namespace Umbraco.Core.Strings /// /// Flag mask for role. /// - RoleMask = 0x030000, // 0xff0000 - 8 possible values + RoleMask = 0x070000, // 0xff0000 - 8 possible values + + + // no value /// /// No value. /// None = 0x00, + + // casing values + /// /// Pascal casing eg "PascalCase". /// @@ -66,9 +75,13 @@ namespace Umbraco.Core.Strings /// and is pascal otherwise. UmbracoCase = 0x20, + + // encoding values + /// /// Unicode encoding. /// + [Obsolete("Use .Utf8 instead.")] Unicode = 0x0100, /// @@ -81,14 +94,22 @@ namespace Umbraco.Core.Strings /// Ascii = 0x0400, + + // role values + /// /// Url role. /// - Url = 0x010000, + UrlSegment = 0x010000, /// /// Alias role. /// - Alias = 0x020000 + Alias = 0x020000, + + /// + /// FileName role. + /// + FileName = 0x040000 } } diff --git a/src/Umbraco.Core/Strings/DefaultShortStringHelper.cs b/src/Umbraco.Core/Strings/DefaultShortStringHelper.cs index c3845e7318..bb85984d0d 100644 --- a/src/Umbraco.Core/Strings/DefaultShortStringHelper.cs +++ b/src/Umbraco.Core/Strings/DefaultShortStringHelper.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Linq; using System.Globalization; @@ -13,14 +14,13 @@ namespace Umbraco.Core.Strings /// /// Not optimized to work on large bodies of text. /// Meant to replace LegacyShortStringHelper where/when backward compatibility is not an issue. - /// Full-unicode support is probably not so good. /// NOTE: pre-filters run _before_ the string is re-encoded. /// public class DefaultShortStringHelper : IShortStringHelper { #region Ctor and vars - static DefaultShortStringHelper() + public DefaultShortStringHelper() { InitializeLegacyUrlReplaceCharacters(); } @@ -41,7 +41,7 @@ namespace Umbraco.Core.Strings private CultureInfo _defaultCulture = CultureInfo.InvariantCulture; private bool _frozen; - private readonly Dictionary> _configs = new Dictionary>(); + private readonly Dictionary> _configs = new Dictionary>(); // see notes for CleanAsciiString //static DefaultShortStringHelper() @@ -51,11 +51,11 @@ namespace Umbraco.Core.Strings #endregion - #region Legacy UrlReplaceCharacters + #region Filters - static readonly Dictionary UrlReplaceCharacters = new Dictionary(); + private readonly Dictionary _urlReplaceCharacters = new Dictionary(); - static void InitializeLegacyUrlReplaceCharacters() + private void InitializeLegacyUrlReplaceCharacters() { var replaceChars = UmbracoSettings.UrlReplaceCharacters; if (replaceChars == null) return; @@ -67,7 +67,7 @@ namespace Umbraco.Core.Strings if (attributes == null) continue; var org = attributes.GetNamedItem("org"); if (org != null && org.Value != "") - UrlReplaceCharacters[org.Value] = XmlHelper.GetNodeValue(node); + _urlReplaceCharacters[org.Value] = XmlHelper.GetNodeValue(node); } } @@ -76,9 +76,21 @@ namespace Umbraco.Core.Strings /// /// The string to filter. /// The filtered string. - public static string ApplyUrlReplaceCharacters(string s) + public string ApplyUrlReplaceCharacters(string s) { - return s.ReplaceMany(UrlReplaceCharacters); + return s.ReplaceMany(_urlReplaceCharacters); + } + + // ok to be static here because it's not configureable in any way + private static readonly char[] InvalidFileNameChars = + Path.GetInvalidFileNameChars() + .Union("!*'();:@&=+$,/?%#[]-~{}\"<>\\^`| ".ToCharArray()) + .Distinct() + .ToArray(); + + public static bool IsValidFileNameChar(char c) + { + return InvalidFileNameChars.Contains(c) == false; } #endregion @@ -91,6 +103,11 @@ namespace Umbraco.Core.Strings throw new InvalidOperationException("Cannot configure the helper once it is frozen."); } + /// + /// Sets a default culture. + /// + /// The default culture. + /// The short string helper. public DefaultShortStringHelper WithDefaultCulture(CultureInfo culture) { EnsureNotFrozen(); @@ -98,75 +115,131 @@ namespace Umbraco.Core.Strings return this; } - public DefaultShortStringHelper WithConfig( - Func preFilter = null, - bool breakTermsOnUpper = true, bool allowLeadingDigits = false, bool allowUnderscoreInTerm = false) + public DefaultShortStringHelper WithConfig(Config config) { - return WithConfig(_defaultCulture, CleanStringType.RoleMask, - preFilter, breakTermsOnUpper, allowLeadingDigits, allowUnderscoreInTerm); + return WithConfig(_defaultCulture, CleanStringType.RoleMask, config); } - public DefaultShortStringHelper WithConfig(CleanStringType stringRole, - Func preFilter = null, - bool breakTermsOnUpper = true, bool allowLeadingDigits = false, bool allowUnderscoreInTerm = false) + public DefaultShortStringHelper WithConfig(CleanStringType stringRole, Config config) { - return WithConfig(_defaultCulture, stringRole, - preFilter, breakTermsOnUpper, allowLeadingDigits, allowUnderscoreInTerm); + return WithConfig(_defaultCulture, stringRole, config); } - public DefaultShortStringHelper WithConfig(CultureInfo culture, CleanStringType stringRole, - Func preFilter = null, - bool breakTermsOnUpper = true, bool allowLeadingDigits = false, bool allowUnderscoreInTerm = false) + public DefaultShortStringHelper WithConfig(CultureInfo culture, CleanStringType stringRole, Config config) { + if (config == null) + throw new ArgumentNullException("config"); + EnsureNotFrozen(); if (_configs.ContainsKey(culture) == false) - _configs[culture] = new Dictionary(); - _configs[culture][stringRole] = new HelperConfig(preFilter, breakTermsOnUpper, allowLeadingDigits, allowUnderscoreInTerm); + _configs[culture] = new Dictionary(); + _configs[culture][stringRole] = config.Clone(); // clone so it can't be changed return this; } - internal sealed class HelperConfig + /// + /// Sets the default configuration. + /// + /// The short string helper. + public DefaultShortStringHelper WithDefaultConfig() { - private HelperConfig() + return WithConfig(CleanStringType.UrlSegment, new Config { + PreFilter = ApplyUrlReplaceCharacters, + IsTerm = (c, leading) => char.IsLetterOrDigit(c) || c == '_', // letter, digit or underscore + StringType = CleanStringType.Utf8 | CleanStringType.LowerCase, + BreakTermsOnUpper = false, + Separator = '-' + }).WithConfig(CleanStringType.FileName, new Config + { + PreFilter = ApplyUrlReplaceCharacters, + IsTerm = (c, leading) => char.IsLetterOrDigit(c) || c == '_', // letter, digit or underscore + StringType = CleanStringType.Utf8 | CleanStringType.LowerCase, + BreakTermsOnUpper = false, + Separator = '-' + }).WithConfig(CleanStringType.Alias, new Config + { + PreFilter = ApplyUrlReplaceCharacters, + IsTerm = (c, leading) => leading + ? char.IsLetter(c) // only letters + : (char.IsLetterOrDigit(c) || c == '_'), // letter, digit or underscore + StringType = CleanStringType.Ascii | CleanStringType.UmbracoCase, + BreakTermsOnUpper = false + }); + } + + public sealed class Config + { + public Config() + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged; PreFilter = null; - BreakTermsOnUpper = true; - AllowLeadingDigits = false; + IsTerm = (c, leading) => leading ? char.IsLetter(c) : char.IsLetterOrDigit(c); + BreakTermsOnUpper = false; + CutAcronymOnNonUpper = false; + GreedyAcronyms = false; + Separator = Char.MinValue; } - public HelperConfig(Func preFilter, bool breakTermsOnUpper, bool allowLeadingDigits, bool allowUnderscoreInTerm) - : this() + public Config Clone() { - PreFilter = preFilter; - BreakTermsOnUpper = breakTermsOnUpper; - AllowLeadingDigits = allowLeadingDigits; - AllowUnderscoreInTerm = allowUnderscoreInTerm; + return new Config + { + PreFilter = PreFilter, + IsTerm = IsTerm, + StringType = StringType, + BreakTermsOnUpper = BreakTermsOnUpper, + CutAcronymOnNonUpper = CutAcronymOnNonUpper, + GreedyAcronyms = GreedyAcronyms, + Separator = Separator + }; } - public Func PreFilter { get; private set; } + public Func PreFilter { get; set; } + public Func IsTerm { get; set; } + + public CleanStringType StringType { get; set; } // indicate whether an uppercase within a term eg "fooBar" is to break // into a new term, or to be considered as part of the current term - public bool BreakTermsOnUpper { get; private set; } + public bool BreakTermsOnUpper { get; set; } - // indicates whether it is legal to have leading digits, or whether they - // should be stripped as any other illegal character - public bool AllowLeadingDigits { get; private set; } - - // indicates whether underscore is a valid character in a term or is - // to be considered as a separator - public bool AllowUnderscoreInTerm { get; private set; } + // indicate whether a non-uppercase within an acronym eg "FOOBar" is to cut + // the acronym (at "B" or "a" depending on GreedyAcronyms) or to give + // up the acronym and treat the term as a word + public bool CutAcronymOnNonUpper { get; set; } // indicates whether acronyms parsing is greedy ie whether "FOObar" is // "FOO" + "bar" (greedy) or "FO" + "Obar" (non-greedy) - public bool GreedyAcronyms { get { return false; } } + public bool GreedyAcronyms { get; set; } - public static readonly HelperConfig Empty = new HelperConfig(); + // the separator char + // but then how can we tell we dont want any? + public char Separator { get; set; } + + // extends the config + public CleanStringType StringTypeExtend(CleanStringType stringType) + { + var st = StringType; + foreach (var mask in new[] { CleanStringType.CaseMask, CleanStringType.CodeMask }) + { + var a = stringType & mask; + if (a == 0) continue; + + st = st & ~mask; // clear what we have + st = st | a; // set the new value + } + return st; + } + + internal static readonly Config NotConfigured = new Config(); } - private HelperConfig GetConfig(CleanStringType stringType, CultureInfo culture) + private Config GetConfig(CleanStringType stringType, CultureInfo culture) { - Dictionary config; + stringType = stringType & CleanStringType.RoleMask; + + Dictionary config; if (_configs.ContainsKey(culture)) { config = _configs[culture]; @@ -184,7 +257,7 @@ namespace Umbraco.Core.Strings return config[CleanStringType.RoleMask]; } - return HelperConfig.Empty; + return Config.NotConfigured; } #endregion @@ -252,7 +325,7 @@ function validateSafeAlias(id, value, immediate, callback) {{ /// public virtual string CleanStringForSafeAlias(string text) { - return CleanString(text, CleanStringType.Ascii | CleanStringType.UmbracoCase | CleanStringType.Alias); + return CleanStringForSafeAlias(text, _defaultCulture); } /// @@ -266,7 +339,7 @@ function validateSafeAlias(id, value, immediate, callback) {{ /// public virtual string CleanStringForSafeAlias(string text, CultureInfo culture) { - return CleanString(text, CleanStringType.Ascii | CleanStringType.UmbracoCase | CleanStringType.Alias, culture); + return CleanString(text, CleanStringType.Alias, culture); } /// @@ -280,7 +353,7 @@ function validateSafeAlias(id, value, immediate, callback) {{ /// public virtual string CleanStringForUrlSegment(string text) { - return CleanString(text, CleanStringType.Ascii | CleanStringType.LowerCase | CleanStringType.Url, '-'); + return CleanStringForUrlSegment(text, _defaultCulture); } /// @@ -294,11 +367,11 @@ function validateSafeAlias(id, value, immediate, callback) {{ /// public virtual string CleanStringForUrlSegment(string text, CultureInfo culture) { - return CleanString(text, CleanStringType.Ascii | CleanStringType.LowerCase | CleanStringType.Url, '-', culture); + return CleanString(text, CleanStringType.UrlSegment, culture); } /// - /// Cleans a string, in the context of the invariant culture, to produce a string that can safely be used as a filename, + /// Cleans a string, in the context of the default culture, to produce a string that can safely be used as a filename, /// both internally (on disk) and externally (as a url). /// /// The text to filter. @@ -306,23 +379,11 @@ function validateSafeAlias(id, value, immediate, callback) {{ /// Legacy says this was used to "overcome an issue when Umbraco is used in IE in an intranet environment" but that issue is not documented. public virtual string CleanStringForSafeFileName(string text) { - if (string.IsNullOrWhiteSpace(text)) - return string.Empty; - - text = text.ReplaceMany(Path.GetInvalidFileNameChars(), '-'); - - var pos = text.LastIndexOf('.'); - var name = pos < 0 ? text : text.Substring(0, pos); - var ext = pos < 0 ? string.Empty : text.Substring(pos + 1); - - name = CleanString(name, CleanStringType.Ascii | CleanStringType.Alias | CleanStringType.LowerCase, '-'); - ext = CleanString(ext, CleanStringType.Ascii | CleanStringType.Alias | CleanStringType.LowerCase, '-'); - - return pos < 0 ? name : (name + "." + ext); + return CleanStringForSafeFileName(text, _defaultCulture); } /// - /// Cleans a string, in the context of the invariant culture, to produce a string that can safely be used as a filename, + /// Cleans a string to produce a string that can safely be used as a filename, /// both internally (on disk) and externally (as a url). /// /// The text to filter. @@ -335,14 +396,17 @@ function validateSafeAlias(id, value, immediate, callback) {{ text = text.ReplaceMany(Path.GetInvalidFileNameChars(), '-'); - var pos = text.LastIndexOf('.'); - var name = pos < 0 ? text : text.Substring(0, pos); - var ext = pos < 0 ? string.Empty : text.Substring(pos + 1); + var name = Path.GetFileNameWithoutExtension(text); + var ext = Path.GetExtension(text); // includes the dot, empty if no extension - name = CleanString(name, CleanStringType.Ascii | CleanStringType.Alias | CleanStringType.LowerCase, '-', culture); - ext = CleanString(ext, CleanStringType.Ascii | CleanStringType.Alias | CleanStringType.LowerCase, '-', culture); + Debug.Assert(name != null, "name != null"); + if (name.Length > 0) + name = CleanString(name, CleanStringType.FileName, culture); + Debug.Assert(ext != null, "ext != null"); + if (ext.Length > 0) + ext = CleanString(ext.Substring(1), CleanStringType.FileName, culture); - return pos < 0 ? name : (name + "." + ext); + return ext.Length > 0 ? (name + "." + ext) : name; } #endregion @@ -351,7 +415,7 @@ function validateSafeAlias(id, value, immediate, callback) {{ // MS rules & guidelines: // - Do capitalize both characters of two-character acronyms, except the first word of a camel-cased identifier. - // eg "DBRate" (pascal) or "ioHelper" (camel) - "specialDBRate" (pascal) or "specialIOHelper" (camel) + // eg "DBRate" (pascal) or "ioHelper" (camel) - "SpecialDBRate" (pascal) or "specialIOHelper" (camel) // - Do capitalize only the first character of acronyms with three or more characters, except the first word of a camel-cased identifier. // eg "XmlWriter (pascal) or "htmlReader" (camel) - "SpecialXmlWriter" (pascal) or "specialHtmlReader" (camel) // - Do not capitalize any of the characters of any acronyms, whatever their length, at the beginning of a camel-cased identifier. @@ -376,7 +440,7 @@ function validateSafeAlias(id, value, immediate, callback) {{ /// The string is cleaned in the context of the default culture. public string CleanString(string text, CleanStringType stringType) { - return CleanString(text, stringType, char.MinValue, _defaultCulture); + return CleanString(text, stringType, _defaultCulture, null); } /// @@ -390,7 +454,7 @@ function validateSafeAlias(id, value, immediate, callback) {{ /// The string is cleaned in the context of the default culture. public string CleanString(string text, CleanStringType stringType, char separator) { - return CleanString(text, stringType, separator, _defaultCulture); + return CleanString(text, stringType, _defaultCulture, separator); } /// @@ -403,7 +467,7 @@ function validateSafeAlias(id, value, immediate, callback) {{ /// The clean string. public string CleanString(string text, CleanStringType stringType, CultureInfo culture) { - return CleanString(text, stringType, char.MinValue, culture); + return CleanString(text, stringType, culture, null); } /// @@ -415,23 +479,12 @@ function validateSafeAlias(id, value, immediate, callback) {{ /// The separator. /// The culture. /// The clean string. - public virtual string CleanString(string text, CleanStringType stringType, char separator, CultureInfo culture) + public string CleanString(string text, CleanStringType stringType, char separator, CultureInfo culture) { - var config = GetConfig(stringType & CleanStringType.RoleMask, culture); - return CleanString(text, stringType, separator, culture, config); + return CleanString(text, stringType, culture, separator); } - /// - /// Cleans a string in the context of a specified culture, using a specified separator and configuration. - /// - /// The text to clean. - /// A flag indicating the target casing and encoding of the string. By default, - /// strings are cleaned up to camelCase and Ascii. - /// The separator. - /// The culture. - /// The configuration. - /// The clean string. - private string CleanString(string text, CleanStringType stringType, char separator, CultureInfo culture, HelperConfig config) + protected virtual string CleanString(string text, CleanStringType stringType, CultureInfo culture, char? separator) { // be safe if (text == null) @@ -439,13 +492,18 @@ function validateSafeAlias(id, value, immediate, callback) {{ if (culture == null) throw new ArgumentNullException("culture"); + // get config + var config = GetConfig(stringType, culture); + stringType = config.StringTypeExtend(stringType); + // apply defaults if ((stringType & CleanStringType.CaseMask) == CleanStringType.None) stringType |= CleanStringType.CamelCase; if ((stringType & CleanStringType.CodeMask) == CleanStringType.None) stringType |= CleanStringType.Ascii; - var codeType = stringType & CleanStringType.CodeMask; + // use configured unless specified + separator = separator ?? config.Separator; // apply pre-filter if (config.PreFilter != null) @@ -456,231 +514,46 @@ function validateSafeAlias(id, value, immediate, callback) {{ // text = ReplaceMany(text, config.Replacements); // recode - text = Recode(text, stringType); + var codeType = stringType & CleanStringType.CodeMask; + text = codeType == CleanStringType.Ascii + ? Utf8ToAsciiConverter.ToAsciiString(text) + : RemoveSurrogatePairs(text); // clean - switch (codeType) - { - case CleanStringType.Ascii: - // see note below - don't use CleanAsciiString - //text = CleanAsciiString(text, stringType, separator); - //break; - case CleanStringType.Utf8: - text = CleanUtf8String(text, stringType, separator, culture, config); - break; - case CleanStringType.Unicode: - throw new NotImplementedException("DefaultShortStringHelper does not handle unicode yet."); - default: - throw new ArgumentOutOfRangeException("stringType"); - } + text = CleanCodeString(text, stringType, separator.Value, culture, config); return text; } - // however proud I can be of that subtle, ascii-optimized code, - // benchmarking shows it is an order of magnitude slower that the utf8 version - // don't use it - keep it here should anyone be tempted to micro-optimize again... - // - // beware, it has bugs that are fixed in CleanUtf8String but I'm not going to - // bugfix commented code.... - - /* - internal string CleanAsciiString(string text) + private static string RemoveSurrogatePairs(string text) { - return CleanAsciiString(text, CleanStringType.CamelCase, char.MinValue); - } + var input = text.ToCharArray(); + var output = new char[input.Length]; + var opos = 0; - internal string CleanAsciiString(string text, CleanStringType caseType, char separator) - { - int opos = 0, ipos = 0; - var state = StateBreak; - - caseType &= CleanStringType.CaseMask; - - //switch (caseType) - //{ - // case CleanStringType.LowerCase: - // input = text.ToLowerInvariant().ToCharArray(); - // break; - // case CleanStringType.UpperCase: - // input = text.ToUpperInvariant().ToCharArray(); - // break; - // default: - // input = text.ToCharArray(); - // break; - //} - // if we apply global ToUpper or ToLower to text here - // then we cannot break words on uppercase chars - var input = text; - - // because we shouldn't be adding any extra char - // it's faster to use an array than a StringBuilder - var ilen = input.Length; - var output = new char[ilen]; - - Func termFilter = null; - - for (var i = 0; i < ilen; i++) + for (var ipos = 0; ipos < input.Length; ipos++) { - var idx = ValidStringCharacters.IndexOf(input[i]); - - switch (state) + var c = input[ipos]; + if (char.IsSurrogate(c)) // ignore high surrogate { - case StateBreak: - if (idx >= 0 && (opos > 0 || idx < 26 || idx >= 36)) - { - ipos = i; - if (opos > 0 && separator != char.MinValue) - output[opos++] = separator; - state = idx < 36 ? StateWord : StateUp; - } - break; - - case StateWord: - if (idx < 0 || (_breakTermsOnUpper && idx >= 36)) - { - CopyAsciiTerm(input, ipos, output, ref opos, i - ipos, caseType, termFilter, false); - ipos = i; - state = idx < 0 ? StateBreak : StateUp; - if (state != StateBreak && separator != char.MinValue) - output[opos++] = separator; - } - break; - - case StateAcronym: - if (idx < 36) - { - CopyAsciiTerm(input, ipos, output, ref opos, i - ipos, caseType, termFilter, true); - ipos = i; - state = idx < 0 ? StateBreak : StateWord; - if (state != StateBreak && separator != char.MinValue) - output[opos++] = separator; - } - break; - - case StateUp: - if (idx >= 0) - { - state = idx < 36 ? StateWord : StateAcronym; - } - else - { - CopyAsciiTerm(input, ipos, output, ref opos, 1, caseType, termFilter, false); - state = StateBreak; - } - break; - - default: - throw new Exception("Invalid state."); + ipos++; // and skip low surrogate + output[opos++] = '?'; + } + else + { + output[opos++] = c; } - } - - //Console.WriteLine("xx: ({0}) {1}, {2}, {3}", state, input.Length, ipos, opos); - switch (state) - { - case StateBreak: - break; - - case StateWord: - CopyAsciiTerm(input, ipos, output, ref opos, input.Length - ipos, caseType, termFilter, false); - break; - - case StateAcronym: - case StateUp: - CopyAsciiTerm(input, ipos, output, ref opos, input.Length - ipos, caseType, termFilter, true); - break; - - default: - throw new Exception("Invalid state."); } return new string(output, 0, opos); } - internal void CopyAsciiTerm(string input, int ipos, char[] output, ref int opos, int len, - CleanStringType caseType, Func termFilter, bool isAcronym) - { - var term = input.Substring(ipos, len); - ipos = 0; + // here was a subtle, ascii-optimized version of the cleaning code, and I was + // very proud of it until benchmarking showed it was an order of magnitude slower + // that the utf8 version. Micro-optimizing sometimes isn't such a good idea. - if (termFilter != null) - { - term = termFilter(term); - len = term.Length; - } - - if (isAcronym) - { - if (caseType == CleanStringType.CamelCase && len <= 2 && opos > 0) - caseType = CleanStringType.Unchanged; - else if (caseType == CleanStringType.PascalCase && len <= 2) - caseType = CleanStringType.Unchanged; - } - - int idx; - switch (caseType) - { - //case CleanStringType.LowerCase: - //case CleanStringType.UpperCase: - case CleanStringType.Unchanged: - term.CopyTo(ipos, output, opos, len); - opos += len; - break; - - case CleanStringType.LowerCase: - for (var i = ipos; i < ipos + len; i++) - { - idx = ValidStringCharacters.IndexOf(term[i]); - output[opos++] = ValidStringCharacters[idx >= 36 ? idx - 36 : idx]; - } - break; - - case CleanStringType.UpperCase: - for (var i = ipos; i < ipos + len; i++) - { - idx = ValidStringCharacters.IndexOf(term[i]); - output[opos++] = ValidStringCharacters[idx < 26 ? idx + 36 : idx]; - } - break; - - case CleanStringType.CamelCase: - idx = ValidStringCharacters.IndexOf(term[ipos]); - if (opos == 0) - output[opos++] = ValidStringCharacters[idx >= 36 ? idx - 36 : idx]; - else - output[opos++] = ValidStringCharacters[idx < 26 ? idx + 36 : idx]; - for (var i = ipos + 1; i < ipos + len; i++) - { - idx = ValidStringCharacters.IndexOf(term[i]); - output[opos++] = ValidStringCharacters[idx >= 36 ? idx - 36 : idx]; - } - break; - - case CleanStringType.PascalCase: - idx = ValidStringCharacters.IndexOf(term[ipos]); - output[opos++] = ValidStringCharacters[idx < 26 ? idx + 36 : idx]; - for (var i = ipos + 1; i < ipos + len; i++) - { - idx = ValidStringCharacters.IndexOf(term[i]); - output[opos++] = ValidStringCharacters[idx >= 36 ? idx - 36 : idx]; - } - break; - - default: - throw new ArgumentOutOfRangeException("caseType"); - } - } - */ - - // that's the default code that will work for utf8 strings - // will not handle unicode, though - - internal string CleanUtf8String(string text) - { - return CleanUtf8String(text, CleanStringType.CamelCase, char.MinValue, _defaultCulture, HelperConfig.Empty); - } - - internal string CleanUtf8String(string text, CleanStringType caseType, char separator, CultureInfo culture, HelperConfig config) + // note: does NOT support surrogate pairs in text + internal string CleanCodeString(string text, CleanStringType caseType, char separator, CultureInfo culture, Config config) { int opos = 0, ipos = 0; var state = StateBreak; @@ -695,21 +568,28 @@ function validateSafeAlias(id, value, immediate, callback) {{ var ilen = input.Length; var output = new char[ilen * 2]; // twice the length should be OK in all cases - //var termFilter = config.TermFilter; - for (var i = 0; i < ilen; i++) { var c = input[i]; - var isDigit = char.IsDigit(c); + var isTerm = config.IsTerm(c, opos == 0); + + //var isDigit = char.IsDigit(c); var isUpper = char.IsUpper(c); // false for digits, symbols... - var isLower = char.IsLower(c); // false for digits, symbols... - var isUnder = config.AllowUnderscoreInTerm && c == '_'; - var isTerm = char.IsLetterOrDigit(c) || isUnder; + //var isLower = char.IsLower(c); // false for digits, symbols... + + // what should I do with surrogates? + // no idea, really, so they are not supported at the moment + var isPair = char.IsSurrogate(c); + if (isPair) + throw new NotSupportedException("Surrogate pairs are not supported."); switch (state) { + // within a break case StateBreak: - if (isTerm && (opos > 0 || (isUnder == false && (config.AllowLeadingDigits || isDigit == false)))) + // begin a new term if char is a term char, + // and ( pos > 0 or it's also a valid leading char ) + if (isTerm) { ipos = i; if (opos > 0 && separator != char.MinValue) @@ -718,10 +598,13 @@ function validateSafeAlias(id, value, immediate, callback) {{ } break; + // within a term / word case StateWord: + // end a term if char is not a term char, + // or ( it's uppercase and we break terms on uppercase) if (isTerm == false || (config.BreakTermsOnUpper && isUpper)) { - CopyUtf8Term(input, ipos, output, ref opos, i - ipos, caseType, culture, /*termFilter,*/ false); + CopyTerm(input, ipos, output, ref opos, i - ipos, caseType, culture, false); ipos = i; state = isTerm ? StateUp : StateBreak; if (state != StateBreak && separator != char.MinValue) @@ -729,27 +612,48 @@ function validateSafeAlias(id, value, immediate, callback) {{ } break; + // within a term / acronym case StateAcronym: - if (isTerm == false || isLower || isDigit) + // end an acronym if char is not a term char, + // or if it's not uppercase / config + //Console.WriteLine("acro {0} {1}", c, (config.CutAcronymOnNonUpper && isUpper == false)); + if (isTerm == false || (config.CutAcronymOnNonUpper && isUpper == false)) { - if (isLower && config.GreedyAcronyms == false) - i -= 1; - CopyUtf8Term(input, ipos, output, ref opos, i - ipos, caseType, culture, /*termFilter,*/ true); - ipos = i; - state = isTerm ? StateWord : StateBreak; - if (state != StateBreak && separator != char.MinValue) - output[opos++] = separator; + // whether it's part of the acronym depends on whether we're greedy + if (isTerm && config.GreedyAcronyms == false) + i -= 1; // handle that char again, in another state - not part of the acronym + if (i - ipos > 1) // single-char can't be an acronym + { + CopyTerm(input, ipos, output, ref opos, i - ipos, caseType, culture, true); + ipos = i; + state = isTerm ? StateWord : StateBreak; + if (state != StateBreak && separator != char.MinValue) + output[opos++] = separator; + } + else if (isTerm) + { + state = StateWord; + } + } + else if (isUpper == false) // isTerm == true + { + // it's a term char and we don't cut... + // keep moving forward as a word + state = StateWord; } break; + // within a term / uppercase = could be a word or an acronym case StateUp: if (isTerm) { + // add that char to the term and pick word or acronym state = isUpper ? StateAcronym : StateWord; } else { - CopyUtf8Term(input, ipos, output, ref opos, 1, caseType, culture, /*termFilter,*/ false); + // single char, copy then break + CopyTerm(input, ipos, output, ref opos, 1, caseType, culture, false); state = StateBreak; } break; @@ -765,12 +669,12 @@ function validateSafeAlias(id, value, immediate, callback) {{ break; case StateWord: - CopyUtf8Term(input, ipos, output, ref opos, input.Length - ipos, caseType, culture, /*termFilter,*/ false); + CopyTerm(input, ipos, output, ref opos, input.Length - ipos, caseType, culture, false); break; case StateAcronym: case StateUp: - CopyUtf8Term(input, ipos, output, ref opos, input.Length - ipos, caseType, culture, /*termFilter,*/ true); + CopyTerm(input, ipos, output, ref opos, input.Length - ipos, caseType, culture, true); break; default: @@ -780,17 +684,15 @@ function validateSafeAlias(id, value, immediate, callback) {{ return new string(output, 0, opos); } - internal void CopyUtf8Term(string input, int ipos, char[] output, ref int opos, int len, - CleanStringType caseType, CultureInfo culture, /*Func termFilter,*/ bool isAcronym) + // note: supports surrogate pairs in input string + internal void CopyTerm(string input, int ipos, char[] output, ref int opos, int len, + CleanStringType caseType, CultureInfo culture, bool isAcronym) { var term = input.Substring(ipos, len); - ipos = 0; - - //if (termFilter != null) - //{ - // term = termFilter(term); - // len = term.Length; - //} + //Console.WriteLine("TERM \"{0}\" {1} {2}", + // term, + // isAcronym ? "acronym" : "word", + // caseType); if (isAcronym) { @@ -800,48 +702,100 @@ function validateSafeAlias(id, value, immediate, callback) {{ caseType = CleanStringType.Unchanged; } + // note: MSDN seems to imply that ToUpper or ToLower preserve the length + // of the string, but that this behavior is not guaranteed and could change. + char c; + int i; + string s; switch (caseType) { //case CleanStringType.LowerCase: //case CleanStringType.UpperCase: case CleanStringType.Unchanged: - term.CopyTo(ipos, output, opos, len); + term.CopyTo(0, output, opos, len); opos += len; break; case CleanStringType.LowerCase: - term.ToLower(culture).CopyTo(ipos, output, opos, len); - opos += len; + term = term.ToLower(culture); + term.CopyTo(0, output, opos, term.Length); + opos += term.Length; break; case CleanStringType.UpperCase: - term.ToUpper(culture).CopyTo(ipos, output, opos, len); - opos += len; + term = term.ToUpper(culture); + term.CopyTo(0, output, opos, term.Length); + opos += term.Length; break; case CleanStringType.CamelCase: - c = term[ipos++]; - output[opos] = opos++ == 0 ? char.ToLower(c, culture) : char.ToUpper(c, culture); - if (len > 1) - term.ToLower(culture).CopyTo(ipos, output, opos, len - 1); - opos += len - 1; + c = term[0]; + i = 1; + if (char.IsSurrogate(c)) + { + s = term.Substring(ipos, 2); + s = opos == 0 ? s.ToLower(culture) : s.ToUpper(culture); + s.CopyTo(0, output, opos, s.Length); + opos += s.Length; + i++; // surrogate pair len is 2 + } + else + { + output[opos] = opos++ == 0 ? char.ToLower(c, culture) : char.ToUpper(c, culture); + } + if (len > i) + { + term = term.Substring(i).ToLower(culture); + term.CopyTo(0, output, opos, term.Length); + opos += term.Length; + } break; case CleanStringType.PascalCase: - c = term[ipos++]; - output[opos++] = char.ToUpper(c, culture); - if (len > 1) - term.ToLower(culture).CopyTo(ipos, output, opos, len - 1); - opos += len - 1; + c = term[0]; + i = 1; + if (char.IsSurrogate(c)) + { + s = term.Substring(ipos, 2); + s = s.ToUpper(culture); + s.CopyTo(0, output, opos, s.Length); + opos += s.Length; + i++; // surrogate pair len is 2 + } + else + { + output[opos++] = char.ToUpper(c, culture); + } + if (len > i) + { + term = term.Substring(i).ToLower(culture); + term.CopyTo(0, output, opos, term.Length); + opos += term.Length; + } break; case CleanStringType.UmbracoCase: - c = term[ipos++]; - output[opos] = opos++ == 0 ? c : char.ToUpper(c, culture); - if (len > 1) - term.CopyTo(ipos, output, opos, len - 1); - opos += len - 1; + c = term[0]; + i = 1; + if (char.IsSurrogate(c)) + { + s = term.Substring(ipos, 2); + s = opos == 0 ? s : s.ToUpper(culture); + s.CopyTo(0, output, opos, s.Length); + opos += s.Length; + i++; // surrogate pair len is 2 + } + else + { + output[opos] = opos++ == 0 ? c : char.ToUpper(c, culture); + } + if (len > i) + { + term = term.Substring(i); + term.CopyTo(0, output, opos, term.Length); + opos += term.Length; + } break; default: @@ -860,6 +814,7 @@ function validateSafeAlias(id, value, immediate, callback) {{ /// The separator, which defaults to a whitespace. /// The splitted text. /// Supports Utf8 and Ascii strings, not Unicode strings. + // NOTE does not support surrogates pairs at the moment public virtual string SplitPascalCasing(string text, char separator) { // be safe @@ -904,55 +859,6 @@ function validateSafeAlias(id, value, immediate, callback) {{ #endregion - #region Recode - - /// - /// Returns a new string containing only characters within the specified code type. - /// - /// The string to filter. - /// The string type. - /// The filtered string. - /// If is not Unicode then non-utf8 characters are - /// removed. If it is Ascii we try to do some intelligent replacement of accents, etc. - public virtual string Recode(string text, CleanStringType stringType) - { - // be safe - if (text == null) - throw new ArgumentNullException("text"); - - var codeType = stringType & CleanStringType.CodeMask; - - // unicode to utf8 or ascii: just remove the unicode chars - // utf8 to ascii: try to be clever and replace some chars - - // what's the point? - if (codeType == CleanStringType.Unicode) - return text; - - return codeType == CleanStringType.Utf8 - ? RemoveNonUtf8(text) - : Utf8ToAsciiConverter.ToAsciiString(text); - } - - private string RemoveNonUtf8(string text) - { - var len = text.Length; - var output = new char[len]; // we won't be adding chars - int opos = 0; - - for (var ipos = 0; ipos < len; ipos++) - { - var c = text[ipos]; - if (char.IsSurrogate(c)) - ipos++; - else - output[opos++] = c; - } - return new string(output, 0, opos); - } - - #endregion - #region ReplaceMany /// diff --git a/src/Umbraco.Core/Strings/Utf8ToAsciiConverter.cs b/src/Umbraco.Core/Strings/Utf8ToAsciiConverter.cs index f93c461fa3..23ac4e3931 100644 --- a/src/Umbraco.Core/Strings/Utf8ToAsciiConverter.cs +++ b/src/Umbraco.Core/Strings/Utf8ToAsciiConverter.cs @@ -72,8 +72,11 @@ namespace Umbraco.Core.Strings var opos = 0; for (var ipos = 0; ipos < input.Length; ipos++) - if (char.IsSurrogate(input[ipos])) - ipos++; + if (char.IsSurrogate(input[ipos])) // ignore high surrogate + { + ipos++; // and skip low surrogate + output[opos++] = '?'; + } else ToAscii(input, ipos, output, ref opos); diff --git a/src/Umbraco.Tests/CoreStrings/DefaultShortStringHelperTests.cs b/src/Umbraco.Tests/CoreStrings/DefaultShortStringHelperTests.cs index b9188730d7..09df7d0abf 100644 --- a/src/Umbraco.Tests/CoreStrings/DefaultShortStringHelperTests.cs +++ b/src/Umbraco.Tests/CoreStrings/DefaultShortStringHelperTests.cs @@ -1,6 +1,10 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; +using System.Diagnostics; using System.Globalization; +using System.IO; using System.Linq; +using System.Text; using System.Text.RegularExpressions; using NUnit.Framework; using Umbraco.Core; @@ -26,10 +30,39 @@ namespace Umbraco.Tests.CoreStrings // so there still may be utf8 chars even though you want ascii _helper = new DefaultShortStringHelper() - .WithConfig(CleanStringType.Url, StripQuotes, allowLeadingDigits: true) - .WithConfig(new CultureInfo("fr-FR"), CleanStringType.Url, FilterFrenchElisions, allowLeadingDigits: true) - .WithConfig(CleanStringType.Alias, StripQuotes) - .WithConfig(new CultureInfo("fr-FR"), CleanStringType.Alias, WhiteQuotes); + .WithConfig(CleanStringType.FileName, new DefaultShortStringHelper.Config + { + //PreFilter = ClearFileChars, // done in IsTerm + IsTerm = (c, leading) => (char.IsLetterOrDigit(c) || c == '_') && DefaultShortStringHelper.IsValidFileNameChar(c), + StringType = CleanStringType.LowerCase | CleanStringType.Ascii, + Separator = '-' + }) + .WithConfig(CleanStringType.UrlSegment, new DefaultShortStringHelper.Config + { + PreFilter = StripQuotes, + IsTerm = (c, leading) => char.IsLetterOrDigit(c) || c == '_', + StringType = CleanStringType.LowerCase | CleanStringType.Ascii, + Separator = '-' + }) + .WithConfig(new CultureInfo("fr-FR"), CleanStringType.UrlSegment, new DefaultShortStringHelper.Config + { + PreFilter = FilterFrenchElisions, + IsTerm = (c, leading) => leading ? char.IsLetter(c) : (char.IsLetterOrDigit(c) || c == '_'), + StringType = CleanStringType.LowerCase | CleanStringType.Ascii, + Separator = '-' + }) + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + PreFilter = StripQuotes, + IsTerm = (c, leading) => leading ? char.IsLetter(c) : char.IsLetterOrDigit(c), + StringType = CleanStringType.UmbracoCase | CleanStringType.Ascii + }) + .WithConfig(new CultureInfo("fr-FR"), CleanStringType.Alias, new DefaultShortStringHelper.Config + { + PreFilter = WhiteQuotes, + IsTerm = (c, leading) => leading ? char.IsLetter(c) : char.IsLetterOrDigit(c), + StringType = CleanStringType.UmbracoCase | CleanStringType.Ascii + }); ShortStringHelperResolver.Reset(); ShortStringHelperResolver.Current = new ShortStringHelperResolver(_helper); @@ -61,6 +94,333 @@ namespace Umbraco.Tests.CoreStrings return s; } + [Test] + public void CleanStringUnderscoreInTerm() + { + var helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + // underscore is accepted within terms + IsTerm = (c, leading) => char.IsLetterOrDigit(c) || c == '_', + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + Separator = '*' + }); + Assert.AreEqual("foo_bar*nil", helper.CleanString("foo_bar nil", CleanStringType.Alias)); + + helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + // underscore is not accepted within terms + IsTerm = (c, leading) => char.IsLetterOrDigit(c), + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + Separator = '*' + }); + Assert.AreEqual("foo*bar*nil", helper.CleanString("foo_bar nil", CleanStringType.Alias)); + } + + [Test] + public void CleanStringLeadingChars() + { + var helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + // letters and digits are valid leading chars + IsTerm = (c, leading) => char.IsLetterOrDigit(c), + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + Separator = '*' + }); + Assert.AreEqual("0123foo*bar*nil", helper.CleanString("0123foo_bar nil", CleanStringType.Alias)); + + helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + // only letters are valid leading chars + IsTerm = (c, leading) => leading ? char.IsLetter(c) : char.IsLetterOrDigit(c), + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + Separator = '*' + }); + Assert.AreEqual("foo*bar*nil", helper.CleanString("0123foo_bar nil", CleanStringType.Alias)); + Assert.AreEqual("foo*bar*nil", helper.CleanString("0123 foo_bar nil", CleanStringType.Alias)); + } + + [Test] + public void CleanStringTermOnUpper() + { + var helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + // uppercase letter means new term + BreakTermsOnUpper = true, + Separator = '*' + }); + Assert.AreEqual("foo*Bar", helper.CleanString("fooBar", CleanStringType.Alias)); + + helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + // uppercase letter is part of term + BreakTermsOnUpper = false, + Separator = '*' + }); + Assert.AreEqual("fooBar", helper.CleanString("fooBar", CleanStringType.Alias)); + } + + [Test] + public void CleanStringAcronymOnNonUpper() + { + var helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + // non-uppercase letter means cut acronym + CutAcronymOnNonUpper = true, + Separator = '*' + }); + Assert.AreEqual("foo*BAR*Rnil", helper.CleanString("foo BARRnil", CleanStringType.Alias)); + Assert.AreEqual("foo*BA*Rnil", helper.CleanString("foo BARnil", CleanStringType.Alias)); + Assert.AreEqual("foo*BAnil", helper.CleanString("foo BAnil", CleanStringType.Alias)); + Assert.AreEqual("foo*Bnil", helper.CleanString("foo Bnil", CleanStringType.Alias)); + + helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + // non-uppercase letter means word + CutAcronymOnNonUpper = false, + Separator = '*' + }); + Assert.AreEqual("foo*BARRnil", helper.CleanString("foo BARRnil", CleanStringType.Alias)); + Assert.AreEqual("foo*BARnil", helper.CleanString("foo BARnil", CleanStringType.Alias)); + Assert.AreEqual("foo*BAnil", helper.CleanString("foo BAnil", CleanStringType.Alias)); + Assert.AreEqual("foo*Bnil", helper.CleanString("foo Bnil", CleanStringType.Alias)); + } + + [Test] + public void CleanStringGreedyAcronyms() + { + var helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + CutAcronymOnNonUpper = true, + GreedyAcronyms = true, + Separator = '*' + }); + Assert.AreEqual("foo*BARR*nil", helper.CleanString("foo BARRnil", CleanStringType.Alias)); + Assert.AreEqual("foo*BAR*nil", helper.CleanString("foo BARnil", CleanStringType.Alias)); + Assert.AreEqual("foo*BA*nil", helper.CleanString("foo BAnil", CleanStringType.Alias)); + Assert.AreEqual("foo*Bnil", helper.CleanString("foo Bnil", CleanStringType.Alias)); + + helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + CutAcronymOnNonUpper = true, + GreedyAcronyms = false, + Separator = '*' + }); + Assert.AreEqual("foo*BAR*Rnil", helper.CleanString("foo BARRnil", CleanStringType.Alias)); + Assert.AreEqual("foo*BA*Rnil", helper.CleanString("foo BARnil", CleanStringType.Alias)); + Assert.AreEqual("foo*BAnil", helper.CleanString("foo BAnil", CleanStringType.Alias)); + Assert.AreEqual("foo*Bnil", helper.CleanString("foo Bnil", CleanStringType.Alias)); + } + + [Test] + public void CleanStringWhiteSpace() + { + var helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + Separator = '*' + }); + Assert.AreEqual("foo", helper.CleanString(" foo ", CleanStringType.Alias)); + Assert.AreEqual("foo*bar", helper.CleanString(" foo bar ", CleanStringType.Alias)); + } + + [Test] + public void CleanStringSeparator() + { + var helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + Separator = '*' + }); + Assert.AreEqual("foo*bar", helper.CleanString("foo bar", CleanStringType.Alias)); + + helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + Separator = ' ' + }); + Assert.AreEqual("foo bar", helper.CleanString("foo bar", CleanStringType.Alias)); + + helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged + }); + Assert.AreEqual("foobar", helper.CleanString("foo bar", CleanStringType.Alias)); + + helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + Separator = '文' + }); + Assert.AreEqual("foo文bar", helper.CleanString("foo bar", CleanStringType.Alias)); + } + + [Test] + public void CleanStringSymbols() + { + var helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + Separator = '*' + }); + Assert.AreEqual("house*2", helper.CleanString("house (2)", CleanStringType.Alias)); + + // FIXME but for a filename we want to keep them! + // FIXME and what about a url? + } + + [Test] + public void Utf8Surrogates() + { + // Unicode values between 0x10000 and 0x10FFF are represented by two 16-bit "surrogate" characters + const string str = "a\U00010F00z\uA74Ft"; + Assert.AreEqual(6, str.Length); + Assert.IsTrue(char.IsSurrogate(str[1])); + Assert.IsTrue(char.IsHighSurrogate(str[1])); + Assert.IsTrue(char.IsSurrogate(str[2])); + Assert.IsTrue(char.IsLowSurrogate(str[2])); + Assert.AreEqual('z', str[3]); + Assert.IsFalse(char.IsSurrogate(str[4])); + Assert.AreEqual('\uA74F', str[4]); + Assert.AreEqual('t', str[5]); + + Assert.AreEqual("z", str.Substring(3, 1)); + Assert.AreEqual("\U00010F00", str.Substring(1, 2)); + + var bytes = Encoding.UTF8.GetBytes(str); + Assert.AreEqual(10, bytes.Length); + Assert.AreEqual('a', bytes[0]); + // then next string element is two chars (surrogate pair) or 4 bytes, 21 bits of code point + Assert.AreEqual('z', bytes[5]); + // then next string element is one char and 3 bytes, 16 bits of code point + Assert.AreEqual('t', bytes[9]); + //foreach (var b in bytes) + // Console.WriteLine("{0:X}", b); + + Console.WriteLine("\U00010B70"); + } + + [Test] + public void Utf8ToAsciiConverter() + { + const string str = "a\U00010F00z\uA74Ftéô"; + var output = Core.Strings.Utf8ToAsciiConverter.ToAsciiString(str); + Assert.AreEqual("a?zooteo", output); + } + + [Test] + public void CleanStringEncoding() + { + var helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + Separator = '*' + }); + Assert.AreEqual("中文测试", helper.CleanString("中文测试", CleanStringType.Alias)); + Assert.AreEqual("léger*中文测试*ZÔRG", helper.CleanString("léger 中文测试 ZÔRG", CleanStringType.Alias)); + + helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Ascii | CleanStringType.Unchanged, + Separator = '*' + }); + Assert.AreEqual("", helper.CleanString("中文测试", CleanStringType.Alias)); + Assert.AreEqual("leger*ZORG", helper.CleanString("léger 中文测试 ZÔRG", CleanStringType.Alias)); + } + + [Test] + public void CleanStringDefaultConfig() + { + var helper = new DefaultShortStringHelper().WithDefaultConfig(); + + const string input = "0123 中文测试 中文测试 léger ZÔRG (2) a?? *x"; + + var alias = helper.CleanStringForSafeAlias(input); + var filename = helper.CleanStringForSafeFileName(input); + var segment = helper.CleanStringForUrlSegment(input); + + // umbraco-cased ascii alias, must begin with a proper letter + Assert.AreEqual("legerZORG2AX", alias, "alias"); + + // lower-cased, utf8 filename, removing illegal filename chars, using dash-separator + Assert.AreEqual("0123-中文测试-中文测试-léger-zôrg-2-a-x", filename, "filename"); + + // lower-cased, utf8 url segment, only letters and digits, using dash-separator + Assert.AreEqual("0123-中文测试-中文测试-léger-zôrg-2-a-x", segment, "segment"); + } + + [Test] + public void CleanStringCasing() + { + var helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.Alias, new DefaultShortStringHelper.Config + { + StringType = CleanStringType.Utf8 | CleanStringType.Unchanged, + Separator = ' ' + }); + + // BBB is an acronym + // E is a word (too short to be an acronym) + // FF is an acronym + + // FIXME "C" can't be an acronym + // FIXME "DBXreview" = acronym?! + + Assert.AreEqual("aaa BBB CCc Ddd E FF", helper.CleanString("aaa BBB CCc Ddd E FF", CleanStringType.Alias)); // unchanged + Assert.AreEqual("aaa Bbb Ccc Ddd E FF", helper.CleanString("aaa BBB CCc Ddd E FF", CleanStringType.Alias | CleanStringType.CamelCase)); + Assert.AreEqual("Aaa Bbb Ccc Ddd E FF", helper.CleanString("aaa BBB CCc Ddd E FF", CleanStringType.Alias | CleanStringType.PascalCase)); + Assert.AreEqual("aaa bbb ccc ddd e ff", helper.CleanString("aaa BBB CCc Ddd E FF", CleanStringType.Alias | CleanStringType.LowerCase)); + Assert.AreEqual("AAA BBB CCC DDD E FF", helper.CleanString("aaa BBB CCc Ddd E FF", CleanStringType.Alias | CleanStringType.UpperCase)); + Assert.AreEqual("aaa BBB CCc Ddd E FF", helper.CleanString("aaa BBB CCc Ddd E FF", CleanStringType.Alias | CleanStringType.UmbracoCase)); + + // MS rules & guidelines: + // - Do capitalize both characters of two-character acronyms, except the first word of a camel-cased identifier. + // eg "DBRate" (pascal) or "ioHelper" (camel) - "SpecialDBRate" (pascal) or "specialIOHelper" (camel) + // - Do capitalize only the first character of acronyms with three or more characters, except the first word of a camel-cased identifier. + // eg "XmlWriter (pascal) or "htmlReader" (camel) - "SpecialXmlWriter" (pascal) or "specialHtmlReader" (camel) + // - Do not capitalize any of the characters of any acronyms, whatever their length, at the beginning of a camel-cased identifier. + // eg "xmlWriter" or "dbWriter" (camel) + + Assert.AreEqual("aaa BB Ccc", helper.CleanString("aaa BB ccc", CleanStringType.Alias | CleanStringType.CamelCase)); + Assert.AreEqual("aa Bb Ccc", helper.CleanString("AA bb ccc", CleanStringType.Alias | CleanStringType.CamelCase)); + Assert.AreEqual("aaa Bb Ccc", helper.CleanString("AAA bb ccc", CleanStringType.Alias | CleanStringType.CamelCase)); + Assert.AreEqual("db Rate", helper.CleanString("DB rate", CleanStringType.Alias | CleanStringType.CamelCase)); + Assert.AreEqual("special DB Rate", helper.CleanString("special DB rate", CleanStringType.Alias | CleanStringType.CamelCase)); + Assert.AreEqual("xml Writer", helper.CleanString("XML writer", CleanStringType.Alias | CleanStringType.CamelCase)); + Assert.AreEqual("special Xml Writer", helper.CleanString("special XML writer", CleanStringType.Alias | CleanStringType.CamelCase)); + + Assert.AreEqual("Aaa BB Ccc", helper.CleanString("aaa BB ccc", CleanStringType.Alias | CleanStringType.PascalCase)); + Assert.AreEqual("AA Bb Ccc", helper.CleanString("AA bb ccc", CleanStringType.Alias | CleanStringType.PascalCase)); + Assert.AreEqual("Aaa Bb Ccc", helper.CleanString("AAA bb ccc", CleanStringType.Alias | CleanStringType.PascalCase)); + Assert.AreEqual("DB Rate", helper.CleanString("DB rate", CleanStringType.Alias | CleanStringType.PascalCase)); + Assert.AreEqual("Special DB Rate", helper.CleanString("special DB rate", CleanStringType.Alias | CleanStringType.PascalCase)); + Assert.AreEqual("Xml Writer", helper.CleanString("XML writer", CleanStringType.Alias | CleanStringType.PascalCase)); + Assert.AreEqual("Special Xml Writer", helper.CleanString("special XML writer", CleanStringType.Alias | CleanStringType.PascalCase)); + } + #region Cases [TestCase("foo", "foo")] [TestCase(" foo ", "foo")] @@ -100,29 +460,29 @@ namespace Umbraco.Tests.CoreStrings Assert.AreEqual(expected, output); } - #region Cases - [TestCase("This is my_little_house so cute.", "thisIsMyLittleHouseSoCute", false)] - [TestCase("This is my_little_house so cute.", "thisIsMy_little_houseSoCute", true)] - [TestCase("This is my_Little_House so cute.", "thisIsMyLittleHouseSoCute", false)] - [TestCase("This is my_Little_House so cute.", "thisIsMy_Little_HouseSoCute", true)] - [TestCase("An UPPER_CASE_TEST to check", "anUpperCaseTestToCheck", false)] - [TestCase("An UPPER_CASE_TEST to check", "anUpper_case_testToCheck", true)] - [TestCase("Trailing_", "trailing", false)] - [TestCase("Trailing_", "trailing_", true)] - [TestCase("_Leading", "leading", false)] - [TestCase("_Leading", "leading", true)] - [TestCase("Repeat___Repeat", "repeatRepeat", false)] - [TestCase("Repeat___Repeat", "repeat___Repeat", true)] - [TestCase("Repeat___repeat", "repeatRepeat", false)] - [TestCase("Repeat___repeat", "repeat___repeat", true)] - #endregion - public void CleanStringWithUnderscore(string input, string expected, bool allowUnderscoreInTerm) - { - var helper = new DefaultShortStringHelper() - .WithConfig(allowUnderscoreInTerm: allowUnderscoreInTerm); - var output = helper.CleanString(input, CleanStringType.Alias | CleanStringType.Ascii | CleanStringType.CamelCase); - Assert.AreEqual(expected, output); - } + //#region Cases + //[TestCase("This is my_little_house so cute.", "thisIsMyLittleHouseSoCute", false)] + //[TestCase("This is my_little_house so cute.", "thisIsMy_little_houseSoCute", true)] + //[TestCase("This is my_Little_House so cute.", "thisIsMyLittleHouseSoCute", false)] + //[TestCase("This is my_Little_House so cute.", "thisIsMy_Little_HouseSoCute", true)] + //[TestCase("An UPPER_CASE_TEST to check", "anUpperCaseTestToCheck", false)] + //[TestCase("An UPPER_CASE_TEST to check", "anUpper_case_testToCheck", true)] + //[TestCase("Trailing_", "trailing", false)] + //[TestCase("Trailing_", "trailing_", true)] + //[TestCase("_Leading", "leading", false)] + //[TestCase("_Leading", "leading", true)] + //[TestCase("Repeat___Repeat", "repeatRepeat", false)] + //[TestCase("Repeat___Repeat", "repeat___Repeat", true)] + //[TestCase("Repeat___repeat", "repeatRepeat", false)] + //[TestCase("Repeat___repeat", "repeat___repeat", true)] + //#endregion + //public void CleanStringWithUnderscore(string input, string expected, bool allowUnderscoreInTerm) + //{ + // var helper = new DefaultShortStringHelper() + // .WithConfig(allowUnderscoreInTerm: allowUnderscoreInTerm); + // var output = helper.CleanString(input, CleanStringType.Alias | CleanStringType.Ascii | CleanStringType.CamelCase); + // Assert.AreEqual(expected, output); + //} #region Cases [TestCase("Home Page", "home-page")] @@ -133,7 +493,6 @@ namespace Umbraco.Tests.CoreStrings [TestCase("汉#字*/漢?字", "")] [TestCase("Réalösk fix bran#lo'sk", "realosk-fix-bran-losk")] [TestCase("200 ways to be happy", "200-ways-to-be-happy")] - [TestCase("aBCdEfGhIJK", "a-b-cd-ef-gh-ijk")] #endregion public void CleanStringForUrlSegment(string input, string expected) { @@ -162,173 +521,19 @@ namespace Umbraco.Tests.CoreStrings } #region Cases - [TestCase("foo", "foo")] - [TestCase(" foo ", "foo")] - [TestCase("Foo", "foo")] - [TestCase("FoO", "foO")] - [TestCase("FoO bar", "foOBar")] - [TestCase("FoO bar NIL", "foOBarNil")] - [TestCase("FoO 33bar 22NIL", "foO33bar22Nil")] - [TestCase("FoO 33bar 22NI", "foO33bar22NI")] - [TestCase("0foo", "foo")] - [TestCase("2foo bar", "fooBar")] - [TestCase("9FOO", "foo")] - [TestCase("foo-BAR", "fooBar")] - [TestCase("foo-BA-dang", "fooBADang")] - [TestCase("foo_BAR", "fooBar")] - [TestCase("foo'BAR", "fooBar")] - [TestCase("sauté dans l'espace", "sautéDansLEspace")] - [TestCase("foo\"\"bar", "fooBar")] - [TestCase("-foo-", "foo")] - [TestCase("_foo_", "foo")] - [TestCase("spécial", "spécial")] - [TestCase("brô dëk ", "brôDëk")] - [TestCase("1235brô dëk ", "brôDëk")] - [TestCase("汉#字*/漢?字", "汉字漢字")] - [TestCase("aa DB cd EFG X KLMN OP qrst", "aaDBCdEfgXKlmnOPQrst")] - [TestCase("AA db cd EFG X KLMN OP qrst", "aaDbCdEfgXKlmnOPQrst")] - [TestCase("AAA db cd EFG X KLMN OP qrst", "aaaDbCdEfgXKlmnOPQrst")] - [TestCase("quelle élévation à partir", "quelleÉlévationÀPartir")] - #endregion - public void CleanUtf8String(string input, string expected) - { - input = _helper.Recode(input, CleanStringType.Utf8); - var output = _helper.CleanUtf8String(input); - Assert.AreEqual(expected, output); - } - - #region Cases - [TestCase("sauté dans l'espace", "saute-dans-espace", "fr-FR", CleanStringType.Url | CleanStringType.Ascii | CleanStringType.LowerCase)] - [TestCase("sauté dans l'espace", "sauté-dans-espace", "fr-FR", CleanStringType.Url | CleanStringType.Utf8 | CleanStringType.LowerCase)] + [TestCase("sauté dans l'espace", "saute-dans-espace", "fr-FR", CleanStringType.UrlSegment | CleanStringType.Ascii | CleanStringType.LowerCase)] + [TestCase("sauté dans l'espace", "sauté-dans-espace", "fr-FR", CleanStringType.UrlSegment | CleanStringType.Utf8 | CleanStringType.LowerCase)] [TestCase("sauté dans l'espace", "SauteDansLEspace", "fr-FR", CleanStringType.Alias | CleanStringType.Ascii | CleanStringType.PascalCase)] - [TestCase("he doesn't want", "he-doesnt-want", null, CleanStringType.Url | CleanStringType.Ascii | CleanStringType.LowerCase)] + [TestCase("he doesn't want", "he-doesnt-want", null, CleanStringType.UrlSegment | CleanStringType.Ascii | CleanStringType.LowerCase)] [TestCase("he doesn't want", "heDoesntWant", null, CleanStringType.Alias | CleanStringType.Ascii | CleanStringType.CamelCase)] #endregion public void CleanStringWithTypeAndCulture(string input, string expected, string culture, CleanStringType stringType) { var cinfo = culture == null ? CultureInfo.InvariantCulture : new CultureInfo(culture); - var separator = (stringType & CleanStringType.Url) == CleanStringType.Url ? '-' : char.MinValue; - var output = _helper.CleanString(input, stringType, separator, cinfo); - Assert.AreEqual(expected, output); - } - #region Cases - [TestCase("foo", "foo")] - [TestCase(" foo ", "foo")] - [TestCase("Foo", "foo")] - [TestCase("FoO", "foO")] - [TestCase("FoO bar", "foOBar")] - [TestCase("FoO bar NIL", "foOBarNil")] - [TestCase("FoO 33bar 22NIL", "foO33bar22Nil")] - [TestCase("FoO 33bar 22NI", "foO33bar22NI")] - [TestCase("0foo", "foo")] - [TestCase("2foo bar", "fooBar")] - [TestCase("9FOO", "foo")] - [TestCase("foo-BAR", "fooBar")] - [TestCase("foo-BA-dang", "fooBADang")] - [TestCase("foo_BAR", "fooBar")] - [TestCase("foo'BAR", "fooBar")] - [TestCase("sauté dans l'espace", "sauteDansLEspace")] - [TestCase("foo\"\"bar", "fooBar")] - [TestCase("-foo-", "foo")] - [TestCase("_foo_", "foo")] - [TestCase("spécial", "special")] - [TestCase("brô dëk ", "broDek")] - [TestCase("1235brô dëk ", "broDek")] - [TestCase("汉#字*/漢?字", "")] - [TestCase("aa DB cd EFG X KLMN OP qrst", "aaDBCdEfgXKlmnOPQrst")] - [TestCase("AA db cd EFG X KLMN OP qrst", "aaDbCdEfgXKlmnOPQrst")] - [TestCase("AAA db cd EFG X KLMN OP qrst", "aaaDbCdEfgXKlmnOPQrst")] - #endregion - public void CleanStringToAscii(string input, string expected) - { - var output = _helper.CleanString(input, CleanStringType.Ascii | CleanStringType.CamelCase); - Assert.AreEqual(expected, output); - } - - #region Cases - [TestCase("1235brô dëK tzARlan ban123!pOo", "brodeKtzARlanban123pOo", CleanStringType.Unchanged)] - [TestCase(" 1235brô dëK tzARlan ban123!pOo ", "brodeKtzARlanban123pOo", CleanStringType.Unchanged)] - [TestCase("1235brô dëK tzARlan ban123!pOo", "BroDeKTzARlanBan123POo", CleanStringType.PascalCase)] - [TestCase("1235brô dëK tzARlan ban123!pOo", "broDeKTzARlanBan123POo", CleanStringType.CamelCase)] - [TestCase("1235brô dëK tzARlan ban123!pOo", "BRODEKTZARLANBAN123POO", CleanStringType.UpperCase)] - [TestCase("1235brô dëK tzARlan ban123!pOo", "brodektzarlanban123poo", CleanStringType.LowerCase)] - [TestCase("aa DB cd EFG X KLMN OP qrst", "aaDBCdEfgXKlmnOPQrst", CleanStringType.CamelCase)] - [TestCase("aaa DB cd EFG X KLMN OP qrst", "aaaDBCdEfgXKlmnOPQrst", CleanStringType.CamelCase)] - [TestCase("aa DB cd EFG X KLMN OP qrst", "AaDBCdEfgXKlmnOPQrst", CleanStringType.PascalCase)] - [TestCase("aaa DB cd EFG X KLMN OP qrst", "AaaDBCdEfgXKlmnOPQrst", CleanStringType.PascalCase)] - [TestCase("AA db cd EFG X KLMN OP qrst", "aaDbCdEfgXKlmnOPQrst", CleanStringType.CamelCase)] - [TestCase("AAA db cd EFG X KLMN OP qrst", "aaaDbCdEfgXKlmnOPQrst", CleanStringType.CamelCase)] - [TestCase("AA db cd EFG X KLMN OP qrst", "AADbCdEfgXKlmnOPQrst", CleanStringType.PascalCase)] - [TestCase("AAA db cd EFG X KLMN OP qrst", "AaaDbCdEfgXKlmnOPQrst", CleanStringType.PascalCase)] - [TestCase("We store some HTML in the DB for performance", "WeStoreSomeHtmlInTheDBForPerformance", CleanStringType.PascalCase)] - [TestCase("We store some HTML in the DB for performance", "weStoreSomeHtmlInTheDBForPerformance", CleanStringType.CamelCase)] - [TestCase("X is true", "XIsTrue", CleanStringType.PascalCase)] - [TestCase("X is true", "xIsTrue", CleanStringType.CamelCase)] - [TestCase("IO are slow", "IOAreSlow", CleanStringType.PascalCase)] - [TestCase("IO are slow", "ioAreSlow", CleanStringType.CamelCase)] - [TestCase("RAM is fast", "RamIsFast", CleanStringType.PascalCase)] - [TestCase("RAM is fast", "ramIsFast", CleanStringType.CamelCase)] - [TestCase("Tab 1", "tab1", CleanStringType.CamelCase)] - [TestCase("Home - Page", "homePage", CleanStringType.CamelCase)] - [TestCase("Shannon's Document Type", "shannonSDocumentType", CleanStringType.CamelCase)] - [TestCase("Shannon's Document Type", "shannonsDocumentType", CleanStringType.CamelCase | CleanStringType.Alias)] - [TestCase("!BADDLY nam-ed Document Type", "baddlyNamEdDocumentType", CleanStringType.CamelCase)] - [TestCase(" !BADDLY nam-ed Document Type", "BADDLYnamedDocumentType", CleanStringType.Unchanged)] - [TestCase("!BADDLY nam-ed Document Type", "BaddlyNamEdDocumentType", CleanStringType.PascalCase)] - [TestCase("i %Want!thisTo end up In Proper@case", "IWantThisToEndUpInProperCase", CleanStringType.PascalCase)] - [TestCase("Räksmörgås %%$£¤¤¤§ kéKé", "raksmorgasKeKe", CleanStringType.CamelCase)] - [TestCase("Räksmörgås %%$£¤¤¤§ kéKé", "RaksmorgasKeKe", CleanStringType.PascalCase)] - [TestCase("Räksmörgås %%$£¤¤¤§ kéKé", "RaksmorgaskeKe", CleanStringType.Unchanged)] - [TestCase("TRii", "TRii", CleanStringType.Unchanged)] - [TestCase("**TRii", "TRii", CleanStringType.Unchanged)] - [TestCase("TRii", "tRii", CleanStringType.CamelCase)] - [TestCase("TRXii", "trXii", CleanStringType.CamelCase)] - [TestCase("**TRii", "tRii", CleanStringType.CamelCase)] - [TestCase("TRii", "TRii", CleanStringType.PascalCase)] - [TestCase("TRXii", "TRXii", CleanStringType.PascalCase)] - [TestCase("**TRii", "TRii", CleanStringType.PascalCase)] - [TestCase("trII", "trII", CleanStringType.Unchanged)] - [TestCase("**trII", "trII", CleanStringType.Unchanged)] - [TestCase("trII", "trII", CleanStringType.CamelCase)] - [TestCase("**trII", "trII", CleanStringType.CamelCase)] - [TestCase("trII", "TrII", CleanStringType.PascalCase)] - [TestCase("**trII", "TrII", CleanStringType.PascalCase)] - [TestCase("trIIX", "trIix", CleanStringType.CamelCase)] - [TestCase("**trIIX", "trIix", CleanStringType.CamelCase)] - [TestCase("trIIX", "TrIix", CleanStringType.PascalCase)] - [TestCase("**trIIX", "TrIix", CleanStringType.PascalCase)] - #endregion - public void CleanStringToAsciiWithType(string input, string expected, CleanStringType caseType) - { - var output = _helper.CleanString(input, caseType | CleanStringType.Ascii); - Assert.AreEqual(expected, output); - } - - #region Cases - [TestCase("1235brô dëK tzARlan ban123!pOo", "bro de K tz A Rlan ban123 p Oo", ' ', CleanStringType.Unchanged)] - [TestCase(" 1235brô dëK tzARlan ban123!pOo ", "bro de K tz A Rlan ban123 p Oo", ' ', CleanStringType.Unchanged)] - [TestCase("1235brô dëK tzARlan ban123!pOo", "Bro De K Tz A Rlan Ban123 P Oo", ' ', CleanStringType.PascalCase)] - [TestCase("1235brô dëK tzARlan ban123!pOo", "Bro De K Tz A Rlan Ban123 P Oo", ' ', CleanStringType.PascalCase)] - [TestCase("1235brô dëK tzARlan ban123!pOo", "bro De K Tz A Rlan Ban123 P Oo", ' ', CleanStringType.CamelCase)] - [TestCase("1235brô dëK tzARlan ban123!pOo", "bro-De-K-Tz-A-Rlan-Ban123-P-Oo", '-', CleanStringType.CamelCase)] - [TestCase("1235brô dëK tzARlan ban123!pOo", "BRO-DE-K-TZ-A-RLAN-BAN123-P-OO", '-', CleanStringType.UpperCase)] - [TestCase("1235brô dëK tzARlan ban123!pOo", "bro-de-k-tz-a-rlan-ban123-p-oo", '-', CleanStringType.LowerCase)] - [TestCase("Tab 1", "tab 1", ' ', CleanStringType.CamelCase)] - [TestCase("Home - Page", "home Page", ' ', CleanStringType.CamelCase)] - [TestCase("Shannon's Document Type", "shannon S Document Type", ' ', CleanStringType.CamelCase)] - [TestCase("Shannon's Document Type", "shannons Document Type", ' ', CleanStringType.CamelCase | CleanStringType.Alias)] - [TestCase("!BADDLY nam-ed Document Type", "baddly Nam Ed Document Type", ' ', CleanStringType.CamelCase)] - [TestCase(" !BADDLY nam-ed Document Type", "BADDLY nam ed Document Type", ' ', CleanStringType.Unchanged)] - [TestCase("!BADDLY nam-ed Document Type", "Baddly Nam Ed Document Type", ' ', CleanStringType.PascalCase)] - [TestCase("i %Want!thisTo end up In Proper@case", "I Want This To End Up In Proper Case", ' ', CleanStringType.PascalCase)] - [TestCase("Räksmörgås %%$£¤¤¤§ kéKé", "raksmorgas Ke Ke", ' ', CleanStringType.CamelCase)] - [TestCase("Räksmörgås %%$£¤¤¤§ kéKé", "Raksmorgas Ke Ke", ' ', CleanStringType.PascalCase)] - [TestCase("Räksmörgås %%$£¤¤¤§ kéKé", "Raksmorgas ke Ke", ' ', CleanStringType.Unchanged)] - #endregion - public void CleanStringToAsciiWithTypeAndSeparator(string input, string expected, char separator, CleanStringType caseType) - { - var output = _helper.CleanString(input, caseType | CleanStringType.Ascii, separator); + // picks the proper config per culture + // and overrides some stringType params (ascii...) + var output = _helper.CleanString(input, stringType, cinfo); Assert.AreEqual(expected, output); }