From 326309e61ee022755b428afa0409b8938c9d607b Mon Sep 17 00:00:00 2001 From: Stephan Date: Thu, 16 Jan 2014 17:03:51 +0100 Subject: [PATCH] U4-4056 - better unicode/ascii support for urls --- .../Configuration/UmbracoSettings.cs | 2 + .../Strings/DefaultShortStringHelper.cs | 13 +- .../Strings/Utf8ToAsciiConverter.cs | 237 ++++++++++++++++++ .../DefaultShortStringHelperTests.cs | 20 ++ 4 files changed, 271 insertions(+), 1 deletion(-) diff --git a/src/Umbraco.Core/Configuration/UmbracoSettings.cs b/src/Umbraco.Core/Configuration/UmbracoSettings.cs index a6f81b593c..0bdb0cd5c9 100644 --- a/src/Umbraco.Core/Configuration/UmbracoSettings.cs +++ b/src/Umbraco.Core/Configuration/UmbracoSettings.cs @@ -855,6 +855,8 @@ namespace Umbraco.Core.Configuration /// /// Whether to replace double dashes from url (ie my--story----from--dash.aspx caused by multiple url replacement chars /// + // was used by the legacy short string helper, is not used anymore by the new default short string helper + // should update documentation internal static bool RemoveDoubleDashesFromUrlReplacing { get diff --git a/src/Umbraco.Core/Strings/DefaultShortStringHelper.cs b/src/Umbraco.Core/Strings/DefaultShortStringHelper.cs index 38e9edc4d8..fbbc25ea20 100644 --- a/src/Umbraco.Core/Strings/DefaultShortStringHelper.cs +++ b/src/Umbraco.Core/Strings/DefaultShortStringHelper.cs @@ -71,6 +71,17 @@ namespace Umbraco.Core.Strings } } + private static bool UrlReplacingToAscii + { + get + { + var replaceChars = UmbracoSettings.UrlReplaceCharacters; + if (replaceChars == null || replaceChars.Attributes == null) return false; + var attr = replaceChars.Attributes.GetNamedItem("toAscii"); + return attr != null && attr.Value == "true"; + } + } + /// /// Returns a new string in which characters have been replaced according to the Umbraco settings UrlReplaceCharacters. /// @@ -147,7 +158,7 @@ namespace Umbraco.Core.Strings { PreFilter = ApplyUrlReplaceCharacters, IsTerm = (c, leading) => char.IsLetterOrDigit(c) || c == '_', // letter, digit or underscore - StringType = CleanStringType.Utf8 | CleanStringType.LowerCase, + StringType = (UrlReplacingToAscii ? CleanStringType.Ascii : CleanStringType.Utf8) | CleanStringType.LowerCase, BreakTermsOnUpper = false, Separator = '-' }).WithConfig(CleanStringType.FileName, new Config diff --git a/src/Umbraco.Core/Strings/Utf8ToAsciiConverter.cs b/src/Umbraco.Core/Strings/Utf8ToAsciiConverter.cs index 23ac4e3931..e08defdedd 100644 --- a/src/Umbraco.Core/Strings/Utf8ToAsciiConverter.cs +++ b/src/Umbraco.Core/Strings/Utf8ToAsciiConverter.cs @@ -3316,6 +3316,243 @@ namespace Umbraco.Core.Strings output[opos++] = '~'; break; + // BEGIN CUSTOM TRANSLITERATION OF CYRILIC CHARS + + #region Cyrilic chars + + // russian uppercase "А Б В Г Д Е Ё Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я" + // russian lowercase "а б в г д е ё ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я" + + // notes + // read http://www.vesic.org/english/blog/c-sharp/transliteration-easy-way-microsoft-transliteration-utility/ + // should we look into MS Transliteration Utility (http://msdn.microsoft.com/en-US/goglobal/bb688104.aspx) + // also UnicodeSharpFork https://bitbucket.org/DimaStefantsov/unidecodesharpfork + // also Transliterator http://transliterator.codeplex.com/ + // + // in any case it would be good to generate all those "case" statements instead of writing them by hand + // time for a T4 template? + // also we should support extensibility so ppl can register more cases in external code + + // fixme + // transliterates Анастасия as Anastasiya, and not Anastasia + // Ольга --> Ol'ga, Татьяна --> Tat'yana -- that's bad (?) + // Note: should ä (german umlaut) become a or ae ? + + case '\u0410': // А + output[opos++] = 'A'; + break; + case '\u0430': // а + output[opos++] = 'a'; + break; + case '\u0411': // Б + output[opos++] = 'B'; + break; + case '\u0431': // б + output[opos++] = 'b'; + break; + case '\u0412': // В + output[opos++] = 'V'; + break; + case '\u0432': // в + output[opos++] = 'v'; + break; + case '\u0413': // Г + output[opos++] = 'G'; + break; + case '\u0433': // г + output[opos++] = 'g'; + break; + case '\u0414': // Д + output[opos++] = 'D'; + break; + case '\u0434': // д + output[opos++] = 'd'; + break; + case '\u0415': // Е + output[opos++] = 'E'; + break; + case '\u0435': // е + output[opos++] = 'e'; + break; + case '\u0401': // Ё + output[opos++] = 'E'; // alt. Yo + break; + case '\u0451': // ё + output[opos++] = 'e'; // alt. yo + break; + case '\u0416': // Ж + output[opos++] = 'Z'; + output[opos++] = 'h'; + break; + case '\u0436': // ж + output[opos++] = 'z'; + output[opos++] = 'h'; + break; + case '\u0417': // З + output[opos++] = 'Z'; + break; + case '\u0437': // з + output[opos++] = 'z'; + break; + case '\u0418': // И + output[opos++] = 'I'; + break; + case '\u0438': // и + output[opos++] = 'i'; + break; + case '\u0419': // Й + output[opos++] = 'I'; // alt. Y, J + break; + case '\u0439': // й + output[opos++] = 'i'; // alt. y, j + break; + case '\u041A': // К + output[opos++] = 'K'; + break; + case '\u043A': // к + output[opos++] = 'k'; + break; + case '\u041B': // Л + output[opos++] = 'L'; + break; + case '\u043B': // л + output[opos++] = 'l'; + break; + case '\u041C': // М + output[opos++] = 'M'; + break; + case '\u043C': // м + output[opos++] = 'm'; + break; + case '\u041D': // Н + output[opos++] = 'N'; + break; + case '\u043D': // н + output[opos++] = 'n'; + break; + case '\u041E': // О + output[opos++] = 'O'; + break; + case '\u043E': // о + output[opos++] = 'o'; + break; + case '\u041F': // П + output[opos++] = 'P'; + break; + case '\u043F': // п + output[opos++] = 'p'; + break; + case '\u0420': // Р + output[opos++] = 'R'; + break; + case '\u0440': // р + output[opos++] = 'r'; + break; + case '\u0421': // С + output[opos++] = 'S'; + break; + case '\u0441': // с + output[opos++] = 's'; + break; + case '\u0422': // Т + output[opos++] = 'T'; + break; + case '\u0442': // т + output[opos++] = 't'; + break; + case '\u0423': // У + output[opos++] = 'U'; + break; + case '\u0443': // у + output[opos++] = 'u'; + break; + case '\u0424': // Ф + output[opos++] = 'F'; + break; + case '\u0444': // ф + output[opos++] = 'f'; + break; + case '\u0425': // Х + output[opos++] = 'K'; // alt. X + output[opos++] = 'h'; + break; + case '\u0445': // х + output[opos++] = 'k'; // alt. x + output[opos++] = 'h'; + break; + case '\u0426': // Ц + output[opos++] = 'F'; + break; + case '\u0446': // ц + output[opos++] = 'f'; + break; + case '\u0427': // Ч + output[opos++] = 'C'; // alt. Ts, C + output[opos++] = 'h'; + break; + case '\u0447': // ч + output[opos++] = 'c'; // alt. ts, c + output[opos++] = 'h'; + break; + case '\u0428': // Ш + output[opos++] = 'S'; // alt. Ch, S + output[opos++] = 'h'; + break; + case '\u0448': // ш + output[opos++] = 's'; // alt. ch, s + output[opos++] = 'h'; + break; + case '\u0429': // Щ + output[opos++] = 'S'; // alt. Shch, Sc + output[opos++] = 'h'; + break; + case '\u0449': // щ + output[opos++] = 's'; // alt. shch, sc + output[opos++] = 'h'; + break; + case '\u042A': // Ъ + output[opos++] = '"'; // " + break; + case '\u044A': // ъ + output[opos++] = '"'; // " + break; + case '\u042B': // Ы + output[opos++] = 'Y'; + break; + case '\u044B': // ы + output[opos++] = 'y'; + break; + case '\u042C': // Ь + output[opos++] = '\''; // ' + break; + case '\u044C': // ь + output[opos++] = '\''; // ' + break; + case '\u042D': // Э + output[opos++] = 'E'; + break; + case '\u044D': // э + output[opos++] = 'e'; + break; + case '\u042E': // Ю + output[opos++] = 'Y'; // alt. Ju + output[opos++] = 'u'; + break; + case '\u044E': // ю + output[opos++] = 'y'; // alt. ju + output[opos++] = 'u'; + break; + case '\u042F': // Я + output[opos++] = 'Y'; // alt. Ja + output[opos++] = 'a'; + break; + case '\u044F': // я + output[opos++] = 'y'; // alt. ja + output[opos++] = 'a'; + break; + + #endregion + // BEGIN EXTRA /* case '£': diff --git a/src/Umbraco.Tests/CoreStrings/DefaultShortStringHelperTests.cs b/src/Umbraco.Tests/CoreStrings/DefaultShortStringHelperTests.cs index 6828e4ea88..83328c5135 100644 --- a/src/Umbraco.Tests/CoreStrings/DefaultShortStringHelperTests.cs +++ b/src/Umbraco.Tests/CoreStrings/DefaultShortStringHelperTests.cs @@ -94,6 +94,26 @@ namespace Umbraco.Tests.CoreStrings return s; } + [Test] + public void U4_4056() + { + const string input = "ÆØÅ and æøå and 中文测试 and אודות האתר and größer БбДдЖж page"; + + var helper = new DefaultShortStringHelper().WithDefaultConfig(); // unicode + var output = helper.CleanStringForUrlSegment(input); + Assert.AreEqual("æøå-and-æøå-and-中文测试-and-אודות-האתר-and-größer-ббдджж-page", output); + + helper = new DefaultShortStringHelper() + .WithConfig(CleanStringType.UrlSegment, new DefaultShortStringHelper.Config + { + IsTerm = (c, leading) => char.IsLetterOrDigit(c) || c == '_', + StringType = CleanStringType.LowerCase | CleanStringType.Ascii, // ascii + Separator = '-' + }); + output = helper.CleanStringForUrlSegment(input); + Assert.AreEqual("aeoa-and-aeoa-and-and-and-grosser-bbddzhzh-page", output); + } + [Test] public void CleanStringUnderscoreInTerm() {