diff --git a/src/Umbraco.Core/Strings/CharacterMappings/cyrillic.json b/src/Umbraco.Core/Strings/CharacterMappings/cyrillic.json
index 6dc4904618..5b45a4abf0 100644
--- a/src/Umbraco.Core/Strings/CharacterMappings/cyrillic.json
+++ b/src/Umbraco.Core/Strings/CharacterMappings/cyrillic.json
@@ -1,6 +1,6 @@
{
"name": "Cyrillic",
- "description": "Russian Cyrillic to Latin transliteration",
+ "description": "Russian Cyrillic to Latin transliteration. NOTE: Uses original Umbraco mappings for backward compatibility with existing URLs, not ISO 9 or BGN/PCGN standard transliteration.",
"priority": 0,
"mappings": {
"А": "A",
diff --git a/src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs b/src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs
index 247b5d37e5..d1327bab6f 100644
--- a/src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs
+++ b/src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs
@@ -8,8 +8,29 @@ namespace Umbraco.Cms.Core.Strings;
///
/// SIMD-optimized UTF-8 to ASCII converter with extensible character mappings.
///
+///
+///
+/// This converter uses a multi-step fallback strategy:
+/// 1. Dictionary lookup for special cases (ligatures, Cyrillic, special Latin)
+/// 2. Unicode normalization (FormD) for accented Latin characters
+/// 3. Control character stripping
+/// 4. Whitespace normalization
+/// 5. Fallback character for unmapped characters
+///
+///
+/// Most accented Latin characters (À, é, ñ, etc.) are handled automatically via
+/// Unicode normalization. Dictionary mappings are only needed for characters that
+/// don't decompose correctly (ligatures like Æ→AE, Cyrillic, special Latin like Ø→O).
+///
+///
public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter
{
+ ///
+ /// Maximum expansion ratio for output buffer sizing.
+ /// Worst case: single char becomes 4 chars (e.g., Щ→Shch in standard transliteration).
+ ///
+ private const int MaxExpansionRatio = 4;
+
// SIMD-optimized ASCII detection (uses AVX-512 when available)
private static readonly SearchValues AsciiPrintable =
SearchValues.Create(" !\"#$%&'()*+,-./0123456789:;<=>?@" +
@@ -39,8 +60,8 @@ public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter
return text;
}
- // Allocate output buffer (worst case: each char becomes 4, e.g., Щ→Shch)
- var maxLen = text.Length * 4;
+ // Allocate output buffer for worst-case expansion
+ var maxLen = text.Length * MaxExpansionRatio;
char[] arrayBuffer = ArrayPool.Shared.Rent(maxLen);
try
{
diff --git a/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterGoldenTests.cs b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterGoldenTests.cs
index 2b9ea1ea97..d833a95e16 100644
--- a/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterGoldenTests.cs
+++ b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterGoldenTests.cs
@@ -22,18 +22,24 @@ public class Utf8ToAsciiConverterGoldenTests
"TestData",
"golden-mappings.json");
- if (File.Exists(testDataPath))
+ if (!File.Exists(testDataPath))
{
- var json = File.ReadAllText(testDataPath);
- var doc = JsonDocument.Parse(json);
- GoldenMappings = doc.RootElement
- .GetProperty("mappings")
- .EnumerateObject()
- .ToDictionary(p => p.Name, p => p.Value.GetString() ?? "");
+ throw new InvalidOperationException(
+ $"Golden mappings file not found at: {testDataPath}. " +
+ "Ensure the test data is configured to copy to output directory.");
}
- else
+
+ var json = File.ReadAllText(testDataPath);
+ var doc = JsonDocument.Parse(json);
+ GoldenMappings = doc.RootElement
+ .GetProperty("mappings")
+ .EnumerateObject()
+ .ToDictionary(p => p.Name, p => p.Value.GetString() ?? "");
+
+ if (GoldenMappings.Count == 0)
{
- GoldenMappings = new Dictionary();
+ throw new InvalidOperationException(
+ "Golden mappings file is empty. Test data may be corrupted.");
}
}
@@ -69,9 +75,23 @@ public class Utf8ToAsciiConverterGoldenTests
public void NewConverter_MatchesOriginalBehavior(string input, string expected)
{
// Compare new implementation against original
- var originalResult = Utf8ToAsciiConverter.ToAsciiString(input);
- var newResult = _newConverter.Convert(input);
+ // Note: Original has buffer overflow bugs for chars that expand to 4+ chars (e.g., ⑽→(10))
+ string? originalResult;
+ try
+ {
+ originalResult = Utf8ToAsciiConverter.ToAsciiString(input);
+ }
+ catch (IndexOutOfRangeException)
+ {
+ // Original converter has known buffer bugs for high-expansion characters
+ // New converter fixes these - verify it produces the expected golden mapping
+ var newResult = _newConverter.Convert(input);
+ Assert.That(newResult, Is.EqualTo(expected),
+ $"Original throws IndexOutOfRangeException, but new converter should match golden mapping");
+ return;
+ }
- Assert.That(newResult, Is.EqualTo(originalResult));
+ var result = _newConverter.Convert(input);
+ Assert.That(result, Is.EqualTo(originalResult));
}
}
diff --git a/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterNewTests.cs b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterNewTests.cs
index 2bcff40f15..db2369648d 100644
--- a/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterNewTests.cs
+++ b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterNewTests.cs
@@ -112,4 +112,104 @@ public class Utf8ToAsciiConverterNewTests
Assert.That(_converter.Convert(input), Is.EqualTo(expected));
}
+
+ // === Edge Cases: Control Characters ===
+
+ [Test]
+ public void Convert_ControlCharacters_AreStripped()
+ {
+ // Tab, newline, carriage return should be stripped
+ var input = "hello\t\n\rworld";
+ var result = _converter.Convert(input);
+
+ // Control characters are stripped (not converted to space)
+ Assert.That(result, Is.EqualTo("helloworld"));
+ }
+
+ [Test]
+ public void Convert_NullCharacter_IsStripped()
+ {
+ var input = "hello\0world";
+ var result = _converter.Convert(input);
+
+ Assert.That(result, Is.EqualTo("helloworld"));
+ }
+
+ // === Edge Cases: Whitespace Variants ===
+
+ [Test]
+ public void Convert_NonBreakingSpace_NormalizesToSpace()
+ {
+ // Non-breaking space (U+00A0)
+ var input = "hello\u00A0world";
+ var result = _converter.Convert(input);
+
+ Assert.That(result, Is.EqualTo("hello world"));
+ }
+
+ [Test]
+ public void Convert_EmSpace_NormalizesToSpace()
+ {
+ // Em space (U+2003)
+ var input = "hello\u2003world";
+ var result = _converter.Convert(input);
+
+ Assert.That(result, Is.EqualTo("hello world"));
+ }
+
+ // === Edge Cases: Empty Mappings ===
+
+ [Test]
+ public void Convert_CyrillicHardSign_MapsToQuote()
+ {
+ // Ъ maps to " in original Umbraco implementation
+ var input = "Ъ";
+ var result = _converter.Convert(input);
+
+ Assert.That(result, Is.EqualTo("\""));
+ }
+
+ [Test]
+ public void Convert_CyrillicSoftSign_MapsToApostrophe()
+ {
+ // Ь maps to ' in original Umbraco implementation
+ var input = "Ь";
+ var result = _converter.Convert(input);
+
+ Assert.That(result, Is.EqualTo("'"));
+ }
+
+ // === Edge Cases: Surrogate Pairs (Emoji) ===
+
+ [Test]
+ public void Convert_Emoji_ReplacedWithFallback()
+ {
+ // Emoji (surrogate pair)
+ var input = "hello 😀 world";
+ var result = _converter.Convert(input);
+
+ Assert.That(result, Is.EqualTo("hello ? world"));
+ }
+
+ [Test]
+ public void Convert_Emoji_CustomFallback()
+ {
+ var input = "test 🎉 emoji";
+ var result = _converter.Convert(input, fallback: '*');
+
+ Assert.That(result, Is.EqualTo("test * emoji"));
+ }
+
+ // === Edge Cases: Long Input ===
+
+ [Test]
+ public void Convert_LongAsciiString_ReturnsSameReference()
+ {
+ // Pure ASCII should return same string instance (no allocation)
+ var input = new string('a', 10000);
+ var result = _converter.Convert(input);
+
+ Assert.That(ReferenceEquals(input, result), Is.True,
+ "Pure ASCII input should return same string instance");
+ }
}