From 8d532696f072a9b7b4442ddfe596decf926cbd17 Mon Sep 17 00:00:00 2001 From: yv01p Date: Sat, 13 Dec 2025 01:09:33 +0000 Subject: [PATCH] refactor(strings): apply code review fixes to Utf8ToAsciiConverterNew MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Document backward compat rationale in cyrillic.json - Extract magic number 4 to MaxExpansionRatio constant - Add fail-fast assertion for missing golden mappings file - Add edge case tests (control chars, whitespace, emoji, empty mappings) - Handle original converter buffer overflow bugs in golden tests 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../Strings/CharacterMappings/cyrillic.json | 2 +- .../Strings/Utf8ToAsciiConverterNew.cs | 25 ++++- .../Utf8ToAsciiConverterGoldenTests.cs | 44 +++++--- .../Strings/Utf8ToAsciiConverterNewTests.cs | 100 ++++++++++++++++++ 4 files changed, 156 insertions(+), 15 deletions(-) diff --git a/src/Umbraco.Core/Strings/CharacterMappings/cyrillic.json b/src/Umbraco.Core/Strings/CharacterMappings/cyrillic.json index 6dc4904618..5b45a4abf0 100644 --- a/src/Umbraco.Core/Strings/CharacterMappings/cyrillic.json +++ b/src/Umbraco.Core/Strings/CharacterMappings/cyrillic.json @@ -1,6 +1,6 @@ { "name": "Cyrillic", - "description": "Russian Cyrillic to Latin transliteration", + "description": "Russian Cyrillic to Latin transliteration. NOTE: Uses original Umbraco mappings for backward compatibility with existing URLs, not ISO 9 or BGN/PCGN standard transliteration.", "priority": 0, "mappings": { "А": "A", diff --git a/src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs b/src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs index 247b5d37e5..d1327bab6f 100644 --- a/src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs +++ b/src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs @@ -8,8 +8,29 @@ namespace Umbraco.Cms.Core.Strings; /// /// SIMD-optimized UTF-8 to ASCII converter with extensible character mappings. /// +/// +/// +/// This converter uses a multi-step fallback strategy: +/// 1. Dictionary lookup for special cases (ligatures, Cyrillic, special Latin) +/// 2. Unicode normalization (FormD) for accented Latin characters +/// 3. Control character stripping +/// 4. Whitespace normalization +/// 5. Fallback character for unmapped characters +/// +/// +/// Most accented Latin characters (À, é, ñ, etc.) are handled automatically via +/// Unicode normalization. Dictionary mappings are only needed for characters that +/// don't decompose correctly (ligatures like Æ→AE, Cyrillic, special Latin like Ø→O). +/// +/// public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter { + /// + /// Maximum expansion ratio for output buffer sizing. + /// Worst case: single char becomes 4 chars (e.g., Щ→Shch in standard transliteration). + /// + private const int MaxExpansionRatio = 4; + // SIMD-optimized ASCII detection (uses AVX-512 when available) private static readonly SearchValues AsciiPrintable = SearchValues.Create(" !\"#$%&'()*+,-./0123456789:;<=>?@" + @@ -39,8 +60,8 @@ public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter return text; } - // Allocate output buffer (worst case: each char becomes 4, e.g., Щ→Shch) - var maxLen = text.Length * 4; + // Allocate output buffer for worst-case expansion + var maxLen = text.Length * MaxExpansionRatio; char[] arrayBuffer = ArrayPool.Shared.Rent(maxLen); try { diff --git a/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterGoldenTests.cs b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterGoldenTests.cs index 2b9ea1ea97..d833a95e16 100644 --- a/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterGoldenTests.cs +++ b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterGoldenTests.cs @@ -22,18 +22,24 @@ public class Utf8ToAsciiConverterGoldenTests "TestData", "golden-mappings.json"); - if (File.Exists(testDataPath)) + if (!File.Exists(testDataPath)) { - var json = File.ReadAllText(testDataPath); - var doc = JsonDocument.Parse(json); - GoldenMappings = doc.RootElement - .GetProperty("mappings") - .EnumerateObject() - .ToDictionary(p => p.Name, p => p.Value.GetString() ?? ""); + throw new InvalidOperationException( + $"Golden mappings file not found at: {testDataPath}. " + + "Ensure the test data is configured to copy to output directory."); } - else + + var json = File.ReadAllText(testDataPath); + var doc = JsonDocument.Parse(json); + GoldenMappings = doc.RootElement + .GetProperty("mappings") + .EnumerateObject() + .ToDictionary(p => p.Name, p => p.Value.GetString() ?? ""); + + if (GoldenMappings.Count == 0) { - GoldenMappings = new Dictionary(); + throw new InvalidOperationException( + "Golden mappings file is empty. Test data may be corrupted."); } } @@ -69,9 +75,23 @@ public class Utf8ToAsciiConverterGoldenTests public void NewConverter_MatchesOriginalBehavior(string input, string expected) { // Compare new implementation against original - var originalResult = Utf8ToAsciiConverter.ToAsciiString(input); - var newResult = _newConverter.Convert(input); + // Note: Original has buffer overflow bugs for chars that expand to 4+ chars (e.g., ⑽→(10)) + string? originalResult; + try + { + originalResult = Utf8ToAsciiConverter.ToAsciiString(input); + } + catch (IndexOutOfRangeException) + { + // Original converter has known buffer bugs for high-expansion characters + // New converter fixes these - verify it produces the expected golden mapping + var newResult = _newConverter.Convert(input); + Assert.That(newResult, Is.EqualTo(expected), + $"Original throws IndexOutOfRangeException, but new converter should match golden mapping"); + return; + } - Assert.That(newResult, Is.EqualTo(originalResult)); + var result = _newConverter.Convert(input); + Assert.That(result, Is.EqualTo(originalResult)); } } diff --git a/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterNewTests.cs b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterNewTests.cs index 2bcff40f15..db2369648d 100644 --- a/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterNewTests.cs +++ b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterNewTests.cs @@ -112,4 +112,104 @@ public class Utf8ToAsciiConverterNewTests Assert.That(_converter.Convert(input), Is.EqualTo(expected)); } + + // === Edge Cases: Control Characters === + + [Test] + public void Convert_ControlCharacters_AreStripped() + { + // Tab, newline, carriage return should be stripped + var input = "hello\t\n\rworld"; + var result = _converter.Convert(input); + + // Control characters are stripped (not converted to space) + Assert.That(result, Is.EqualTo("helloworld")); + } + + [Test] + public void Convert_NullCharacter_IsStripped() + { + var input = "hello\0world"; + var result = _converter.Convert(input); + + Assert.That(result, Is.EqualTo("helloworld")); + } + + // === Edge Cases: Whitespace Variants === + + [Test] + public void Convert_NonBreakingSpace_NormalizesToSpace() + { + // Non-breaking space (U+00A0) + var input = "hello\u00A0world"; + var result = _converter.Convert(input); + + Assert.That(result, Is.EqualTo("hello world")); + } + + [Test] + public void Convert_EmSpace_NormalizesToSpace() + { + // Em space (U+2003) + var input = "hello\u2003world"; + var result = _converter.Convert(input); + + Assert.That(result, Is.EqualTo("hello world")); + } + + // === Edge Cases: Empty Mappings === + + [Test] + public void Convert_CyrillicHardSign_MapsToQuote() + { + // Ъ maps to " in original Umbraco implementation + var input = "Ъ"; + var result = _converter.Convert(input); + + Assert.That(result, Is.EqualTo("\"")); + } + + [Test] + public void Convert_CyrillicSoftSign_MapsToApostrophe() + { + // Ь maps to ' in original Umbraco implementation + var input = "Ь"; + var result = _converter.Convert(input); + + Assert.That(result, Is.EqualTo("'")); + } + + // === Edge Cases: Surrogate Pairs (Emoji) === + + [Test] + public void Convert_Emoji_ReplacedWithFallback() + { + // Emoji (surrogate pair) + var input = "hello 😀 world"; + var result = _converter.Convert(input); + + Assert.That(result, Is.EqualTo("hello ? world")); + } + + [Test] + public void Convert_Emoji_CustomFallback() + { + var input = "test 🎉 emoji"; + var result = _converter.Convert(input, fallback: '*'); + + Assert.That(result, Is.EqualTo("test * emoji")); + } + + // === Edge Cases: Long Input === + + [Test] + public void Convert_LongAsciiString_ReturnsSameReference() + { + // Pure ASCII should return same string instance (no allocation) + var input = new string('a', 10000); + var result = _converter.Convert(input); + + Assert.That(ReferenceEquals(input, result), Is.True, + "Pure ASCII input should return same string instance"); + } }