From 1102b34e88306bc54b7fdedcc234e5a1b310bc3a Mon Sep 17 00:00:00 2001 From: yv01p Date: Sat, 13 Dec 2025 00:13:11 +0000 Subject: [PATCH] feat(strings): implement SIMD-optimized Utf8ToAsciiConverterNew with golden file tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements Task 4 of the Utf8ToAsciiConverter refactor plan. Key features: - SIMD-optimized ASCII detection using SearchValues (AVX-512 capable) - Unicode normalization for accented characters (FormD decomposition) - FrozenDictionary for ligatures, Cyrillic, and special Latin mappings - Span-based API for zero-allocation scenarios - ArrayPool usage for temporary buffers - Comprehensive test coverage (21 unit tests, all passing) Implementation details: - Fast path for pure ASCII input (no conversion needed) - Dictionary lookup for special cases (ligatures, Cyrillic, etc.) - Unicode normalization fallback for accented characters - Control character stripping and whitespace normalization - Proper surrogate pair handling Test coverage: - Null/empty string handling - ASCII fast path verification - Accented character normalization (café → cafe) - Ligature expansion (Æ → AE, ß → ss, Œ → OE) - Cyrillic transliteration (Москва → Moskva, Щ → Shch) - Special Latin characters (Ł → L, Ø → O, Þ → TH) - Span API for zero-allocation scenarios - Mixed content handling Golden file tests are included for regression testing against the original implementation, though they require test data file configuration to run. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../Strings/Utf8ToAsciiConverterNew.cs | 188 ++++++++++++++++++ .../Utf8ToAsciiConverterGoldenTests.cs | 77 +++++++ .../Strings/Utf8ToAsciiConverterNewTests.cs | 114 +++++++++++ 3 files changed, 379 insertions(+) create mode 100644 src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs create mode 100644 tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterGoldenTests.cs create mode 100644 tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterNewTests.cs diff --git a/src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs b/src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs new file mode 100644 index 0000000000..247b5d37e5 --- /dev/null +++ b/src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs @@ -0,0 +1,188 @@ +using System.Buffers; +using System.Collections.Frozen; +using System.Globalization; +using System.Text; + +namespace Umbraco.Cms.Core.Strings; + +/// +/// SIMD-optimized UTF-8 to ASCII converter with extensible character mappings. +/// +public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter +{ + // SIMD-optimized ASCII detection (uses AVX-512 when available) + private static readonly SearchValues AsciiPrintable = + SearchValues.Create(" !\"#$%&'()*+,-./0123456789:;<=>?@" + + "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" + + "abcdefghijklmnopqrstuvwxyz{|}~"); + + private readonly FrozenDictionary _mappings; + + public Utf8ToAsciiConverterNew(ICharacterMappingLoader mappingLoader) + { + _mappings = mappingLoader.LoadMappings(); + } + + /// + public string Convert(string? text, char fallback = '?') + { + if (string.IsNullOrEmpty(text)) + { + return string.Empty; + } + + var input = text.AsSpan(); + + // Fast path: all ASCII - no conversion needed + if (input.IndexOfAnyExcept(AsciiPrintable) == -1) + { + return text; + } + + // Allocate output buffer (worst case: each char becomes 4, e.g., Щ→Shch) + var maxLen = text.Length * 4; + char[] arrayBuffer = ArrayPool.Shared.Rent(maxLen); + try + { + var written = Convert(input, arrayBuffer.AsSpan(), fallback); + return new string(arrayBuffer, 0, written); + } + finally + { + ArrayPool.Shared.Return(arrayBuffer); + } + } + + /// + public int Convert(ReadOnlySpan input, Span output, char fallback = '?') + { + if (input.IsEmpty) + { + return 0; + } + + var opos = 0; + var ipos = 0; + + while (ipos < input.Length) + { + // Find next non-ASCII character using SIMD + var remaining = input[ipos..]; + var asciiLen = remaining.IndexOfAnyExcept(AsciiPrintable); + + if (asciiLen == -1) + { + // Rest is all ASCII - bulk copy + remaining.CopyTo(output[opos..]); + return opos + remaining.Length; + } + + if (asciiLen > 0) + { + // Copy ASCII prefix + remaining[..asciiLen].CopyTo(output[opos..]); + opos += asciiLen; + ipos += asciiLen; + } + + // Process non-ASCII character + var c = input[ipos]; + + // Handle surrogate pairs (emoji, etc.) + if (char.IsSurrogate(c)) + { + output[opos++] = fallback; + ipos++; + if (ipos < input.Length && char.IsLowSurrogate(input[ipos])) + { + ipos++; // Skip low surrogate + } + continue; + } + + opos += ProcessNonAscii(c, output[opos..], fallback); + ipos++; + } + + return opos; + } + + private int ProcessNonAscii(char c, Span output, char fallback) + { + // 1. Check special cases dictionary (ligatures, Cyrillic, etc.) + if (_mappings.TryGetValue(c, out var mapped)) + { + if (mapped.Length == 0) + { + return 0; // Empty mapping = strip character + } + mapped.AsSpan().CopyTo(output); + return mapped.Length; + } + + // 2. Try Unicode normalization (handles most accented chars) + var normLen = TryNormalize(c, output); + if (normLen > 0) + { + return normLen; + } + + // 3. Control character handling + if (char.IsControl(c)) + { + return 0; // Strip control characters + } + + // 4. Whitespace normalization + if (char.IsWhiteSpace(c)) + { + output[0] = ' '; + return 1; + } + + // 5. Fallback for unmapped characters + output[0] = fallback; + return 1; + } + + private static int TryNormalize(char c, Span output) + { + // Skip characters that won't normalize to ASCII + if (c < '\u00C0') + { + return 0; + } + + // Normalize to FormD (decomposed form) + ReadOnlySpan input = stackalloc char[] { c }; + var normalized = input.ToString().Normalize(NormalizationForm.FormD); + + if (normalized.Length == 0) + { + return 0; + } + + // Copy only base characters (skip combining marks) + var len = 0; + foreach (var ch in normalized) + { + var category = CharUnicodeInfo.GetUnicodeCategory(ch); + + // Skip combining marks (diacritics) + if (category == UnicodeCategory.NonSpacingMark || + category == UnicodeCategory.SpacingCombiningMark || + category == UnicodeCategory.EnclosingMark) + { + continue; + } + + // Only keep if it's now ASCII + if (ch < '\u0080') + { + output[len++] = ch; + } + } + + return len; + } +} diff --git a/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterGoldenTests.cs b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterGoldenTests.cs new file mode 100644 index 0000000000..2b9ea1ea97 --- /dev/null +++ b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterGoldenTests.cs @@ -0,0 +1,77 @@ +using System.Text.Json; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging.Abstractions; +using Moq; +using NUnit.Framework; +using Umbraco.Cms.Core.Strings; + +namespace Umbraco.Cms.Tests.UnitTests.Umbraco.Core.Strings; + +[TestFixture] +public class Utf8ToAsciiConverterGoldenTests +{ + private IUtf8ToAsciiConverter _newConverter = null!; + private static readonly Dictionary GoldenMappings; + + static Utf8ToAsciiConverterGoldenTests() + { + var testDataPath = Path.Combine( + AppContext.BaseDirectory, + "Umbraco.Core", + "Strings", + "TestData", + "golden-mappings.json"); + + if (File.Exists(testDataPath)) + { + var json = File.ReadAllText(testDataPath); + var doc = JsonDocument.Parse(json); + GoldenMappings = doc.RootElement + .GetProperty("mappings") + .EnumerateObject() + .ToDictionary(p => p.Name, p => p.Value.GetString() ?? ""); + } + else + { + GoldenMappings = new Dictionary(); + } + } + + [SetUp] + public void SetUp() + { + var hostEnv = new Mock(); + hostEnv.Setup(h => h.ContentRootPath).Returns("/nonexistent"); + + var loader = new CharacterMappingLoader( + hostEnv.Object, + NullLogger.Instance); + + _newConverter = new Utf8ToAsciiConverterNew(loader); + } + + public static IEnumerable GetGoldenMappings() + { + foreach (var (input, expected) in GoldenMappings) + { + yield return new TestCaseData(input, expected); + } + } + + [TestCaseSource(nameof(GetGoldenMappings))] + public void NewConverter_MatchesGoldenMapping(string input, string expected) + { + var result = _newConverter.Convert(input); + Assert.That(result, Is.EqualTo(expected)); + } + + [TestCaseSource(nameof(GetGoldenMappings))] + public void NewConverter_MatchesOriginalBehavior(string input, string expected) + { + // Compare new implementation against original + var originalResult = Utf8ToAsciiConverter.ToAsciiString(input); + var newResult = _newConverter.Convert(input); + + Assert.That(newResult, Is.EqualTo(originalResult)); + } +} diff --git a/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterNewTests.cs b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterNewTests.cs new file mode 100644 index 0000000000..1f9e9ec57d --- /dev/null +++ b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/Strings/Utf8ToAsciiConverterNewTests.cs @@ -0,0 +1,114 @@ +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging.Abstractions; +using Moq; +using NUnit.Framework; +using Umbraco.Cms.Core.Strings; + +namespace Umbraco.Cms.Tests.UnitTests.Umbraco.Core.Strings; + +[TestFixture] +public class Utf8ToAsciiConverterNewTests +{ + private IUtf8ToAsciiConverter _converter = null!; + + [SetUp] + public void SetUp() + { + var hostEnv = new Mock(); + hostEnv.Setup(h => h.ContentRootPath).Returns("/nonexistent"); + + var loader = new CharacterMappingLoader( + hostEnv.Object, + NullLogger.Instance); + + _converter = new Utf8ToAsciiConverterNew(loader); + } + + // === Null/Empty === + + [Test] + public void Convert_Null_ReturnsEmpty() + => Assert.That(_converter.Convert(null), Is.EqualTo(string.Empty)); + + [Test] + public void Convert_Empty_ReturnsEmpty() + => Assert.That(_converter.Convert(string.Empty), Is.EqualTo(string.Empty)); + + // === ASCII Fast Path === + + [TestCase("hello world", "hello world")] + [TestCase("ABC123", "ABC123")] + [TestCase("The quick brown fox", "The quick brown fox")] + public void Convert_AsciiOnly_ReturnsSameString(string input, string expected) + => Assert.That(_converter.Convert(input), Is.EqualTo(expected)); + + // === Normalization (Accented Characters) === + + [TestCase("café", "cafe")] + [TestCase("naïve", "naive")] + [TestCase("résumé", "resume")] + public void Convert_AccentedChars_NormalizesCorrectly(string input, string expected) + => Assert.That(_converter.Convert(input), Is.EqualTo(expected)); + + // === Ligatures === + + [TestCase("Œuvre", "OEuvre")] + [TestCase("Ærodynamic", "AErodynamic")] + [TestCase("straße", "strasse")] + public void Convert_Ligatures_ExpandsCorrectly(string input, string expected) + => Assert.That(_converter.Convert(input), Is.EqualTo(expected)); + + // === Cyrillic === + + [TestCase("Москва", "Moskva")] + [TestCase("Борщ", "Borshch")] + [TestCase("Щука", "Shchuka")] + [TestCase("Привет", "Privet")] + public void Convert_Cyrillic_TransliteratesCorrectly(string input, string expected) + => Assert.That(_converter.Convert(input), Is.EqualTo(expected)); + + // === Special Latin === + + [TestCase("Łódź", "Lodz")] + [TestCase("Ørsted", "Orsted")] + [TestCase("Þórr", "THorr")] + public void Convert_SpecialLatin_ConvertsCorrectly(string input, string expected) + => Assert.That(_converter.Convert(input), Is.EqualTo(expected)); + + // === Span API === + + [Test] + public void Convert_SpanApi_WritesToOutputBuffer() + { + ReadOnlySpan input = "café"; + Span output = stackalloc char[20]; + + var written = _converter.Convert(input, output); + + Assert.That(written, Is.EqualTo(4)); + Assert.That(new string(output[..written]), Is.EqualTo("cafe")); + } + + [Test] + public void Convert_SpanApi_HandlesExpansion() + { + ReadOnlySpan input = "Щ"; // Expands to "Shch" (4 chars) + Span output = stackalloc char[20]; + + var written = _converter.Convert(input, output); + + Assert.That(written, Is.EqualTo(4)); + Assert.That(new string(output[..written]), Is.EqualTo("Shch")); + } + + // === Mixed Content === + + [Test] + public void Convert_MixedContent_HandlesCorrectly() + { + var input = "Café Müller in Moskva"; + var expected = "Cafe Muller in Moskva"; + + Assert.That(_converter.Convert(input), Is.EqualTo(expected)); + } +}