feat(strings): implement SIMD-optimized Utf8ToAsciiConverterNew with golden file tests

Implements Task 4 of the Utf8ToAsciiConverter refactor plan.

Key features:
- SIMD-optimized ASCII detection using SearchValues (AVX-512 capable)
- Unicode normalization for accented characters (FormD decomposition)
- FrozenDictionary for ligatures, Cyrillic, and special Latin mappings
- Span-based API for zero-allocation scenarios
- ArrayPool usage for temporary buffers
- Comprehensive test coverage (21 unit tests, all passing)

Implementation details:
- Fast path for pure ASCII input (no conversion needed)
- Dictionary lookup for special cases (ligatures, Cyrillic, etc.)
- Unicode normalization fallback for accented characters
- Control character stripping and whitespace normalization
- Proper surrogate pair handling

Test coverage:
- Null/empty string handling
- ASCII fast path verification
- Accented character normalization (café → cafe)
- Ligature expansion (Æ → AE, ß → ss, Œ → OE)
- Cyrillic transliteration (Москва → Moskva, Щ → Shch)
- Special Latin characters (Ł → L, Ø → O, Þ → TH)
- Span API for zero-allocation scenarios
- Mixed content handling

Golden file tests are included for regression testing against the original
implementation, though they require test data file configuration to run.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-13 00:13:11 +00:00
parent 72dfd667c5
commit 1102b34e88
3 changed files with 379 additions and 0 deletions

View File

@@ -0,0 +1,188 @@
using System.Buffers;
using System.Collections.Frozen;
using System.Globalization;
using System.Text;
namespace Umbraco.Cms.Core.Strings;
/// <summary>
/// SIMD-optimized UTF-8 to ASCII converter with extensible character mappings.
/// </summary>
public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter
{
// SIMD-optimized ASCII detection (uses AVX-512 when available)
private static readonly SearchValues<char> AsciiPrintable =
SearchValues.Create(" !\"#$%&'()*+,-./0123456789:;<=>?@" +
"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" +
"abcdefghijklmnopqrstuvwxyz{|}~");
private readonly FrozenDictionary<char, string> _mappings;
public Utf8ToAsciiConverterNew(ICharacterMappingLoader mappingLoader)
{
_mappings = mappingLoader.LoadMappings();
}
/// <inheritdoc />
public string Convert(string? text, char fallback = '?')
{
if (string.IsNullOrEmpty(text))
{
return string.Empty;
}
var input = text.AsSpan();
// Fast path: all ASCII - no conversion needed
if (input.IndexOfAnyExcept(AsciiPrintable) == -1)
{
return text;
}
// Allocate output buffer (worst case: each char becomes 4, e.g., Щ→Shch)
var maxLen = text.Length * 4;
char[] arrayBuffer = ArrayPool<char>.Shared.Rent(maxLen);
try
{
var written = Convert(input, arrayBuffer.AsSpan(), fallback);
return new string(arrayBuffer, 0, written);
}
finally
{
ArrayPool<char>.Shared.Return(arrayBuffer);
}
}
/// <inheritdoc />
public int Convert(ReadOnlySpan<char> input, Span<char> output, char fallback = '?')
{
if (input.IsEmpty)
{
return 0;
}
var opos = 0;
var ipos = 0;
while (ipos < input.Length)
{
// Find next non-ASCII character using SIMD
var remaining = input[ipos..];
var asciiLen = remaining.IndexOfAnyExcept(AsciiPrintable);
if (asciiLen == -1)
{
// Rest is all ASCII - bulk copy
remaining.CopyTo(output[opos..]);
return opos + remaining.Length;
}
if (asciiLen > 0)
{
// Copy ASCII prefix
remaining[..asciiLen].CopyTo(output[opos..]);
opos += asciiLen;
ipos += asciiLen;
}
// Process non-ASCII character
var c = input[ipos];
// Handle surrogate pairs (emoji, etc.)
if (char.IsSurrogate(c))
{
output[opos++] = fallback;
ipos++;
if (ipos < input.Length && char.IsLowSurrogate(input[ipos]))
{
ipos++; // Skip low surrogate
}
continue;
}
opos += ProcessNonAscii(c, output[opos..], fallback);
ipos++;
}
return opos;
}
private int ProcessNonAscii(char c, Span<char> output, char fallback)
{
// 1. Check special cases dictionary (ligatures, Cyrillic, etc.)
if (_mappings.TryGetValue(c, out var mapped))
{
if (mapped.Length == 0)
{
return 0; // Empty mapping = strip character
}
mapped.AsSpan().CopyTo(output);
return mapped.Length;
}
// 2. Try Unicode normalization (handles most accented chars)
var normLen = TryNormalize(c, output);
if (normLen > 0)
{
return normLen;
}
// 3. Control character handling
if (char.IsControl(c))
{
return 0; // Strip control characters
}
// 4. Whitespace normalization
if (char.IsWhiteSpace(c))
{
output[0] = ' ';
return 1;
}
// 5. Fallback for unmapped characters
output[0] = fallback;
return 1;
}
private static int TryNormalize(char c, Span<char> output)
{
// Skip characters that won't normalize to ASCII
if (c < '\u00C0')
{
return 0;
}
// Normalize to FormD (decomposed form)
ReadOnlySpan<char> input = stackalloc char[] { c };
var normalized = input.ToString().Normalize(NormalizationForm.FormD);
if (normalized.Length == 0)
{
return 0;
}
// Copy only base characters (skip combining marks)
var len = 0;
foreach (var ch in normalized)
{
var category = CharUnicodeInfo.GetUnicodeCategory(ch);
// Skip combining marks (diacritics)
if (category == UnicodeCategory.NonSpacingMark ||
category == UnicodeCategory.SpacingCombiningMark ||
category == UnicodeCategory.EnclosingMark)
{
continue;
}
// Only keep if it's now ASCII
if (ch < '\u0080')
{
output[len++] = ch;
}
}
return len;
}
}

View File

@@ -0,0 +1,77 @@
using System.Text.Json;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging.Abstractions;
using Moq;
using NUnit.Framework;
using Umbraco.Cms.Core.Strings;
namespace Umbraco.Cms.Tests.UnitTests.Umbraco.Core.Strings;
[TestFixture]
public class Utf8ToAsciiConverterGoldenTests
{
private IUtf8ToAsciiConverter _newConverter = null!;
private static readonly Dictionary<string, string> GoldenMappings;
static Utf8ToAsciiConverterGoldenTests()
{
var testDataPath = Path.Combine(
AppContext.BaseDirectory,
"Umbraco.Core",
"Strings",
"TestData",
"golden-mappings.json");
if (File.Exists(testDataPath))
{
var json = File.ReadAllText(testDataPath);
var doc = JsonDocument.Parse(json);
GoldenMappings = doc.RootElement
.GetProperty("mappings")
.EnumerateObject()
.ToDictionary(p => p.Name, p => p.Value.GetString() ?? "");
}
else
{
GoldenMappings = new Dictionary<string, string>();
}
}
[SetUp]
public void SetUp()
{
var hostEnv = new Mock<IHostEnvironment>();
hostEnv.Setup(h => h.ContentRootPath).Returns("/nonexistent");
var loader = new CharacterMappingLoader(
hostEnv.Object,
NullLogger<CharacterMappingLoader>.Instance);
_newConverter = new Utf8ToAsciiConverterNew(loader);
}
public static IEnumerable<TestCaseData> GetGoldenMappings()
{
foreach (var (input, expected) in GoldenMappings)
{
yield return new TestCaseData(input, expected);
}
}
[TestCaseSource(nameof(GetGoldenMappings))]
public void NewConverter_MatchesGoldenMapping(string input, string expected)
{
var result = _newConverter.Convert(input);
Assert.That(result, Is.EqualTo(expected));
}
[TestCaseSource(nameof(GetGoldenMappings))]
public void NewConverter_MatchesOriginalBehavior(string input, string expected)
{
// Compare new implementation against original
var originalResult = Utf8ToAsciiConverter.ToAsciiString(input);
var newResult = _newConverter.Convert(input);
Assert.That(newResult, Is.EqualTo(originalResult));
}
}

View File

@@ -0,0 +1,114 @@
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging.Abstractions;
using Moq;
using NUnit.Framework;
using Umbraco.Cms.Core.Strings;
namespace Umbraco.Cms.Tests.UnitTests.Umbraco.Core.Strings;
[TestFixture]
public class Utf8ToAsciiConverterNewTests
{
private IUtf8ToAsciiConverter _converter = null!;
[SetUp]
public void SetUp()
{
var hostEnv = new Mock<IHostEnvironment>();
hostEnv.Setup(h => h.ContentRootPath).Returns("/nonexistent");
var loader = new CharacterMappingLoader(
hostEnv.Object,
NullLogger<CharacterMappingLoader>.Instance);
_converter = new Utf8ToAsciiConverterNew(loader);
}
// === Null/Empty ===
[Test]
public void Convert_Null_ReturnsEmpty()
=> Assert.That(_converter.Convert(null), Is.EqualTo(string.Empty));
[Test]
public void Convert_Empty_ReturnsEmpty()
=> Assert.That(_converter.Convert(string.Empty), Is.EqualTo(string.Empty));
// === ASCII Fast Path ===
[TestCase("hello world", "hello world")]
[TestCase("ABC123", "ABC123")]
[TestCase("The quick brown fox", "The quick brown fox")]
public void Convert_AsciiOnly_ReturnsSameString(string input, string expected)
=> Assert.That(_converter.Convert(input), Is.EqualTo(expected));
// === Normalization (Accented Characters) ===
[TestCase("café", "cafe")]
[TestCase("naïve", "naive")]
[TestCase("résumé", "resume")]
public void Convert_AccentedChars_NormalizesCorrectly(string input, string expected)
=> Assert.That(_converter.Convert(input), Is.EqualTo(expected));
// === Ligatures ===
[TestCase("Œuvre", "OEuvre")]
[TestCase("Ærodynamic", "AErodynamic")]
[TestCase("straße", "strasse")]
public void Convert_Ligatures_ExpandsCorrectly(string input, string expected)
=> Assert.That(_converter.Convert(input), Is.EqualTo(expected));
// === Cyrillic ===
[TestCase("Москва", "Moskva")]
[TestCase("Борщ", "Borshch")]
[TestCase("Щука", "Shchuka")]
[TestCase("Привет", "Privet")]
public void Convert_Cyrillic_TransliteratesCorrectly(string input, string expected)
=> Assert.That(_converter.Convert(input), Is.EqualTo(expected));
// === Special Latin ===
[TestCase("Łódź", "Lodz")]
[TestCase("Ørsted", "Orsted")]
[TestCase("Þórr", "THorr")]
public void Convert_SpecialLatin_ConvertsCorrectly(string input, string expected)
=> Assert.That(_converter.Convert(input), Is.EqualTo(expected));
// === Span API ===
[Test]
public void Convert_SpanApi_WritesToOutputBuffer()
{
ReadOnlySpan<char> input = "café";
Span<char> output = stackalloc char[20];
var written = _converter.Convert(input, output);
Assert.That(written, Is.EqualTo(4));
Assert.That(new string(output[..written]), Is.EqualTo("cafe"));
}
[Test]
public void Convert_SpanApi_HandlesExpansion()
{
ReadOnlySpan<char> input = "Щ"; // Expands to "Shch" (4 chars)
Span<char> output = stackalloc char[20];
var written = _converter.Convert(input, output);
Assert.That(written, Is.EqualTo(4));
Assert.That(new string(output[..written]), Is.EqualTo("Shch"));
}
// === Mixed Content ===
[Test]
public void Convert_MixedContent_HandlesCorrectly()
{
var input = "Café Müller in Moskva";
var expected = "Cafe Muller in Moskva";
Assert.That(_converter.Convert(input), Is.EqualTo(expected));
}
}