feat(strings): implement SIMD-optimized Utf8ToAsciiConverterNew with golden file tests
Implements Task 4 of the Utf8ToAsciiConverter refactor plan. Key features: - SIMD-optimized ASCII detection using SearchValues (AVX-512 capable) - Unicode normalization for accented characters (FormD decomposition) - FrozenDictionary for ligatures, Cyrillic, and special Latin mappings - Span-based API for zero-allocation scenarios - ArrayPool usage for temporary buffers - Comprehensive test coverage (21 unit tests, all passing) Implementation details: - Fast path for pure ASCII input (no conversion needed) - Dictionary lookup for special cases (ligatures, Cyrillic, etc.) - Unicode normalization fallback for accented characters - Control character stripping and whitespace normalization - Proper surrogate pair handling Test coverage: - Null/empty string handling - ASCII fast path verification - Accented character normalization (café → cafe) - Ligature expansion (Æ → AE, ß → ss, Œ → OE) - Cyrillic transliteration (Москва → Moskva, Щ → Shch) - Special Latin characters (Ł → L, Ø → O, Þ → TH) - Span API for zero-allocation scenarios - Mixed content handling Golden file tests are included for regression testing against the original implementation, though they require test data file configuration to run. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
188
src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs
Normal file
188
src/Umbraco.Core/Strings/Utf8ToAsciiConverterNew.cs
Normal file
@@ -0,0 +1,188 @@
|
||||
using System.Buffers;
|
||||
using System.Collections.Frozen;
|
||||
using System.Globalization;
|
||||
using System.Text;
|
||||
|
||||
namespace Umbraco.Cms.Core.Strings;
|
||||
|
||||
/// <summary>
|
||||
/// SIMD-optimized UTF-8 to ASCII converter with extensible character mappings.
|
||||
/// </summary>
|
||||
public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter
|
||||
{
|
||||
// SIMD-optimized ASCII detection (uses AVX-512 when available)
|
||||
private static readonly SearchValues<char> AsciiPrintable =
|
||||
SearchValues.Create(" !\"#$%&'()*+,-./0123456789:;<=>?@" +
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" +
|
||||
"abcdefghijklmnopqrstuvwxyz{|}~");
|
||||
|
||||
private readonly FrozenDictionary<char, string> _mappings;
|
||||
|
||||
public Utf8ToAsciiConverterNew(ICharacterMappingLoader mappingLoader)
|
||||
{
|
||||
_mappings = mappingLoader.LoadMappings();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Convert(string? text, char fallback = '?')
|
||||
{
|
||||
if (string.IsNullOrEmpty(text))
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
var input = text.AsSpan();
|
||||
|
||||
// Fast path: all ASCII - no conversion needed
|
||||
if (input.IndexOfAnyExcept(AsciiPrintable) == -1)
|
||||
{
|
||||
return text;
|
||||
}
|
||||
|
||||
// Allocate output buffer (worst case: each char becomes 4, e.g., Щ→Shch)
|
||||
var maxLen = text.Length * 4;
|
||||
char[] arrayBuffer = ArrayPool<char>.Shared.Rent(maxLen);
|
||||
try
|
||||
{
|
||||
var written = Convert(input, arrayBuffer.AsSpan(), fallback);
|
||||
return new string(arrayBuffer, 0, written);
|
||||
}
|
||||
finally
|
||||
{
|
||||
ArrayPool<char>.Shared.Return(arrayBuffer);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public int Convert(ReadOnlySpan<char> input, Span<char> output, char fallback = '?')
|
||||
{
|
||||
if (input.IsEmpty)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
var opos = 0;
|
||||
var ipos = 0;
|
||||
|
||||
while (ipos < input.Length)
|
||||
{
|
||||
// Find next non-ASCII character using SIMD
|
||||
var remaining = input[ipos..];
|
||||
var asciiLen = remaining.IndexOfAnyExcept(AsciiPrintable);
|
||||
|
||||
if (asciiLen == -1)
|
||||
{
|
||||
// Rest is all ASCII - bulk copy
|
||||
remaining.CopyTo(output[opos..]);
|
||||
return opos + remaining.Length;
|
||||
}
|
||||
|
||||
if (asciiLen > 0)
|
||||
{
|
||||
// Copy ASCII prefix
|
||||
remaining[..asciiLen].CopyTo(output[opos..]);
|
||||
opos += asciiLen;
|
||||
ipos += asciiLen;
|
||||
}
|
||||
|
||||
// Process non-ASCII character
|
||||
var c = input[ipos];
|
||||
|
||||
// Handle surrogate pairs (emoji, etc.)
|
||||
if (char.IsSurrogate(c))
|
||||
{
|
||||
output[opos++] = fallback;
|
||||
ipos++;
|
||||
if (ipos < input.Length && char.IsLowSurrogate(input[ipos]))
|
||||
{
|
||||
ipos++; // Skip low surrogate
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
opos += ProcessNonAscii(c, output[opos..], fallback);
|
||||
ipos++;
|
||||
}
|
||||
|
||||
return opos;
|
||||
}
|
||||
|
||||
private int ProcessNonAscii(char c, Span<char> output, char fallback)
|
||||
{
|
||||
// 1. Check special cases dictionary (ligatures, Cyrillic, etc.)
|
||||
if (_mappings.TryGetValue(c, out var mapped))
|
||||
{
|
||||
if (mapped.Length == 0)
|
||||
{
|
||||
return 0; // Empty mapping = strip character
|
||||
}
|
||||
mapped.AsSpan().CopyTo(output);
|
||||
return mapped.Length;
|
||||
}
|
||||
|
||||
// 2. Try Unicode normalization (handles most accented chars)
|
||||
var normLen = TryNormalize(c, output);
|
||||
if (normLen > 0)
|
||||
{
|
||||
return normLen;
|
||||
}
|
||||
|
||||
// 3. Control character handling
|
||||
if (char.IsControl(c))
|
||||
{
|
||||
return 0; // Strip control characters
|
||||
}
|
||||
|
||||
// 4. Whitespace normalization
|
||||
if (char.IsWhiteSpace(c))
|
||||
{
|
||||
output[0] = ' ';
|
||||
return 1;
|
||||
}
|
||||
|
||||
// 5. Fallback for unmapped characters
|
||||
output[0] = fallback;
|
||||
return 1;
|
||||
}
|
||||
|
||||
private static int TryNormalize(char c, Span<char> output)
|
||||
{
|
||||
// Skip characters that won't normalize to ASCII
|
||||
if (c < '\u00C0')
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Normalize to FormD (decomposed form)
|
||||
ReadOnlySpan<char> input = stackalloc char[] { c };
|
||||
var normalized = input.ToString().Normalize(NormalizationForm.FormD);
|
||||
|
||||
if (normalized.Length == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Copy only base characters (skip combining marks)
|
||||
var len = 0;
|
||||
foreach (var ch in normalized)
|
||||
{
|
||||
var category = CharUnicodeInfo.GetUnicodeCategory(ch);
|
||||
|
||||
// Skip combining marks (diacritics)
|
||||
if (category == UnicodeCategory.NonSpacingMark ||
|
||||
category == UnicodeCategory.SpacingCombiningMark ||
|
||||
category == UnicodeCategory.EnclosingMark)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Only keep if it's now ASCII
|
||||
if (ch < '\u0080')
|
||||
{
|
||||
output[len++] = ch;
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Moq;
|
||||
using NUnit.Framework;
|
||||
using Umbraco.Cms.Core.Strings;
|
||||
|
||||
namespace Umbraco.Cms.Tests.UnitTests.Umbraco.Core.Strings;
|
||||
|
||||
[TestFixture]
|
||||
public class Utf8ToAsciiConverterGoldenTests
|
||||
{
|
||||
private IUtf8ToAsciiConverter _newConverter = null!;
|
||||
private static readonly Dictionary<string, string> GoldenMappings;
|
||||
|
||||
static Utf8ToAsciiConverterGoldenTests()
|
||||
{
|
||||
var testDataPath = Path.Combine(
|
||||
AppContext.BaseDirectory,
|
||||
"Umbraco.Core",
|
||||
"Strings",
|
||||
"TestData",
|
||||
"golden-mappings.json");
|
||||
|
||||
if (File.Exists(testDataPath))
|
||||
{
|
||||
var json = File.ReadAllText(testDataPath);
|
||||
var doc = JsonDocument.Parse(json);
|
||||
GoldenMappings = doc.RootElement
|
||||
.GetProperty("mappings")
|
||||
.EnumerateObject()
|
||||
.ToDictionary(p => p.Name, p => p.Value.GetString() ?? "");
|
||||
}
|
||||
else
|
||||
{
|
||||
GoldenMappings = new Dictionary<string, string>();
|
||||
}
|
||||
}
|
||||
|
||||
[SetUp]
|
||||
public void SetUp()
|
||||
{
|
||||
var hostEnv = new Mock<IHostEnvironment>();
|
||||
hostEnv.Setup(h => h.ContentRootPath).Returns("/nonexistent");
|
||||
|
||||
var loader = new CharacterMappingLoader(
|
||||
hostEnv.Object,
|
||||
NullLogger<CharacterMappingLoader>.Instance);
|
||||
|
||||
_newConverter = new Utf8ToAsciiConverterNew(loader);
|
||||
}
|
||||
|
||||
public static IEnumerable<TestCaseData> GetGoldenMappings()
|
||||
{
|
||||
foreach (var (input, expected) in GoldenMappings)
|
||||
{
|
||||
yield return new TestCaseData(input, expected);
|
||||
}
|
||||
}
|
||||
|
||||
[TestCaseSource(nameof(GetGoldenMappings))]
|
||||
public void NewConverter_MatchesGoldenMapping(string input, string expected)
|
||||
{
|
||||
var result = _newConverter.Convert(input);
|
||||
Assert.That(result, Is.EqualTo(expected));
|
||||
}
|
||||
|
||||
[TestCaseSource(nameof(GetGoldenMappings))]
|
||||
public void NewConverter_MatchesOriginalBehavior(string input, string expected)
|
||||
{
|
||||
// Compare new implementation against original
|
||||
var originalResult = Utf8ToAsciiConverter.ToAsciiString(input);
|
||||
var newResult = _newConverter.Convert(input);
|
||||
|
||||
Assert.That(newResult, Is.EqualTo(originalResult));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,114 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Moq;
|
||||
using NUnit.Framework;
|
||||
using Umbraco.Cms.Core.Strings;
|
||||
|
||||
namespace Umbraco.Cms.Tests.UnitTests.Umbraco.Core.Strings;
|
||||
|
||||
[TestFixture]
|
||||
public class Utf8ToAsciiConverterNewTests
|
||||
{
|
||||
private IUtf8ToAsciiConverter _converter = null!;
|
||||
|
||||
[SetUp]
|
||||
public void SetUp()
|
||||
{
|
||||
var hostEnv = new Mock<IHostEnvironment>();
|
||||
hostEnv.Setup(h => h.ContentRootPath).Returns("/nonexistent");
|
||||
|
||||
var loader = new CharacterMappingLoader(
|
||||
hostEnv.Object,
|
||||
NullLogger<CharacterMappingLoader>.Instance);
|
||||
|
||||
_converter = new Utf8ToAsciiConverterNew(loader);
|
||||
}
|
||||
|
||||
// === Null/Empty ===
|
||||
|
||||
[Test]
|
||||
public void Convert_Null_ReturnsEmpty()
|
||||
=> Assert.That(_converter.Convert(null), Is.EqualTo(string.Empty));
|
||||
|
||||
[Test]
|
||||
public void Convert_Empty_ReturnsEmpty()
|
||||
=> Assert.That(_converter.Convert(string.Empty), Is.EqualTo(string.Empty));
|
||||
|
||||
// === ASCII Fast Path ===
|
||||
|
||||
[TestCase("hello world", "hello world")]
|
||||
[TestCase("ABC123", "ABC123")]
|
||||
[TestCase("The quick brown fox", "The quick brown fox")]
|
||||
public void Convert_AsciiOnly_ReturnsSameString(string input, string expected)
|
||||
=> Assert.That(_converter.Convert(input), Is.EqualTo(expected));
|
||||
|
||||
// === Normalization (Accented Characters) ===
|
||||
|
||||
[TestCase("café", "cafe")]
|
||||
[TestCase("naïve", "naive")]
|
||||
[TestCase("résumé", "resume")]
|
||||
public void Convert_AccentedChars_NormalizesCorrectly(string input, string expected)
|
||||
=> Assert.That(_converter.Convert(input), Is.EqualTo(expected));
|
||||
|
||||
// === Ligatures ===
|
||||
|
||||
[TestCase("Œuvre", "OEuvre")]
|
||||
[TestCase("Ærodynamic", "AErodynamic")]
|
||||
[TestCase("straße", "strasse")]
|
||||
public void Convert_Ligatures_ExpandsCorrectly(string input, string expected)
|
||||
=> Assert.That(_converter.Convert(input), Is.EqualTo(expected));
|
||||
|
||||
// === Cyrillic ===
|
||||
|
||||
[TestCase("Москва", "Moskva")]
|
||||
[TestCase("Борщ", "Borshch")]
|
||||
[TestCase("Щука", "Shchuka")]
|
||||
[TestCase("Привет", "Privet")]
|
||||
public void Convert_Cyrillic_TransliteratesCorrectly(string input, string expected)
|
||||
=> Assert.That(_converter.Convert(input), Is.EqualTo(expected));
|
||||
|
||||
// === Special Latin ===
|
||||
|
||||
[TestCase("Łódź", "Lodz")]
|
||||
[TestCase("Ørsted", "Orsted")]
|
||||
[TestCase("Þórr", "THorr")]
|
||||
public void Convert_SpecialLatin_ConvertsCorrectly(string input, string expected)
|
||||
=> Assert.That(_converter.Convert(input), Is.EqualTo(expected));
|
||||
|
||||
// === Span API ===
|
||||
|
||||
[Test]
|
||||
public void Convert_SpanApi_WritesToOutputBuffer()
|
||||
{
|
||||
ReadOnlySpan<char> input = "café";
|
||||
Span<char> output = stackalloc char[20];
|
||||
|
||||
var written = _converter.Convert(input, output);
|
||||
|
||||
Assert.That(written, Is.EqualTo(4));
|
||||
Assert.That(new string(output[..written]), Is.EqualTo("cafe"));
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void Convert_SpanApi_HandlesExpansion()
|
||||
{
|
||||
ReadOnlySpan<char> input = "Щ"; // Expands to "Shch" (4 chars)
|
||||
Span<char> output = stackalloc char[20];
|
||||
|
||||
var written = _converter.Convert(input, output);
|
||||
|
||||
Assert.That(written, Is.EqualTo(4));
|
||||
Assert.That(new string(output[..written]), Is.EqualTo("Shch"));
|
||||
}
|
||||
|
||||
// === Mixed Content ===
|
||||
|
||||
[Test]
|
||||
public void Convert_MixedContent_HandlesCorrectly()
|
||||
{
|
||||
var input = "Café Müller in Moskva";
|
||||
var expected = "Cafe Muller in Moskva";
|
||||
|
||||
Assert.That(_converter.Convert(input), Is.EqualTo(expected));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user