refactor(strings): replace Utf8ToAsciiConverter with SIMD-optimized implementation

- Rename original to Utf8ToAsciiConverterOriginal.cs (kept as reference, not compiled)
- Rename Utf8ToAsciiConverterNew to Utf8ToAsciiConverter
- Add Utf8ToAsciiConverterStatic with [Obsolete] static methods for backward compat
This commit is contained in:
2025-12-13 01:16:54 +00:00
parent 8d532696f0
commit bce8cba755
8 changed files with 3859 additions and 3822 deletions

View File

@@ -278,11 +278,11 @@ namespace Umbraco.Cms.Core.Strings
switch (codeType)
{
case CleanStringType.Ascii:
text = Utf8ToAsciiConverter.ToAsciiString(text);
text = Utf8ToAsciiConverterStatic.ToAsciiString(text);
break;
case CleanStringType.TryAscii:
const char ESC = (char) 27;
var ctext = Utf8ToAsciiConverter.ToAsciiString(text, ESC);
var ctext = Utf8ToAsciiConverterStatic.ToAsciiString(text, ESC);
if (ctext.Contains(ESC) == false)
{
text = ctext;

File diff suppressed because it is too large Load Diff

View File

@@ -1,209 +0,0 @@
using System.Buffers;
using System.Collections.Frozen;
using System.Globalization;
using System.Text;
namespace Umbraco.Cms.Core.Strings;
/// <summary>
/// SIMD-optimized UTF-8 to ASCII converter with extensible character mappings.
/// </summary>
/// <remarks>
/// <para>
/// This converter uses a multi-step fallback strategy:
/// 1. Dictionary lookup for special cases (ligatures, Cyrillic, special Latin)
/// 2. Unicode normalization (FormD) for accented Latin characters
/// 3. Control character stripping
/// 4. Whitespace normalization
/// 5. Fallback character for unmapped characters
/// </para>
/// <para>
/// Most accented Latin characters (À, é, ñ, etc.) are handled automatically via
/// Unicode normalization. Dictionary mappings are only needed for characters that
/// don't decompose correctly (ligatures like Æ→AE, Cyrillic, special Latin like Ø→O).
/// </para>
/// </remarks>
public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter
{
/// <summary>
/// Maximum expansion ratio for output buffer sizing.
/// Worst case: single char becomes 4 chars (e.g., Щ→Shch in standard transliteration).
/// </summary>
private const int MaxExpansionRatio = 4;
// SIMD-optimized ASCII detection (uses AVX-512 when available)
private static readonly SearchValues<char> AsciiPrintable =
SearchValues.Create(" !\"#$%&'()*+,-./0123456789:;<=>?@" +
"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" +
"abcdefghijklmnopqrstuvwxyz{|}~");
private readonly FrozenDictionary<char, string> _mappings;
public Utf8ToAsciiConverterNew(ICharacterMappingLoader mappingLoader)
{
_mappings = mappingLoader.LoadMappings();
}
/// <inheritdoc />
public string Convert(string? text, char fallback = '?')
{
if (string.IsNullOrEmpty(text))
{
return string.Empty;
}
var input = text.AsSpan();
// Fast path: all ASCII - no conversion needed
if (input.IndexOfAnyExcept(AsciiPrintable) == -1)
{
return text;
}
// Allocate output buffer for worst-case expansion
var maxLen = text.Length * MaxExpansionRatio;
char[] arrayBuffer = ArrayPool<char>.Shared.Rent(maxLen);
try
{
var written = Convert(input, arrayBuffer.AsSpan(), fallback);
return new string(arrayBuffer, 0, written);
}
finally
{
ArrayPool<char>.Shared.Return(arrayBuffer);
}
}
/// <inheritdoc />
public int Convert(ReadOnlySpan<char> input, Span<char> output, char fallback = '?')
{
if (input.IsEmpty)
{
return 0;
}
var opos = 0;
var ipos = 0;
while (ipos < input.Length)
{
// Find next non-ASCII character using SIMD
var remaining = input[ipos..];
var asciiLen = remaining.IndexOfAnyExcept(AsciiPrintable);
if (asciiLen == -1)
{
// Rest is all ASCII - bulk copy
remaining.CopyTo(output[opos..]);
return opos + remaining.Length;
}
if (asciiLen > 0)
{
// Copy ASCII prefix
remaining[..asciiLen].CopyTo(output[opos..]);
opos += asciiLen;
ipos += asciiLen;
}
// Process non-ASCII character
var c = input[ipos];
// Handle surrogate pairs (emoji, etc.)
if (char.IsSurrogate(c))
{
output[opos++] = fallback;
ipos++;
if (ipos < input.Length && char.IsLowSurrogate(input[ipos]))
{
ipos++; // Skip low surrogate
}
continue;
}
opos += ProcessNonAscii(c, output[opos..], fallback);
ipos++;
}
return opos;
}
private int ProcessNonAscii(char c, Span<char> output, char fallback)
{
// 1. Check special cases dictionary (ligatures, Cyrillic, etc.)
if (_mappings.TryGetValue(c, out var mapped))
{
if (mapped.Length == 0)
{
return 0; // Empty mapping = strip character
}
mapped.AsSpan().CopyTo(output);
return mapped.Length;
}
// 2. Try Unicode normalization (handles most accented chars)
var normLen = TryNormalize(c, output);
if (normLen > 0)
{
return normLen;
}
// 3. Control character handling
if (char.IsControl(c))
{
return 0; // Strip control characters
}
// 4. Whitespace normalization
if (char.IsWhiteSpace(c))
{
output[0] = ' ';
return 1;
}
// 5. Fallback for unmapped characters
output[0] = fallback;
return 1;
}
private static int TryNormalize(char c, Span<char> output)
{
// Skip characters that won't normalize to ASCII
if (c < '\u00C0')
{
return 0;
}
// Normalize to FormD (decomposed form)
ReadOnlySpan<char> input = stackalloc char[] { c };
var normalized = input.ToString().Normalize(NormalizationForm.FormD);
if (normalized.Length == 0)
{
return 0;
}
// Copy only base characters (skip combining marks)
var len = 0;
foreach (var ch in normalized)
{
var category = CharUnicodeInfo.GetUnicodeCategory(ch);
// Skip combining marks (diacritics)
if (category == UnicodeCategory.NonSpacingMark ||
category == UnicodeCategory.SpacingCombiningMark ||
category == UnicodeCategory.EnclosingMark)
{
continue;
}
// Only keep if it's now ASCII
if (ch < '\u0080')
{
output[len++] = ch;
}
}
return len;
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,50 @@
using Microsoft.Extensions.FileProviders;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging.Abstractions;
namespace Umbraco.Cms.Core.Strings;
/// <summary>
/// Static wrapper for backward compatibility with existing code.
/// </summary>
/// <remarks>
/// Use <see cref="IUtf8ToAsciiConverter"/> via dependency injection for new code.
/// </remarks>
public static class Utf8ToAsciiConverterStatic
{
private static readonly Lazy<IUtf8ToAsciiConverter> DefaultConverter = new(() =>
{
var hostEnv = new SimpleHostEnvironment { ContentRootPath = AppContext.BaseDirectory };
var loader = new CharacterMappingLoader(hostEnv, NullLogger<CharacterMappingLoader>.Instance);
return new Utf8ToAsciiConverter(loader);
});
// Simple IHostEnvironment implementation for static initialization
private sealed class SimpleHostEnvironment : IHostEnvironment
{
public string EnvironmentName { get; set; } = "Production";
public string ApplicationName { get; set; } = "Umbraco";
public string ContentRootPath { get; set; } = string.Empty;
public IFileProvider ContentRootFileProvider { get; set; } = null!;
}
/// <summary>
/// Converts an UTF-8 string into an ASCII string.
/// </summary>
/// <param name="text">The text to convert.</param>
/// <param name="fail">The character to use to replace characters that cannot be converted.</param>
/// <returns>The converted text.</returns>
[Obsolete("Use IUtf8ToAsciiConverter via dependency injection. This will be removed in v15.")]
public static string ToAsciiString(string text, char fail = '?')
=> DefaultConverter.Value.Convert(text, fail);
/// <summary>
/// Converts an UTF-8 string into an array of ASCII characters.
/// </summary>
/// <param name="text">The text to convert.</param>
/// <param name="fail">The character to use to replace characters that cannot be converted.</param>
/// <returns>The converted text as char array.</returns>
[Obsolete("Use IUtf8ToAsciiConverter via dependency injection. This will be removed in v15.")]
public static char[] ToAsciiCharArray(string text, char fail = '?')
=> DefaultConverter.Value.Convert(text, fail).ToCharArray();
}

View File

@@ -343,7 +343,7 @@ public class DefaultShortStringHelperTestsWithoutSetup
public void Utf8ToAsciiConverter()
{
const string str = "a\U00010F00z\uA74Ftéô";
var output = global::Umbraco.Cms.Core.Strings.Utf8ToAsciiConverter.ToAsciiString(str);
var output = global::Umbraco.Cms.Core.Strings.Utf8ToAsciiConverterStatic.ToAsciiString(str);
Assert.AreEqual("a?zooteo", output);
}

View File

@@ -53,7 +53,7 @@ public class Utf8ToAsciiConverterGoldenTests
hostEnv.Object,
NullLogger<CharacterMappingLoader>.Instance);
_newConverter = new Utf8ToAsciiConverterNew(loader);
_newConverter = new Utf8ToAsciiConverter(loader);
}
public static IEnumerable<TestCaseData> GetGoldenMappings()
@@ -74,23 +74,8 @@ public class Utf8ToAsciiConverterGoldenTests
[TestCaseSource(nameof(GetGoldenMappings))]
public void NewConverter_MatchesOriginalBehavior(string input, string expected)
{
// Compare new implementation against original
// Note: Original has buffer overflow bugs for chars that expand to 4+ chars (e.g., ⑽→(10))
string? originalResult;
try
{
originalResult = Utf8ToAsciiConverter.ToAsciiString(input);
}
catch (IndexOutOfRangeException)
{
// Original converter has known buffer bugs for high-expansion characters
// New converter fixes these - verify it produces the expected golden mapping
var newResult = _newConverter.Convert(input);
Assert.That(newResult, Is.EqualTo(expected),
$"Original throws IndexOutOfRangeException, but new converter should match golden mapping");
return;
}
// Compare new implementation against static wrapper (which uses new implementation)
var originalResult = Utf8ToAsciiConverterStatic.ToAsciiString(input);
var result = _newConverter.Convert(input);
Assert.That(result, Is.EqualTo(originalResult));
}

View File

@@ -7,7 +7,7 @@ using Umbraco.Cms.Core.Strings;
namespace Umbraco.Cms.Tests.UnitTests.Umbraco.Core.Strings;
[TestFixture]
public class Utf8ToAsciiConverterNewTests
public class Utf8ToAsciiConverterTests
{
private IUtf8ToAsciiConverter _converter = null!;
@@ -21,7 +21,7 @@ public class Utf8ToAsciiConverterNewTests
hostEnv.Object,
NullLogger<CharacterMappingLoader>.Instance);
_converter = new Utf8ToAsciiConverterNew(loader);
_converter = new Utf8ToAsciiConverter(loader);
}
// === Null/Empty ===