refactor(strings): replace Utf8ToAsciiConverter with SIMD-optimized implementation
- Rename original to Utf8ToAsciiConverterOriginal.cs (kept as reference, not compiled) - Rename Utf8ToAsciiConverterNew to Utf8ToAsciiConverter - Add Utf8ToAsciiConverterStatic with [Obsolete] static methods for backward compat
This commit is contained in:
@@ -278,11 +278,11 @@ namespace Umbraco.Cms.Core.Strings
|
||||
switch (codeType)
|
||||
{
|
||||
case CleanStringType.Ascii:
|
||||
text = Utf8ToAsciiConverter.ToAsciiString(text);
|
||||
text = Utf8ToAsciiConverterStatic.ToAsciiString(text);
|
||||
break;
|
||||
case CleanStringType.TryAscii:
|
||||
const char ESC = (char) 27;
|
||||
var ctext = Utf8ToAsciiConverter.ToAsciiString(text, ESC);
|
||||
var ctext = Utf8ToAsciiConverterStatic.ToAsciiString(text, ESC);
|
||||
if (ctext.Contains(ESC) == false)
|
||||
{
|
||||
text = ctext;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,209 +0,0 @@
|
||||
using System.Buffers;
|
||||
using System.Collections.Frozen;
|
||||
using System.Globalization;
|
||||
using System.Text;
|
||||
|
||||
namespace Umbraco.Cms.Core.Strings;
|
||||
|
||||
/// <summary>
|
||||
/// SIMD-optimized UTF-8 to ASCII converter with extensible character mappings.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// This converter uses a multi-step fallback strategy:
|
||||
/// 1. Dictionary lookup for special cases (ligatures, Cyrillic, special Latin)
|
||||
/// 2. Unicode normalization (FormD) for accented Latin characters
|
||||
/// 3. Control character stripping
|
||||
/// 4. Whitespace normalization
|
||||
/// 5. Fallback character for unmapped characters
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Most accented Latin characters (À, é, ñ, etc.) are handled automatically via
|
||||
/// Unicode normalization. Dictionary mappings are only needed for characters that
|
||||
/// don't decompose correctly (ligatures like Æ→AE, Cyrillic, special Latin like Ø→O).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter
|
||||
{
|
||||
/// <summary>
|
||||
/// Maximum expansion ratio for output buffer sizing.
|
||||
/// Worst case: single char becomes 4 chars (e.g., Щ→Shch in standard transliteration).
|
||||
/// </summary>
|
||||
private const int MaxExpansionRatio = 4;
|
||||
|
||||
// SIMD-optimized ASCII detection (uses AVX-512 when available)
|
||||
private static readonly SearchValues<char> AsciiPrintable =
|
||||
SearchValues.Create(" !\"#$%&'()*+,-./0123456789:;<=>?@" +
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" +
|
||||
"abcdefghijklmnopqrstuvwxyz{|}~");
|
||||
|
||||
private readonly FrozenDictionary<char, string> _mappings;
|
||||
|
||||
public Utf8ToAsciiConverterNew(ICharacterMappingLoader mappingLoader)
|
||||
{
|
||||
_mappings = mappingLoader.LoadMappings();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Convert(string? text, char fallback = '?')
|
||||
{
|
||||
if (string.IsNullOrEmpty(text))
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
var input = text.AsSpan();
|
||||
|
||||
// Fast path: all ASCII - no conversion needed
|
||||
if (input.IndexOfAnyExcept(AsciiPrintable) == -1)
|
||||
{
|
||||
return text;
|
||||
}
|
||||
|
||||
// Allocate output buffer for worst-case expansion
|
||||
var maxLen = text.Length * MaxExpansionRatio;
|
||||
char[] arrayBuffer = ArrayPool<char>.Shared.Rent(maxLen);
|
||||
try
|
||||
{
|
||||
var written = Convert(input, arrayBuffer.AsSpan(), fallback);
|
||||
return new string(arrayBuffer, 0, written);
|
||||
}
|
||||
finally
|
||||
{
|
||||
ArrayPool<char>.Shared.Return(arrayBuffer);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public int Convert(ReadOnlySpan<char> input, Span<char> output, char fallback = '?')
|
||||
{
|
||||
if (input.IsEmpty)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
var opos = 0;
|
||||
var ipos = 0;
|
||||
|
||||
while (ipos < input.Length)
|
||||
{
|
||||
// Find next non-ASCII character using SIMD
|
||||
var remaining = input[ipos..];
|
||||
var asciiLen = remaining.IndexOfAnyExcept(AsciiPrintable);
|
||||
|
||||
if (asciiLen == -1)
|
||||
{
|
||||
// Rest is all ASCII - bulk copy
|
||||
remaining.CopyTo(output[opos..]);
|
||||
return opos + remaining.Length;
|
||||
}
|
||||
|
||||
if (asciiLen > 0)
|
||||
{
|
||||
// Copy ASCII prefix
|
||||
remaining[..asciiLen].CopyTo(output[opos..]);
|
||||
opos += asciiLen;
|
||||
ipos += asciiLen;
|
||||
}
|
||||
|
||||
// Process non-ASCII character
|
||||
var c = input[ipos];
|
||||
|
||||
// Handle surrogate pairs (emoji, etc.)
|
||||
if (char.IsSurrogate(c))
|
||||
{
|
||||
output[opos++] = fallback;
|
||||
ipos++;
|
||||
if (ipos < input.Length && char.IsLowSurrogate(input[ipos]))
|
||||
{
|
||||
ipos++; // Skip low surrogate
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
opos += ProcessNonAscii(c, output[opos..], fallback);
|
||||
ipos++;
|
||||
}
|
||||
|
||||
return opos;
|
||||
}
|
||||
|
||||
private int ProcessNonAscii(char c, Span<char> output, char fallback)
|
||||
{
|
||||
// 1. Check special cases dictionary (ligatures, Cyrillic, etc.)
|
||||
if (_mappings.TryGetValue(c, out var mapped))
|
||||
{
|
||||
if (mapped.Length == 0)
|
||||
{
|
||||
return 0; // Empty mapping = strip character
|
||||
}
|
||||
mapped.AsSpan().CopyTo(output);
|
||||
return mapped.Length;
|
||||
}
|
||||
|
||||
// 2. Try Unicode normalization (handles most accented chars)
|
||||
var normLen = TryNormalize(c, output);
|
||||
if (normLen > 0)
|
||||
{
|
||||
return normLen;
|
||||
}
|
||||
|
||||
// 3. Control character handling
|
||||
if (char.IsControl(c))
|
||||
{
|
||||
return 0; // Strip control characters
|
||||
}
|
||||
|
||||
// 4. Whitespace normalization
|
||||
if (char.IsWhiteSpace(c))
|
||||
{
|
||||
output[0] = ' ';
|
||||
return 1;
|
||||
}
|
||||
|
||||
// 5. Fallback for unmapped characters
|
||||
output[0] = fallback;
|
||||
return 1;
|
||||
}
|
||||
|
||||
private static int TryNormalize(char c, Span<char> output)
|
||||
{
|
||||
// Skip characters that won't normalize to ASCII
|
||||
if (c < '\u00C0')
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Normalize to FormD (decomposed form)
|
||||
ReadOnlySpan<char> input = stackalloc char[] { c };
|
||||
var normalized = input.ToString().Normalize(NormalizationForm.FormD);
|
||||
|
||||
if (normalized.Length == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Copy only base characters (skip combining marks)
|
||||
var len = 0;
|
||||
foreach (var ch in normalized)
|
||||
{
|
||||
var category = CharUnicodeInfo.GetUnicodeCategory(ch);
|
||||
|
||||
// Skip combining marks (diacritics)
|
||||
if (category == UnicodeCategory.NonSpacingMark ||
|
||||
category == UnicodeCategory.SpacingCombiningMark ||
|
||||
category == UnicodeCategory.EnclosingMark)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Only keep if it's now ASCII
|
||||
if (ch < '\u0080')
|
||||
{
|
||||
output[len++] = ch;
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
}
|
||||
3633
src/Umbraco.Core/Strings/Utf8ToAsciiConverterOriginal.cs
Normal file
3633
src/Umbraco.Core/Strings/Utf8ToAsciiConverterOriginal.cs
Normal file
File diff suppressed because it is too large
Load Diff
50
src/Umbraco.Core/Strings/Utf8ToAsciiConverterStatic.cs
Normal file
50
src/Umbraco.Core/Strings/Utf8ToAsciiConverterStatic.cs
Normal file
@@ -0,0 +1,50 @@
|
||||
using Microsoft.Extensions.FileProviders;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
|
||||
namespace Umbraco.Cms.Core.Strings;
|
||||
|
||||
/// <summary>
|
||||
/// Static wrapper for backward compatibility with existing code.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Use <see cref="IUtf8ToAsciiConverter"/> via dependency injection for new code.
|
||||
/// </remarks>
|
||||
public static class Utf8ToAsciiConverterStatic
|
||||
{
|
||||
private static readonly Lazy<IUtf8ToAsciiConverter> DefaultConverter = new(() =>
|
||||
{
|
||||
var hostEnv = new SimpleHostEnvironment { ContentRootPath = AppContext.BaseDirectory };
|
||||
var loader = new CharacterMappingLoader(hostEnv, NullLogger<CharacterMappingLoader>.Instance);
|
||||
return new Utf8ToAsciiConverter(loader);
|
||||
});
|
||||
|
||||
// Simple IHostEnvironment implementation for static initialization
|
||||
private sealed class SimpleHostEnvironment : IHostEnvironment
|
||||
{
|
||||
public string EnvironmentName { get; set; } = "Production";
|
||||
public string ApplicationName { get; set; } = "Umbraco";
|
||||
public string ContentRootPath { get; set; } = string.Empty;
|
||||
public IFileProvider ContentRootFileProvider { get; set; } = null!;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts an UTF-8 string into an ASCII string.
|
||||
/// </summary>
|
||||
/// <param name="text">The text to convert.</param>
|
||||
/// <param name="fail">The character to use to replace characters that cannot be converted.</param>
|
||||
/// <returns>The converted text.</returns>
|
||||
[Obsolete("Use IUtf8ToAsciiConverter via dependency injection. This will be removed in v15.")]
|
||||
public static string ToAsciiString(string text, char fail = '?')
|
||||
=> DefaultConverter.Value.Convert(text, fail);
|
||||
|
||||
/// <summary>
|
||||
/// Converts an UTF-8 string into an array of ASCII characters.
|
||||
/// </summary>
|
||||
/// <param name="text">The text to convert.</param>
|
||||
/// <param name="fail">The character to use to replace characters that cannot be converted.</param>
|
||||
/// <returns>The converted text as char array.</returns>
|
||||
[Obsolete("Use IUtf8ToAsciiConverter via dependency injection. This will be removed in v15.")]
|
||||
public static char[] ToAsciiCharArray(string text, char fail = '?')
|
||||
=> DefaultConverter.Value.Convert(text, fail).ToCharArray();
|
||||
}
|
||||
@@ -343,7 +343,7 @@ public class DefaultShortStringHelperTestsWithoutSetup
|
||||
public void Utf8ToAsciiConverter()
|
||||
{
|
||||
const string str = "a\U00010F00z\uA74Ftéô";
|
||||
var output = global::Umbraco.Cms.Core.Strings.Utf8ToAsciiConverter.ToAsciiString(str);
|
||||
var output = global::Umbraco.Cms.Core.Strings.Utf8ToAsciiConverterStatic.ToAsciiString(str);
|
||||
Assert.AreEqual("a?zooteo", output);
|
||||
}
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ public class Utf8ToAsciiConverterGoldenTests
|
||||
hostEnv.Object,
|
||||
NullLogger<CharacterMappingLoader>.Instance);
|
||||
|
||||
_newConverter = new Utf8ToAsciiConverterNew(loader);
|
||||
_newConverter = new Utf8ToAsciiConverter(loader);
|
||||
}
|
||||
|
||||
public static IEnumerable<TestCaseData> GetGoldenMappings()
|
||||
@@ -74,23 +74,8 @@ public class Utf8ToAsciiConverterGoldenTests
|
||||
[TestCaseSource(nameof(GetGoldenMappings))]
|
||||
public void NewConverter_MatchesOriginalBehavior(string input, string expected)
|
||||
{
|
||||
// Compare new implementation against original
|
||||
// Note: Original has buffer overflow bugs for chars that expand to 4+ chars (e.g., ⑽→(10))
|
||||
string? originalResult;
|
||||
try
|
||||
{
|
||||
originalResult = Utf8ToAsciiConverter.ToAsciiString(input);
|
||||
}
|
||||
catch (IndexOutOfRangeException)
|
||||
{
|
||||
// Original converter has known buffer bugs for high-expansion characters
|
||||
// New converter fixes these - verify it produces the expected golden mapping
|
||||
var newResult = _newConverter.Convert(input);
|
||||
Assert.That(newResult, Is.EqualTo(expected),
|
||||
$"Original throws IndexOutOfRangeException, but new converter should match golden mapping");
|
||||
return;
|
||||
}
|
||||
|
||||
// Compare new implementation against static wrapper (which uses new implementation)
|
||||
var originalResult = Utf8ToAsciiConverterStatic.ToAsciiString(input);
|
||||
var result = _newConverter.Convert(input);
|
||||
Assert.That(result, Is.EqualTo(originalResult));
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ using Umbraco.Cms.Core.Strings;
|
||||
namespace Umbraco.Cms.Tests.UnitTests.Umbraco.Core.Strings;
|
||||
|
||||
[TestFixture]
|
||||
public class Utf8ToAsciiConverterNewTests
|
||||
public class Utf8ToAsciiConverterTests
|
||||
{
|
||||
private IUtf8ToAsciiConverter _converter = null!;
|
||||
|
||||
@@ -21,7 +21,7 @@ public class Utf8ToAsciiConverterNewTests
|
||||
hostEnv.Object,
|
||||
NullLogger<CharacterMappingLoader>.Instance);
|
||||
|
||||
_converter = new Utf8ToAsciiConverterNew(loader);
|
||||
_converter = new Utf8ToAsciiConverter(loader);
|
||||
}
|
||||
|
||||
// === Null/Empty ===
|
||||
Reference in New Issue
Block a user