refactor(strings): apply code review fixes to Utf8ToAsciiConverterNew

- Document backward compat rationale in cyrillic.json
- Extract magic number 4 to MaxExpansionRatio constant
- Add fail-fast assertion for missing golden mappings file
- Add edge case tests (control chars, whitespace, emoji, empty mappings)
- Handle original converter buffer overflow bugs in golden tests

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-13 01:09:33 +00:00
parent aed6e99246
commit 8d532696f0
4 changed files with 156 additions and 15 deletions

View File

@@ -1,6 +1,6 @@
{
"name": "Cyrillic",
"description": "Russian Cyrillic to Latin transliteration",
"description": "Russian Cyrillic to Latin transliteration. NOTE: Uses original Umbraco mappings for backward compatibility with existing URLs, not ISO 9 or BGN/PCGN standard transliteration.",
"priority": 0,
"mappings": {
"А": "A",

View File

@@ -8,8 +8,29 @@ namespace Umbraco.Cms.Core.Strings;
/// <summary>
/// SIMD-optimized UTF-8 to ASCII converter with extensible character mappings.
/// </summary>
/// <remarks>
/// <para>
/// This converter uses a multi-step fallback strategy:
/// 1. Dictionary lookup for special cases (ligatures, Cyrillic, special Latin)
/// 2. Unicode normalization (FormD) for accented Latin characters
/// 3. Control character stripping
/// 4. Whitespace normalization
/// 5. Fallback character for unmapped characters
/// </para>
/// <para>
/// Most accented Latin characters (À, é, ñ, etc.) are handled automatically via
/// Unicode normalization. Dictionary mappings are only needed for characters that
/// don't decompose correctly (ligatures like Æ→AE, Cyrillic, special Latin like Ø→O).
/// </para>
/// </remarks>
public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter
{
/// <summary>
/// Maximum expansion ratio for output buffer sizing.
/// Worst case: single char becomes 4 chars (e.g., Щ→Shch in standard transliteration).
/// </summary>
private const int MaxExpansionRatio = 4;
// SIMD-optimized ASCII detection (uses AVX-512 when available)
private static readonly SearchValues<char> AsciiPrintable =
SearchValues.Create(" !\"#$%&'()*+,-./0123456789:;<=>?@" +
@@ -39,8 +60,8 @@ public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter
return text;
}
// Allocate output buffer (worst case: each char becomes 4, e.g., Щ→Shch)
var maxLen = text.Length * 4;
// Allocate output buffer for worst-case expansion
var maxLen = text.Length * MaxExpansionRatio;
char[] arrayBuffer = ArrayPool<char>.Shared.Rent(maxLen);
try
{

View File

@@ -22,18 +22,24 @@ public class Utf8ToAsciiConverterGoldenTests
"TestData",
"golden-mappings.json");
if (File.Exists(testDataPath))
if (!File.Exists(testDataPath))
{
var json = File.ReadAllText(testDataPath);
var doc = JsonDocument.Parse(json);
GoldenMappings = doc.RootElement
.GetProperty("mappings")
.EnumerateObject()
.ToDictionary(p => p.Name, p => p.Value.GetString() ?? "");
throw new InvalidOperationException(
$"Golden mappings file not found at: {testDataPath}. " +
"Ensure the test data is configured to copy to output directory.");
}
else
var json = File.ReadAllText(testDataPath);
var doc = JsonDocument.Parse(json);
GoldenMappings = doc.RootElement
.GetProperty("mappings")
.EnumerateObject()
.ToDictionary(p => p.Name, p => p.Value.GetString() ?? "");
if (GoldenMappings.Count == 0)
{
GoldenMappings = new Dictionary<string, string>();
throw new InvalidOperationException(
"Golden mappings file is empty. Test data may be corrupted.");
}
}
@@ -69,9 +75,23 @@ public class Utf8ToAsciiConverterGoldenTests
public void NewConverter_MatchesOriginalBehavior(string input, string expected)
{
// Compare new implementation against original
var originalResult = Utf8ToAsciiConverter.ToAsciiString(input);
var newResult = _newConverter.Convert(input);
// Note: Original has buffer overflow bugs for chars that expand to 4+ chars (e.g., ⑽→(10))
string? originalResult;
try
{
originalResult = Utf8ToAsciiConverter.ToAsciiString(input);
}
catch (IndexOutOfRangeException)
{
// Original converter has known buffer bugs for high-expansion characters
// New converter fixes these - verify it produces the expected golden mapping
var newResult = _newConverter.Convert(input);
Assert.That(newResult, Is.EqualTo(expected),
$"Original throws IndexOutOfRangeException, but new converter should match golden mapping");
return;
}
Assert.That(newResult, Is.EqualTo(originalResult));
var result = _newConverter.Convert(input);
Assert.That(result, Is.EqualTo(originalResult));
}
}

View File

@@ -112,4 +112,104 @@ public class Utf8ToAsciiConverterNewTests
Assert.That(_converter.Convert(input), Is.EqualTo(expected));
}
// === Edge Cases: Control Characters ===
[Test]
public void Convert_ControlCharacters_AreStripped()
{
// Tab, newline, carriage return should be stripped
var input = "hello\t\n\rworld";
var result = _converter.Convert(input);
// Control characters are stripped (not converted to space)
Assert.That(result, Is.EqualTo("helloworld"));
}
[Test]
public void Convert_NullCharacter_IsStripped()
{
var input = "hello\0world";
var result = _converter.Convert(input);
Assert.That(result, Is.EqualTo("helloworld"));
}
// === Edge Cases: Whitespace Variants ===
[Test]
public void Convert_NonBreakingSpace_NormalizesToSpace()
{
// Non-breaking space (U+00A0)
var input = "hello\u00A0world";
var result = _converter.Convert(input);
Assert.That(result, Is.EqualTo("hello world"));
}
[Test]
public void Convert_EmSpace_NormalizesToSpace()
{
// Em space (U+2003)
var input = "hello\u2003world";
var result = _converter.Convert(input);
Assert.That(result, Is.EqualTo("hello world"));
}
// === Edge Cases: Empty Mappings ===
[Test]
public void Convert_CyrillicHardSign_MapsToQuote()
{
// Ъ maps to " in original Umbraco implementation
var input = "Ъ";
var result = _converter.Convert(input);
Assert.That(result, Is.EqualTo("\""));
}
[Test]
public void Convert_CyrillicSoftSign_MapsToApostrophe()
{
// Ь maps to ' in original Umbraco implementation
var input = "Ь";
var result = _converter.Convert(input);
Assert.That(result, Is.EqualTo("'"));
}
// === Edge Cases: Surrogate Pairs (Emoji) ===
[Test]
public void Convert_Emoji_ReplacedWithFallback()
{
// Emoji (surrogate pair)
var input = "hello 😀 world";
var result = _converter.Convert(input);
Assert.That(result, Is.EqualTo("hello ? world"));
}
[Test]
public void Convert_Emoji_CustomFallback()
{
var input = "test 🎉 emoji";
var result = _converter.Convert(input, fallback: '*');
Assert.That(result, Is.EqualTo("test * emoji"));
}
// === Edge Cases: Long Input ===
[Test]
public void Convert_LongAsciiString_ReturnsSameReference()
{
// Pure ASCII should return same string instance (no allocation)
var input = new string('a', 10000);
var result = _converter.Convert(input);
Assert.That(ReferenceEquals(input, result), Is.True,
"Pure ASCII input should return same string instance");
}
}