refactor(strings): apply code review fixes to Utf8ToAsciiConverterNew
- Document backward compat rationale in cyrillic.json - Extract magic number 4 to MaxExpansionRatio constant - Add fail-fast assertion for missing golden mappings file - Add edge case tests (control chars, whitespace, emoji, empty mappings) - Handle original converter buffer overflow bugs in golden tests 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "Cyrillic",
|
||||
"description": "Russian Cyrillic to Latin transliteration",
|
||||
"description": "Russian Cyrillic to Latin transliteration. NOTE: Uses original Umbraco mappings for backward compatibility with existing URLs, not ISO 9 or BGN/PCGN standard transliteration.",
|
||||
"priority": 0,
|
||||
"mappings": {
|
||||
"А": "A",
|
||||
|
||||
@@ -8,8 +8,29 @@ namespace Umbraco.Cms.Core.Strings;
|
||||
/// <summary>
|
||||
/// SIMD-optimized UTF-8 to ASCII converter with extensible character mappings.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// This converter uses a multi-step fallback strategy:
|
||||
/// 1. Dictionary lookup for special cases (ligatures, Cyrillic, special Latin)
|
||||
/// 2. Unicode normalization (FormD) for accented Latin characters
|
||||
/// 3. Control character stripping
|
||||
/// 4. Whitespace normalization
|
||||
/// 5. Fallback character for unmapped characters
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Most accented Latin characters (À, é, ñ, etc.) are handled automatically via
|
||||
/// Unicode normalization. Dictionary mappings are only needed for characters that
|
||||
/// don't decompose correctly (ligatures like Æ→AE, Cyrillic, special Latin like Ø→O).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter
|
||||
{
|
||||
/// <summary>
|
||||
/// Maximum expansion ratio for output buffer sizing.
|
||||
/// Worst case: single char becomes 4 chars (e.g., Щ→Shch in standard transliteration).
|
||||
/// </summary>
|
||||
private const int MaxExpansionRatio = 4;
|
||||
|
||||
// SIMD-optimized ASCII detection (uses AVX-512 when available)
|
||||
private static readonly SearchValues<char> AsciiPrintable =
|
||||
SearchValues.Create(" !\"#$%&'()*+,-./0123456789:;<=>?@" +
|
||||
@@ -39,8 +60,8 @@ public sealed class Utf8ToAsciiConverterNew : IUtf8ToAsciiConverter
|
||||
return text;
|
||||
}
|
||||
|
||||
// Allocate output buffer (worst case: each char becomes 4, e.g., Щ→Shch)
|
||||
var maxLen = text.Length * 4;
|
||||
// Allocate output buffer for worst-case expansion
|
||||
var maxLen = text.Length * MaxExpansionRatio;
|
||||
char[] arrayBuffer = ArrayPool<char>.Shared.Rent(maxLen);
|
||||
try
|
||||
{
|
||||
|
||||
@@ -22,18 +22,24 @@ public class Utf8ToAsciiConverterGoldenTests
|
||||
"TestData",
|
||||
"golden-mappings.json");
|
||||
|
||||
if (File.Exists(testDataPath))
|
||||
if (!File.Exists(testDataPath))
|
||||
{
|
||||
var json = File.ReadAllText(testDataPath);
|
||||
var doc = JsonDocument.Parse(json);
|
||||
GoldenMappings = doc.RootElement
|
||||
.GetProperty("mappings")
|
||||
.EnumerateObject()
|
||||
.ToDictionary(p => p.Name, p => p.Value.GetString() ?? "");
|
||||
throw new InvalidOperationException(
|
||||
$"Golden mappings file not found at: {testDataPath}. " +
|
||||
"Ensure the test data is configured to copy to output directory.");
|
||||
}
|
||||
else
|
||||
|
||||
var json = File.ReadAllText(testDataPath);
|
||||
var doc = JsonDocument.Parse(json);
|
||||
GoldenMappings = doc.RootElement
|
||||
.GetProperty("mappings")
|
||||
.EnumerateObject()
|
||||
.ToDictionary(p => p.Name, p => p.Value.GetString() ?? "");
|
||||
|
||||
if (GoldenMappings.Count == 0)
|
||||
{
|
||||
GoldenMappings = new Dictionary<string, string>();
|
||||
throw new InvalidOperationException(
|
||||
"Golden mappings file is empty. Test data may be corrupted.");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,9 +75,23 @@ public class Utf8ToAsciiConverterGoldenTests
|
||||
public void NewConverter_MatchesOriginalBehavior(string input, string expected)
|
||||
{
|
||||
// Compare new implementation against original
|
||||
var originalResult = Utf8ToAsciiConverter.ToAsciiString(input);
|
||||
var newResult = _newConverter.Convert(input);
|
||||
// Note: Original has buffer overflow bugs for chars that expand to 4+ chars (e.g., ⑽→(10))
|
||||
string? originalResult;
|
||||
try
|
||||
{
|
||||
originalResult = Utf8ToAsciiConverter.ToAsciiString(input);
|
||||
}
|
||||
catch (IndexOutOfRangeException)
|
||||
{
|
||||
// Original converter has known buffer bugs for high-expansion characters
|
||||
// New converter fixes these - verify it produces the expected golden mapping
|
||||
var newResult = _newConverter.Convert(input);
|
||||
Assert.That(newResult, Is.EqualTo(expected),
|
||||
$"Original throws IndexOutOfRangeException, but new converter should match golden mapping");
|
||||
return;
|
||||
}
|
||||
|
||||
Assert.That(newResult, Is.EqualTo(originalResult));
|
||||
var result = _newConverter.Convert(input);
|
||||
Assert.That(result, Is.EqualTo(originalResult));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -112,4 +112,104 @@ public class Utf8ToAsciiConverterNewTests
|
||||
|
||||
Assert.That(_converter.Convert(input), Is.EqualTo(expected));
|
||||
}
|
||||
|
||||
// === Edge Cases: Control Characters ===
|
||||
|
||||
[Test]
|
||||
public void Convert_ControlCharacters_AreStripped()
|
||||
{
|
||||
// Tab, newline, carriage return should be stripped
|
||||
var input = "hello\t\n\rworld";
|
||||
var result = _converter.Convert(input);
|
||||
|
||||
// Control characters are stripped (not converted to space)
|
||||
Assert.That(result, Is.EqualTo("helloworld"));
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void Convert_NullCharacter_IsStripped()
|
||||
{
|
||||
var input = "hello\0world";
|
||||
var result = _converter.Convert(input);
|
||||
|
||||
Assert.That(result, Is.EqualTo("helloworld"));
|
||||
}
|
||||
|
||||
// === Edge Cases: Whitespace Variants ===
|
||||
|
||||
[Test]
|
||||
public void Convert_NonBreakingSpace_NormalizesToSpace()
|
||||
{
|
||||
// Non-breaking space (U+00A0)
|
||||
var input = "hello\u00A0world";
|
||||
var result = _converter.Convert(input);
|
||||
|
||||
Assert.That(result, Is.EqualTo("hello world"));
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void Convert_EmSpace_NormalizesToSpace()
|
||||
{
|
||||
// Em space (U+2003)
|
||||
var input = "hello\u2003world";
|
||||
var result = _converter.Convert(input);
|
||||
|
||||
Assert.That(result, Is.EqualTo("hello world"));
|
||||
}
|
||||
|
||||
// === Edge Cases: Empty Mappings ===
|
||||
|
||||
[Test]
|
||||
public void Convert_CyrillicHardSign_MapsToQuote()
|
||||
{
|
||||
// Ъ maps to " in original Umbraco implementation
|
||||
var input = "Ъ";
|
||||
var result = _converter.Convert(input);
|
||||
|
||||
Assert.That(result, Is.EqualTo("\""));
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void Convert_CyrillicSoftSign_MapsToApostrophe()
|
||||
{
|
||||
// Ь maps to ' in original Umbraco implementation
|
||||
var input = "Ь";
|
||||
var result = _converter.Convert(input);
|
||||
|
||||
Assert.That(result, Is.EqualTo("'"));
|
||||
}
|
||||
|
||||
// === Edge Cases: Surrogate Pairs (Emoji) ===
|
||||
|
||||
[Test]
|
||||
public void Convert_Emoji_ReplacedWithFallback()
|
||||
{
|
||||
// Emoji (surrogate pair)
|
||||
var input = "hello 😀 world";
|
||||
var result = _converter.Convert(input);
|
||||
|
||||
Assert.That(result, Is.EqualTo("hello ? world"));
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void Convert_Emoji_CustomFallback()
|
||||
{
|
||||
var input = "test 🎉 emoji";
|
||||
var result = _converter.Convert(input, fallback: '*');
|
||||
|
||||
Assert.That(result, Is.EqualTo("test * emoji"));
|
||||
}
|
||||
|
||||
// === Edge Cases: Long Input ===
|
||||
|
||||
[Test]
|
||||
public void Convert_LongAsciiString_ReturnsSameReference()
|
||||
{
|
||||
// Pure ASCII should return same string instance (no allocation)
|
||||
var input = new string('a', 10000);
|
||||
var result = _converter.Convert(input);
|
||||
|
||||
Assert.That(ReferenceEquals(input, result), Is.True,
|
||||
"Pure ASCII input should return same string instance");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user