fixed issue #15 (incorrect handling of unicode codepoints requiring 2 consecutive c# chars)

carlosirna · carlosirna · commit 785d26eef8c4 · 2023-07-03T14:57:38.000+02:00
diff --git a/Convert.bat b/Convert.bat
@@ -0,0 +1,2 @@
+@python py2cs.py
+@copy unidecoder-decodemap.txt assets /y
diff --git a/benchmark/Unidecode.NET.Benchmark/Benchmarks.cs b/benchmark/Unidecode.NET.Benchmark/Benchmarks.cs
@@ -1,9 +1,4 @@
-using System;
-using System.Collections.Generic;
-using System.Linq;
 using System.Text;
-using System.Threading.Tasks;
-using BenchmarkDotNet;
 using BenchmarkDotNet.Attributes;
 
 namespace Unidecode.NET.Benchmark;
@@ -12,15 +7,27 @@ public class Benchmarks
 {
 
   [Benchmark]
-  public void UnidecodeRussian()
+  public void FastUnidecodeRussian()
   {
-    var converted = "Работа с кириллицей".Unidecode();
+    var converted = "Работа с кириллицей".Unidecode(UnidecodeAlgorithm.Fast);
   }
 
   [Benchmark]
-  public void UnidecodeAscii()
+  public void CompleteUnidecodeRussian()
   {
-    var converted = "Hello World!".Unidecode();
+    var converted = "Работа с кириллицей".Unidecode(UnidecodeAlgorithm.Complete);
+  }
+
+  [Benchmark]
+  public void FastUnidecodeAscii()
+  {
+    var converted = "Hello World!".Unidecode(UnidecodeAlgorithm.Fast);
+  }
+  
+  [Benchmark]
+  public void CompleteUnidecodeAscii()
+  {
+    var converted = "Hello World!".Unidecode(UnidecodeAlgorithm.Complete);
   }
 
   [Benchmark]
@@ -35,4 +42,22 @@ public void UnidecodeAsciiChar()
     var converted = 'Z'.Unidecode();
   }
 
+
+  private readonly static Rune russianRune = new('и');
+  [Benchmark]
+  public void UnidecodeRussianRune()
+  {
+
+    var converted = russianRune.Unidecode();
+  }
+
+  private readonly static Rune AsciiRune = new('Z');
+
+  [Benchmark]
+  public void UnidecodeAsciiRune()
+  {
+    var converted = AsciiRune.Unidecode();
+  }
+
+
 }
diff --git a/src/Unidecoder.cs b/src/Unidecoder.cs
@@ -7,21 +7,40 @@
 using System.Text;
 using System.Text.RegularExpressions;
 
-// this IntenralsVisibleTo attribute is here to allow benchmarking and
-// testing of SlowUnidecode, which normally, due to the stackalloc optimization,
-// is called only when Unidecode receives a long string
-[assembly: InternalsVisibleTo("Unidecode.Net.Benchmark")]
-[assembly: InternalsVisibleTo("Unidecode.Net.Tests")]
-
 namespace Unidecode.NET
 {
+
+
+  public enum UnidecodeAlgorithm
+  {
+    /// <summary>
+    /// optimized decoding algorithm (up to 3 times faster), but does not work properly for unicode codepoints >65535.
+    /// </summary>
+    Fast,
+    /// <summary>
+    /// proper, slower algorithm that properly handles all codepoints (for languages like Chinese, Japanese..)
+    /// </summary>
+    Complete
+  };
+
   /// <summary>
   /// ASCII transliterations of Unicode text
   /// </summary>
   public static class Unidecoder
   {
+    // for short strings the fast decoding algorithm uses a buffer allocated
+    // in the stack instead of a stringbuilder.
+    private const int MAX_STACKALLOC_BUFFER_SIZE = 16384;
     private static readonly int MaxDecodedCharLength;
     private static string[][] characters;
+
+
+
+    /// <summary>
+    /// sets the algorithm to be used for the extension methods that do not explicitly receive the algorithm to be used)
+    /// </summary>
+    static public UnidecodeAlgorithm Algorithm { get; set; } = UnidecodeAlgorithm.Fast;
+
     static Unidecoder()
     {
       MaxDecodedCharLength = 0;
@@ -60,14 +79,26 @@ static Unidecoder()
         characters[pair.Key] = pair.Value;
     }
 
+    public static string Unidecode(this string input, int? tempStringBuilderCapacity = null)
+    {
+      if (Algorithm == UnidecodeAlgorithm.Complete)
+        return CompleteUnidecode(input, tempStringBuilderCapacity);
+      return FastUnidecode(input, tempStringBuilderCapacity);
+    }
+
+    public static string Unidecode(this string input, UnidecodeAlgorithm Algorithm, int? tempStringBuilderCapacity = null)
+    {
+      if (Algorithm == UnidecodeAlgorithm.Complete)
+        return CompleteUnidecode(input, tempStringBuilderCapacity);
+      return FastUnidecode(input, tempStringBuilderCapacity);
+    }
 
-    // for short strings I use a buffer allocated in the stack instead of a stringbuilder.
-    // (this is faster and gives less work to the garbage collector)
-    private const int MAX_STACKALLOC_BUFFER_SIZE = 8192;
 
-    [SkipLocalsInit] // this is to avoid the local raw buffer variable stackBuffer do be zeroed for every call: we don't need it and is very cpu intensive (this attribute needs unsafe compliation)
+    // this is to avoid the local raw buffer variable stackBuffer do be zeroed for every call: we don't need it and is very cpu intensive (this attribute needs unsafe compliation)
     /// <summary>
-    /// Transliterate Unicode string to ASCII string.
+    /// Transliterate Unicode string to ASCII. 
+    ///  This one is a fast implementation that does NOT work properly on code points >65535. Use it only if you have to deal with languages that
+    ///  do not use such unicode symbols (like European languages), and decoding performance is an actual issue in your application<br/>
     /// </summary>
     /// <param name="input">String you want to transliterate into ASCII</param>
     /// <param name="tempStringBuilderCapacity">
@@ -79,22 +110,23 @@ static Unidecoder()
     ///     ASCII string. There are [?] (3 characters) in places of some unknown(?) unicode characters.
     ///     It is this way in Python code as well.
     /// </returns>
-    public static string Unidecode(this string input, int? tempStringBuilderCapacity = null)
+    [SkipLocalsInit]
+    private static string FastUnidecode(string input, int? tempStringBuilderCapacity = null)
     {
       if (string.IsNullOrEmpty(input))
         return "";
       var neededBufferSize = input.Length * MaxDecodedCharLength + 1;
       if (neededBufferSize >= MAX_STACKALLOC_BUFFER_SIZE)
-        return SlowUnidecode(input, tempStringBuilderCapacity);
+        return CompleteUnidecode(input, tempStringBuilderCapacity);
 
       bool noConversionNeeded = true;
       Span<char> stackBuffer = stackalloc char[neededBufferSize];
       int buffIdx = 0;
-      foreach (char c in input)
+      foreach (var c in input)
       {
         if (c < 0x80)
         {
-          stackBuffer[buffIdx++] = c;
+          stackBuffer[buffIdx++] =  c;
           continue;
         }
         noConversionNeeded = false;
@@ -115,7 +147,7 @@ public static string Unidecode(this string input, int? tempStringBuilderCapacity
     }
 
     // this implementation is slower but it has no limits for the lenght of the input stirng. it gets called by Unidecode() for long strings
-    internal static string SlowUnidecode(this string input, int? tempStringBuilderCapacity = null)
+    private static string CompleteUnidecode(string input, int? tempStringBuilderCapacity = null)
     {
       if (string.IsNullOrEmpty(input))
         return "";
@@ -125,12 +157,13 @@ internal static string SlowUnidecode(this string input, int? tempStringBuilderCa
 
       // Unidecode result often can be at least two times longer than input string.
       var sb = new StringBuilder(tempStringBuilderCapacity ?? input.Length * 2);
-      foreach (var c in input)
+      foreach (var rune in input.EnumerateRunes())
       {
+        long c = rune.Value;
         // Copypaste is bad, but sb.Append(c.Unidecode()); would be a bit slower.
         if (c < 0x80)
         {
-          sb.Append(c);
+          sb.Append((char) c);
         }
         else
         {
@@ -151,6 +184,7 @@ internal static string SlowUnidecode(this string input, int? tempStringBuilderCa
 
     /// <summary>
     /// Transliterate Unicode character to ASCII string.
+    /// (for unicode points that exceed the 2 byte size, use <see cref="Unidecode(in Rune)"/>)
     /// </summary>
     /// <param name="c">Character you want to transliterate into ASCII</param>
     /// <returns>
@@ -172,6 +206,22 @@ public static string Unidecode(this in char c)
       return bytes[c & 0xff];
     }
 
+    public static string Unidecode(this in Rune c)
+    {
+      var codepoint = c.Value;
+      if (codepoint < 0x80)
+        return AsciiCharacter.AsString[codepoint];
+
+      var high = codepoint >> 8;
+      if (high >= characters.Length)
+        return null;
+      var bytes = characters[high];
+      if (bytes == null)
+        return null;
+
+      return bytes[codepoint & 0xff];
+    }
+
     // I keep a precalculated cache of all the single character strings for ascii characters, so the Unidecode character extension method
     // does not instantiate a new string every time it has to return a single character string for a character <0x80
     private static class AsciiCharacter
diff --git a/test/CompleteUnidecoderTest.cs b/test/CompleteUnidecoderTest.cs
@@ -0,0 +1,73 @@
+using System;
+using System.Text;
+using Xunit;
+using Unidecode.NET;
+
+namespace Unidecode.NET.Tests
+{
+  // these are the same tests you find in UnidecoderTest, but here we are testing directly the internal function SlowUnidecode
+  // which is the Unidecode implementation that normally gets called only for very long strings
+  // (because it is slower, but it has no limits on the size of the input string)
+  public class CompleteUnidecoderTest
+  {
+    [Fact]
+    public void DocTest()
+    {
+      Assert.Equal("Bei Jing ", "\u5317\u4EB0".Unidecode(UnidecodeAlgorithm.Complete));
+    }
+
+    [Fact]
+    public void CustomTest()
+    {
+      Assert.Equal("Rabota s kirillitsei", "Работа с кириллицей".Unidecode(UnidecodeAlgorithm.Complete));
+      Assert.Equal("aouoAOUO", "äöűőÄÖŨŐ".Unidecode(UnidecodeAlgorithm.Complete));
+    }
+
+
+
+    [Fact]
+    public void PythonTest()
+    {
+      
+      Assert.Equal("Hello, World!", "Hello, World!".Unidecode(UnidecodeAlgorithm.Complete));
+      Assert.Equal("'\"\r\n", "'\"\r\n".Unidecode(UnidecodeAlgorithm.Complete));
+      Assert.Equal("CZSczs", "ČŽŠčžš".Unidecode(UnidecodeAlgorithm.Complete));
+      Assert.Equal("a", "ア".Unidecode(UnidecodeAlgorithm.Complete));
+      Assert.Equal("a", "α".Unidecode(UnidecodeAlgorithm.Complete));
+      Assert.Equal("a", "а".Unidecode(UnidecodeAlgorithm.Complete));
+      Assert.Equal("chateau", "ch\u00e2teau".Unidecode(UnidecodeAlgorithm.Complete));
+      Assert.Equal("vinedos", "vi\u00f1edos".Unidecode(UnidecodeAlgorithm.Complete));
+    }
+
+    [Fact]
+    public void RussianAlphabetTest()
+    {
+      const string russianAlphabetLowercase = "а б в г д е ё ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я";
+      const string russianAlphabetUppercase = "А Б В Г Д Е Ё Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я";
+
+      const string expectedLowercase = "a b v g d e io zh z i i k l m n o p r s t u f kh ts ch sh shch ' y ' e iu ia";
+      const string expectedUppercase = "A B V G D E Io Zh Z I I K L M N O P R S T U F Kh Ts Ch Sh Shch ' Y ' E Iu Ia";
+
+      Assert.Equal(expectedLowercase, russianAlphabetLowercase.Unidecode(UnidecodeAlgorithm.Complete));
+      Assert.Equal(expectedUppercase, russianAlphabetUppercase.Unidecode(UnidecodeAlgorithm.Complete));
+    }
+
+    [Fact]
+    public void UnidecodeOnNullShouldReturnEmptyString()
+    {
+      Assert.Equal("", ((string)null).Unidecode(UnidecodeAlgorithm.Complete));
+    }
+
+    [Fact]
+    public void TheCompleteAlgorithmShouldSupportAllCodePoints()
+    {
+      var a = "更".Unidecode(UnidecodeAlgorithm.Complete); // f901 Kayng
+      var b = "🄁".Unidecode(UnidecodeAlgorithm.Complete); // 1f101 0,
+      Assert.Equal("Kayng ", a);
+      Assert.Equal("0,", b);
+      
+    }
+
+
+  }
+}
diff --git a/test/SlowUnidecoderTest.cs b/test/SlowUnidecoderTest.cs
diff --git a/test/UnidecoderTest.cs b/test/UnidecoderTest.cs
@@ -1,4 +1,3 @@
-using System;
 using System.Linq;
 using System.Text;
 using Xunit;

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+@python py2cs.py`
	`2`	`+@copy unidecoder-decodemap.txt assets /y`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-using System;`
`2`	`1`	`using System.Linq;`
`3`	`2`	`using System.Text;`
`4`	`3`	`using Xunit;`