Skip to content

Commit 785d26e

Browse files
committed
fixed issue #15 (incorrect handling of unicode codepoints requiring 2 consecutive c# chars)
1 parent cb3225d commit 785d26e

File tree

6 files changed

+177
-91
lines changed

6 files changed

+177
-91
lines changed

Convert.bat

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
@python py2cs.py
2+
@copy unidecoder-decodemap.txt assets /y
Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,4 @@
1-
using System;
2-
using System.Collections.Generic;
3-
using System.Linq;
41
using System.Text;
5-
using System.Threading.Tasks;
6-
using BenchmarkDotNet;
72
using BenchmarkDotNet.Attributes;
83

94
namespace Unidecode.NET.Benchmark;
@@ -12,15 +7,27 @@ public class Benchmarks
127
{
138

149
[Benchmark]
15-
public void UnidecodeRussian()
10+
public void FastUnidecodeRussian()
1611
{
17-
var converted = "Работа с кириллицей".Unidecode();
12+
var converted = "Работа с кириллицей".Unidecode(UnidecodeAlgorithm.Fast);
1813
}
1914

2015
[Benchmark]
21-
public void UnidecodeAscii()
16+
public void CompleteUnidecodeRussian()
2217
{
23-
var converted = "Hello World!".Unidecode();
18+
var converted = "Работа с кириллицей".Unidecode(UnidecodeAlgorithm.Complete);
19+
}
20+
21+
[Benchmark]
22+
public void FastUnidecodeAscii()
23+
{
24+
var converted = "Hello World!".Unidecode(UnidecodeAlgorithm.Fast);
25+
}
26+
27+
[Benchmark]
28+
public void CompleteUnidecodeAscii()
29+
{
30+
var converted = "Hello World!".Unidecode(UnidecodeAlgorithm.Complete);
2431
}
2532

2633
[Benchmark]
@@ -35,4 +42,22 @@ public void UnidecodeAsciiChar()
3542
var converted = 'Z'.Unidecode();
3643
}
3744

45+
46+
private readonly static Rune russianRune = new('и');
47+
[Benchmark]
48+
public void UnidecodeRussianRune()
49+
{
50+
51+
var converted = russianRune.Unidecode();
52+
}
53+
54+
private readonly static Rune AsciiRune = new('Z');
55+
56+
[Benchmark]
57+
public void UnidecodeAsciiRune()
58+
{
59+
var converted = AsciiRune.Unidecode();
60+
}
61+
62+
3863
}

src/Unidecoder.cs

Lines changed: 68 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,40 @@
77
using System.Text;
88
using System.Text.RegularExpressions;
99

10-
// this IntenralsVisibleTo attribute is here to allow benchmarking and
11-
// testing of SlowUnidecode, which normally, due to the stackalloc optimization,
12-
// is called only when Unidecode receives a long string
13-
[assembly: InternalsVisibleTo("Unidecode.Net.Benchmark")]
14-
[assembly: InternalsVisibleTo("Unidecode.Net.Tests")]
15-
1610
namespace Unidecode.NET
1711
{
12+
13+
14+
public enum UnidecodeAlgorithm
15+
{
16+
/// <summary>
17+
/// optimized decoding algorithm (up to 3 times faster), but does not work properly for unicode codepoints >65535.
18+
/// </summary>
19+
Fast,
20+
/// <summary>
21+
/// proper, slower algorithm that properly handles all codepoints (for languages like Chinese, Japanese..)
22+
/// </summary>
23+
Complete
24+
};
25+
1826
/// <summary>
1927
/// ASCII transliterations of Unicode text
2028
/// </summary>
2129
public static class Unidecoder
2230
{
31+
// for short strings the fast decoding algorithm uses a buffer allocated
32+
// in the stack instead of a stringbuilder.
33+
private const int MAX_STACKALLOC_BUFFER_SIZE = 16384;
2334
private static readonly int MaxDecodedCharLength;
2435
private static string[][] characters;
36+
37+
38+
39+
/// <summary>
40+
/// sets the algorithm to be used for the extension methods that do not explicitly receive the algorithm to be used)
41+
/// </summary>
42+
static public UnidecodeAlgorithm Algorithm { get; set; } = UnidecodeAlgorithm.Fast;
43+
2544
static Unidecoder()
2645
{
2746
MaxDecodedCharLength = 0;
@@ -60,14 +79,26 @@ static Unidecoder()
6079
characters[pair.Key] = pair.Value;
6180
}
6281

82+
public static string Unidecode(this string input, int? tempStringBuilderCapacity = null)
83+
{
84+
if (Algorithm == UnidecodeAlgorithm.Complete)
85+
return CompleteUnidecode(input, tempStringBuilderCapacity);
86+
return FastUnidecode(input, tempStringBuilderCapacity);
87+
}
88+
89+
public static string Unidecode(this string input, UnidecodeAlgorithm Algorithm, int? tempStringBuilderCapacity = null)
90+
{
91+
if (Algorithm == UnidecodeAlgorithm.Complete)
92+
return CompleteUnidecode(input, tempStringBuilderCapacity);
93+
return FastUnidecode(input, tempStringBuilderCapacity);
94+
}
6395

64-
// for short strings I use a buffer allocated in the stack instead of a stringbuilder.
65-
// (this is faster and gives less work to the garbage collector)
66-
private const int MAX_STACKALLOC_BUFFER_SIZE = 8192;
6796

68-
[SkipLocalsInit] // this is to avoid the local raw buffer variable stackBuffer do be zeroed for every call: we don't need it and is very cpu intensive (this attribute needs unsafe compliation)
97+
// this is to avoid the local raw buffer variable stackBuffer do be zeroed for every call: we don't need it and is very cpu intensive (this attribute needs unsafe compliation)
6998
/// <summary>
70-
/// Transliterate Unicode string to ASCII string.
99+
/// Transliterate Unicode string to ASCII.
100+
/// This one is a fast implementation that does NOT work properly on code points >65535. Use it only if you have to deal with languages that
101+
/// do not use such unicode symbols (like European languages), and decoding performance is an actual issue in your application<br/>
71102
/// </summary>
72103
/// <param name="input">String you want to transliterate into ASCII</param>
73104
/// <param name="tempStringBuilderCapacity">
@@ -79,22 +110,23 @@ static Unidecoder()
79110
/// ASCII string. There are [?] (3 characters) in places of some unknown(?) unicode characters.
80111
/// It is this way in Python code as well.
81112
/// </returns>
82-
public static string Unidecode(this string input, int? tempStringBuilderCapacity = null)
113+
[SkipLocalsInit]
114+
private static string FastUnidecode(string input, int? tempStringBuilderCapacity = null)
83115
{
84116
if (string.IsNullOrEmpty(input))
85117
return "";
86118
var neededBufferSize = input.Length * MaxDecodedCharLength + 1;
87119
if (neededBufferSize >= MAX_STACKALLOC_BUFFER_SIZE)
88-
return SlowUnidecode(input, tempStringBuilderCapacity);
120+
return CompleteUnidecode(input, tempStringBuilderCapacity);
89121

90122
bool noConversionNeeded = true;
91123
Span<char> stackBuffer = stackalloc char[neededBufferSize];
92124
int buffIdx = 0;
93-
foreach (char c in input)
125+
foreach (var c in input)
94126
{
95127
if (c < 0x80)
96128
{
97-
stackBuffer[buffIdx++] = c;
129+
stackBuffer[buffIdx++] = c;
98130
continue;
99131
}
100132
noConversionNeeded = false;
@@ -115,7 +147,7 @@ public static string Unidecode(this string input, int? tempStringBuilderCapacity
115147
}
116148

117149
// this implementation is slower but it has no limits for the lenght of the input stirng. it gets called by Unidecode() for long strings
118-
internal static string SlowUnidecode(this string input, int? tempStringBuilderCapacity = null)
150+
private static string CompleteUnidecode(string input, int? tempStringBuilderCapacity = null)
119151
{
120152
if (string.IsNullOrEmpty(input))
121153
return "";
@@ -125,12 +157,13 @@ internal static string SlowUnidecode(this string input, int? tempStringBuilderCa
125157

126158
// Unidecode result often can be at least two times longer than input string.
127159
var sb = new StringBuilder(tempStringBuilderCapacity ?? input.Length * 2);
128-
foreach (var c in input)
160+
foreach (var rune in input.EnumerateRunes())
129161
{
162+
long c = rune.Value;
130163
// Copypaste is bad, but sb.Append(c.Unidecode()); would be a bit slower.
131164
if (c < 0x80)
132165
{
133-
sb.Append(c);
166+
sb.Append((char) c);
134167
}
135168
else
136169
{
@@ -151,6 +184,7 @@ internal static string SlowUnidecode(this string input, int? tempStringBuilderCa
151184

152185
/// <summary>
153186
/// Transliterate Unicode character to ASCII string.
187+
/// (for unicode points that exceed the 2 byte size, use <see cref="Unidecode(in Rune)"/>)
154188
/// </summary>
155189
/// <param name="c">Character you want to transliterate into ASCII</param>
156190
/// <returns>
@@ -172,6 +206,22 @@ public static string Unidecode(this in char c)
172206
return bytes[c & 0xff];
173207
}
174208

209+
public static string Unidecode(this in Rune c)
210+
{
211+
var codepoint = c.Value;
212+
if (codepoint < 0x80)
213+
return AsciiCharacter.AsString[codepoint];
214+
215+
var high = codepoint >> 8;
216+
if (high >= characters.Length)
217+
return null;
218+
var bytes = characters[high];
219+
if (bytes == null)
220+
return null;
221+
222+
return bytes[codepoint & 0xff];
223+
}
224+
175225
// I keep a precalculated cache of all the single character strings for ascii characters, so the Unidecode character extension method
176226
// does not instantiate a new string every time it has to return a single character string for a character <0x80
177227
private static class AsciiCharacter

test/CompleteUnidecoderTest.cs

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
using System;
2+
using System.Text;
3+
using Xunit;
4+
using Unidecode.NET;
5+
6+
namespace Unidecode.NET.Tests
7+
{
8+
// these are the same tests you find in UnidecoderTest, but here we are testing directly the internal function SlowUnidecode
9+
// which is the Unidecode implementation that normally gets called only for very long strings
10+
// (because it is slower, but it has no limits on the size of the input string)
11+
public class CompleteUnidecoderTest
12+
{
13+
[Fact]
14+
public void DocTest()
15+
{
16+
Assert.Equal("Bei Jing ", "\u5317\u4EB0".Unidecode(UnidecodeAlgorithm.Complete));
17+
}
18+
19+
[Fact]
20+
public void CustomTest()
21+
{
22+
Assert.Equal("Rabota s kirillitsei", "Работа с кириллицей".Unidecode(UnidecodeAlgorithm.Complete));
23+
Assert.Equal("aouoAOUO", "äöűőÄÖŨŐ".Unidecode(UnidecodeAlgorithm.Complete));
24+
}
25+
26+
27+
28+
[Fact]
29+
public void PythonTest()
30+
{
31+
32+
Assert.Equal("Hello, World!", "Hello, World!".Unidecode(UnidecodeAlgorithm.Complete));
33+
Assert.Equal("'\"\r\n", "'\"\r\n".Unidecode(UnidecodeAlgorithm.Complete));
34+
Assert.Equal("CZSczs", "ČŽŠčžš".Unidecode(UnidecodeAlgorithm.Complete));
35+
Assert.Equal("a", "ア".Unidecode(UnidecodeAlgorithm.Complete));
36+
Assert.Equal("a", "α".Unidecode(UnidecodeAlgorithm.Complete));
37+
Assert.Equal("a", "а".Unidecode(UnidecodeAlgorithm.Complete));
38+
Assert.Equal("chateau", "ch\u00e2teau".Unidecode(UnidecodeAlgorithm.Complete));
39+
Assert.Equal("vinedos", "vi\u00f1edos".Unidecode(UnidecodeAlgorithm.Complete));
40+
}
41+
42+
[Fact]
43+
public void RussianAlphabetTest()
44+
{
45+
const string russianAlphabetLowercase = "а б в г д е ё ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я";
46+
const string russianAlphabetUppercase = "А Б В Г Д Е Ё Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я";
47+
48+
const string expectedLowercase = "a b v g d e io zh z i i k l m n o p r s t u f kh ts ch sh shch ' y ' e iu ia";
49+
const string expectedUppercase = "A B V G D E Io Zh Z I I K L M N O P R S T U F Kh Ts Ch Sh Shch ' Y ' E Iu Ia";
50+
51+
Assert.Equal(expectedLowercase, russianAlphabetLowercase.Unidecode(UnidecodeAlgorithm.Complete));
52+
Assert.Equal(expectedUppercase, russianAlphabetUppercase.Unidecode(UnidecodeAlgorithm.Complete));
53+
}
54+
55+
[Fact]
56+
public void UnidecodeOnNullShouldReturnEmptyString()
57+
{
58+
Assert.Equal("", ((string)null).Unidecode(UnidecodeAlgorithm.Complete));
59+
}
60+
61+
[Fact]
62+
public void TheCompleteAlgorithmShouldSupportAllCodePoints()
63+
{
64+
var a = "更".Unidecode(UnidecodeAlgorithm.Complete); // f901 Kayng
65+
var b = "🄁".Unidecode(UnidecodeAlgorithm.Complete); // 1f101 0,
66+
Assert.Equal("Kayng ", a);
67+
Assert.Equal("0,", b);
68+
69+
}
70+
71+
72+
}
73+
}

test/SlowUnidecoderTest.cs

Lines changed: 0 additions & 63 deletions
This file was deleted.

test/UnidecoderTest.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
using System;
21
using System.Linq;
32
using System.Text;
43
using Xunit;

0 commit comments

Comments
 (0)