77using System . Text ;
88using System . Text . RegularExpressions ;
99
10- // this IntenralsVisibleTo attribute is here to allow benchmarking and
11- // testing of SlowUnidecode, which normally, due to the stackalloc optimization,
12- // is called only when Unidecode receives a long string
13- [ assembly: InternalsVisibleTo ( "Unidecode.Net.Benchmark" ) ]
14- [ assembly: InternalsVisibleTo ( "Unidecode.Net.Tests" ) ]
15-
1610namespace Unidecode . NET
1711{
12+
13+
14+ public enum UnidecodeAlgorithm
15+ {
16+ /// <summary>
17+ /// optimized decoding algorithm (up to 3 times faster), but does not work properly for unicode codepoints >65535.
18+ /// </summary>
19+ Fast ,
20+ /// <summary>
21+ /// proper, slower algorithm that properly handles all codepoints (for languages like Chinese, Japanese..)
22+ /// </summary>
23+ Complete
24+ } ;
25+
1826 /// <summary>
1927 /// ASCII transliterations of Unicode text
2028 /// </summary>
2129 public static class Unidecoder
2230 {
31+ // for short strings the fast decoding algorithm uses a buffer allocated
32+ // in the stack instead of a stringbuilder.
33+ private const int MAX_STACKALLOC_BUFFER_SIZE = 16384 ;
2334 private static readonly int MaxDecodedCharLength ;
2435 private static string [ ] [ ] characters ;
36+
37+
38+
39+ /// <summary>
40+ /// sets the algorithm to be used for the extension methods that do not explicitly receive the algorithm to be used)
41+ /// </summary>
42+ static public UnidecodeAlgorithm Algorithm { get ; set ; } = UnidecodeAlgorithm . Fast ;
43+
2544 static Unidecoder ( )
2645 {
2746 MaxDecodedCharLength = 0 ;
@@ -60,14 +79,26 @@ static Unidecoder()
6079 characters [ pair . Key ] = pair . Value ;
6180 }
6281
82+ public static string Unidecode ( this string input , int ? tempStringBuilderCapacity = null )
83+ {
84+ if ( Algorithm == UnidecodeAlgorithm . Complete )
85+ return CompleteUnidecode ( input , tempStringBuilderCapacity ) ;
86+ return FastUnidecode ( input , tempStringBuilderCapacity ) ;
87+ }
88+
89+ public static string Unidecode ( this string input , UnidecodeAlgorithm Algorithm , int ? tempStringBuilderCapacity = null )
90+ {
91+ if ( Algorithm == UnidecodeAlgorithm . Complete )
92+ return CompleteUnidecode ( input , tempStringBuilderCapacity ) ;
93+ return FastUnidecode ( input , tempStringBuilderCapacity ) ;
94+ }
6395
64- // for short strings I use a buffer allocated in the stack instead of a stringbuilder.
65- // (this is faster and gives less work to the garbage collector)
66- private const int MAX_STACKALLOC_BUFFER_SIZE = 8192 ;
6796
68- [ SkipLocalsInit ] // this is to avoid the local raw buffer variable stackBuffer do be zeroed for every call: we don't need it and is very cpu intensive (this attribute needs unsafe compliation)
97+ // this is to avoid the local raw buffer variable stackBuffer do be zeroed for every call: we don't need it and is very cpu intensive (this attribute needs unsafe compliation)
6998 /// <summary>
70- /// Transliterate Unicode string to ASCII string.
99+ /// Transliterate Unicode string to ASCII.
100+ /// This one is a fast implementation that does NOT work properly on code points >65535. Use it only if you have to deal with languages that
101+ /// do not use such unicode symbols (like European languages), and decoding performance is an actual issue in your application<br/>
71102 /// </summary>
72103 /// <param name="input">String you want to transliterate into ASCII</param>
73104 /// <param name="tempStringBuilderCapacity">
@@ -79,22 +110,23 @@ static Unidecoder()
79110 /// ASCII string. There are [?] (3 characters) in places of some unknown(?) unicode characters.
80111 /// It is this way in Python code as well.
81112 /// </returns>
82- public static string Unidecode ( this string input , int ? tempStringBuilderCapacity = null )
113+ [ SkipLocalsInit ]
114+ private static string FastUnidecode ( string input , int ? tempStringBuilderCapacity = null )
83115 {
84116 if ( string . IsNullOrEmpty ( input ) )
85117 return "" ;
86118 var neededBufferSize = input . Length * MaxDecodedCharLength + 1 ;
87119 if ( neededBufferSize >= MAX_STACKALLOC_BUFFER_SIZE )
88- return SlowUnidecode ( input , tempStringBuilderCapacity ) ;
120+ return CompleteUnidecode ( input , tempStringBuilderCapacity ) ;
89121
90122 bool noConversionNeeded = true ;
91123 Span < char > stackBuffer = stackalloc char [ neededBufferSize ] ;
92124 int buffIdx = 0 ;
93- foreach ( char c in input )
125+ foreach ( var c in input )
94126 {
95127 if ( c < 0x80 )
96128 {
97- stackBuffer [ buffIdx ++ ] = c ;
129+ stackBuffer [ buffIdx ++ ] = c ;
98130 continue ;
99131 }
100132 noConversionNeeded = false ;
@@ -115,7 +147,7 @@ public static string Unidecode(this string input, int? tempStringBuilderCapacity
115147 }
116148
117149 // this implementation is slower but it has no limits for the lenght of the input stirng. it gets called by Unidecode() for long strings
118- internal static string SlowUnidecode ( this string input , int ? tempStringBuilderCapacity = null )
150+ private static string CompleteUnidecode ( string input , int ? tempStringBuilderCapacity = null )
119151 {
120152 if ( string . IsNullOrEmpty ( input ) )
121153 return "" ;
@@ -125,12 +157,13 @@ internal static string SlowUnidecode(this string input, int? tempStringBuilderCa
125157
126158 // Unidecode result often can be at least two times longer than input string.
127159 var sb = new StringBuilder ( tempStringBuilderCapacity ?? input . Length * 2 ) ;
128- foreach ( var c in input )
160+ foreach ( var rune in input . EnumerateRunes ( ) )
129161 {
162+ long c = rune . Value ;
130163 // Copypaste is bad, but sb.Append(c.Unidecode()); would be a bit slower.
131164 if ( c < 0x80 )
132165 {
133- sb . Append ( c ) ;
166+ sb . Append ( ( char ) c ) ;
134167 }
135168 else
136169 {
@@ -151,6 +184,7 @@ internal static string SlowUnidecode(this string input, int? tempStringBuilderCa
151184
152185 /// <summary>
153186 /// Transliterate Unicode character to ASCII string.
187+ /// (for unicode points that exceed the 2 byte size, use <see cref="Unidecode(in Rune)"/>)
154188 /// </summary>
155189 /// <param name="c">Character you want to transliterate into ASCII</param>
156190 /// <returns>
@@ -172,6 +206,22 @@ public static string Unidecode(this in char c)
172206 return bytes [ c & 0xff ] ;
173207 }
174208
209+ public static string Unidecode ( this in Rune c )
210+ {
211+ var codepoint = c . Value ;
212+ if ( codepoint < 0x80 )
213+ return AsciiCharacter . AsString [ codepoint ] ;
214+
215+ var high = codepoint >> 8 ;
216+ if ( high >= characters . Length )
217+ return null ;
218+ var bytes = characters [ high ] ;
219+ if ( bytes == null )
220+ return null ;
221+
222+ return bytes [ codepoint & 0xff ] ;
223+ }
224+
175225 // I keep a precalculated cache of all the single character strings for ascii characters, so the Unidecode character extension method
176226 // does not instantiate a new string every time it has to return a single character string for a character <0x80
177227 private static class AsciiCharacter
0 commit comments