11using System ;
2+ using System . Collections . Generic ;
23using System . Linq ;
4+ using System . Reflection ;
35using System . Runtime . CompilerServices ;
46using System . Text ;
57
@@ -69,7 +71,7 @@ public static string Unidecode(this string input, int? tempStringBuilderCapacity
6971 return new string ( stackBuffer [ 0 ..buffIdx ] ) ;
7072 }
7173
72-
74+ // this implementation is slower but it has no limits for the lenght of the input stirng. it gets called by Unidecode() for long strings
7375 internal static string SlowUnidecode ( this string input , int ? tempStringBuilderCapacity = null )
7476 {
7577 if ( string . IsNullOrEmpty ( input ) )
@@ -127,6 +129,8 @@ public static string Unidecode(this in char c)
127129 return bytes [ c & 0xff ] ;
128130 }
129131
132+ // I keep a precalculated cache of all the single character strings for ascii characters, so the Unidecode character extension method
133+ // does not instantiate a new string every time it has to return a single character string for a character <0x80
130134 private static class AsciiCharacter
131135 {
132136 public static readonly string [ ] AsString ;
@@ -139,5 +143,53 @@ static AsciiCharacter()
139143
140144 }
141145
146+ /// <summary>
147+ /// this function helps you translate a set of indexes you have found in a decoded stirng to the corresponding indexes
148+ /// in the original string, <br/>
149+ /// for example Süßigkeit gets decoded to Sussigkeit, so the first 'i' character has index 4 in the decoded string, and 3 in the source string.
150+ /// With the avove example Unidecode.GetIndexesInSourceString("Süßigkeit", new int[] {4}) will return {3}
151+ /// Warning: this implementation assumes that the input IEnumerable is sorted!
152+ /// </summary>
153+ public static IEnumerable < int > FindIndexesInSourceString ( string sourceString , IEnumerable < int > indexesInDecodedString )
154+ {
155+ if ( string . IsNullOrEmpty ( sourceString ) )
156+ yield break ;
157+ if ( indexesInDecodedString == null )
158+ yield break ;
159+
160+ using var indexesEnumerator = indexesInDecodedString . GetEnumerator ( ) ;
161+ if ( ! indexesEnumerator . MoveNext ( ) )
162+ yield break ;
163+ var currIndex = indexesEnumerator . Current ;
164+ if ( currIndex < 0 )
165+ throw new ArgumentException ( "indexes can't be negative values" , nameof ( indexesInDecodedString ) ) ;
166+
167+ int decodedIdx = 0 ;
168+ for ( int srcIdx = 0 ; srcIdx < sourceString . Length ; srcIdx ++ )
169+ {
170+ if ( decodedIdx >= currIndex )
171+ {
172+ yield return srcIdx ;
173+ var prevIndex = currIndex ;
174+ if ( ! indexesEnumerator . MoveNext ( ) )
175+ yield break ; // we decoded all the indexes
176+ currIndex = indexesEnumerator . Current ;
177+ if ( currIndex < 0 )
178+ throw new ArgumentException ( "indexes can't be negative values" , nameof ( indexesInDecodedString ) ) ;
179+ if ( currIndex <= prevIndex )
180+ throw new ArgumentException ( "Input sequence of indexes must be strictly increasing" , nameof ( indexesInDecodedString ) ) ;
181+ }
182+
183+ var ch = sourceString [ srcIdx ] ;
184+
185+ var decodedCh = ch . Unidecode ( ) ;
186+ if ( decodedCh != null )
187+ decodedIdx += decodedCh . Length ;
188+ }
189+
190+ }
191+
192+
193+
142194 }
143195}
0 commit comments