Skip to content

Commit ceb8d2f

Browse files
committed
added FindIndexesInSourceString function and a test case for it
1 parent a32972f commit ceb8d2f

File tree

2 files changed

+83
-1
lines changed

2 files changed

+83
-1
lines changed

src/Unidecoder.cs

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
using System;
2+
using System.Collections.Generic;
23
using System.Linq;
4+
using System.Reflection;
35
using System.Runtime.CompilerServices;
46
using System.Text;
57

@@ -69,7 +71,7 @@ public static string Unidecode(this string input, int? tempStringBuilderCapacity
6971
return new string(stackBuffer[0..buffIdx]);
7072
}
7173

72-
74+
// this implementation is slower but it has no limits for the lenght of the input stirng. it gets called by Unidecode() for long strings
7375
internal static string SlowUnidecode(this string input, int? tempStringBuilderCapacity = null)
7476
{
7577
if (string.IsNullOrEmpty(input))
@@ -127,6 +129,8 @@ public static string Unidecode(this in char c)
127129
return bytes[c & 0xff];
128130
}
129131

132+
// I keep a precalculated cache of all the single character strings for ascii characters, so the Unidecode character extension method
133+
// does not instantiate a new string every time it has to return a single character string for a character <0x80
130134
private static class AsciiCharacter
131135
{
132136
public static readonly string[] AsString;
@@ -139,5 +143,53 @@ static AsciiCharacter()
139143

140144
}
141145

146+
/// <summary>
147+
/// this function helps you translate a set of indexes you have found in a decoded stirng to the corresponding indexes
148+
/// in the original string, <br/>
149+
/// for example Süßigkeit gets decoded to Sussigkeit, so the first 'i' character has index 4 in the decoded string, and 3 in the source string.
150+
/// With the avove example Unidecode.GetIndexesInSourceString("Süßigkeit", new int[] {4}) will return {3}
151+
/// Warning: this implementation assumes that the input IEnumerable is sorted!
152+
/// </summary>
153+
public static IEnumerable<int> FindIndexesInSourceString(string sourceString, IEnumerable<int> indexesInDecodedString)
154+
{
155+
if (string.IsNullOrEmpty(sourceString))
156+
yield break;
157+
if (indexesInDecodedString == null)
158+
yield break;
159+
160+
using var indexesEnumerator = indexesInDecodedString.GetEnumerator();
161+
if (!indexesEnumerator.MoveNext())
162+
yield break;
163+
var currIndex = indexesEnumerator.Current;
164+
if (currIndex < 0)
165+
throw new ArgumentException("indexes can't be negative values", nameof(indexesInDecodedString));
166+
167+
int decodedIdx = 0;
168+
for (int srcIdx=0; srcIdx<sourceString.Length; srcIdx++)
169+
{
170+
if (decodedIdx >= currIndex)
171+
{
172+
yield return srcIdx;
173+
var prevIndex = currIndex;
174+
if (!indexesEnumerator.MoveNext())
175+
yield break; // we decoded all the indexes
176+
currIndex = indexesEnumerator.Current;
177+
if (currIndex < 0)
178+
throw new ArgumentException("indexes can't be negative values", nameof(indexesInDecodedString));
179+
if (currIndex <= prevIndex)
180+
throw new ArgumentException("Input sequence of indexes must be strictly increasing", nameof(indexesInDecodedString));
181+
}
182+
183+
var ch = sourceString[srcIdx];
184+
185+
var decodedCh = ch.Unidecode();
186+
if (decodedCh != null)
187+
decodedIdx += decodedCh.Length;
188+
}
189+
190+
}
191+
192+
193+
142194
}
143195
}

test/UnidecoderTest.cs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System;
2+
using System.Linq;
23
using System.Text;
34
using Xunit;
45

@@ -106,6 +107,35 @@ public void MaximumLowByteTest()
106107
}
107108
}
108109

110+
111+
[Fact]
112+
public void test()
113+
{
114+
var translated = "Süßigkeit".Unidecode();
115+
Assert.Equal("Sussigkeit", translated);
116+
// I search some letters in the decoded string
117+
var idx_u = translated.IndexOf("u");
118+
var idx_i = translated.IndexOf("i");
119+
var idx_e = translated.IndexOf("e");
120+
var idx_t = translated.IndexOf("t");
121+
122+
123+
Assert.Equal(1, idx_u);
124+
Assert.Equal(4, idx_i);
125+
Assert.Equal(7, idx_e);
126+
Assert.Equal(9, idx_t);
127+
128+
// I want to know where are the corresponding locations in the source string of these occourrences
129+
130+
var srcIndexes = Unidecoder.FindIndexesInSourceString("Süßigkeit", new int[] { idx_u, idx_i, idx_e, idx_t }).ToArray();
131+
132+
Assert.Equal(1, srcIndexes[0]);
133+
Assert.Equal(3, srcIndexes[1]);
134+
Assert.Equal(6, srcIndexes[2]);
135+
Assert.Equal(8, srcIndexes[3]);
136+
}
137+
138+
109139
/// <summary>
110140
/// Tests that Unidecode "stackAlloc" optimized implementation falls back to the slowest SlowUnidecode implementation for long strings,
111141
/// instead of raising an error

0 commit comments

Comments
 (0)