-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4e6145e
commit ebd4b0f
Showing
3 changed files
with
325 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
namespace Tests; | ||
|
||
using UAX29; | ||
using System.Linq; | ||
using System.Text; | ||
|
||
[TestFixture] | ||
public class TestRangeTokenizer | ||
{ | ||
[SetUp] | ||
public void Setup() | ||
{ | ||
} | ||
|
||
[Test] | ||
public void Reset() | ||
{ | ||
var example = "Hello, how are you?"; | ||
var bytes = Encoding.UTF8.GetBytes(example); | ||
|
||
var words = Tokenizer.GetWords(example); | ||
var ranges = words.Ranges; | ||
|
||
var first = new List<Range>(); | ||
foreach (var range in ranges) | ||
{ | ||
first.Add(range); | ||
} | ||
|
||
Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing | ||
|
||
words.Reset(); | ||
|
||
var second = new List<Range>(); | ||
foreach (var range in ranges) | ||
{ | ||
second.Add(range); | ||
} | ||
|
||
Assert.That(first.SequenceEqual(second)); | ||
} | ||
|
||
[Test] | ||
public void SetText() | ||
{ | ||
var example = "Hello, how are you?"; | ||
|
||
var words = Tokenizer.GetWords(example); | ||
var ranges = words.Ranges; | ||
|
||
var first = new List<Range>(); | ||
foreach (var range in ranges) | ||
{ | ||
first.Add(range); | ||
} | ||
|
||
Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing | ||
|
||
ranges.SetText(example); | ||
|
||
var second = new List<Range>(); | ||
foreach (var range in ranges) | ||
{ | ||
second.Add(range); | ||
} | ||
|
||
Assert.That(first.SequenceEqual(second)); | ||
} | ||
|
||
[Test] | ||
public void MatchesTokenizer() | ||
{ | ||
var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; | ||
var tokens = Tokenizer.GetWords(example); | ||
var ranges = tokens.Ranges; | ||
|
||
foreach (var range in ranges) | ||
{ | ||
tokens.MoveNext(); | ||
|
||
var ranged = example.AsSpan(range); | ||
var token = tokens.Current; | ||
Assert.That(token.SequenceEqual(ranged)); | ||
} | ||
} | ||
|
||
[Test] | ||
public void Enumerator() | ||
{ | ||
var input = "Hello, how are you?"; | ||
var mem = input.AsMemory(); | ||
Tokenizer.GetWords(mem); | ||
|
||
var words = Tokenizer.GetWords(input); | ||
var ranges = words.Ranges; | ||
|
||
var first = new List<Range>(); | ||
while (ranges.MoveNext()) | ||
{ | ||
first.Add(ranges.Current); | ||
} | ||
Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing | ||
|
||
|
||
var tokens2 = Tokenizer.GetWords(input); | ||
var ranges2 = words.Ranges; | ||
|
||
var second = new List<Range>(); | ||
foreach (var range in ranges2) | ||
{ | ||
second.Add(range); | ||
} | ||
Assert.That(first.SequenceEqual(second)); | ||
} | ||
|
||
[Test] | ||
public void ToList() | ||
{ | ||
var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; | ||
var words = Tokenizer.GetWords(example); | ||
var ranges = words.Ranges; | ||
var list = ranges.ToList(); | ||
|
||
var i = 0; | ||
foreach (var range in ranges) | ||
{ | ||
Assert.That(range, Is.EqualTo(list[i])); | ||
i++; | ||
} | ||
|
||
Assert.That(list, Has.Count.EqualTo(i), "ToList should return the same number of tokens as iteration"); | ||
|
||
// Tokenizer should reset back to the beginning | ||
Assert.That(ranges.start, Is.EqualTo(0)); | ||
Assert.That(ranges.end, Is.EqualTo(0)); | ||
|
||
var threw = false; | ||
ranges.MoveNext(); | ||
try | ||
{ | ||
ranges.ToList(); | ||
} | ||
catch (InvalidOperationException) | ||
{ | ||
threw = true; | ||
} | ||
Assert.That(threw, Is.True, "Calling ToList after iteration has begun should throw"); | ||
} | ||
|
||
[Test] | ||
public void ToArray() | ||
{ | ||
var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; | ||
var words = Tokenizer.GetWords(example); | ||
var ranges = words.Ranges; | ||
var array = ranges.ToArray(); | ||
|
||
var i = 0; | ||
foreach (var range in ranges) | ||
{ | ||
Assert.That(range, Is.EqualTo(array[i])); | ||
i++; | ||
} | ||
|
||
Assert.That(array, Has.Length.EqualTo(i), "ToArray should return the same number of tokens as iteration"); | ||
|
||
// Tokenizer should reset back to the beginning | ||
Assert.That(ranges.start, Is.EqualTo(0)); | ||
Assert.That(ranges.end, Is.EqualTo(0)); | ||
|
||
var threw = false; | ||
ranges.MoveNext(); | ||
try | ||
{ | ||
ranges.ToArray(); | ||
} | ||
catch (InvalidOperationException) | ||
{ | ||
threw = true; | ||
} | ||
Assert.That(threw, Is.True, "Calling ToArray after iteration has begun should throw"); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
namespace UAX29; | ||
|
||
/// <summary> | ||
/// Tokenizer splits strings or UTF-8 bytes as words, sentences or graphemes, per the Unicode UAX #29 spec. | ||
/// </summary> | ||
/// <typeparam name="T">byte or char, indicating the type of the input, and by implication, the output.</typeparam> | ||
public ref struct RangeTokenizer<T> where T : struct | ||
{ | ||
ReadOnlySpan<T> input; | ||
|
||
readonly Split<T> split; | ||
|
||
internal int start = 0; | ||
internal int end = 0; | ||
|
||
bool begun = false; | ||
|
||
/// <summary> | ||
/// Tokenizer splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec. | ||
/// </summary> | ||
/// <param name="input">A string, or UTF-8 byte array.</param> | ||
/// <param name="tokenType">Choose to split words, graphemes or sentences. Default is words.</param> | ||
internal RangeTokenizer(ReadOnlySpan<T> input, Split<T> split) | ||
{ | ||
this.input = input; | ||
this.split = split; | ||
} | ||
|
||
/// <summary> | ||
/// Move to the next token. Use <see cref="Current"/> to retrieve the token. | ||
/// </summary> | ||
/// <returns>Whether there are any more tokens. False typically means EOF.</returns> | ||
public bool MoveNext() | ||
{ | ||
begun = true; | ||
|
||
if (end < input.Length) | ||
{ | ||
var advance = this.split(input[end..], true); | ||
// Interpret as EOF | ||
if (advance == 0) | ||
{ | ||
return false; | ||
} | ||
|
||
start = end; | ||
end = start + advance; | ||
|
||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
/// <summary> | ||
/// The current token (word, grapheme or sentence). | ||
/// If the input was a string, <see cref="Current"/> will be <see cref="ReadOnlySpan"/> of <see cref="char"/>. | ||
/// If the input was UTF-8 bytes, <see cref="Current"/> will be <see cref="ReadOnlySpan"/> of <see cref="byte"/>. | ||
/// </summary> | ||
public readonly Range Current | ||
{ | ||
get | ||
{ | ||
return new Range(start, end); | ||
} | ||
} | ||
|
||
public readonly RangeTokenizer<T> GetEnumerator() | ||
{ | ||
return this; | ||
} | ||
|
||
/// <summary> | ||
/// Resets the tokenizer back to the first token. | ||
/// </summary> | ||
public void Reset() | ||
{ | ||
this.start = 0; | ||
this.end = 0; | ||
this.begun = false; | ||
} | ||
|
||
/// <summary> | ||
/// (Re)sets the text to be tokenized, and resets the iterator back to the the start. | ||
/// </summary> | ||
public void SetText(ReadOnlySpan<T> input) | ||
{ | ||
Reset(); | ||
this.input = input; | ||
} | ||
|
||
/// <summary> | ||
/// Iterates over all tokens and collects them into a list, allocating a new array for each token. | ||
/// </summary> | ||
/// <returns>List<byte[]> or List<char[]>, depending on the input</returns> | ||
public List<Range> ToList() | ||
{ | ||
if (begun) | ||
{ | ||
throw new InvalidOperationException("ToList must not be called after iteration has begun. You may wish to call Reset() on the tokenizer."); | ||
} | ||
|
||
var result = new List<Range>(); | ||
foreach (var token in this) | ||
{ | ||
result.Add(token); | ||
} | ||
|
||
this.Reset(); | ||
return result; | ||
} | ||
|
||
/// <summary> | ||
/// Iterates over all tokens and collects them into an array, allocating a new array for each token. | ||
/// </summary> | ||
/// <returns>byte[][] or char[][], depending on the input</returns> | ||
public Range[] ToArray() | ||
{ | ||
if (begun) | ||
{ | ||
throw new InvalidOperationException("ToArray must not be called after iteration has begun. You may wish to call Reset() on the tokenizer."); | ||
} | ||
|
||
return this.ToList().ToArray(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters