Skip to content

Commit

Permalink
Implement Ranges
Browse files Browse the repository at this point in the history
  • Loading branch information
clipperhouse committed Jun 22, 2024
1 parent 4e6145e commit ebd4b0f
Show file tree
Hide file tree
Showing 3 changed files with 325 additions and 0 deletions.
183 changes: 183 additions & 0 deletions uax29/RangeTokenizer.Test.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
namespace Tests;

using UAX29;
using System.Linq;
using System.Text;

[TestFixture]
public class TestRangeTokenizer
{
[SetUp]
public void Setup()
{
}

[Test]
public void Reset()
{
var example = "Hello, how are you?";
var bytes = Encoding.UTF8.GetBytes(example);

var words = Tokenizer.GetWords(example);
var ranges = words.Ranges;

var first = new List<Range>();
foreach (var range in ranges)
{
first.Add(range);
}

Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing

words.Reset();

var second = new List<Range>();
foreach (var range in ranges)
{
second.Add(range);
}

Assert.That(first.SequenceEqual(second));
}

[Test]
public void SetText()
{
var example = "Hello, how are you?";

var words = Tokenizer.GetWords(example);
var ranges = words.Ranges;

var first = new List<Range>();
foreach (var range in ranges)
{
first.Add(range);
}

Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing

ranges.SetText(example);

var second = new List<Range>();
foreach (var range in ranges)
{
second.Add(range);
}

Assert.That(first.SequenceEqual(second));
}

[Test]
public void MatchesTokenizer()
{
var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界.";
var tokens = Tokenizer.GetWords(example);
var ranges = tokens.Ranges;

foreach (var range in ranges)
{
tokens.MoveNext();

var ranged = example.AsSpan(range);
var token = tokens.Current;
Assert.That(token.SequenceEqual(ranged));
}
}

[Test]
public void Enumerator()
{
var input = "Hello, how are you?";
var mem = input.AsMemory();
Tokenizer.GetWords(mem);

var words = Tokenizer.GetWords(input);
var ranges = words.Ranges;

var first = new List<Range>();
while (ranges.MoveNext())
{
first.Add(ranges.Current);
}
Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing


var tokens2 = Tokenizer.GetWords(input);
var ranges2 = words.Ranges;

var second = new List<Range>();
foreach (var range in ranges2)
{
second.Add(range);
}
Assert.That(first.SequenceEqual(second));
}

[Test]
public void ToList()
{
var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界.";
var words = Tokenizer.GetWords(example);
var ranges = words.Ranges;
var list = ranges.ToList();

var i = 0;
foreach (var range in ranges)
{
Assert.That(range, Is.EqualTo(list[i]));
i++;
}

Assert.That(list, Has.Count.EqualTo(i), "ToList should return the same number of tokens as iteration");

// Tokenizer should reset back to the beginning
Assert.That(ranges.start, Is.EqualTo(0));
Assert.That(ranges.end, Is.EqualTo(0));

var threw = false;
ranges.MoveNext();
try
{
ranges.ToList();
}
catch (InvalidOperationException)
{
threw = true;
}
Assert.That(threw, Is.True, "Calling ToList after iteration has begun should throw");
}

[Test]
public void ToArray()
{
var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界.";
var words = Tokenizer.GetWords(example);
var ranges = words.Ranges;
var array = ranges.ToArray();

var i = 0;
foreach (var range in ranges)
{
Assert.That(range, Is.EqualTo(array[i]));
i++;
}

Assert.That(array, Has.Length.EqualTo(i), "ToArray should return the same number of tokens as iteration");

// Tokenizer should reset back to the beginning
Assert.That(ranges.start, Is.EqualTo(0));
Assert.That(ranges.end, Is.EqualTo(0));

var threw = false;
ranges.MoveNext();
try
{
ranges.ToArray();
}
catch (InvalidOperationException)
{
threw = true;
}
Assert.That(threw, Is.True, "Calling ToArray after iteration has begun should throw");
}
}
125 changes: 125 additions & 0 deletions uax29/RangeTokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
namespace UAX29;

/// <summary>
/// Tokenizer splits strings or UTF-8 bytes as words, sentences or graphemes, per the Unicode UAX #29 spec.
/// </summary>
/// <typeparam name="T">byte or char, indicating the type of the input, and by implication, the output.</typeparam>
public ref struct RangeTokenizer<T> where T : struct
{
ReadOnlySpan<T> input;

readonly Split<T> split;

internal int start = 0;
internal int end = 0;

bool begun = false;

/// <summary>
/// Tokenizer splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec.
/// </summary>
/// <param name="input">A string, or UTF-8 byte array.</param>
/// <param name="tokenType">Choose to split words, graphemes or sentences. Default is words.</param>
internal RangeTokenizer(ReadOnlySpan<T> input, Split<T> split)
{
this.input = input;
this.split = split;
}

/// <summary>
/// Move to the next token. Use <see cref="Current"/> to retrieve the token.
/// </summary>
/// <returns>Whether there are any more tokens. False typically means EOF.</returns>
public bool MoveNext()
{
begun = true;

if (end < input.Length)
{
var advance = this.split(input[end..], true);
// Interpret as EOF
if (advance == 0)
{
return false;
}

start = end;
end = start + advance;

return true;
}
return false;
}

/// <summary>
/// The current token (word, grapheme or sentence).
/// If the input was a string, <see cref="Current"/> will be <see cref="ReadOnlySpan"/> of <see cref="char"/>.
/// If the input was UTF-8 bytes, <see cref="Current"/> will be <see cref="ReadOnlySpan"/> of <see cref="byte"/>.
/// </summary>
public readonly Range Current
{
get
{
return new Range(start, end);
}
}

public readonly RangeTokenizer<T> GetEnumerator()
{
return this;
}

/// <summary>
/// Resets the tokenizer back to the first token.
/// </summary>
public void Reset()
{
this.start = 0;
this.end = 0;
this.begun = false;
}

/// <summary>
/// (Re)sets the text to be tokenized, and resets the iterator back to the the start.
/// </summary>
public void SetText(ReadOnlySpan<T> input)
{
Reset();
this.input = input;
}

/// <summary>
/// Iterates over all tokens and collects them into a list, allocating a new array for each token.
/// </summary>
/// <returns>List<byte[]> or List<char[]>, depending on the input</returns>
public List<Range> ToList()
{
if (begun)
{
throw new InvalidOperationException("ToList must not be called after iteration has begun. You may wish to call Reset() on the tokenizer.");
}

var result = new List<Range>();
foreach (var token in this)
{
result.Add(token);
}

this.Reset();
return result;
}

/// <summary>
/// Iterates over all tokens and collects them into an array, allocating a new array for each token.
/// </summary>
/// <returns>byte[][] or char[][], depending on the input</returns>
public Range[] ToArray()
{
if (begun)
{
throw new InvalidOperationException("ToArray must not be called after iteration has begun. You may wish to call Reset() on the tokenizer.");
}

return this.ToList().ToArray();
}
}
17 changes: 17 additions & 0 deletions uax29/Tokenizer.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using System.Collections.Immutable;

namespace UAX29;

/// <summary>
Expand Down Expand Up @@ -131,4 +133,19 @@ public T[][] ToArray()

return this.ToList().ToArray();
}

/// <summary>
/// Get the ranges (boundaries) of the tokens.
/// </summary>
/// <returns>
/// An enumerator of Range. Use foreach to iterate over the ranges. Apply them to your original input
/// using [range] or .AsSpan(range) to get the tokens.
/// </returns>
public RangeTokenizer<T> Ranges
{
get
{
return new RangeTokenizer<T>(input, split);
}
}
}

0 comments on commit ebd4b0f

Please sign in to comment.