diff --git a/uax29/RangeTokenizer.Test.cs b/uax29/RangeTokenizer.Test.cs new file mode 100644 index 0000000..63758bd --- /dev/null +++ b/uax29/RangeTokenizer.Test.cs @@ -0,0 +1,183 @@ +namespace Tests; + +using UAX29; +using System.Linq; +using System.Text; + +[TestFixture] +public class TestRangeTokenizer +{ + [SetUp] + public void Setup() + { + } + + [Test] + public void Reset() + { + var example = "Hello, how are you?"; + var bytes = Encoding.UTF8.GetBytes(example); + + var words = Tokenizer.GetWords(example); + var ranges = words.Ranges; + + var first = new List(); + foreach (var range in ranges) + { + first.Add(range); + } + + Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing + + words.Reset(); + + var second = new List(); + foreach (var range in ranges) + { + second.Add(range); + } + + Assert.That(first.SequenceEqual(second)); + } + + [Test] + public void SetText() + { + var example = "Hello, how are you?"; + + var words = Tokenizer.GetWords(example); + var ranges = words.Ranges; + + var first = new List(); + foreach (var range in ranges) + { + first.Add(range); + } + + Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing + + ranges.SetText(example); + + var second = new List(); + foreach (var range in ranges) + { + second.Add(range); + } + + Assert.That(first.SequenceEqual(second)); + } + + [Test] + public void MatchesTokenizer() + { + var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; + var tokens = Tokenizer.GetWords(example); + var ranges = tokens.Ranges; + + foreach (var range in ranges) + { + tokens.MoveNext(); + + var ranged = example.AsSpan(range); + var token = tokens.Current; + Assert.That(token.SequenceEqual(ranged)); + } + } + + [Test] + public void Enumerator() + { + var input = "Hello, how are you?"; + var mem = input.AsMemory(); + Tokenizer.GetWords(mem); + + var words = Tokenizer.GetWords(input); + var ranges = words.Ranges; + + var first = new List(); + while (ranges.MoveNext()) + { + first.Add(ranges.Current); + } + Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing + + + var tokens2 = Tokenizer.GetWords(input); + var ranges2 = words.Ranges; + + var second = new List(); + foreach (var range in ranges2) + { + second.Add(range); + } + Assert.That(first.SequenceEqual(second)); + } + + [Test] + public void ToList() + { + var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; + var words = Tokenizer.GetWords(example); + var ranges = words.Ranges; + var list = ranges.ToList(); + + var i = 0; + foreach (var range in ranges) + { + Assert.That(range, Is.EqualTo(list[i])); + i++; + } + + Assert.That(list, Has.Count.EqualTo(i), "ToList should return the same number of tokens as iteration"); + + // Tokenizer should reset back to the beginning + Assert.That(ranges.start, Is.EqualTo(0)); + Assert.That(ranges.end, Is.EqualTo(0)); + + var threw = false; + ranges.MoveNext(); + try + { + ranges.ToList(); + } + catch (InvalidOperationException) + { + threw = true; + } + Assert.That(threw, Is.True, "Calling ToList after iteration has begun should throw"); + } + + [Test] + public void ToArray() + { + var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; + var words = Tokenizer.GetWords(example); + var ranges = words.Ranges; + var array = ranges.ToArray(); + + var i = 0; + foreach (var range in ranges) + { + Assert.That(range, Is.EqualTo(array[i])); + i++; + } + + Assert.That(array, Has.Length.EqualTo(i), "ToArray should return the same number of tokens as iteration"); + + // Tokenizer should reset back to the beginning + Assert.That(ranges.start, Is.EqualTo(0)); + Assert.That(ranges.end, Is.EqualTo(0)); + + var threw = false; + ranges.MoveNext(); + try + { + ranges.ToArray(); + } + catch (InvalidOperationException) + { + threw = true; + } + Assert.That(threw, Is.True, "Calling ToArray after iteration has begun should throw"); + } +} diff --git a/uax29/RangeTokenizer.cs b/uax29/RangeTokenizer.cs new file mode 100644 index 0000000..5129b9e --- /dev/null +++ b/uax29/RangeTokenizer.cs @@ -0,0 +1,125 @@ +namespace UAX29; + +/// +/// Tokenizer splits strings or UTF-8 bytes as words, sentences or graphemes, per the Unicode UAX #29 spec. +/// +/// byte or char, indicating the type of the input, and by implication, the output. +public ref struct RangeTokenizer where T : struct +{ + ReadOnlySpan input; + + readonly Split split; + + internal int start = 0; + internal int end = 0; + + bool begun = false; + + /// + /// Tokenizer splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec. + /// + /// A string, or UTF-8 byte array. + /// Choose to split words, graphemes or sentences. Default is words. + internal RangeTokenizer(ReadOnlySpan input, Split split) + { + this.input = input; + this.split = split; + } + + /// + /// Move to the next token. Use to retrieve the token. + /// + /// Whether there are any more tokens. False typically means EOF. + public bool MoveNext() + { + begun = true; + + if (end < input.Length) + { + var advance = this.split(input[end..], true); + // Interpret as EOF + if (advance == 0) + { + return false; + } + + start = end; + end = start + advance; + + return true; + } + return false; + } + + /// + /// The current token (word, grapheme or sentence). + /// If the input was a string, will be of . + /// If the input was UTF-8 bytes, will be of . + /// + public readonly Range Current + { + get + { + return new Range(start, end); + } + } + + public readonly RangeTokenizer GetEnumerator() + { + return this; + } + + /// + /// Resets the tokenizer back to the first token. + /// + public void Reset() + { + this.start = 0; + this.end = 0; + this.begun = false; + } + + /// + /// (Re)sets the text to be tokenized, and resets the iterator back to the the start. + /// + public void SetText(ReadOnlySpan input) + { + Reset(); + this.input = input; + } + + /// + /// Iterates over all tokens and collects them into a list, allocating a new array for each token. + /// + /// List or List, depending on the input + public List ToList() + { + if (begun) + { + throw new InvalidOperationException("ToList must not be called after iteration has begun. You may wish to call Reset() on the tokenizer."); + } + + var result = new List(); + foreach (var token in this) + { + result.Add(token); + } + + this.Reset(); + return result; + } + + /// + /// Iterates over all tokens and collects them into an array, allocating a new array for each token. + /// + /// byte[][] or char[][], depending on the input + public Range[] ToArray() + { + if (begun) + { + throw new InvalidOperationException("ToArray must not be called after iteration has begun. You may wish to call Reset() on the tokenizer."); + } + + return this.ToList().ToArray(); + } +} diff --git a/uax29/Tokenizer.cs b/uax29/Tokenizer.cs index 8858f6d..fa2c4aa 100644 --- a/uax29/Tokenizer.cs +++ b/uax29/Tokenizer.cs @@ -1,3 +1,5 @@ +using System.Collections.Immutable; + namespace UAX29; /// @@ -131,4 +133,19 @@ public T[][] ToArray() return this.ToList().ToArray(); } + + /// + /// Get the ranges (boundaries) of the tokens. + /// + /// + /// An enumerator of Range. Use foreach to iterate over the ranges. Apply them to your original input + /// using [range] or .AsSpan(range) to get the tokens. + /// + public RangeTokenizer Ranges + { + get + { + return new RangeTokenizer(input, split); + } + } }