diff --git a/.gitignore b/.gitignore index bd9e9fb..58a9fbf 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ BenchmarkDotNet.Artifacts bin obj .vscode/tasks.json +global.json diff --git a/Benchmarks/Program.cs b/Benchmarks/Program.cs index 31e5afa..01b448b 100644 --- a/Benchmarks/Program.cs +++ b/Benchmarks/Program.cs @@ -1,3 +1,4 @@ +using System.Buffers; using System.Text; using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Configs; @@ -39,70 +40,62 @@ public void Setup() } [Benchmark] - public void TokenizeBytes() + public void SplitBytes() { - var tokens = Tokenizer.GetWords(sample); + var tokens = Split.Words(sample); foreach (var token in tokens) { } } [Benchmark] - public void TokenizeBytesOmitWhitespace() + public void SplitBytesOmitWhitespace() { - var tokens = Tokenizer.GetWords(sample, Options.OmitWhitespace); + var tokens = Split.Words(sample, Options.OmitWhitespace); foreach (var token in tokens) { } } [Benchmark] - public void TokenizeString() + public void SplitString() { - var tokens = Tokenizer.GetWords(sampleStr); + var tokens = Split.Words(sampleStr); foreach (var token in tokens) { } } [Benchmark] - public void TokenizeStringOmitWhitespace() + public void SplitStringOmitWhitespace() { - var tokens = Tokenizer.GetWords(sampleStr, Options.OmitWhitespace); + var tokens = Split.Words(sampleStr, Options.OmitWhitespace); foreach (var token in tokens) { } } [Benchmark] - public void TokenizeStream() + public void SplitStream() { - var stream = new MemoryStream(sample); - var tokens = Tokenizer.GetWords(stream); - foreach (var token in tokens) - { - } + sampleStream.Seek(0, SeekOrigin.Begin); + var tokens = Split.Words(sampleStream); + foreach (var token in tokens) { } } + static readonly ArrayPool pool = ArrayPool.Shared; + [Benchmark] - public void TokenizeSetStream() + public void SplitStreamArrayPool() { - // This is to test to observe allocations. + var storage = pool.Rent(2048); - // The creation will allocate a buffer of 1024 bytes - var tokens = Tokenizer.GetWords(sampleStream); + sampleStream.Seek(0, SeekOrigin.Begin); + var tokens = Split.Words(sampleStream, minBufferBytes: 1024, bufferStorage: storage); + tokens.SetStream(sampleStream); + foreach (var token in tokens) { } - var runs = 10; - // keep in mind the 10 runs when interpreting the benchmark - for (var i = 0; i < runs; i++) - { - // subsequent runs should allocate less by using SetStream - sampleStream.Seek(0, SeekOrigin.Begin); - tokens.SetStream(sampleStream); - foreach (var token in tokens) - { - } - } + pool.Return(storage); } [Benchmark] @@ -115,9 +108,9 @@ public void StringInfoGraphemes() } [Benchmark] - public void TokenizerGraphemes() + public void SplitGraphemes() { - var tokens = Tokenizer.GetGraphemes(sample); + var tokens = Split.Graphemes(sample); foreach (var token in tokens) { } diff --git a/Benchmarks/Speed.cs b/Benchmarks/Speed.cs index 7a48db2..c1c1d81 100644 --- a/Benchmarks/Speed.cs +++ b/Benchmarks/Speed.cs @@ -11,9 +11,9 @@ public string GetValue(Summary summary, BenchmarkCase benchmarkCase) return "N/A"; } var ourReport = summary.Reports.First(x => x.BenchmarkCase.Equals(benchmarkCase)); - long length = new System.IO.FileInfo("sample.txt").Length; - var mean = ourReport.ResultStatistics.Mean; - return $"{(length / ourReport.ResultStatistics.Mean):#####.000}"; + long length = new FileInfo("sample.txt").Length; + var mean = ourReport.ResultStatistics!.Mean; + return $"{length / mean:#####.000} GB/s"; } public string GetValue(Summary summary, BenchmarkCase benchmarkCase, SummaryStyle style) => GetValue(summary, benchmarkCase); @@ -21,7 +21,7 @@ public string GetValue(Summary summary, BenchmarkCase benchmarkCase) public bool IsAvailable(Summary summary) => true; public string Id { get; } = nameof(Speed); - public string ColumnName { get; } = "Speed (GB/s)"; + public string ColumnName { get; } = "Throughput"; public bool AlwaysShow { get; } = true; public ColumnCategory Category { get; } = ColumnCategory.Custom; public int PriorityInCategory { get; } diff --git a/Codegen/Program.cs b/Codegen/Program.cs index 85bdda4..3ceaadc 100644 --- a/Codegen/Program.cs +++ b/Codegen/Program.cs @@ -115,11 +115,26 @@ static async Task WriteCategories(string typ) if (typ == "Word") { - // hack in a Tab category that the spec doesn't use, be we do - const string tab = "Tab"; + const string ws = "Whitespace"; currentCat <<= 1; - cats.Add(tab, currentCat); - catsByRune.Add(0x09, tab); + cats.Add(ws, currentCat); + + for (var i = 0; i < char.MaxValue; i++) + { + var ch = (char)i; + if (char.IsWhiteSpace(ch)) + { + var r = new Rune(ch); + if (catsByRune.TryGetValue(r.Value, out string? existing)) + { + catsByRune[r.Value] = $"{existing} | {ws}"; + } + else + { + catsByRune.Add(r.Value, ws); + } + } + } } // write the file @@ -142,7 +157,7 @@ internal static partial class {typ}s } dict.Write(@" - internal static readonly Dict Dict = new(GetDict()); + static readonly Dict Dict = new(GetDict()); static Dictionary GetDict() => new() { "); @@ -181,7 +196,41 @@ static async Task WriteTests(string typ) [TestFixture] public class {typ}sTests {{ - internal readonly static UnicodeTest[] UnicodeTests = [ + static UnicodeTest[] Tests => UnicodeTests; + + [Test, TestCaseSource(nameof(Tests))] + public void Bytes(UnicodeTest test) + {{ + var tokens = Split.{typ}s(test.input); + TestUnicode.TestBytes(tokens, test); + }} + + [Test, TestCaseSource(nameof(Tests))] + public void String(UnicodeTest test) + {{ + var s = Encoding.UTF8.GetString(test.input); + var tokens = Split.{typ}s(s); + TestUnicode.TestChars(tokens, test); + }} + + [Test, TestCaseSource(nameof(Tests))] + public void Stream(UnicodeTest test) + {{ + using var stream = new MemoryStream(test.input); + var tokens = Split.{typ}s(stream); + TestUnicode.TestStream(tokens, test); + }} + + [Test, TestCaseSource(nameof(Tests))] + public void TextReader(UnicodeTest test) + {{ + using var stream = new MemoryStream(test.input); + using var reader = new StreamReader(stream); + var tokens = Split.{typ}s(reader); + TestUnicode.TestTextReader(tokens, test); + }} + + readonly static UnicodeTest[] UnicodeTests = [ "); while (true) { @@ -241,40 +290,6 @@ public class {typ}sTests } dict.Write(@$" ]; - - static readonly UnicodeTest[] Tests = UnicodeTests; - - [Test, TestCaseSource(nameof(Tests))] - public void Bytes(UnicodeTest test) - {{ - var tokens = Tokenizer.Get{typ}s(test.input); - TestUnicode.TestTokenizerBytes(tokens, test); - }} - - [Test, TestCaseSource(nameof(Tests))] - public void String(UnicodeTest test) - {{ - var s = Encoding.UTF8.GetString(test.input); - var tokens = Tokenizer.Get{typ}s(s); - TestUnicode.TestTokenizerChars(tokens, test); - }} - - [Test, TestCaseSource(nameof(Tests))] - public void Stream(UnicodeTest test) - {{ - using var stream = new MemoryStream(test.input); - var tokens = Tokenizer.Get{typ}s(stream); - TestUnicode.TestTokenizerStream(tokens, test); - }} - - [Test, TestCaseSource(nameof(Tests))] - public void TextReader(UnicodeTest test) - {{ - using var stream = new MemoryStream(test.input); - using var reader = new StreamReader(stream); - var tokens = Tokenizer.Get{typ}s(reader); - TestUnicode.TestTokenizerTextReader(tokens, test); - }} }} "); } diff --git a/README.md b/README.md index 0a1a004..483938e 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ Any time our code operates on individual words, we are tokenizing. Often, we do ### Example +_⚠️ This documentation on `main` refers to v3, which is not yet published on Nuget. See [v2 documentation](https://github.com/clipperhouse/uax29.net/blob/v2.2.0/README.md) until then._ + ``` dotnet add package UAX29 ``` @@ -19,7 +21,7 @@ var example = "Hello, 🌏 world. 你好,世界."; // The tokenizer can split words, graphemes or sentences. // It operates on strings, UTF-8 bytes, and streams. -var words = Tokenizer.GetWords(example); +var words = Split.Words(example); // Iterate over the tokens foreach (var word in words) @@ -47,7 +49,7 @@ world */ var utf8bytes = Encoding.UTF8.GetBytes(example); -var graphemes = Tokenizer.GetGraphemes(utf8bytes); +var graphemes = Split.Graphemes(utf8bytes); // Iterate over the tokens foreach (var grapheme in graphemes) @@ -84,19 +86,29 @@ d */ ``` +There are also optional extension methods in the spirit of `string.Split`: + +```csharp +using UAX29.Extensions; + +example.SplitWords(); +``` + ### Data types For UTF-8 bytes, pass `byte[]`, `Span` or `Stream`; the resulting tokens will be `ReadOnlySpan`. For strings/chars, pass `string`, `char[]`, `Span` or `TextReader`/`StreamReader`; the resulting tokens will be `ReadOnlySpan`. +If you have `Memory`, pass `Memory.Span`. + ### Conformance We use the official Unicode [test suites](https://unicode.org/reports/tr41/tr41-26.html#Tests29). Status: [![.NET](https://github.com/clipperhouse/uax29.net/actions/workflows/dotnet.yml/badge.svg)](https://github.com/clipperhouse/uax29.net/actions/workflows/dotnet.yml) -This is the same algorithm that is implemented in Lucene's [StandardTokenizer](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/analysis/standard/StandardTokenizer.html). +This is the same spec that is implemented in Lucene's [StandardTokenizer](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/analysis/standard/StandardTokenizer.html). ### Performance @@ -104,28 +116,34 @@ When tokenizing words, I get around 120MB/s on my Macbook M2. For typical text, The tokenizer is implemented as a `ref struct`, so you should see zero allocations for static text such as `byte[]` or `string`/`char`. -Calling `GetWords` et al returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. +Calling `Split.Words` returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. -For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `GetWords`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. +For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `Split.Words`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://github.com/clipperhouse/uax29.net/blob/main/Benchmarks/Program.cs#L89). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. ### Options -Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be returned. +Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be returned (for words only). ### Invalid inputs -The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We [make an effort](https://github.com/clipperhouse/uax29.net/blob/main/uax29/Unicode.Test.cs#L55) to ensure that all bytes will be returned even if invalid, i.e. to be lossless in any case, though the resulting tokenization may not be useful. Garbage in, garbage out. +The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We [make an effort](https://github.com/clipperhouse/uax29.net/blob/main/uax29/Unicode.Test.cs#L80) to ensure that all bytes will be returned even if invalid, i.e. to be lossless in any case, though the resulting tokenization may not be useful. Garbage in, garbage out. ### Major version changes -If you are using v1.x of this package, v2 has been renamed: +#### v2 → v3 + +Renamed methods: + +`Tokenizer.GetWords(input)` → `Split.Words(input)` + +#### v1 → v2 + +Renamed package, namespace and methods: `dotnet add package uax29.net` → `dotnet add package UAX29` `using uax29` → `using UAX29` -We renamed the methods: - `Tokenizer.Create(input)` → `Tokenizer.GetWords(input)` `Tokenizer.Create(input, TokenType.Graphemes)` → `Tokenizer.GetGraphemes(input)` diff --git a/uax29/Buffer.Test.cs b/uax29/Buffer.Test.cs index 06b4006..7917531 100644 --- a/uax29/Buffer.Test.cs +++ b/uax29/Buffer.Test.cs @@ -81,6 +81,44 @@ public void Moving() Assert.That(buffer.end, Is.EqualTo(storageSize)); } } + + [Test] + public void MinBufferSize() + { + var input = "Hello, how are you?"; + var bytes = Encoding.UTF8.GetBytes(input); + using var stream = new MemoryStream(bytes); + + { + var storage = new byte[1024]; + var minBufferBytes = 1024; + bool threw = false; + try + { + var words = new Buffer(stream.Read, minBufferBytes, storage); // ok + } + catch (ArgumentException) + { + threw = true; + } + Assert.That(threw, Is.False); + } + { + var storage = new byte[1024]; + var minBufferBytes = 1025; + + bool threw = false; + try + { + var words = new Buffer(stream.Read, minBufferBytes, storage); // not ok + } + catch (ArgumentException) + { + threw = true; + } + Assert.That(threw, Is.True); + } + } } diff --git a/uax29/Buffer.cs b/uax29/Buffer.cs index 689d391..3547d5b 100644 --- a/uax29/Buffer.cs +++ b/uax29/Buffer.cs @@ -19,15 +19,15 @@ public ref struct Buffer /// public bool EOF { get; private set; } - public Buffer(Read read, int minItems, T[]? storage = null) + public Buffer(Read read, int minBuffer, T[]? storage = null) { this.read = read; - this.minItems = minItems; - if (storage != null && storage.Length < minItems) + this.minItems = minBuffer; + if (storage != null && storage.Length < minBuffer) { - throw new ArgumentException($"Storage ({typeof(T)}[{storage.Length}]) must be at least as large as minItems ({minItems})."); + throw new ArgumentException($"Storage ({typeof(T)}[{storage.Length}]) must be at least as large as minBuffer ({minBuffer})."); } - storage ??= new T[minItems]; + storage ??= new T[minBuffer]; this.storage = storage; } diff --git a/uax29/Examples.Test.cs b/uax29/Examples.Test.cs index 608d3d8..6d77cfe 100644 --- a/uax29/Examples.Test.cs +++ b/uax29/Examples.Test.cs @@ -1,9 +1,9 @@ using System.Text; using UAX29; +using UAX29.Extensions; namespace Tests; - [TestFixture] public class TestExample { @@ -21,7 +21,7 @@ public void Readme() // The tokenizer can split words, graphemes or sentences. // It operates on strings, UTF-8 bytes, and streams. - var words = Tokenizer.GetWords(example); + var words = example.SplitWords(); // Iterate over the tokens foreach (var word in words) @@ -34,12 +34,12 @@ public void Readme() /* Hello , - + 🌏 - + world . - + 你 好 , @@ -49,9 +49,9 @@ public void Readme() */ var utf8bytes = Encoding.UTF8.GetBytes(example); - var graphemes = Tokenizer.GetGraphemes(utf8bytes); + var graphemes = utf8bytes.SplitGraphemes(); - // Iterate over the tokens + // Iterate over the tokens foreach (var grapheme in graphemes) { // grapheme is a ReadOnlySpan of UTF-8 bytes @@ -67,16 +67,16 @@ public void Readme() l o , - + 🌏 - + w o r l d . - + 你 好 , diff --git a/uax29/Tokenizer.Graphemes.cs b/uax29/Extensions/Extensions.Graphemes.cs similarity index 71% rename from uax29/Tokenizer.Graphemes.cs rename to uax29/Extensions/Extensions.Graphemes.cs index 7cb373a..88755b8 100644 --- a/uax29/Tokenizer.Graphemes.cs +++ b/uax29/Extensions/Extensions.Graphemes.cs @@ -1,6 +1,7 @@ -namespace UAX29; +namespace UAX29.Extensions; +using UAX29; -public static partial class Tokenizer +public static partial class Extensions { /// /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ @@ -9,7 +10,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(Span input) => new(input, Graphemes.SplitBytes); + public static SplitEnumerator SplitGraphemes(this Span input) => Split.Graphemes(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ @@ -18,7 +19,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(ReadOnlySpan input) => new(input, Graphemes.SplitBytes); + public static SplitEnumerator SplitGraphemes(this ReadOnlySpan input) => Split.Graphemes(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -27,7 +28,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(Memory input) => new(input.Span, Graphemes.SplitBytes); + public static SplitEnumerator SplitGraphemes(this Memory input) => Split.Graphemes(input.Span); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -36,7 +37,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(ReadOnlyMemory input) => new(input.Span, Graphemes.SplitBytes); + public static SplitEnumerator SplitGraphemes(this ReadOnlyMemory input) => Split.Graphemes(input.Span); /// /// Split the graphemes in the given array of UTF-8 encoded bytes. @@ -45,7 +46,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(byte[] input) => new(input.AsSpan(), Graphemes.SplitBytes); + public static SplitEnumerator SplitGraphemes(this byte[] input) => Split.Graphemes(input); /// /// Split the graphemes in the given string. @@ -54,7 +55,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(string input) => new(input.AsSpan(), Graphemes.SplitChars); + public static SplitEnumerator SplitGraphemes(this string input) => Split.Graphemes(input); /// /// Split the graphemes in the given string. @@ -63,7 +64,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(char[] input) => new(input.AsSpan(), Graphemes.SplitChars); + public static SplitEnumerator SplitGraphemes(this char[] input) => Split.Graphemes(input); /// /// Split the graphemes in the given of . @@ -72,8 +73,8 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - /// - public static Tokenizer GetGraphemes(Span input) => new(input, Graphemes.SplitChars); + /// + public static SplitEnumerator SplitGraphemes(this Span input) => Split.Graphemes(input); /// /// Split the graphemes in the given of . @@ -82,7 +83,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(ReadOnlySpan input) => new(input, Graphemes.SplitChars); + public static SplitEnumerator SplitGraphemes(this ReadOnlySpan input) => Split.Graphemes(input); /// /// Split the graphemes in the given of . @@ -91,7 +92,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(Memory input) => new(input.Span, Graphemes.SplitChars); + public static SplitEnumerator SplitGraphemes(this Memory input) => Split.Graphemes(input.Span); /// /// Split the graphemes in the given of . @@ -100,7 +101,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(ReadOnlyMemory input) => new(input.Span, Graphemes.SplitChars); + public static SplitEnumerator SplitGraphemes(this ReadOnlyMemory input) => Split.Graphemes(input.Span); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -109,28 +110,23 @@ public static partial class Tokenizer /// /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum grapheme token size. Tokens that exceed the bytes in the buffer /// will simply be cut off at this length, no error will occur. - /// + /// /// Default is 256 bytes. /// /// /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. - /// + /// /// If not provided, storage of 2 * minBufferBytes will be allocated by default. - /// - /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, /// which is more efficient, but will increase memory usage. - /// + /// /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. /// /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static StreamTokenizer GetGraphemes(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) - { - bufferStorage ??= new byte[minBufferBytes * 2]; - var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); - return new StreamTokenizer(buffer, Graphemes.SplitBytes); - } + public static StreamEnumerator SplitGraphemes(this Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Split.Graphemes(stream, minBufferBytes, bufferStorage); /// /// Split the graphemes in the given / . @@ -139,26 +135,21 @@ public static StreamTokenizer GetGraphemes(Stream stream, int minBufferByt /// /// Optional, the minimum chars to buffer from the reader. This determines the maximum grapheme token size. Tokens that exceed the chars in the buffer /// will simply be cut off at this length, no error will occur. - /// + /// /// Default is 256 chars. /// /// /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. - /// + /// /// If not provided, storage of 2 * minBufferChars will be allocated by default. - /// - /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, /// which is more efficient, but will increase memory usage. - /// + /// /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. /// /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static StreamTokenizer GetGraphemes(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) - { - bufferStorage ??= new char[minBufferChars * 2]; - var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); - return new StreamTokenizer(buffer, Graphemes.SplitChars); - } + public static StreamEnumerator SplitGraphemes(this TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) => Split.Graphemes(stream, minBufferChars, bufferStorage); } diff --git a/uax29/Extensions/Extensions.Sentences.cs b/uax29/Extensions/Extensions.Sentences.cs new file mode 100644 index 0000000..37d2a4c --- /dev/null +++ b/uax29/Extensions/Extensions.Sentences.cs @@ -0,0 +1,155 @@ +namespace UAX29.Extensions; +using UAX29; + +public static partial class Extensions +{ + /// + /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator SplitSentences(this Span input) => Split.Sentences(input); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator SplitSentences(this ReadOnlySpan input) => Split.Sentences(input); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator SplitSentences(this Memory input) => Split.Sentences(input.Span); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator SplitSentences(this ReadOnlyMemory input) => Split.Sentences(input.Span); + + /// + /// Split the graphemes in the given array of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator SplitSentences(this byte[] input) => Split.Sentences(input.AsSpan()); + + /// + /// Split the graphemes in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator SplitSentences(this string input) => Split.Sentences(input); + + /// + /// Split the graphemes in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator SplitSentences(this char[] input) => Split.Sentences(input.AsSpan()); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + /// + public static SplitEnumerator SplitSentences(this Span input) => Split.Sentences(input); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator SplitSentences(this ReadOnlySpan input) => Split.Sentences(input); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator SplitSentences(this Memory input) => Split.Sentences(input.Span); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator SplitSentences(this ReadOnlyMemory input) => Split.Sentences(input.Span); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum grapheme token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 256 bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static StreamEnumerator SplitSentences(this Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Split.Sentences(stream, minBufferBytes, bufferStorage); + + /// + /// Split the graphemes in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum grapheme token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 256 chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static StreamEnumerator SplitSentences(this TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) => Split.Sentences(stream, minBufferChars, bufferStorage); +} diff --git a/uax29/Extensions/Extensions.Test.cs b/uax29/Extensions/Extensions.Test.cs new file mode 100644 index 0000000..f8e365c --- /dev/null +++ b/uax29/Extensions/Extensions.Test.cs @@ -0,0 +1,178 @@ +namespace Tests; + +using UAX29.Extensions; +using System.Text; + +[TestFixture] +public class TestExtensions +{ + [SetUp] + public void Setup() + { + } + + static int ExpectedOverloads() + { + var expected = 0; + + expected++; // string + expected++; // char[] + expected++; // Span + expected++; // ReadOnlySpan + expected++; // Memory + expected++; // ReadOnlyMemory + + expected++; // byte[] + expected++; // Span + expected++; // ReadOnlySpan + expected++; // Memory + expected++; // ReadOnlyMemory + + expected++; // Stream + expected++; // TextReader + + expected *= 3; // Words, Graphemes, Sentences + + return expected; + } + + + [Test] + public void Overloads() + { + // no assertions, just needs to compile + + int expected = ExpectedOverloads(); + int got = 0; + + var input = "Hello, how are you?"; + var bytes = Encoding.UTF8.GetBytes(input); + using var stream = new MemoryStream(bytes); + using var reader = new StreamReader(stream); + + // Chars + { + // string + input.SplitWords(); got++; + + // char[] + input.ToCharArray().SplitWords(); got++; + + // ReadOnlySpan + input.AsSpan().SplitWords(); got++; + + // Span + var span = new Span(input.ToCharArray()); + span.SplitWords(); got++; + + // Memory + var mem = new Memory(input.ToCharArray()); + mem.SplitWords(); got++; + + // ReadOnlyMemory + ReadOnlyMemory rmem = input.AsMemory(); + rmem.SplitWords(); got++; + + reader.SplitWords(); got++; + } + { + input.SplitGraphemes(); got++; + + var array = input.ToCharArray(); + array.SplitGraphemes(); got++; + + var span = new Span(array); + span.SplitGraphemes(); got++; + + ReadOnlySpan rspan = input.AsSpan(); + rspan.SplitGraphemes(); got++; + + var mem = new Memory(array); + mem.SplitGraphemes(); got++; + + ReadOnlyMemory rmem = input.AsMemory(); + rmem.SplitGraphemes(); got++; + + reader.SplitGraphemes(); got++; + } + { + input.SplitSentences(); got++; + + var array = input.ToCharArray(); + array.SplitSentences(); got++; + + var span = new Span(array); + span.SplitSentences(); got++; + + ReadOnlySpan rspan = input.AsSpan(); + rspan.SplitSentences(); got++; + + var mem = new Memory(array); + mem.SplitSentences(); got++; + + ReadOnlyMemory rmem = input.AsMemory(); + rmem.SplitSentences(); got++; + + reader.SplitSentences(); got++; + } + + // Bytes + { + bytes.SplitWords(); got++; + + Span span = bytes.AsSpan(); + span.SplitWords(); got++; + + ReadOnlySpan rspan = bytes.AsSpan(); + rspan.SplitWords(); got++; + + Memory mem = bytes.AsMemory(); + mem.SplitWords(); got++; + + ReadOnlyMemory rmem = bytes.AsMemory(); + rmem.SplitWords(); got++; + + stream.SplitWords(); got++; + } + { + bytes.SplitGraphemes(); got++; + + Span span = bytes.AsSpan(); + span.SplitGraphemes(); got++; + + ReadOnlySpan rspan = bytes.AsSpan(); + rspan.SplitGraphemes(); got++; + + Memory mem = bytes.AsMemory(); + mem.SplitGraphemes(); got++; + + ReadOnlyMemory rmem = bytes.AsMemory(); + rmem.SplitGraphemes(); got++; + + stream.SplitGraphemes(); got++; + } + + + { + // bytes + + bytes.SplitSentences(); got++; + + Span span = bytes.AsSpan(); + span.SplitSentences(); got++; + + ReadOnlySpan rspan = bytes.AsSpan(); + rspan.SplitSentences(); got++; + + Memory mem = bytes.AsMemory(); + mem.SplitSentences(); got++; + + ReadOnlyMemory rmem = bytes.AsMemory(); + rmem.SplitSentences(); got++; + + stream.SplitSentences(); got++; + } + + Assert.That(got, Is.EqualTo(expected)); + } +} diff --git a/uax29/Tokenizer.Words.cs b/uax29/Extensions/Extensions.Words.cs similarity index 69% rename from uax29/Tokenizer.Words.cs rename to uax29/Extensions/Extensions.Words.cs index 1755ebf..884c06a 100644 --- a/uax29/Tokenizer.Words.cs +++ b/uax29/Extensions/Extensions.Words.cs @@ -1,164 +1,157 @@ -namespace UAX29; - -public static partial class Tokenizer -{ - /// - /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static Tokenizer GetWords(Span input, Options options = Options.None) => new(input, Words.SplitBytes, options); - - /// - /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static Tokenizer GetWords(ReadOnlySpan input, Options options = Options.None) => new(input, Words.SplitBytes, options); - - /// - /// Split the words in the given of UTF-8 encoded bytes. - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static Tokenizer GetWords(Memory input, Options options = Options.None) => new(input.Span, Words.SplitBytes, options); - - /// - /// Split the words in the given of UTF-8 encoded bytes. - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static Tokenizer GetWords(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, Words.SplitBytes, options); - - /// - /// Split the words in the given array of UTF-8 encoded bytes. - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static Tokenizer GetWords(byte[] input, Options options = Options.None) => new(input.AsSpan(), Words.SplitBytes, options); - - /// - /// Split the words in the given string. - /// - /// The string to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static Tokenizer GetWords(string input, Options options = Options.None) => new(input.AsSpan(), Words.SplitChars, options); - - /// - /// Split the words in the given string. - /// - /// The string to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static Tokenizer GetWords(char[] input, Options options = Options.None) => new(input.AsSpan(), Words.SplitChars, options); - - /// - /// Split the words in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - /// - public static Tokenizer GetWords(Span input, Options options = Options.None) => new(input, Words.SplitChars, options); - - /// - /// Split the words in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static Tokenizer GetWords(ReadOnlySpan input, Options options = Options.None) => new(input, Words.SplitChars, options); - - /// - /// Split the words in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static Tokenizer GetWords(Memory input, Options options = Options.None) => new(input.Span, Words.SplitChars, options); - - /// - /// Split the words in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static Tokenizer GetWords(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, Words.SplitChars, options); - - /// - /// Split the words in the given of UTF-8 encoded bytes. - /// - /// The stream of UTF-8 bytes to tokenize. - /// - /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum word token size. Tokens that exceed the bytes in the buffer - /// will simply be cut off at this length, no error will occur. - /// - /// Default is 1024 bytes. The tokenizer is intended for natural language, so we don't expect you'll find text with a word beyond a couple of dozen bytes. - /// - /// - /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. - /// - /// If not provided, storage of 2 * minBufferBytes will be allocated by default. - /// - /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, - /// which is more efficient, but will increase memory usage. - /// - /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. - /// - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static StreamTokenizer GetWords(Stream stream, Options options = Options.None, int minBufferBytes = 1024, byte[]? bufferStorage = null) - { - bufferStorage ??= new byte[minBufferBytes * 2]; - var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); - return new StreamTokenizer(buffer, Words.SplitBytes, options); - } - - /// - /// Split the words in the given / . - /// - /// The stream/text reader of char to tokenize. - /// - /// Optional, the minimum chars to buffer from the reader. This determines the maximum word token size. Tokens that exceed the chars in the buffer - /// will simply be cut off at this length, no error will occur. - /// - /// Default is 1024 chars. The tokenizer is intended for natural language, so we don't expect you'll find text with a word beyond a few dozen chars. - /// - /// - /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. - /// - /// If not provided, storage of 2 * minBufferChars will be allocated by default. - /// - /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, - /// which is more efficient, but will increase memory usage. - /// - /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. - /// - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static StreamTokenizer GetWords(TextReader stream, Options options = Options.None, int minBufferChars = 1024, char[]? bufferStorage = null) - { - bufferStorage ??= new char[minBufferChars * 2]; - var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); - return new StreamTokenizer(buffer, Words.SplitChars, options); - } -} +namespace UAX29.Extensions; +using UAX29; + +public static partial class Extensions +{ + /// + /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator SplitWords(this Span input, Options options = Options.None) => Split.Words(input, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator SplitWords(this ReadOnlySpan input, Options options = Options.None) => Split.Words(input, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator SplitWords(this Memory input, Options options = Options.None) => Split.Words(input.Span, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Split.Words(input.Span, options); + + /// + /// Split the words in the given array of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator SplitWords(this byte[] input, Options options = Options.None) => Split.Words(input.AsSpan(), options); + + /// + /// Split the words in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator SplitWords(this string input, Options options = Options.None) => Split.Words(input, options); + + /// + /// Split the words in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator SplitWords(this char[] input, Options options = Options.None) => Split.Words(input.AsSpan(), options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + /// + public static SplitEnumerator SplitWords(this Span input, Options options = Options.None) => Split.Words(input, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator SplitWords(this ReadOnlySpan input, Options options = Options.None) => Split.Words(input, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator SplitWords(this Memory input, Options options = Options.None) => Split.Words(input.Span, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Split.Words(input.Span, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum word token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 bytes. The tokenizer is intended for natural language, so we don't expect you'll find text with a word beyond a couple of dozen bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static StreamEnumerator SplitWords(this Stream stream, Options options = Options.None, int minBufferBytes = 1024, byte[]? bufferStorage = null) + => Split.Words(stream, options, minBufferBytes, bufferStorage); + + /// + /// Split the words in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum word token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 chars. The tokenizer is intended for natural language, so we don't expect you'll find text with a word beyond a few dozen chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static StreamEnumerator SplitWords(this TextReader stream, Options options = Options.None, int minBufferChars = 1024, char[]? bufferStorage = null) + => Split.Words(stream, options, minBufferChars, bufferStorage); +} diff --git a/uax29/Graphemes.Dict.cs b/uax29/Graphemes.Dict.cs index 0c3838d..9d3f2aa 100644 --- a/uax29/Graphemes.Dict.cs +++ b/uax29/Graphemes.Dict.cs @@ -21,7 +21,7 @@ internal static partial class Graphemes const Property ZWJ = 4096; const Property Extended_Pictographic = 8192; - internal static readonly Dict Dict = new(GetDict()); + static readonly Dict Dict = new(GetDict()); static Dictionary GetDict() => new() { {0x0600, Prepend}, diff --git a/uax29/Graphemes.Splitter.cs b/uax29/Graphemes.Splitter.cs index 97e54cd..25be246 100644 --- a/uax29/Graphemes.Splitter.cs +++ b/uax29/Graphemes.Splitter.cs @@ -27,7 +27,7 @@ internal Splitter(Decoders decoders) /// The string in which to split graphemes. /// Ignore, only applicable to splitting words, not graphemes. /// The number of bytes/chars that comprise the grapheme. - internal int Split(ReadOnlySpan input, out Property _) // this out param is only relevant in Words.Splitter + internal int Split(ReadOnlySpan input, out bool _) // this out param is only relevant in Words.Splitter { Debug.Assert(input.Length > 0); @@ -163,7 +163,7 @@ internal int Split(ReadOnlySpan input, out Property _) // this out para break; } - _ = 0; // see the Property out parameter at tops + _ = false; // see the out parameter at top return pos; } } diff --git a/uax29/Graphemes.Test.cs b/uax29/Graphemes.Test.cs index 4bde5fc..03254f7 100644 --- a/uax29/Graphemes.Test.cs +++ b/uax29/Graphemes.Test.cs @@ -7,7 +7,41 @@ namespace Tests; [TestFixture] public class GraphemesTests { - internal readonly static UnicodeTest[] UnicodeTests = [ + static UnicodeTest[] Tests => UnicodeTests; + + [Test, TestCaseSource(nameof(Tests))] + public void Bytes(UnicodeTest test) + { + var tokens = Split.Graphemes(test.input); + TestUnicode.TestBytes(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void String(UnicodeTest test) + { + var s = Encoding.UTF8.GetString(test.input); + var tokens = Split.Graphemes(s); + TestUnicode.TestChars(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void Stream(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + var tokens = Split.Graphemes(stream); + TestUnicode.TestStream(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void TextReader(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + using var reader = new StreamReader(stream); + var tokens = Split.Graphemes(reader); + TestUnicode.TestTextReader(tokens, test); + } + + readonly static UnicodeTest[] UnicodeTests = [ new([0x0020, 0x0020], [[0x0020], [0x0020]], "÷ [0.2] SPACE (Other) ÷ [999.0] SPACE (Other) ÷ [0.3]"), new([0x0020, 0x00CC, 0x0088, 0x0020], [[0x0020, 0x00CC, 0x0088], [0x0020]], "÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend_ExtCccZwj) ÷ [999.0] SPACE (Other) ÷ [0.3]"), new([0x0020, 0x000D], [[0x0020], [0x000D]], "÷ [0.2] SPACE (Other) ÷ [5.0] (CR) ÷ [0.3]"), @@ -612,38 +646,4 @@ public class GraphemesTests new([0x0061, 0x00E2, 0x0080, 0x008D, 0x00E2, 0x009C, 0x0081], [[0x0061, 0x00E2, 0x0080, 0x008D], [0x00E2, 0x009C, 0x0081]], "÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) ÷ [999.0] UPPER BLADE SCISSORS (Other) ÷ [0.3]"), ]; - - static readonly UnicodeTest[] Tests = UnicodeTests; - - [Test, TestCaseSource(nameof(Tests))] - public void Bytes(UnicodeTest test) - { - var tokens = Tokenizer.GetGraphemes(test.input); - TestUnicode.TestTokenizerBytes(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void String(UnicodeTest test) - { - var s = Encoding.UTF8.GetString(test.input); - var tokens = Tokenizer.GetGraphemes(s); - TestUnicode.TestTokenizerChars(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void Stream(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - var tokens = Tokenizer.GetGraphemes(stream); - TestUnicode.TestTokenizerStream(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void TextReader(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - using var reader = new StreamReader(stream); - var tokens = Tokenizer.GetGraphemes(reader); - TestUnicode.TestTokenizerTextReader(tokens, test); - } } diff --git a/uax29/Legacy/Tokenizer.Graphemes.cs b/uax29/Legacy/Tokenizer.Graphemes.cs new file mode 100644 index 0000000..3a331a3 --- /dev/null +++ b/uax29/Legacy/Tokenizer.Graphemes.cs @@ -0,0 +1,177 @@ +namespace UAX29; + +public static partial class Tokenizer +{ + /// + /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] + public static SplitEnumerator GetGraphemes(Span input) => new(input, Graphemes.SplitBytes); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] + public static SplitEnumerator GetGraphemes(ReadOnlySpan input) => new(input, Graphemes.SplitBytes); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] + public static SplitEnumerator GetGraphemes(Memory input) => new(input.Span, Graphemes.SplitBytes); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] + public static SplitEnumerator GetGraphemes(ReadOnlyMemory input) => new(input.Span, Graphemes.SplitBytes); + + /// + /// Split the graphemes in the given array of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] + public static SplitEnumerator GetGraphemes(byte[] input) => new(input.AsSpan(), Graphemes.SplitBytes); + + /// + /// Split the graphemes in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] + public static SplitEnumerator GetGraphemes(string input) => new(input.AsSpan(), Graphemes.SplitChars); + + /// + /// Split the graphemes in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] + public static SplitEnumerator GetGraphemes(char[] input) => new(input.AsSpan(), Graphemes.SplitChars); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] + public static SplitEnumerator GetGraphemes(Span input) => new(input, Graphemes.SplitChars); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] + public static SplitEnumerator GetGraphemes(ReadOnlySpan input) => new(input, Graphemes.SplitChars); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] + public static SplitEnumerator GetGraphemes(Memory input) => new(input.Span, Graphemes.SplitChars); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] + public static SplitEnumerator GetGraphemes(ReadOnlyMemory input) => new(input.Span, Graphemes.SplitChars); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum grapheme token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 256 bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + [Obsolete("Use Split.Graphemes(stream) or stream.SplitGraphemes()")] + public static StreamEnumerator GetGraphemes(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) + { + bufferStorage ??= new byte[minBufferBytes * 2]; + var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); + return new StreamEnumerator(buffer, Graphemes.SplitBytes); + } + + /// + /// Split the graphemes in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum grapheme token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 256 chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + [Obsolete("Use Split.Graphemes(stream) or stream.SplitGraphemes()")] + public static StreamEnumerator GetGraphemes(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) + { + bufferStorage ??= new char[minBufferChars * 2]; + var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); + return new StreamEnumerator(buffer, Graphemes.SplitChars); + } +} diff --git a/uax29/Tokenizer.Sentences.cs b/uax29/Legacy/Tokenizer.Sentences.cs similarity index 67% rename from uax29/Tokenizer.Sentences.cs rename to uax29/Legacy/Tokenizer.Sentences.cs index 0169ffd..d02a36b 100644 --- a/uax29/Tokenizer.Sentences.cs +++ b/uax29/Legacy/Tokenizer.Sentences.cs @@ -9,7 +9,8 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(Span input) => new(input, Sentences.SplitBytes); + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] + public static SplitEnumerator GetSentences(Span input) => new(input, Sentences.SplitBytes); /// /// Split the sentences in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ @@ -18,7 +19,8 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(ReadOnlySpan input) => new(input, Sentences.SplitBytes); + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] + public static SplitEnumerator GetSentences(ReadOnlySpan input) => new(input, Sentences.SplitBytes); /// /// Split the sentences in the given of UTF-8 encoded bytes. @@ -27,7 +29,8 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(Memory input) => new(input.Span, Sentences.SplitBytes); + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] + public static SplitEnumerator GetSentences(Memory input) => new(input.Span, Sentences.SplitBytes); /// /// Split the sentences in the given of UTF-8 encoded bytes. @@ -36,7 +39,8 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(ReadOnlyMemory input) => new(input.Span, Sentences.SplitBytes); + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] + public static SplitEnumerator GetSentences(ReadOnlyMemory input) => new(input.Span, Sentences.SplitBytes); /// /// Split the sentences in the given array of UTF-8 encoded bytes. @@ -45,7 +49,8 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(byte[] input) => new(input.AsSpan(), Sentences.SplitBytes); + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] + public static SplitEnumerator GetSentences(byte[] input) => new(input.AsSpan(), Sentences.SplitBytes); /// /// Split the sentences in the given string. @@ -54,7 +59,8 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(string input) => new(input.AsSpan(), Sentences.SplitChars); + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] + public static SplitEnumerator GetSentences(string input) => new(input.AsSpan(), Sentences.SplitChars); /// /// Split the sentences in the given string. @@ -63,7 +69,8 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(char[] input) => new(input.AsSpan(), Sentences.SplitChars); + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] + public static SplitEnumerator GetSentences(char[] input) => new(input.AsSpan(), Sentences.SplitChars); /// /// Split the sentences in the given of . @@ -72,8 +79,8 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - /// - public static Tokenizer GetSentences(Span input) => new(input, Sentences.SplitChars); + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] + public static SplitEnumerator GetSentences(Span input) => new(input, Sentences.SplitChars); /// /// Split the sentences in the given of . @@ -82,7 +89,8 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(ReadOnlySpan input) => new(input, Sentences.SplitChars); + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] + public static SplitEnumerator GetSentences(ReadOnlySpan input) => new(input, Sentences.SplitChars); /// /// Split the sentences in the given of . @@ -91,7 +99,8 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(Memory input) => new(input.Span, Sentences.SplitChars); + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] + public static SplitEnumerator GetSentences(Memory input) => new(input.Span, Sentences.SplitChars); /// /// Split the sentences in the given of . @@ -100,9 +109,8 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(ReadOnlyMemory input) => new(input.Span, Sentences.SplitChars); - - + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] + public static SplitEnumerator GetSentences(ReadOnlyMemory input) => new(input.Span, Sentences.SplitChars); /// /// Split the sentences in the given of UTF-8 encoded bytes. @@ -111,27 +119,28 @@ public static partial class Tokenizer /// /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum sentence token size. Tokens that exceed the bytes in the buffer /// will simply be cut off at this length, no error will occur. - /// + /// /// Default is 1024 bytes. /// /// /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. - /// + /// /// If not provided, storage of 2 * minBufferBytes will be allocated by default. - /// - /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, /// which is more efficient, but will increase memory usage. - /// + /// /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. /// /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static StreamTokenizer GetSentences(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) + [Obsolete("Use Split.Sentences(stream) or stream.SplitSentences()")] + public static StreamEnumerator GetSentences(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) { bufferStorage ??= new byte[minBufferBytes * 2]; var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); - return new StreamTokenizer(buffer, Sentences.SplitBytes); + return new StreamEnumerator(buffer, Sentences.SplitBytes); } /// @@ -141,26 +150,27 @@ public static StreamTokenizer GetSentences(Stream stream, int minBufferByt /// /// Optional, the minimum chars to buffer from the reader. This determines the maximum sentence token size. Tokens that exceed the chars in the buffer /// will simply be cut off at this length, no error will occur. - /// + /// /// Default is 1024 chars. /// /// /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. - /// + /// /// If not provided, storage of 2 * minBufferChars will be allocated by default. - /// - /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, /// which is more efficient, but will increase memory usage. - /// + /// /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. /// /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static StreamTokenizer GetSentences(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) + [Obsolete("Use Split.Sentences(stream) or stream.SplitSentences()")] + public static StreamEnumerator GetSentences(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) { bufferStorage ??= new char[minBufferChars * 2]; var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); - return new StreamTokenizer(buffer, Sentences.SplitChars); + return new StreamEnumerator(buffer, Sentences.SplitChars); } } diff --git a/uax29/Legacy/Tokenizer.Words.cs b/uax29/Legacy/Tokenizer.Words.cs new file mode 100644 index 0000000..e245d67 --- /dev/null +++ b/uax29/Legacy/Tokenizer.Words.cs @@ -0,0 +1,176 @@ +namespace UAX29; + +public static partial class Tokenizer +{ + /// + /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] + public static SplitEnumerator GetWords(Span input, Options options = Options.None) => new(input, Words.SplitBytes, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] + public static SplitEnumerator GetWords(ReadOnlySpan input, Options options = Options.None) => new(input, Words.SplitBytes, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] + public static SplitEnumerator GetWords(Memory input, Options options = Options.None) => new(input.Span, Words.SplitBytes, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] + public static SplitEnumerator GetWords(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, Words.SplitBytes, options); + + /// + /// Split the words in the given array of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] + public static SplitEnumerator GetWords(byte[] input, Options options = Options.None) => new(input.AsSpan(), Words.SplitBytes, options); + + /// + /// Split the words in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] + public static SplitEnumerator GetWords(string input, Options options = Options.None) => new(input.AsSpan(), Words.SplitChars, options); + + /// + /// Split the words in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] + public static SplitEnumerator GetWords(char[] input, Options options = Options.None) => new(input.AsSpan(), Words.SplitChars, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] + public static SplitEnumerator GetWords(Span input, Options options = Options.None) => new(input, Words.SplitChars, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] + public static SplitEnumerator GetWords(ReadOnlySpan input, Options options = Options.None) => new(input, Words.SplitChars, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] + public static SplitEnumerator GetWords(Memory input, Options options = Options.None) => new(input.Span, Words.SplitChars, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] + public static SplitEnumerator GetWords(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, Words.SplitChars, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum word token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 bytes. The tokenizer is intended for natural language, so we don't expect you'll find text with a word beyond a couple of dozen bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(stream) or stream.SplitWords()")] + public static StreamEnumerator GetWords(Stream stream, Options options = Options.None, int minBufferBytes = 1024, byte[]? bufferStorage = null) + { + bufferStorage ??= new byte[minBufferBytes * 2]; + var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); + return new StreamEnumerator(buffer, Words.SplitBytes, options); + } + + /// + /// Split the words in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum word token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 chars. The tokenizer is intended for natural language, so we don't expect you'll find text with a word beyond a few dozen chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of words. Use foreach (var word in words). + /// + [Obsolete("Use Split.Words(stream) or stream.SplitWords()")] + public static StreamEnumerator GetWords(TextReader stream, Options options = Options.None, int minBufferChars = 1024, char[]? bufferStorage = null) + { + bufferStorage ??= new char[minBufferChars * 2]; + var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); + return new StreamEnumerator(buffer, Words.SplitChars, options); + } +} diff --git a/uax29/Options.cs b/uax29/Options.cs index dfe3b2b..8f83c15 100644 --- a/uax29/Options.cs +++ b/uax29/Options.cs @@ -12,12 +12,7 @@ public enum Options : byte None = 0, /// - /// Omit tokens that consist entirely of whitespace, defined as UAX #29 WSegSpace | CR | LF | Tab. - /// - /// “Whitespace” in this implementation includes those which delimit words, but not all characters that are categorically whitespace. - /// For example, “non-breaking space” is whitespace, but it’s not what you want when splitting words, and so - /// it is not considered whitespace for our purposes. - /// + /// Omit tokens that consist entirely of whitespace, as defined by char.IsWhitespace. /// * Only supported for splitting Words; ignored for Graphemes and Sentences. * /// OmitWhitespace = 1, diff --git a/uax29/README.md b/uax29/README.md index 0a1a004..483938e 100644 --- a/uax29/README.md +++ b/uax29/README.md @@ -6,6 +6,8 @@ Any time our code operates on individual words, we are tokenizing. Often, we do ### Example +_⚠️ This documentation on `main` refers to v3, which is not yet published on Nuget. See [v2 documentation](https://github.com/clipperhouse/uax29.net/blob/v2.2.0/README.md) until then._ + ``` dotnet add package UAX29 ``` @@ -19,7 +21,7 @@ var example = "Hello, 🌏 world. 你好,世界."; // The tokenizer can split words, graphemes or sentences. // It operates on strings, UTF-8 bytes, and streams. -var words = Tokenizer.GetWords(example); +var words = Split.Words(example); // Iterate over the tokens foreach (var word in words) @@ -47,7 +49,7 @@ world */ var utf8bytes = Encoding.UTF8.GetBytes(example); -var graphemes = Tokenizer.GetGraphemes(utf8bytes); +var graphemes = Split.Graphemes(utf8bytes); // Iterate over the tokens foreach (var grapheme in graphemes) @@ -84,19 +86,29 @@ d */ ``` +There are also optional extension methods in the spirit of `string.Split`: + +```csharp +using UAX29.Extensions; + +example.SplitWords(); +``` + ### Data types For UTF-8 bytes, pass `byte[]`, `Span` or `Stream`; the resulting tokens will be `ReadOnlySpan`. For strings/chars, pass `string`, `char[]`, `Span` or `TextReader`/`StreamReader`; the resulting tokens will be `ReadOnlySpan`. +If you have `Memory`, pass `Memory.Span`. + ### Conformance We use the official Unicode [test suites](https://unicode.org/reports/tr41/tr41-26.html#Tests29). Status: [![.NET](https://github.com/clipperhouse/uax29.net/actions/workflows/dotnet.yml/badge.svg)](https://github.com/clipperhouse/uax29.net/actions/workflows/dotnet.yml) -This is the same algorithm that is implemented in Lucene's [StandardTokenizer](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/analysis/standard/StandardTokenizer.html). +This is the same spec that is implemented in Lucene's [StandardTokenizer](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/analysis/standard/StandardTokenizer.html). ### Performance @@ -104,28 +116,34 @@ When tokenizing words, I get around 120MB/s on my Macbook M2. For typical text, The tokenizer is implemented as a `ref struct`, so you should see zero allocations for static text such as `byte[]` or `string`/`char`. -Calling `GetWords` et al returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. +Calling `Split.Words` returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. -For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `GetWords`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. +For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `Split.Words`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://github.com/clipperhouse/uax29.net/blob/main/Benchmarks/Program.cs#L89). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. ### Options -Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be returned. +Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be returned (for words only). ### Invalid inputs -The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We [make an effort](https://github.com/clipperhouse/uax29.net/blob/main/uax29/Unicode.Test.cs#L55) to ensure that all bytes will be returned even if invalid, i.e. to be lossless in any case, though the resulting tokenization may not be useful. Garbage in, garbage out. +The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We [make an effort](https://github.com/clipperhouse/uax29.net/blob/main/uax29/Unicode.Test.cs#L80) to ensure that all bytes will be returned even if invalid, i.e. to be lossless in any case, though the resulting tokenization may not be useful. Garbage in, garbage out. ### Major version changes -If you are using v1.x of this package, v2 has been renamed: +#### v2 → v3 + +Renamed methods: + +`Tokenizer.GetWords(input)` → `Split.Words(input)` + +#### v1 → v2 + +Renamed package, namespace and methods: `dotnet add package uax29.net` → `dotnet add package UAX29` `using uax29` → `using UAX29` -We renamed the methods: - `Tokenizer.Create(input)` → `Tokenizer.GetWords(input)` `Tokenizer.Create(input, TokenType.Graphemes)` → `Tokenizer.GetGraphemes(input)` diff --git a/uax29/RangeTokenizer.Test.cs b/uax29/RangeEnumerator.Test.cs similarity index 76% rename from uax29/RangeTokenizer.Test.cs rename to uax29/RangeEnumerator.Test.cs index e3d848c..f2b1fce 100644 --- a/uax29/RangeTokenizer.Test.cs +++ b/uax29/RangeEnumerator.Test.cs @@ -5,7 +5,7 @@ using System.Text; [TestFixture] -public class TestRangeTokenizer +public class TestRangeEnumerator { [SetUp] public void Setup() @@ -18,7 +18,7 @@ public void Reset() var example = "Hello, how are you?"; var bytes = Encoding.UTF8.GetBytes(example); - var words = Tokenizer.GetWords(example); + var words = Split.Words(example); var ranges = words.Ranges; var first = new List(); @@ -43,13 +43,13 @@ public void Reset() static readonly Options[] options = [Options.None, Options.OmitWhitespace]; [Test] - public void MatchesTokenizer() + public void MatchesSplit() { var example = "abcdefghijk lmnopq r \tstu vwxyz; ABC DEFG \r\nHIJKL MNOP Q RSTUV WXYZ! 你好,世界.\r"; foreach (var option in options) { - var tokens = Tokenizer.GetWords(example, option); + var tokens = Split.Words(example, option); var ranges = tokens.Ranges; foreach (var range in ranges) @@ -67,27 +67,21 @@ public void MatchesTokenizer() public void Enumerator() { var input = "Hello, how are you?"; - var mem = input.AsMemory(); - Tokenizer.GetWords(mem); - var words = Tokenizer.GetWords(input); - var ranges = words.Ranges; - - var first = new List(); - while (ranges.MoveNext()) + var words = Split.Words(input); + var first = new List(); + foreach (var word in words) { - first.Add(ranges.Current); + first.Add(word.ToString()); } - Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing - - var tokens2 = Tokenizer.GetWords(input); - var ranges2 = words.Ranges; + Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing - var second = new List(); - foreach (var range in ranges2) + var ranges = Split.Words(input).Ranges; + var second = new List(); + foreach (var range in ranges) { - second.Add(range); + second.Add(input[range]); } Assert.That(first.SequenceEqual(second)); } @@ -96,7 +90,7 @@ public void Enumerator() public void ToList() { var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; - var words = Tokenizer.GetWords(example); + var words = Split.Words(example); var ranges = words.Ranges; var list = ranges.ToList(); @@ -126,7 +120,7 @@ public void ToList() public void ToArray() { var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; - var words = Tokenizer.GetWords(example); + var words = Split.Words(example); var ranges = words.Ranges; var array = ranges.ToArray(); diff --git a/uax29/RangeTokenizer.cs b/uax29/RangeEnumerator.cs similarity index 76% rename from uax29/RangeTokenizer.cs rename to uax29/RangeEnumerator.cs index 3eb131c..f73f1d9 100644 --- a/uax29/RangeTokenizer.cs +++ b/uax29/RangeEnumerator.cs @@ -4,19 +4,19 @@ namespace UAX29; using Property = uint; /// -/// Tokenizer splits strings or UTF-8 bytes as words, sentences or graphemes, per the Unicode UAX #29 spec. +/// RangeEnumerator splits strings or UTF-8 bytes as words, sentences or graphemes, per the Unicode UAX #29 spec. /// /// byte or char, indicating the type of the input, and by implication, the output. -public ref struct RangeTokenizer where T : struct +public ref struct RangeEnumerator where T : struct { - Tokenizer tokenizer; + SplitEnumerator tokenizer; bool begun = false; /// - /// Tokenizer splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec. + /// RangeEnumerator splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec. /// /// A string, or UTF-8 byte array. - internal RangeTokenizer(Tokenizer tokenizer) + internal RangeEnumerator(SplitEnumerator tokenizer) { this.tokenizer = tokenizer; } @@ -42,7 +42,7 @@ public readonly Range Current } } - public readonly RangeTokenizer GetEnumerator() + public readonly RangeEnumerator GetEnumerator() { return this; } @@ -55,7 +55,7 @@ public readonly List ToList() { if (begun) { - throw new InvalidOperationException("ToArray must not be called after iteration has begun. You may wish to call Reset() on the tokenizer."); + throw new InvalidOperationException("ToArray must not be called after iteration has begun. You may wish to call Reset() on the enumerator."); } var result = new List(); diff --git a/uax29/Sentences.Dict.cs b/uax29/Sentences.Dict.cs index 660ba7b..63756e8 100644 --- a/uax29/Sentences.Dict.cs +++ b/uax29/Sentences.Dict.cs @@ -21,7 +21,7 @@ internal static partial class Sentences const Property Close = 4096; const Property SContinue = 8192; - internal static readonly Dict Dict = new(GetDict()); + static readonly Dict Dict = new(GetDict()); static Dictionary GetDict() => new() { {0x000D, CR}, diff --git a/uax29/Sentences.Splitter.cs b/uax29/Sentences.Splitter.cs index 2606dcb..e2bc4fd 100644 --- a/uax29/Sentences.Splitter.cs +++ b/uax29/Sentences.Splitter.cs @@ -29,7 +29,7 @@ internal Splitter(Decoders decoders) /// The string in which to split sentences. /// Ignore, only applicable to splitting words, not sentences. /// The number of bytes/chars that comprise the sentence. - internal int Split(ReadOnlySpan input, out Property _) // this out param is only relevant in Words.Splitter + internal int Split(ReadOnlySpan input, out bool _) // this out param is only relevant in Words.Splitter { Debug.Assert(input.Length > 0); @@ -247,7 +247,7 @@ internal int Split(ReadOnlySpan input, out Property _) // this out para pos += w; } - _ = 0; // see the out Property parameter at top + _ = false; // see the out parameter at top return pos; diff --git a/uax29/Sentences.Test.cs b/uax29/Sentences.Test.cs index 65313ce..2d1359f 100644 --- a/uax29/Sentences.Test.cs +++ b/uax29/Sentences.Test.cs @@ -7,7 +7,41 @@ namespace Tests; [TestFixture] public class SentencesTests { - internal readonly static UnicodeTest[] UnicodeTests = [ + static UnicodeTest[] Tests => UnicodeTests; + + [Test, TestCaseSource(nameof(Tests))] + public void Bytes(UnicodeTest test) + { + var tokens = Split.Sentences(test.input); + TestUnicode.TestBytes(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void String(UnicodeTest test) + { + var s = Encoding.UTF8.GetString(test.input); + var tokens = Split.Sentences(s); + TestUnicode.TestChars(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void Stream(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + var tokens = Split.Sentences(stream); + TestUnicode.TestStream(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void TextReader(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + using var reader = new StreamReader(stream); + var tokens = Split.Sentences(reader); + TestUnicode.TestTextReader(tokens, test); + } + + readonly static UnicodeTest[] UnicodeTests = [ new([0x0001, 0x0001], [[0x0001, 0x0001]], "÷ [0.2] (Other) × [998.0] (Other) ÷ [0.3]"), new([0x0001, 0x00CC, 0x0088, 0x0001], [[0x0001, 0x00CC, 0x0088, 0x0001]], "÷ [0.2] (Other) × [5.0] COMBINING DIAERESIS (Extend_FE) × [998.0] (Other) ÷ [0.3]"), new([0x0001, 0x000D], [[0x0001, 0x000D]], "÷ [0.2] (Other) × [998.0] (CR) ÷ [0.3]"), @@ -512,38 +546,4 @@ public class SentencesTests new([0x00E2, 0x0081, 0x00A0, 0x0021, 0x00E2, 0x0081, 0x00A0, 0x0020, 0x00E2, 0x0081, 0x00A0, 0x0020, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0], [[0x00E2, 0x0081, 0x00A0, 0x0021, 0x00E2, 0x0081, 0x00A0, 0x0020, 0x00E2, 0x0081, 0x00A0, 0x0020, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0]], "÷ [0.2] WORD JOINER (Format_FE) × [998.0] EXCLAMATION MARK (STerm) × [5.0] WORD JOINER (Format_FE) × [9.0] SPACE (Sp) × [5.0] WORD JOINER (Format_FE) × [10.0] SPACE (Sp) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]"), ]; - - static readonly UnicodeTest[] Tests = UnicodeTests; - - [Test, TestCaseSource(nameof(Tests))] - public void Bytes(UnicodeTest test) - { - var tokens = Tokenizer.GetSentences(test.input); - TestUnicode.TestTokenizerBytes(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void String(UnicodeTest test) - { - var s = Encoding.UTF8.GetString(test.input); - var tokens = Tokenizer.GetSentences(s); - TestUnicode.TestTokenizerChars(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void Stream(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - var tokens = Tokenizer.GetSentences(stream); - TestUnicode.TestTokenizerStream(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void TextReader(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - using var reader = new StreamReader(stream); - var tokens = Tokenizer.GetSentences(reader); - TestUnicode.TestTokenizerTextReader(tokens, test); - } } diff --git a/uax29/Split.Graphemes.cs b/uax29/Split.Graphemes.cs new file mode 100644 index 0000000..135d720 --- /dev/null +++ b/uax29/Split.Graphemes.cs @@ -0,0 +1,82 @@ +namespace UAX29; + +public static partial class Split +{ + /// + /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator Graphemes(ReadOnlySpan input) => new(input, UAX29.Graphemes.SplitBytes); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator Graphemes(ReadOnlySpan input) => new(input, UAX29.Graphemes.SplitChars); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum grapheme token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 256 bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static StreamEnumerator Graphemes(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) + { + bufferStorage ??= new byte[minBufferBytes * 2]; + var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); + return new StreamEnumerator(buffer, UAX29.Graphemes.SplitBytes); + } + + /// + /// Split the graphemes in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum grapheme token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 256 chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static StreamEnumerator Graphemes(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) + { + bufferStorage ??= new char[minBufferChars * 2]; + var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); + return new StreamEnumerator(buffer, UAX29.Graphemes.SplitChars); + } +} diff --git a/uax29/Split.Sentences.cs b/uax29/Split.Sentences.cs new file mode 100644 index 0000000..09d7c37 --- /dev/null +++ b/uax29/Split.Sentences.cs @@ -0,0 +1,82 @@ +namespace UAX29; + +public static partial class Split +{ + /// + /// Split the sentences in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static SplitEnumerator Sentences(ReadOnlySpan input) => new(input, UAX29.Sentences.SplitBytes); + + /// + /// Split the sentences in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static SplitEnumerator Sentences(ReadOnlySpan input) => new(input, UAX29.Sentences.SplitChars); + + /// + /// Split the sentences in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum sentence token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static StreamEnumerator Sentences(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) + { + bufferStorage ??= new byte[minBufferBytes * 2]; + var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); + return new StreamEnumerator(buffer, UAX29.Sentences.SplitBytes); + } + + /// + /// Split the sentences in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum sentence token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static StreamEnumerator Sentences(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) + { + bufferStorage ??= new char[minBufferChars * 2]; + var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); + return new StreamEnumerator(buffer, UAX29.Sentences.SplitChars); + } +} diff --git a/uax29/Split.Words.cs b/uax29/Split.Words.cs new file mode 100644 index 0000000..1b6fad6 --- /dev/null +++ b/uax29/Split.Words.cs @@ -0,0 +1,82 @@ +namespace UAX29; + +public static partial class Split +{ + /// + /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator Words(ReadOnlySpan input, Options options = Options.None) => new(input, UAX29.Words.SplitBytes, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator Words(ReadOnlySpan input, Options options = Options.None) => new(input, UAX29.Words.SplitChars, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum word token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 bytes. The tokenizer is intended for natural language, so we don't expect you'll find text with a word beyond a couple of dozen bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static StreamEnumerator Words(Stream stream, Options options = Options.None, int minBufferBytes = 1024, byte[]? bufferStorage = null) + { + bufferStorage ??= new byte[minBufferBytes * 2]; + var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); + return new StreamEnumerator(buffer, UAX29.Words.SplitBytes, options); + } + + /// + /// Split the words in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum word token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 chars. The tokenizer is intended for natural language, so we don't expect you'll find text with a word beyond a few dozen chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static StreamEnumerator Words(TextReader stream, Options options = Options.None, int minBufferChars = 1024, char[]? bufferStorage = null) + { + bufferStorage ??= new char[minBufferChars * 2]; + var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); + return new StreamEnumerator(buffer, UAX29.Words.SplitChars, options); + } +} diff --git a/uax29/Tokenizer.Test.cs b/uax29/SplitEnumerator.Test.cs similarity index 63% rename from uax29/Tokenizer.Test.cs rename to uax29/SplitEnumerator.Test.cs index 65179dc..84d4ba5 100644 --- a/uax29/Tokenizer.Test.cs +++ b/uax29/SplitEnumerator.Test.cs @@ -5,7 +5,7 @@ using System.Text; [TestFixture] -public class TestTokenizer +public class TestEnumerator { [SetUp] public void Setup() @@ -18,7 +18,7 @@ public void Reset() var example = "Hello, how are you?"; var bytes = Encoding.UTF8.GetBytes(example); - var tokens = Tokenizer.GetWords(example); + var tokens = Split.Words(example); var first = new List(); foreach (var token in tokens) @@ -44,7 +44,7 @@ public void SetText() { var example = "Hello, how are you?"; - var tokens = Tokenizer.GetWords(example); + var tokens = Split.Words(example); var first = new List(); foreach (var token in tokens) @@ -73,14 +73,10 @@ static int ExpectedOverloads() expected++; // char[] expected++; // Span expected++; // ReadOnlySpan - expected++; // Memory - expected++; // ReadOnlyMemory expected++; // byte[] expected++; // Span expected++; // ReadOnlySpan - expected++; // Memory - expected++; // ReadOnlyMemory expected++; // Stream expected++; // TextReader @@ -104,137 +100,83 @@ public void Overloads() using var stream = new MemoryStream(bytes); using var reader = new StreamReader(stream); + // Chars { - // chars - - Tokenizer.GetWords(input); got++; + Split.Words(input); got++; var array = input.ToCharArray(); - Tokenizer.GetWords(array); got++; + Split.Words(array); got++; var span = new Span(array); - Tokenizer.GetWords(span); got++; + Split.Words(span); got++; ReadOnlySpan rspan = input.AsSpan(); - Tokenizer.GetWords(rspan); got++; - - var mem = new Memory(array); - Tokenizer.GetWords(mem); got++; + Split.Words(rspan); got++; - ReadOnlyMemory rmem = input.AsMemory(); - Tokenizer.GetWords(rmem); got++; - - Tokenizer.GetWords(reader); got++; + Split.Words(reader); got++; } - - { - // chars - - Tokenizer.GetGraphemes(input); got++; + Split.Graphemes(input); got++; var array = input.ToCharArray(); - Tokenizer.GetGraphemes(array); got++; + Split.Graphemes(array); got++; var span = new Span(array); - Tokenizer.GetGraphemes(span); got++; + Split.Graphemes(span); got++; ReadOnlySpan rspan = input.AsSpan(); - Tokenizer.GetGraphemes(rspan); got++; - - var mem = new Memory(array); - Tokenizer.GetGraphemes(mem); got++; + Split.Graphemes(rspan); got++; - ReadOnlyMemory rmem = input.AsMemory(); - Tokenizer.GetGraphemes(rmem); got++; - - Tokenizer.GetGraphemes(reader); got++; + Split.Graphemes(reader); got++; } - - { - // chars - - Tokenizer.GetSentences(input); got++; + Split.Sentences(input); got++; var array = input.ToCharArray(); - Tokenizer.GetSentences(array); got++; + Split.Sentences(array); got++; var span = new Span(array); - Tokenizer.GetSentences(span); got++; + Split.Sentences(span); got++; ReadOnlySpan rspan = input.AsSpan(); - Tokenizer.GetSentences(rspan); got++; - - var mem = new Memory(array); - Tokenizer.GetSentences(mem); got++; - - ReadOnlyMemory rmem = input.AsMemory(); - Tokenizer.GetSentences(rmem); got++; + Split.Sentences(rspan); got++; - Tokenizer.GetSentences(reader); got++; + Split.Sentences(reader); got++; } + // Bytes { - // bytes - - Tokenizer.GetWords(bytes); got++; + Split.Words(bytes); got++; Span span = bytes.AsSpan(); - Tokenizer.GetWords(span); got++; + Split.Words(span); got++; ReadOnlySpan rspan = bytes.AsSpan(); - Tokenizer.GetWords(rspan); got++; - - Memory mem = bytes.AsMemory(); - Tokenizer.GetWords(mem); got++; + Split.Words(rspan); got++; - ReadOnlyMemory rmem = bytes.AsMemory(); - Tokenizer.GetWords(rmem); got++; - - Tokenizer.GetWords(stream); got++; + Split.Words(stream); got++; } - - { - // bytes - - Tokenizer.GetGraphemes(bytes); got++; + Split.Graphemes(bytes); got++; Span span = bytes.AsSpan(); - Tokenizer.GetGraphemes(span); got++; + Split.Graphemes(span); got++; ReadOnlySpan rspan = bytes.AsSpan(); - Tokenizer.GetGraphemes(rspan); got++; - - Memory mem = bytes.AsMemory(); - Tokenizer.GetGraphemes(mem); got++; + Split.Graphemes(rspan); got++; - ReadOnlyMemory rmem = bytes.AsMemory(); - Tokenizer.GetGraphemes(rmem); got++; - - Tokenizer.GetGraphemes(stream); got++; + Split.Graphemes(stream); got++; } - - { - // bytes - - Tokenizer.GetSentences(bytes); got++; + Split.Sentences(bytes); got++; Span span = bytes.AsSpan(); - Tokenizer.GetSentences(span); got++; + Split.Sentences(span); got++; ReadOnlySpan rspan = bytes.AsSpan(); - Tokenizer.GetSentences(rspan); got++; - - Memory mem = bytes.AsMemory(); - Tokenizer.GetSentences(mem); got++; - - ReadOnlyMemory rmem = bytes.AsMemory(); - Tokenizer.GetSentences(rmem); got++; + Split.Sentences(rspan); got++; - Tokenizer.GetSentences(stream); got++; + Split.Sentences(stream); got++; } Assert.That(got, Is.EqualTo(expected)); @@ -244,11 +186,9 @@ public void Overloads() public void Enumerator() { var input = "Hello, how are you?"; - var mem = input.AsMemory(); var bytes = Encoding.UTF8.GetBytes(input); - Tokenizer.GetWords(mem); - var tokens = Tokenizer.GetWords(input); + var tokens = Split.Words(input); var first = new List(); while (tokens.MoveNext()) { @@ -257,11 +197,11 @@ public void Enumerator() } Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing - var tokens2 = Tokenizer.GetWords(input); + var tokens2 = Split.Words(bytes); var second = new List(); foreach (var token in tokens2) { - var s = token.ToString(); + var s = Encoding.UTF8.GetString(token); second.Add(s); } Assert.That(first.SequenceEqual(second)); @@ -271,7 +211,7 @@ public void Enumerator() public void ToList() { var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; - var tokens = Tokenizer.GetWords(example); + var tokens = Split.Words(example); var list = tokens.ToList(); var i = 0; @@ -283,7 +223,7 @@ public void ToList() Assert.That(list, Has.Count.EqualTo(i), "ToList should return the same number of tokens as iteration"); - // Tokenizer should reset back to the beginning + // Enumerator should reset back to the beginning Assert.That(tokens.start, Is.EqualTo(0)); Assert.That(tokens.end, Is.EqualTo(0)); @@ -304,7 +244,7 @@ public void ToList() public void ToArray() { var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; - var tokens = Tokenizer.GetWords(example); + var tokens = Split.Words(example); var array = tokens.ToArray(); var i = 0; @@ -316,7 +256,7 @@ public void ToArray() Assert.That(array, Has.Length.EqualTo(i), "ToArray should return the same number of tokens as iteration"); - // Tokenizer should reset back to the beginning + // Should reset back to the beginning Assert.That(tokens.start, Is.EqualTo(0)); Assert.That(tokens.end, Is.EqualTo(0)); @@ -339,7 +279,7 @@ public void Position() var example = "Hello, how are you?"; { - var tokens = Tokenizer.GetWords(example); + var tokens = Split.Words(example); tokens.MoveNext(); Assert.That(tokens.Position, Is.EqualTo(0)); tokens.MoveNext(); @@ -358,7 +298,7 @@ public void Position() var bytes = Encoding.UTF8.GetBytes(example); { - var tokens = Tokenizer.GetWords(bytes); + var tokens = Split.Words(bytes); tokens.MoveNext(); Assert.That(tokens.Position, Is.EqualTo(0)); tokens.MoveNext(); @@ -387,7 +327,7 @@ public void OmitWhitespace() // Options.None should be lossless var expected = example; var got = string.Concat( - Tokenizer.GetWords(example, Options.None) + Split.Words(example, Options.None) .ToList() .SelectMany(c => c) ); @@ -399,7 +339,7 @@ public void OmitWhitespace() // Options.OmitWhitespace should have no whitespace var expected = new string(example.Where(c => !char.IsWhiteSpace(c)).ToArray()); var got = string.Concat( - Tokenizer.GetWords(example, Options.OmitWhitespace) + Split.Words(example, Options.OmitWhitespace) .ToList() .SelectMany(c => c) ); diff --git a/uax29/Tokenizer.cs b/uax29/SplitEnumerator.cs similarity index 82% rename from uax29/Tokenizer.cs rename to uax29/SplitEnumerator.cs index 24972fe..10c8f53 100644 --- a/uax29/Tokenizer.cs +++ b/uax29/SplitEnumerator.cs @@ -6,7 +6,7 @@ namespace UAX29; /// Splits an input string (UTF-8 or UTF-16) and provides an enumerator over the splits. /// /// byte or char, indicating the type of the input, and by implication, the output. -public ref struct Tokenizer where T : struct +public ref struct SplitEnumerator where T : struct { ReadOnlySpan input; @@ -26,12 +26,12 @@ namespace UAX29; bool begun = false; /// - /// Tokenizer splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec. + /// Splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec. /// /// A string, or UTF-8 byte array. /// A func/method meeting the Split delegate signature. /// Options for handling the input text. - internal Tokenizer(ReadOnlySpan input, Split split, Options options = Options.None) + internal SplitEnumerator(ReadOnlySpan input, Split split, Options options = Options.None) { this.input = input; this.split = split; @@ -48,14 +48,14 @@ public bool MoveNext() while (end < input.Length) { - var advance = this.split(input[end..], out var seen); + var advance = this.split(input[end..], out var whitespace); Debug.Assert(advance > 0); start = end; end += advance; // This option is only supported for words; prevent other uses at the static API level - if (options.Includes(Options.OmitWhitespace) && seen.IsExclusively(Words.Whitespace)) + if (whitespace && options.Includes(Options.OmitWhitespace)) { continue; } @@ -78,13 +78,13 @@ public readonly ReadOnlySpan Current } } - public readonly Tokenizer GetEnumerator() + public readonly SplitEnumerator GetEnumerator() { return this; } /// - /// Resets the tokenizer back to the first token. + /// Resets the enumerator back to the first token. /// public void Reset() { @@ -144,11 +144,11 @@ public T[][] ToArray() /// An enumerator of Range. Use foreach to iterate over the ranges. Apply them to your original input /// using [range] or .AsSpan(range) to get the tokens. /// - public readonly RangeTokenizer Ranges + public readonly RangeEnumerator Ranges { get { - return new RangeTokenizer(this); + return new RangeEnumerator(this); } } } diff --git a/uax29/Splitter.Test.cs b/uax29/Splitter.Test.cs deleted file mode 100644 index 8b73096..0000000 --- a/uax29/Splitter.Test.cs +++ /dev/null @@ -1,63 +0,0 @@ -using System.Text; -using UAX29; - -namespace Tests; - -/// A bitmap of Unicode categories -using Property = uint; - -[TestFixture] -public class TestSplitter -{ - - [SetUp] - public void Setup() - { - } - - const Property Yes1 = 1; - const Property No1 = 2; - const Property Yes2 = 4; - const Property No2 = 8; - const Property Yes3 = 16; - const Property Yeses = Yes1 | Yes2 | Yes3; - - [Test] - public void TestIsExclusively() - { - { - var seen = Yes1; - Assert.That(seen.IsExclusively(Yeses), Is.True); - } - - { - var seen = Yes1 | Yes2; - Assert.That(seen.IsExclusively(Yeses), Is.True); - } - - { - var seen = No1; - Assert.That(seen.IsExclusively(Yeses), Is.False); - } - - { - var seen = No1 | No2; - Assert.That(seen.IsExclusively(Yeses), Is.False); - } - - { - var seen = Yes1 | No1; - Assert.That(seen.IsExclusively(Yeses), Is.False); - } - - { - var seen = Yes1 | Yes3 | No1; - Assert.That(seen.IsExclusively(Yeses), Is.False); - } - - { - Property seen = 0; - Assert.That(seen.IsExclusively(Yeses), Is.False); - } - } -} diff --git a/uax29/Splitter.cs b/uax29/Splitter.cs index ca3ce4c..bef428a 100644 --- a/uax29/Splitter.cs +++ b/uax29/Splitter.cs @@ -11,9 +11,9 @@ /// byte or char, indicating the type of the input, and by implication, the output. /// The string to split/tokenize. /// How many bytes/chars were consumed from the input. -internal delegate int Split(ReadOnlySpan input, out Property seen); +internal delegate int Split(ReadOnlySpan input, out bool whitespace); -internal static class Extensions +internal static class PropertyExtensions { /// /// Determines whether two properties (bitstrings) match, i.e. intersect, i.e. share at least one bit. @@ -25,19 +25,4 @@ internal static bool Is(this Property lookup, Property properties) { return (lookup & properties) != 0; } - - /// - /// Determines if property consists entirely of compare, i.e. no other values (flags) besides the ones in compare. - /// - /// The property to test; the haystack. - /// The property to test against; the needle. - /// True if property consists entirely of compare, otherwise false. - internal static bool IsExclusively(this Property property, Property compare) - { - Debug.Assert(compare > 0); - return - (property & compare) != 0 && // compare appears in property - (property & ~compare) == 0 // but no others do - ; - } } diff --git a/uax29/StreamTokenizer.Test.cs b/uax29/StreamEnumerator.Test.cs similarity index 87% rename from uax29/StreamTokenizer.Test.cs rename to uax29/StreamEnumerator.Test.cs index ef8d2b2..fc02ed5 100644 --- a/uax29/StreamTokenizer.Test.cs +++ b/uax29/StreamEnumerator.Test.cs @@ -4,7 +4,7 @@ using UAX29; [TestFixture] -public class TestStreamTokenizer +public class TestStreamEnumerator { [SetUp] public void Setup() @@ -31,10 +31,10 @@ public void StreamMatchesStatic() foreach (var input in examples) { var bytes = Encoding.UTF8.GetBytes(input); - var staticTokens = Tokenizer.GetWords(bytes, Options.OmitWhitespace); + var staticTokens = Split.Words(bytes, Options.OmitWhitespace); using var stream = new MemoryStream(bytes); - var streamTokens = Tokenizer.GetWords(stream, Options.OmitWhitespace); + var streamTokens = Split.Words(stream, Options.OmitWhitespace); foreach (var streamToken in streamTokens) { @@ -70,11 +70,11 @@ public void StreamReaderMatchesStatic() foreach (var input in examples) { var bytes = Encoding.UTF8.GetBytes(input); - var staticTokens = Tokenizer.GetWords(bytes, option); + var staticTokens = Split.Words(bytes, option); using var stream = new MemoryStream(bytes); using var reader = new StreamReader(stream); - var streamTokens = Tokenizer.GetWords(reader, option); + var streamTokens = Split.Words(reader, option); foreach (var streamToken in streamTokens) { @@ -97,7 +97,7 @@ public void SetStream() var bytes = Encoding.UTF8.GetBytes(input); using var stream = new MemoryStream(bytes); - var tokens = Tokenizer.GetWords(stream); + var tokens = Split.Words(stream); var first = new List(); foreach (var token in tokens) @@ -130,7 +130,7 @@ public void SetStreamReader() using var stream = new MemoryStream(bytes); using var reader = new StreamReader(stream); - var tokens = Tokenizer.GetWords(reader); + var tokens = Split.Words(reader); var first = new List(); foreach (var token in tokens) @@ -163,7 +163,7 @@ public void StreamEnumerator() var bytes = Encoding.UTF8.GetBytes(input); using var stream = new MemoryStream(bytes); - var tokens = Tokenizer.GetWords(stream); + var tokens = Split.Words(stream); var first = new List(); while (tokens.MoveNext()) @@ -175,7 +175,7 @@ public void StreamEnumerator() Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing using var stream2 = new MemoryStream(bytes); - var tokens2 = Tokenizer.GetWords(stream2); + var tokens2 = Split.Words(stream2); var second = new List(); foreach (var token in tokens2) @@ -194,10 +194,10 @@ public void StreamToList() var bytes = Encoding.UTF8.GetBytes(example); using var stream = new MemoryStream(bytes); - var list = Tokenizer.GetWords(stream).ToList(); + var list = Split.Words(stream).ToList(); stream.Seek(0, SeekOrigin.Begin); - var tokens = Tokenizer.GetWords(stream); + var tokens = Split.Words(stream); var i = 0; foreach (var token in tokens) @@ -228,10 +228,10 @@ public void StreamToArray() var bytes = Encoding.UTF8.GetBytes(example); using var stream = new MemoryStream(bytes); - var list = Tokenizer.GetWords(stream).ToList(); + var list = Split.Words(stream).ToList(); stream.Seek(0, SeekOrigin.Begin); - var tokens = Tokenizer.GetWords(stream); + var tokens = Split.Words(stream); var i = 0; foreach (var token in tokens) @@ -263,7 +263,7 @@ public void Position() { using var stream = new MemoryStream(bytes); - var tokens = Tokenizer.GetWords(stream, minBufferBytes: 8); + var tokens = Split.Words(stream, minBufferBytes: 8); tokens.MoveNext(); Assert.That(tokens.Position, Is.EqualTo(0)); // ab... tokens.MoveNext(); @@ -282,7 +282,7 @@ public void Position() { using var stream = new MemoryStream(bytes); - var tokens = Tokenizer.GetWords(stream, minBufferBytes: 8, options: Options.OmitWhitespace); + var tokens = Split.Words(stream, minBufferBytes: 8, options: Options.OmitWhitespace); tokens.MoveNext(); Assert.That(tokens.Position, Is.EqualTo(0)); // ab... // tokens.MoveNext(); diff --git a/uax29/StreamTokenizer.cs b/uax29/StreamEnumerator.cs similarity index 68% rename from uax29/StreamTokenizer.cs rename to uax29/StreamEnumerator.cs index 3da11a1..ede42d6 100644 --- a/uax29/StreamTokenizer.cs +++ b/uax29/StreamEnumerator.cs @@ -6,9 +6,9 @@ using Property = uint; /// -/// StreamTokenizer is a small data structure for splitting strings from Streams or TextReaders. It implements GetEnumerator. +/// StreamEnumerator is a small data structure for splitting strings from Streams or TextReaders. It implements GetEnumerator. /// -public ref struct StreamTokenizer where T : struct +public ref struct StreamEnumerator where T : struct { internal Buffer buffer; readonly Split split; @@ -27,11 +27,11 @@ bool begun = false; /// - /// StreamTokenizer is a small data structure for splitting strings. + /// StreamEnumerator is a small data structure for splitting strings. /// /// For backing storage, typically created from a Stream or TextReader. /// A delegate that does the tokenizing. See Split for details. - internal StreamTokenizer(Buffer buffer, Split split, Options options = Options.None) + internal StreamEnumerator(Buffer buffer, Split split, Options options = Options.None) { this.buffer = buffer; this.split = split; @@ -47,13 +47,13 @@ public bool MoveNext() count += end; buffer.Consume(this.Current.Length); // previous token - var advance = this.split(buffer.Contents, out Property seen); + var advance = this.split(buffer.Contents, out var whitespace); Debug.Assert(advance > 0); end = advance; // This option is only supported for words; prevent other uses at the static API level - if (options.Includes(Options.OmitWhitespace) && seen.IsExclusively(Words.Whitespace)) + if (whitespace && options.Includes(Options.OmitWhitespace)) { continue; } @@ -72,7 +72,7 @@ public ReadOnlySpan Current } } - public readonly StreamTokenizer GetEnumerator() + public readonly StreamEnumerator GetEnumerator() { return this; } @@ -115,19 +115,19 @@ public readonly T[][] ToArray() public static class StreamExtensions { /// - /// Resets an existing tokenizer with a new stream. You might choose this as an optimization, as it will re-use a buffer, avoiding allocations. + /// Resets an existing StreamEnumerator with a new stream. You might choose this as an optimization, as it will re-use a buffer, avoiding allocations. /// /// The new stream - public static void SetStream(ref this StreamTokenizer tokenizer, Stream stream) + public static void SetStream(ref this StreamEnumerator tokenizer, Stream stream) { tokenizer.buffer.SetRead(stream.Read); } /// - /// Resets an existing tokenizer with a new stream. You might choose this as an optimization, as it will re-use a buffer, avoiding allocations. + /// Resets an existing StreamEnumerator with a new stream. You might choose this as an optimization, as it will re-use a buffer, avoiding allocations. /// /// The new stream - public static void SetStream(ref this StreamTokenizer tokenizer, TextReader stream) + public static void SetStream(ref this StreamEnumerator tokenizer, TextReader stream) { tokenizer.buffer.SetRead(stream.Read); } diff --git a/uax29/Unicode.Test.cs b/uax29/Unicode.Test.cs index 71053c9..4630d37 100644 --- a/uax29/Unicode.Test.cs +++ b/uax29/Unicode.Test.cs @@ -20,7 +20,7 @@ public void Setup() { } - internal static void TestTokenizerBytes(Tokenizer tokens, UnicodeTest test) + internal static void TestBytes(SplitEnumerator tokens, UnicodeTest test) { var i = 0; foreach (var token in tokens) @@ -32,7 +32,7 @@ internal static void TestTokenizerBytes(Tokenizer tokens, UnicodeTest test } } - internal static void TestTokenizerStream(StreamTokenizer tokens, UnicodeTest test) + internal static void TestStream(StreamEnumerator tokens, UnicodeTest test) { var i = 0; foreach (var token in tokens) @@ -44,7 +44,7 @@ internal static void TestTokenizerStream(StreamTokenizer tokens, UnicodeTe } } - internal static void TestTokenizerChars(Tokenizer tokens, UnicodeTest test) + internal static void TestChars(SplitEnumerator tokens, UnicodeTest test) { var i = 0; foreach (var token in tokens) @@ -56,7 +56,7 @@ internal static void TestTokenizerChars(Tokenizer tokens, UnicodeTest test } } - internal static void TestTokenizerTextReader(StreamTokenizer tokens, UnicodeTest test) + internal static void TestTextReader(StreamEnumerator tokens, UnicodeTest test) { var i = 0; foreach (var token in tokens) @@ -68,13 +68,13 @@ internal static void TestTokenizerTextReader(StreamTokenizer tokens, Unico } } - private delegate Tokenizer ByteMethod(byte[] input); - static readonly ByteMethod byteWords = (byte[] input) => Tokenizer.GetWords(input); // because of the optional parameter - static readonly ByteMethod[] byteMethods = [byteWords, Tokenizer.GetGraphemes, Tokenizer.GetSentences]; + private delegate SplitEnumerator ByteMethod(ReadOnlySpan input); + static readonly ByteMethod byteWords = (ReadOnlySpan input) => Split.Words(input); // because of the optional parameter + static readonly ByteMethod[] byteMethods = [byteWords, Split.Graphemes, Split.Graphemes]; - private delegate Tokenizer CharMethod(char[] input); - static readonly CharMethod charWords = (char[] input) => Tokenizer.GetWords(input); // because of the optional parameter - static readonly CharMethod[] charMethods = [charWords, Tokenizer.GetGraphemes, Tokenizer.GetSentences]; + private delegate SplitEnumerator CharMethod(ReadOnlySpan input); + static readonly CharMethod charWords = (ReadOnlySpan input) => Split.Words(input); // because of the optional parameter + static readonly CharMethod[] charMethods = [charWords, Split.Graphemes, Split.Sentences]; [Test] public void InvalidEncoding() @@ -130,7 +130,6 @@ public void InvalidEncoding() { var bytes = new byte[i]; rng.GetBytes(bytes); - var s = Encoding.UTF8.GetChars(bytes); foreach (var method in byteMethods) { @@ -146,6 +145,7 @@ public void InvalidEncoding() } } + var s = Encoding.UTF8.GetChars(bytes); foreach (var method in charMethods) { var tokens = method(s); diff --git a/uax29/Words.Dict.cs b/uax29/Words.Dict.cs index 06395c9..428f20b 100644 --- a/uax29/Words.Dict.cs +++ b/uax29/Words.Dict.cs @@ -25,9 +25,9 @@ internal static partial class Words const Property ZWJ = 65536; const Property WSegSpace = 131072; const Property Extended_Pictographic = 262144; - const Property Tab = 524288; + const Property Whitespace = 524288; - internal static readonly Dict Dict = new(GetDict()); + static readonly Dict Dict = new(GetDict()); static Dictionary GetDict() => new() { {0x0022, Double_Quote}, @@ -107,13 +107,13 @@ internal static partial class Words {0xFB4D, Hebrew_Letter}, {0xFB4E, Hebrew_Letter}, {0xFB4F, Hebrew_Letter}, - {0x000D, CR}, - {0x000A, LF}, - {0x000B, Newline}, - {0x000C, Newline}, - {0x0085, Newline}, - {0x2028, Newline}, - {0x2029, Newline}, + {0x000D, CR | Whitespace}, + {0x000A, LF | Whitespace}, + {0x000B, Newline | Whitespace}, + {0x000C, Newline | Whitespace}, + {0x0085, Newline | Whitespace}, + {0x2028, Newline | Whitespace}, + {0x2029, Newline | Whitespace}, {0x0300, Extend}, {0x0301, Extend}, {0x0302, Extend}, @@ -33298,7 +33298,7 @@ internal static partial class Words {0x1FBF8, Numeric}, {0x1FBF9, Numeric}, {0x005F, ExtendNumLet}, - {0x202F, ExtendNumLet}, + {0x202F, ExtendNumLet | Whitespace}, {0x203F, ExtendNumLet}, {0x2040, ExtendNumLet}, {0x2054, ExtendNumLet}, @@ -33309,20 +33309,20 @@ internal static partial class Words {0xFE4F, ExtendNumLet}, {0xFF3F, ExtendNumLet}, {0x200D, ZWJ}, - {0x0020, WSegSpace}, - {0x1680, WSegSpace}, - {0x2000, WSegSpace}, - {0x2001, WSegSpace}, - {0x2002, WSegSpace}, - {0x2003, WSegSpace}, - {0x2004, WSegSpace}, - {0x2005, WSegSpace}, - {0x2006, WSegSpace}, - {0x2008, WSegSpace}, - {0x2009, WSegSpace}, - {0x200A, WSegSpace}, - {0x205F, WSegSpace}, - {0x3000, WSegSpace}, + {0x0020, WSegSpace | Whitespace}, + {0x1680, WSegSpace | Whitespace}, + {0x2000, WSegSpace | Whitespace}, + {0x2001, WSegSpace | Whitespace}, + {0x2002, WSegSpace | Whitespace}, + {0x2003, WSegSpace | Whitespace}, + {0x2004, WSegSpace | Whitespace}, + {0x2005, WSegSpace | Whitespace}, + {0x2006, WSegSpace | Whitespace}, + {0x2008, WSegSpace | Whitespace}, + {0x2009, WSegSpace | Whitespace}, + {0x200A, WSegSpace | Whitespace}, + {0x205F, WSegSpace | Whitespace}, + {0x3000, WSegSpace | Whitespace}, {0x00A9, Extended_Pictographic}, {0x00AE, Extended_Pictographic}, {0x203C, Extended_Pictographic}, @@ -36854,6 +36854,8 @@ internal static partial class Words {0x1FFFB, Extended_Pictographic}, {0x1FFFC, Extended_Pictographic}, {0x1FFFD, Extended_Pictographic}, - {0x0009, Tab}, + {0x0009, Whitespace}, + {0x00A0, Whitespace}, + {0x2007, Whitespace}, }; // end dict }; // end class diff --git a/uax29/Words.Splitter.cs b/uax29/Words.Splitter.cs index 9fdd754..7ad2c92 100644 --- a/uax29/Words.Splitter.cs +++ b/uax29/Words.Splitter.cs @@ -8,8 +8,6 @@ internal static partial class Words { - internal const Property Whitespace = CR | LF | WSegSpace | Tab; - internal static readonly Split SplitBytes = new Splitter(Decoders.Utf8).Split; internal static readonly Split SplitChars = new Splitter(Decoders.Char).Split; @@ -31,14 +29,14 @@ internal Splitter(Decoders decoders) /// The string in which to split words. /// Categories that were seen in the first word. /// The number of bytes/chars that comprise the word. - internal int Split(ReadOnlySpan input, out Property seen) + internal int Split(ReadOnlySpan input, out bool whitespace) { Debug.Assert(input.Length > 0); // These vars are stateful across loop iterations int pos = 0; int w; - seen = 0; + whitespace = true; Property current = 0; Property lastExIgnore = 0; // "last excluding ignored categories" Property lastLastExIgnore = 0; // "the last one before that" @@ -58,7 +56,7 @@ internal int Split(ReadOnlySpan input, out Property seen) pos += w; current = Dict.Lookup(rune.Value); - seen |= current; + whitespace = whitespace && current.Is(Whitespace); } // https://unicode.org/reports/tr29/#WB2 @@ -79,7 +77,7 @@ internal int Split(ReadOnlySpan input, out Property seen) lastExIgnore = last; } - seen |= last; + whitespace = whitespace && current.Is(Whitespace); current = Dict.Lookup(rune.Value); diff --git a/uax29/Words.Test.cs b/uax29/Words.Test.cs index 0f3d29c..d47218d 100644 --- a/uax29/Words.Test.cs +++ b/uax29/Words.Test.cs @@ -7,7 +7,41 @@ namespace Tests; [TestFixture] public class WordsTests { - internal readonly static UnicodeTest[] UnicodeTests = [ + static UnicodeTest[] Tests => UnicodeTests; + + [Test, TestCaseSource(nameof(Tests))] + public void Bytes(UnicodeTest test) + { + var tokens = Split.Words(test.input); + TestUnicode.TestBytes(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void String(UnicodeTest test) + { + var s = Encoding.UTF8.GetString(test.input); + var tokens = Split.Words(s); + TestUnicode.TestChars(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void Stream(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + var tokens = Split.Words(stream); + TestUnicode.TestStream(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void TextReader(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + using var reader = new StreamReader(stream); + var tokens = Split.Words(reader); + TestUnicode.TestTextReader(tokens, test); + } + + readonly static UnicodeTest[] UnicodeTests = [ new([0x0001, 0x0001], [[0x0001], [0x0001]], "÷ [0.2] (Other) ÷ [999.0] (Other) ÷ [0.3]"), new([0x0001, 0x00CC, 0x0088, 0x0001], [[0x0001, 0x00CC, 0x0088], [0x0001]], "÷ [0.2] (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] (Other) ÷ [0.3]"), new([0x0001, 0x000D], [[0x0001], [0x000D]], "÷ [0.2] (Other) ÷ [3.2] (CR) ÷ [0.3]"), @@ -1833,38 +1867,4 @@ public class WordsTests new([0x0061, 0x005F, 0x0061, 0x002C, 0x002C, 0x0061], [[0x0061, 0x005F, 0x0061], [0x002C], [0x002C], [0x0061]], "÷ [0.2] LATIN SMALL LETTER A (ALetter) × [13.1] LOW LINE (ExtendNumLet) × [13.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [999.0] COMMA (MidNum) ÷ [999.0] LATIN SMALL LETTER A (ALetter) ÷ [0.3]"), ]; - - static readonly UnicodeTest[] Tests = UnicodeTests; - - [Test, TestCaseSource(nameof(Tests))] - public void Bytes(UnicodeTest test) - { - var tokens = Tokenizer.GetWords(test.input); - TestUnicode.TestTokenizerBytes(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void String(UnicodeTest test) - { - var s = Encoding.UTF8.GetString(test.input); - var tokens = Tokenizer.GetWords(s); - TestUnicode.TestTokenizerChars(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void Stream(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - var tokens = Tokenizer.GetWords(stream); - TestUnicode.TestTokenizerStream(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void TextReader(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - using var reader = new StreamReader(stream); - var tokens = Tokenizer.GetWords(reader); - TestUnicode.TestTokenizerTextReader(tokens, test); - } } diff --git a/uax29/uax29.csproj b/uax29/uax29.csproj index 97ad1ba..ca9331e 100644 --- a/uax29/uax29.csproj +++ b/uax29/uax29.csproj @@ -18,8 +18,8 @@ - - + + @@ -49,4 +49,8 @@ + + + +