From d4fa8645857027ca2ca7817534ca876a0044a1b9 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 14 Jul 2024 20:52:57 -0400 Subject: [PATCH 01/17] Start extensions methods --- uax29/Extensions/Extensions.Graphemes.cs | 155 +++++++++++++++++++ uax29/Extensions/Extensions.Sentences.cs | 155 +++++++++++++++++++ uax29/Extensions/Extensions.Test.cs | 189 +++++++++++++++++++++++ uax29/Extensions/Extensions.Words.cs | 157 +++++++++++++++++++ uax29/Splitter.cs | 2 +- 5 files changed, 657 insertions(+), 1 deletion(-) create mode 100644 uax29/Extensions/Extensions.Graphemes.cs create mode 100644 uax29/Extensions/Extensions.Sentences.cs create mode 100644 uax29/Extensions/Extensions.Test.cs create mode 100644 uax29/Extensions/Extensions.Words.cs diff --git a/uax29/Extensions/Extensions.Graphemes.cs b/uax29/Extensions/Extensions.Graphemes.cs new file mode 100644 index 0000000..71a2ed6 --- /dev/null +++ b/uax29/Extensions/Extensions.Graphemes.cs @@ -0,0 +1,155 @@ +namespace UAX29.Extensions; +using UAX29; + +public static partial class Extensions +{ + /// + /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitGraphemes(this Span input) => Tokenizer.GetGraphemes(input); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitGraphemes(this ReadOnlySpan input) => Tokenizer.GetGraphemes(input); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitGraphemes(this Memory input) => Tokenizer.GetGraphemes(input); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitGraphemes(this ReadOnlyMemory input) => Tokenizer.GetGraphemes(input); + + /// + /// Split the graphemes in the given array of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitGraphemes(this byte[] input) => Tokenizer.GetGraphemes(input); + + /// + /// Split the graphemes in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitGraphemes(this string input) => Tokenizer.GetGraphemes(input); + + /// + /// Split the graphemes in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitGraphemes(this char[] input) => Tokenizer.GetGraphemes(input); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + /// + public static Tokenizer SplitGraphemes(this Span input) => Tokenizer.GetGraphemes(input); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitGraphemes(this ReadOnlySpan input) => Tokenizer.GetGraphemes(input); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitGraphemes(this Memory input) => Tokenizer.GetGraphemes(input); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitGraphemes(this ReadOnlyMemory input) => Tokenizer.GetGraphemes(input); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum grapheme token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 256 bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static StreamTokenizer SplitGraphemes(this Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Tokenizer.GetGraphemes(stream, minBufferBytes, bufferStorage); + + /// + /// Split the graphemes in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum grapheme token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 256 chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static StreamTokenizer SplitGraphemes(this TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) => Tokenizer.GetGraphemes(stream, minBufferChars, bufferStorage); +} diff --git a/uax29/Extensions/Extensions.Sentences.cs b/uax29/Extensions/Extensions.Sentences.cs new file mode 100644 index 0000000..169f03a --- /dev/null +++ b/uax29/Extensions/Extensions.Sentences.cs @@ -0,0 +1,155 @@ +namespace UAX29.Extensions; +using UAX29; + +public static partial class Extensions +{ + /// + /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitSentences(this Span input) => Tokenizer.GetSentences(input); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitSentences(this ReadOnlySpan input) => Tokenizer.GetSentences(input); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitSentences(this Memory input) => Tokenizer.GetSentences(input); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitSentences(this ReadOnlyMemory input) => Tokenizer.GetSentences(input); + + /// + /// Split the graphemes in the given array of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitSentences(this byte[] input) => Tokenizer.GetSentences(input); + + /// + /// Split the graphemes in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitSentences(this string input) => Tokenizer.GetSentences(input); + + /// + /// Split the graphemes in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitSentences(this char[] input) => Tokenizer.GetSentences(input); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + /// + public static Tokenizer SplitSentences(this Span input) => Tokenizer.GetSentences(input); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitSentences(this ReadOnlySpan input) => Tokenizer.GetSentences(input); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitSentences(this Memory input) => Tokenizer.GetSentences(input); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static Tokenizer SplitSentences(this ReadOnlyMemory input) => Tokenizer.GetSentences(input); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum grapheme token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 256 bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static StreamTokenizer SplitSentences(this Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Tokenizer.GetSentences(stream, minBufferBytes, bufferStorage); + + /// + /// Split the graphemes in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum grapheme token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 256 chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static StreamTokenizer SplitSentences(this TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) => Tokenizer.GetSentences(stream, minBufferChars, bufferStorage); +} diff --git a/uax29/Extensions/Extensions.Test.cs b/uax29/Extensions/Extensions.Test.cs new file mode 100644 index 0000000..751059d --- /dev/null +++ b/uax29/Extensions/Extensions.Test.cs @@ -0,0 +1,189 @@ +namespace Tests; + +using UAX29.Extensions; +using System.Text; + +[TestFixture] +public class TestExtensions +{ + [SetUp] + public void Setup() + { + } + + static int ExpectedOverloads() + { + var expected = 0; + + expected++; // string + expected++; // char[] + expected++; // Span + expected++; // ReadOnlySpan + expected++; // Memory + expected++; // ReadOnlyMemory + + expected++; // byte[] + expected++; // Span + expected++; // ReadOnlySpan + expected++; // Memory + expected++; // ReadOnlyMemory + + expected++; // Stream + expected++; // TextReader + + expected *= 3; // Words, Graphemes, Sentences + + return expected; + } + + + [Test] + public void Overloads() + { + // no assertions, just needs to compile + + int expected = ExpectedOverloads(); + int got = 0; + + var input = "Hello, how are you?"; + var bytes = Encoding.UTF8.GetBytes(input); + using var stream = new MemoryStream(bytes); + using var reader = new StreamReader(stream); + + { + // string + input.SplitWords(); got++; + + // char[] + input.ToCharArray().SplitWords(); got++; + + // ReadOnlySpan + input.AsSpan().SplitWords(); got++; + + // Span + var span = new Span(input.ToCharArray()); + span.SplitWords(); got++; + + // Memory + var mem = new Memory(input.ToCharArray()); + mem.SplitWords(); got++; + + // ReadOnlyMemoryMemory + ReadOnlyMemory rmem = input.AsMemory(); + rmem.SplitWords(); got++; + + reader.SplitWords(); got++; + } + + { + // chars + + input.SplitGraphemes(); got++; + + var array = input.ToCharArray(); + array.SplitGraphemes(); got++; + + var span = new Span(array); + span.SplitGraphemes(); got++; + + ReadOnlySpan rspan = input.AsSpan(); + rspan.SplitGraphemes(); got++; + + var mem = new Memory(array); + mem.SplitGraphemes(); got++; + + ReadOnlyMemory rmem = input.AsMemory(); + rmem.SplitGraphemes(); got++; + + reader.SplitGraphemes(); got++; + } + + + { + // chars + + input.SplitSentences(); got++; + + var array = input.ToCharArray(); + array.SplitSentences(); got++; + + var span = new Span(array); + span.SplitSentences(); got++; + + ReadOnlySpan rspan = input.AsSpan(); + rspan.SplitSentences(); got++; + + var mem = new Memory(array); + mem.SplitSentences(); got++; + + ReadOnlyMemory rmem = input.AsMemory(); + rmem.SplitSentences(); got++; + + reader.SplitSentences(); got++; + } + + { + // bytes + + bytes.SplitWords(); got++; + + Span span = bytes.AsSpan(); + span.SplitWords(); got++; + + ReadOnlySpan rspan = bytes.AsSpan(); + rspan.SplitWords(); got++; + + Memory mem = bytes.AsMemory(); + mem.SplitWords(); got++; + + ReadOnlyMemory rmem = bytes.AsMemory(); + rmem.SplitWords(); got++; + + stream.SplitWords(); got++; + } + + + { + // bytes + + bytes.SplitGraphemes(); got++; + + Span span = bytes.AsSpan(); + span.SplitGraphemes(); got++; + + ReadOnlySpan rspan = bytes.AsSpan(); + rspan.SplitGraphemes(); got++; + + Memory mem = bytes.AsMemory(); + mem.SplitGraphemes(); got++; + + ReadOnlyMemory rmem = bytes.AsMemory(); + rmem.SplitGraphemes(); got++; + + stream.SplitGraphemes(); got++; + } + + + { + // bytes + + bytes.SplitSentences(); got++; + + Span span = bytes.AsSpan(); + span.SplitSentences(); got++; + + ReadOnlySpan rspan = bytes.AsSpan(); + rspan.SplitSentences(); got++; + + Memory mem = bytes.AsMemory(); + mem.SplitSentences(); got++; + + ReadOnlyMemory rmem = bytes.AsMemory(); + rmem.SplitSentences(); got++; + + stream.SplitSentences(); got++; + } + + Assert.That(got, Is.EqualTo(expected)); + } +} diff --git a/uax29/Extensions/Extensions.Words.cs b/uax29/Extensions/Extensions.Words.cs new file mode 100644 index 0000000..7274d65 --- /dev/null +++ b/uax29/Extensions/Extensions.Words.cs @@ -0,0 +1,157 @@ +namespace UAX29.Extensions; +using UAX29; + +public static partial class Extensions +{ + /// + /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static Tokenizer SplitWords(this Span input, Options options = Options.None) => Tokenizer.GetWords(input, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static Tokenizer SplitWords(this ReadOnlySpan input, Options options = Options.None) => Tokenizer.GetWords(input, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static Tokenizer SplitWords(this Memory input, Options options = Options.None) => Tokenizer.GetWords(input, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static Tokenizer SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Tokenizer.GetWords(input, options); + + /// + /// Split the words in the given array of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static Tokenizer SplitWords(this byte[] input, Options options = Options.None) => Tokenizer.GetWords(input, options); + + /// + /// Split the words in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static Tokenizer SplitWords(this string input, Options options = Options.None) => Tokenizer.GetWords(input, options); + + /// + /// Split the words in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static Tokenizer SplitWords(this char[] input, Options options = Options.None) => Tokenizer.GetWords(input, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + /// + public static Tokenizer SplitWords(this Span input, Options options = Options.None) => Tokenizer.GetWords(input, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static Tokenizer SplitWords(this ReadOnlySpan input, Options options = Options.None) => Tokenizer.GetWords(input, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static Tokenizer SplitWords(this Memory input, Options options = Options.None) => Tokenizer.GetWords(input, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static Tokenizer SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Tokenizer.GetWords(input, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum word token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 bytes. The tokenizer is intended for natural language, so we don't expect you'll find text with a word beyond a couple of dozen bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static StreamTokenizer SplitWords(this Stream stream, Options options = Options.None, int minBufferBytes = 1024, byte[]? bufferStorage = null) + => Tokenizer.GetWords(stream, options, minBufferBytes, bufferStorage); + + /// + /// Split the words in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum word token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 chars. The tokenizer is intended for natural language, so we don't expect you'll find text with a word beyond a few dozen chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static StreamTokenizer SplitWords(this TextReader stream, Options options = Options.None, int minBufferChars = 1024, char[]? bufferStorage = null) + => Tokenizer.GetWords(stream, options, minBufferChars, bufferStorage); +} diff --git a/uax29/Splitter.cs b/uax29/Splitter.cs index ca3ce4c..b0198b4 100644 --- a/uax29/Splitter.cs +++ b/uax29/Splitter.cs @@ -13,7 +13,7 @@ /// How many bytes/chars were consumed from the input. internal delegate int Split(ReadOnlySpan input, out Property seen); -internal static class Extensions +internal static class PropertyExtensions { /// /// Determines whether two properties (bitstrings) match, i.e. intersect, i.e. share at least one bit. From 13e623915a34f1b0d5d0fd5028789a098a592ff5 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 14 Jul 2024 22:30:48 -0400 Subject: [PATCH 02/17] More renames --- .gitignore | 1 + Benchmarks/Speed.cs | 8 ++--- README.md | 12 +++---- uax29/Examples.Test.cs | 20 +++++------ uax29/Extensions/Extensions.Graphemes.cs | 22 ++++++------ uax29/Extensions/Extensions.Sentences.cs | 22 ++++++------ uax29/Extensions/Extensions.Words.cs | 22 ++++++------ uax29/RangeTokenizer.cs | 4 +-- uax29/Tokenizer.Graphemes.cs | 44 ++++++++++++------------ uax29/Tokenizer.Sentences.cs | 44 ++++++++++++------------ uax29/Tokenizer.Words.cs | 22 ++++++------ uax29/Tokenizer.cs | 6 ++-- uax29/Unicode.Test.cs | 8 ++--- uax29/uax29.csproj | 4 +-- 14 files changed, 120 insertions(+), 119 deletions(-) diff --git a/.gitignore b/.gitignore index bd9e9fb..58a9fbf 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ BenchmarkDotNet.Artifacts bin obj .vscode/tasks.json +global.json diff --git a/Benchmarks/Speed.cs b/Benchmarks/Speed.cs index 7a48db2..c1c1d81 100644 --- a/Benchmarks/Speed.cs +++ b/Benchmarks/Speed.cs @@ -11,9 +11,9 @@ public string GetValue(Summary summary, BenchmarkCase benchmarkCase) return "N/A"; } var ourReport = summary.Reports.First(x => x.BenchmarkCase.Equals(benchmarkCase)); - long length = new System.IO.FileInfo("sample.txt").Length; - var mean = ourReport.ResultStatistics.Mean; - return $"{(length / ourReport.ResultStatistics.Mean):#####.000}"; + long length = new FileInfo("sample.txt").Length; + var mean = ourReport.ResultStatistics!.Mean; + return $"{length / mean:#####.000} GB/s"; } public string GetValue(Summary summary, BenchmarkCase benchmarkCase, SummaryStyle style) => GetValue(summary, benchmarkCase); @@ -21,7 +21,7 @@ public string GetValue(Summary summary, BenchmarkCase benchmarkCase) public bool IsAvailable(Summary summary) => true; public string Id { get; } = nameof(Speed); - public string ColumnName { get; } = "Speed (GB/s)"; + public string ColumnName { get; } = "Throughput"; public bool AlwaysShow { get; } = true; public ColumnCategory Category { get; } = ColumnCategory.Custom; public int PriorityInCategory { get; } diff --git a/README.md b/README.md index 0a1a004..3a911d4 100644 --- a/README.md +++ b/README.md @@ -11,15 +11,15 @@ dotnet add package UAX29 ``` ```csharp -using UAX29; using System.Text; +using UAX29.Extensions; var example = "Hello, 🌏 world. 你好,世界."; // The tokenizer can split words, graphemes or sentences. // It operates on strings, UTF-8 bytes, and streams. -var words = Tokenizer.GetWords(example); +var words = example.SplitWords(); // Iterate over the tokens foreach (var word in words) @@ -47,7 +47,7 @@ world */ var utf8bytes = Encoding.UTF8.GetBytes(example); -var graphemes = Tokenizer.GetGraphemes(utf8bytes); +var graphemes = utf8bytes.SplitGraphemes(); // Iterate over the tokens foreach (var grapheme in graphemes) @@ -96,7 +96,7 @@ We use the official Unicode [test suites](https://unicode.org/reports/tr41/tr41- [![.NET](https://github.com/clipperhouse/uax29.net/actions/workflows/dotnet.yml/badge.svg)](https://github.com/clipperhouse/uax29.net/actions/workflows/dotnet.yml) -This is the same algorithm that is implemented in Lucene's [StandardTokenizer](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/analysis/standard/StandardTokenizer.html). +This is the same spec that is implemented in Lucene's [StandardTokenizer](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/analysis/standard/StandardTokenizer.html). ### Performance @@ -104,9 +104,9 @@ When tokenizing words, I get around 120MB/s on my Macbook M2. For typical text, The tokenizer is implemented as a `ref struct`, so you should see zero allocations for static text such as `byte[]` or `string`/`char`. -Calling `GetWords` et al returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. +Calling `SplitWords` et al returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. -For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `GetWords`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. +For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `SplitWords`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. ### Options diff --git a/uax29/Examples.Test.cs b/uax29/Examples.Test.cs index 608d3d8..6d77cfe 100644 --- a/uax29/Examples.Test.cs +++ b/uax29/Examples.Test.cs @@ -1,9 +1,9 @@ using System.Text; using UAX29; +using UAX29.Extensions; namespace Tests; - [TestFixture] public class TestExample { @@ -21,7 +21,7 @@ public void Readme() // The tokenizer can split words, graphemes or sentences. // It operates on strings, UTF-8 bytes, and streams. - var words = Tokenizer.GetWords(example); + var words = example.SplitWords(); // Iterate over the tokens foreach (var word in words) @@ -34,12 +34,12 @@ public void Readme() /* Hello , - + 🌏 - + world . - + 你 好 , @@ -49,9 +49,9 @@ public void Readme() */ var utf8bytes = Encoding.UTF8.GetBytes(example); - var graphemes = Tokenizer.GetGraphemes(utf8bytes); + var graphemes = utf8bytes.SplitGraphemes(); - // Iterate over the tokens + // Iterate over the tokens foreach (var grapheme in graphemes) { // grapheme is a ReadOnlySpan of UTF-8 bytes @@ -67,16 +67,16 @@ public void Readme() l o , - + 🌏 - + w o r l d . - + 你 好 , diff --git a/uax29/Extensions/Extensions.Graphemes.cs b/uax29/Extensions/Extensions.Graphemes.cs index 71a2ed6..858a7c9 100644 --- a/uax29/Extensions/Extensions.Graphemes.cs +++ b/uax29/Extensions/Extensions.Graphemes.cs @@ -10,7 +10,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitGraphemes(this Span input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this Span input) => Tokenizer.GetGraphemes(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ @@ -19,7 +19,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitGraphemes(this ReadOnlySpan input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this ReadOnlySpan input) => Tokenizer.GetGraphemes(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -28,7 +28,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitGraphemes(this Memory input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this Memory input) => Tokenizer.GetGraphemes(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -37,7 +37,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitGraphemes(this ReadOnlyMemory input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this ReadOnlyMemory input) => Tokenizer.GetGraphemes(input); /// /// Split the graphemes in the given array of UTF-8 encoded bytes. @@ -46,7 +46,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitGraphemes(this byte[] input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this byte[] input) => Tokenizer.GetGraphemes(input); /// /// Split the graphemes in the given string. @@ -55,7 +55,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitGraphemes(this string input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this string input) => Tokenizer.GetGraphemes(input); /// /// Split the graphemes in the given string. @@ -64,7 +64,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitGraphemes(this char[] input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this char[] input) => Tokenizer.GetGraphemes(input); /// /// Split the graphemes in the given of . @@ -74,7 +74,7 @@ public static partial class Extensions /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// /// - public static Tokenizer SplitGraphemes(this Span input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this Span input) => Tokenizer.GetGraphemes(input); /// /// Split the graphemes in the given of . @@ -83,7 +83,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitGraphemes(this ReadOnlySpan input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this ReadOnlySpan input) => Tokenizer.GetGraphemes(input); /// /// Split the graphemes in the given of . @@ -92,7 +92,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitGraphemes(this Memory input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this Memory input) => Tokenizer.GetGraphemes(input); /// /// Split the graphemes in the given of . @@ -101,7 +101,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitGraphemes(this ReadOnlyMemory input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this ReadOnlyMemory input) => Tokenizer.GetGraphemes(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. diff --git a/uax29/Extensions/Extensions.Sentences.cs b/uax29/Extensions/Extensions.Sentences.cs index 169f03a..c776d22 100644 --- a/uax29/Extensions/Extensions.Sentences.cs +++ b/uax29/Extensions/Extensions.Sentences.cs @@ -10,7 +10,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitSentences(this Span input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this Span input) => Tokenizer.GetSentences(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ @@ -19,7 +19,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitSentences(this ReadOnlySpan input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this ReadOnlySpan input) => Tokenizer.GetSentences(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -28,7 +28,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitSentences(this Memory input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this Memory input) => Tokenizer.GetSentences(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -37,7 +37,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitSentences(this ReadOnlyMemory input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this ReadOnlyMemory input) => Tokenizer.GetSentences(input); /// /// Split the graphemes in the given array of UTF-8 encoded bytes. @@ -46,7 +46,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitSentences(this byte[] input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this byte[] input) => Tokenizer.GetSentences(input); /// /// Split the graphemes in the given string. @@ -55,7 +55,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitSentences(this string input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this string input) => Tokenizer.GetSentences(input); /// /// Split the graphemes in the given string. @@ -64,7 +64,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitSentences(this char[] input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this char[] input) => Tokenizer.GetSentences(input); /// /// Split the graphemes in the given of . @@ -74,7 +74,7 @@ public static partial class Extensions /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// /// - public static Tokenizer SplitSentences(this Span input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this Span input) => Tokenizer.GetSentences(input); /// /// Split the graphemes in the given of . @@ -83,7 +83,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitSentences(this ReadOnlySpan input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this ReadOnlySpan input) => Tokenizer.GetSentences(input); /// /// Split the graphemes in the given of . @@ -92,7 +92,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitSentences(this Memory input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this Memory input) => Tokenizer.GetSentences(input); /// /// Split the graphemes in the given of . @@ -101,7 +101,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer SplitSentences(this ReadOnlyMemory input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this ReadOnlyMemory input) => Tokenizer.GetSentences(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. diff --git a/uax29/Extensions/Extensions.Words.cs b/uax29/Extensions/Extensions.Words.cs index 7274d65..3fed1f5 100644 --- a/uax29/Extensions/Extensions.Words.cs +++ b/uax29/Extensions/Extensions.Words.cs @@ -10,7 +10,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer SplitWords(this Span input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this Span input, Options options = Options.None) => Tokenizer.GetWords(input, options); /// /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ @@ -19,7 +19,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer SplitWords(this ReadOnlySpan input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this ReadOnlySpan input, Options options = Options.None) => Tokenizer.GetWords(input, options); /// /// Split the words in the given of UTF-8 encoded bytes. @@ -28,7 +28,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer SplitWords(this Memory input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this Memory input, Options options = Options.None) => Tokenizer.GetWords(input, options); /// /// Split the words in the given of UTF-8 encoded bytes. @@ -37,7 +37,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Tokenizer.GetWords(input, options); /// /// Split the words in the given array of UTF-8 encoded bytes. @@ -46,7 +46,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer SplitWords(this byte[] input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this byte[] input, Options options = Options.None) => Tokenizer.GetWords(input, options); /// /// Split the words in the given string. @@ -55,7 +55,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer SplitWords(this string input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this string input, Options options = Options.None) => Tokenizer.GetWords(input, options); /// /// Split the words in the given string. @@ -64,7 +64,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer SplitWords(this char[] input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this char[] input, Options options = Options.None) => Tokenizer.GetWords(input, options); /// /// Split the words in the given of . @@ -74,7 +74,7 @@ public static partial class Extensions /// An enumerator of words. Use foreach (var word in words). /// /// - public static Tokenizer SplitWords(this Span input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this Span input, Options options = Options.None) => Tokenizer.GetWords(input, options); /// /// Split the words in the given of . @@ -83,7 +83,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer SplitWords(this ReadOnlySpan input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this ReadOnlySpan input, Options options = Options.None) => Tokenizer.GetWords(input, options); /// /// Split the words in the given of . @@ -92,7 +92,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer SplitWords(this Memory input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this Memory input, Options options = Options.None) => Tokenizer.GetWords(input, options); /// /// Split the words in the given of . @@ -101,7 +101,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Tokenizer.GetWords(input, options); /// /// Split the words in the given of UTF-8 encoded bytes. diff --git a/uax29/RangeTokenizer.cs b/uax29/RangeTokenizer.cs index 3eb131c..1612a1f 100644 --- a/uax29/RangeTokenizer.cs +++ b/uax29/RangeTokenizer.cs @@ -9,14 +9,14 @@ namespace UAX29; /// byte or char, indicating the type of the input, and by implication, the output. public ref struct RangeTokenizer where T : struct { - Tokenizer tokenizer; + SplitEnumerator tokenizer; bool begun = false; /// /// Tokenizer splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec. /// /// A string, or UTF-8 byte array. - internal RangeTokenizer(Tokenizer tokenizer) + internal RangeTokenizer(SplitEnumerator tokenizer) { this.tokenizer = tokenizer; } diff --git a/uax29/Tokenizer.Graphemes.cs b/uax29/Tokenizer.Graphemes.cs index 7cb373a..e3828b6 100644 --- a/uax29/Tokenizer.Graphemes.cs +++ b/uax29/Tokenizer.Graphemes.cs @@ -9,7 +9,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(Span input) => new(input, Graphemes.SplitBytes); + public static SplitEnumerator GetGraphemes(Span input) => new(input, Graphemes.SplitBytes); /// /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ @@ -18,7 +18,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(ReadOnlySpan input) => new(input, Graphemes.SplitBytes); + public static SplitEnumerator GetGraphemes(ReadOnlySpan input) => new(input, Graphemes.SplitBytes); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -27,7 +27,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(Memory input) => new(input.Span, Graphemes.SplitBytes); + public static SplitEnumerator GetGraphemes(Memory input) => new(input.Span, Graphemes.SplitBytes); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -36,7 +36,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(ReadOnlyMemory input) => new(input.Span, Graphemes.SplitBytes); + public static SplitEnumerator GetGraphemes(ReadOnlyMemory input) => new(input.Span, Graphemes.SplitBytes); /// /// Split the graphemes in the given array of UTF-8 encoded bytes. @@ -45,7 +45,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(byte[] input) => new(input.AsSpan(), Graphemes.SplitBytes); + public static SplitEnumerator GetGraphemes(byte[] input) => new(input.AsSpan(), Graphemes.SplitBytes); /// /// Split the graphemes in the given string. @@ -54,7 +54,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(string input) => new(input.AsSpan(), Graphemes.SplitChars); + public static SplitEnumerator GetGraphemes(string input) => new(input.AsSpan(), Graphemes.SplitChars); /// /// Split the graphemes in the given string. @@ -63,7 +63,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(char[] input) => new(input.AsSpan(), Graphemes.SplitChars); + public static SplitEnumerator GetGraphemes(char[] input) => new(input.AsSpan(), Graphemes.SplitChars); /// /// Split the graphemes in the given of . @@ -72,8 +72,8 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - /// - public static Tokenizer GetGraphemes(Span input) => new(input, Graphemes.SplitChars); + /// + public static SplitEnumerator GetGraphemes(Span input) => new(input, Graphemes.SplitChars); /// /// Split the graphemes in the given of . @@ -82,7 +82,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(ReadOnlySpan input) => new(input, Graphemes.SplitChars); + public static SplitEnumerator GetGraphemes(ReadOnlySpan input) => new(input, Graphemes.SplitChars); /// /// Split the graphemes in the given of . @@ -91,7 +91,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(Memory input) => new(input.Span, Graphemes.SplitChars); + public static SplitEnumerator GetGraphemes(Memory input) => new(input.Span, Graphemes.SplitChars); /// /// Split the graphemes in the given of . @@ -100,7 +100,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static Tokenizer GetGraphemes(ReadOnlyMemory input) => new(input.Span, Graphemes.SplitChars); + public static SplitEnumerator GetGraphemes(ReadOnlyMemory input) => new(input.Span, Graphemes.SplitChars); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -109,17 +109,17 @@ public static partial class Tokenizer /// /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum grapheme token size. Tokens that exceed the bytes in the buffer /// will simply be cut off at this length, no error will occur. - /// + /// /// Default is 256 bytes. /// /// /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. - /// + /// /// If not provided, storage of 2 * minBufferBytes will be allocated by default. - /// - /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, /// which is more efficient, but will increase memory usage. - /// + /// /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. /// /// @@ -139,17 +139,17 @@ public static StreamTokenizer GetGraphemes(Stream stream, int minBufferByt /// /// Optional, the minimum chars to buffer from the reader. This determines the maximum grapheme token size. Tokens that exceed the chars in the buffer /// will simply be cut off at this length, no error will occur. - /// + /// /// Default is 256 chars. /// /// /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. - /// + /// /// If not provided, storage of 2 * minBufferChars will be allocated by default. - /// - /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, /// which is more efficient, but will increase memory usage. - /// + /// /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. /// /// diff --git a/uax29/Tokenizer.Sentences.cs b/uax29/Tokenizer.Sentences.cs index 0169ffd..6a1ed21 100644 --- a/uax29/Tokenizer.Sentences.cs +++ b/uax29/Tokenizer.Sentences.cs @@ -9,7 +9,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(Span input) => new(input, Sentences.SplitBytes); + public static SplitEnumerator GetSentences(Span input) => new(input, Sentences.SplitBytes); /// /// Split the sentences in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ @@ -18,7 +18,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(ReadOnlySpan input) => new(input, Sentences.SplitBytes); + public static SplitEnumerator GetSentences(ReadOnlySpan input) => new(input, Sentences.SplitBytes); /// /// Split the sentences in the given of UTF-8 encoded bytes. @@ -27,7 +27,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(Memory input) => new(input.Span, Sentences.SplitBytes); + public static SplitEnumerator GetSentences(Memory input) => new(input.Span, Sentences.SplitBytes); /// /// Split the sentences in the given of UTF-8 encoded bytes. @@ -36,7 +36,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(ReadOnlyMemory input) => new(input.Span, Sentences.SplitBytes); + public static SplitEnumerator GetSentences(ReadOnlyMemory input) => new(input.Span, Sentences.SplitBytes); /// /// Split the sentences in the given array of UTF-8 encoded bytes. @@ -45,7 +45,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(byte[] input) => new(input.AsSpan(), Sentences.SplitBytes); + public static SplitEnumerator GetSentences(byte[] input) => new(input.AsSpan(), Sentences.SplitBytes); /// /// Split the sentences in the given string. @@ -54,7 +54,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(string input) => new(input.AsSpan(), Sentences.SplitChars); + public static SplitEnumerator GetSentences(string input) => new(input.AsSpan(), Sentences.SplitChars); /// /// Split the sentences in the given string. @@ -63,7 +63,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(char[] input) => new(input.AsSpan(), Sentences.SplitChars); + public static SplitEnumerator GetSentences(char[] input) => new(input.AsSpan(), Sentences.SplitChars); /// /// Split the sentences in the given of . @@ -72,8 +72,8 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - /// - public static Tokenizer GetSentences(Span input) => new(input, Sentences.SplitChars); + /// + public static SplitEnumerator GetSentences(Span input) => new(input, Sentences.SplitChars); /// /// Split the sentences in the given of . @@ -82,7 +82,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(ReadOnlySpan input) => new(input, Sentences.SplitChars); + public static SplitEnumerator GetSentences(ReadOnlySpan input) => new(input, Sentences.SplitChars); /// /// Split the sentences in the given of . @@ -91,7 +91,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(Memory input) => new(input.Span, Sentences.SplitChars); + public static SplitEnumerator GetSentences(Memory input) => new(input.Span, Sentences.SplitChars); /// /// Split the sentences in the given of . @@ -100,7 +100,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static Tokenizer GetSentences(ReadOnlyMemory input) => new(input.Span, Sentences.SplitChars); + public static SplitEnumerator GetSentences(ReadOnlyMemory input) => new(input.Span, Sentences.SplitChars); @@ -111,17 +111,17 @@ public static partial class Tokenizer /// /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum sentence token size. Tokens that exceed the bytes in the buffer /// will simply be cut off at this length, no error will occur. - /// + /// /// Default is 1024 bytes. /// /// /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. - /// + /// /// If not provided, storage of 2 * minBufferBytes will be allocated by default. - /// - /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, /// which is more efficient, but will increase memory usage. - /// + /// /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. /// /// @@ -141,17 +141,17 @@ public static StreamTokenizer GetSentences(Stream stream, int minBufferByt /// /// Optional, the minimum chars to buffer from the reader. This determines the maximum sentence token size. Tokens that exceed the chars in the buffer /// will simply be cut off at this length, no error will occur. - /// + /// /// Default is 1024 chars. /// /// /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. - /// + /// /// If not provided, storage of 2 * minBufferChars will be allocated by default. - /// - /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, /// which is more efficient, but will increase memory usage. - /// + /// /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. /// /// diff --git a/uax29/Tokenizer.Words.cs b/uax29/Tokenizer.Words.cs index 1755ebf..99d610d 100644 --- a/uax29/Tokenizer.Words.cs +++ b/uax29/Tokenizer.Words.cs @@ -9,7 +9,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer GetWords(Span input, Options options = Options.None) => new(input, Words.SplitBytes, options); + public static SplitEnumerator GetWords(Span input, Options options = Options.None) => new(input, Words.SplitBytes, options); /// /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ @@ -18,7 +18,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer GetWords(ReadOnlySpan input, Options options = Options.None) => new(input, Words.SplitBytes, options); + public static SplitEnumerator GetWords(ReadOnlySpan input, Options options = Options.None) => new(input, Words.SplitBytes, options); /// /// Split the words in the given of UTF-8 encoded bytes. @@ -27,7 +27,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer GetWords(Memory input, Options options = Options.None) => new(input.Span, Words.SplitBytes, options); + public static SplitEnumerator GetWords(Memory input, Options options = Options.None) => new(input.Span, Words.SplitBytes, options); /// /// Split the words in the given of UTF-8 encoded bytes. @@ -36,7 +36,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer GetWords(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, Words.SplitBytes, options); + public static SplitEnumerator GetWords(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, Words.SplitBytes, options); /// /// Split the words in the given array of UTF-8 encoded bytes. @@ -45,7 +45,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer GetWords(byte[] input, Options options = Options.None) => new(input.AsSpan(), Words.SplitBytes, options); + public static SplitEnumerator GetWords(byte[] input, Options options = Options.None) => new(input.AsSpan(), Words.SplitBytes, options); /// /// Split the words in the given string. @@ -54,7 +54,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer GetWords(string input, Options options = Options.None) => new(input.AsSpan(), Words.SplitChars, options); + public static SplitEnumerator GetWords(string input, Options options = Options.None) => new(input.AsSpan(), Words.SplitChars, options); /// /// Split the words in the given string. @@ -63,7 +63,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer GetWords(char[] input, Options options = Options.None) => new(input.AsSpan(), Words.SplitChars, options); + public static SplitEnumerator GetWords(char[] input, Options options = Options.None) => new(input.AsSpan(), Words.SplitChars, options); /// /// Split the words in the given of . @@ -73,7 +73,7 @@ public static partial class Tokenizer /// An enumerator of words. Use foreach (var word in words). /// /// - public static Tokenizer GetWords(Span input, Options options = Options.None) => new(input, Words.SplitChars, options); + public static SplitEnumerator GetWords(Span input, Options options = Options.None) => new(input, Words.SplitChars, options); /// /// Split the words in the given of . @@ -82,7 +82,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer GetWords(ReadOnlySpan input, Options options = Options.None) => new(input, Words.SplitChars, options); + public static SplitEnumerator GetWords(ReadOnlySpan input, Options options = Options.None) => new(input, Words.SplitChars, options); /// /// Split the words in the given of . @@ -91,7 +91,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer GetWords(Memory input, Options options = Options.None) => new(input.Span, Words.SplitChars, options); + public static SplitEnumerator GetWords(Memory input, Options options = Options.None) => new(input.Span, Words.SplitChars, options); /// /// Split the words in the given of . @@ -100,7 +100,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// - public static Tokenizer GetWords(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, Words.SplitChars, options); + public static SplitEnumerator GetWords(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, Words.SplitChars, options); /// /// Split the words in the given of UTF-8 encoded bytes. diff --git a/uax29/Tokenizer.cs b/uax29/Tokenizer.cs index 24972fe..185b0b9 100644 --- a/uax29/Tokenizer.cs +++ b/uax29/Tokenizer.cs @@ -6,7 +6,7 @@ namespace UAX29; /// Splits an input string (UTF-8 or UTF-16) and provides an enumerator over the splits. /// /// byte or char, indicating the type of the input, and by implication, the output. -public ref struct Tokenizer where T : struct +public ref struct SplitEnumerator where T : struct { ReadOnlySpan input; @@ -31,7 +31,7 @@ namespace UAX29; /// A string, or UTF-8 byte array. /// A func/method meeting the Split delegate signature. /// Options for handling the input text. - internal Tokenizer(ReadOnlySpan input, Split split, Options options = Options.None) + internal SplitEnumerator(ReadOnlySpan input, Split split, Options options = Options.None) { this.input = input; this.split = split; @@ -78,7 +78,7 @@ public readonly ReadOnlySpan Current } } - public readonly Tokenizer GetEnumerator() + public readonly SplitEnumerator GetEnumerator() { return this; } diff --git a/uax29/Unicode.Test.cs b/uax29/Unicode.Test.cs index 71053c9..9639479 100644 --- a/uax29/Unicode.Test.cs +++ b/uax29/Unicode.Test.cs @@ -20,7 +20,7 @@ public void Setup() { } - internal static void TestTokenizerBytes(Tokenizer tokens, UnicodeTest test) + internal static void TestTokenizerBytes(SplitEnumerator tokens, UnicodeTest test) { var i = 0; foreach (var token in tokens) @@ -44,7 +44,7 @@ internal static void TestTokenizerStream(StreamTokenizer tokens, UnicodeTe } } - internal static void TestTokenizerChars(Tokenizer tokens, UnicodeTest test) + internal static void TestTokenizerChars(SplitEnumerator tokens, UnicodeTest test) { var i = 0; foreach (var token in tokens) @@ -68,11 +68,11 @@ internal static void TestTokenizerTextReader(StreamTokenizer tokens, Unico } } - private delegate Tokenizer ByteMethod(byte[] input); + private delegate SplitEnumerator ByteMethod(byte[] input); static readonly ByteMethod byteWords = (byte[] input) => Tokenizer.GetWords(input); // because of the optional parameter static readonly ByteMethod[] byteMethods = [byteWords, Tokenizer.GetGraphemes, Tokenizer.GetSentences]; - private delegate Tokenizer CharMethod(char[] input); + private delegate SplitEnumerator CharMethod(char[] input); static readonly CharMethod charWords = (char[] input) => Tokenizer.GetWords(input); // because of the optional parameter static readonly CharMethod[] charMethods = [charWords, Tokenizer.GetGraphemes, Tokenizer.GetSentences]; diff --git a/uax29/uax29.csproj b/uax29/uax29.csproj index 97ad1ba..5764d03 100644 --- a/uax29/uax29.csproj +++ b/uax29/uax29.csproj @@ -18,8 +18,8 @@ - - + + From 47107831f6f48c1ff9217e0db26100c6c4b27491 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 14 Jul 2024 22:39:58 -0400 Subject: [PATCH 03/17] Renames --- uax29/Extensions/Extensions.Graphemes.cs | 4 ++-- uax29/Extensions/Extensions.Sentences.cs | 4 ++-- uax29/Extensions/Extensions.Words.cs | 4 ++-- uax29/{ => Legacy}/Tokenizer.Graphemes.cs | 8 ++++---- uax29/{ => Legacy}/Tokenizer.Sentences.cs | 8 ++++---- uax29/{ => Legacy}/Tokenizer.Words.cs | 8 ++++---- ...kenizer.Test.cs => RangeEnumerator.Test.cs} | 0 .../{RangeTokenizer.cs => RangeEnumerator.cs} | 12 ++++++------ ...kenizer.Test.cs => SplitEnumerator.Test.cs} | 2 +- uax29/{Tokenizer.cs => SplitEnumerator.cs} | 6 +++--- ...enizer.Test.cs => StreamEnumerator.Test.cs} | 0 ...{StreamTokenizer.cs => StreamEnumerator.cs} | 18 +++++++++--------- uax29/Unicode.Test.cs | 4 ++-- uax29/uax29.csproj | 4 ++++ 14 files changed, 43 insertions(+), 39 deletions(-) rename uax29/{ => Legacy}/Tokenizer.Graphemes.cs (92%) rename uax29/{ => Legacy}/Tokenizer.Sentences.cs (92%) rename uax29/{ => Legacy}/Tokenizer.Words.cs (92%) rename uax29/{RangeTokenizer.Test.cs => RangeEnumerator.Test.cs} (100%) rename uax29/{RangeTokenizer.cs => RangeEnumerator.cs} (77%) rename uax29/{Tokenizer.Test.cs => SplitEnumerator.Test.cs} (95%) rename uax29/{Tokenizer.cs => SplitEnumerator.cs} (93%) rename uax29/{StreamTokenizer.Test.cs => StreamEnumerator.Test.cs} (100%) rename uax29/{StreamTokenizer.cs => StreamEnumerator.cs} (71%) diff --git a/uax29/Extensions/Extensions.Graphemes.cs b/uax29/Extensions/Extensions.Graphemes.cs index 858a7c9..7c5c441 100644 --- a/uax29/Extensions/Extensions.Graphemes.cs +++ b/uax29/Extensions/Extensions.Graphemes.cs @@ -126,7 +126,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static StreamTokenizer SplitGraphemes(this Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Tokenizer.GetGraphemes(stream, minBufferBytes, bufferStorage); + public static StreamEnumerator SplitGraphemes(this Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Tokenizer.GetGraphemes(stream, minBufferBytes, bufferStorage); /// /// Split the graphemes in the given / . @@ -151,5 +151,5 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static StreamTokenizer SplitGraphemes(this TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) => Tokenizer.GetGraphemes(stream, minBufferChars, bufferStorage); + public static StreamEnumerator SplitGraphemes(this TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) => Tokenizer.GetGraphemes(stream, minBufferChars, bufferStorage); } diff --git a/uax29/Extensions/Extensions.Sentences.cs b/uax29/Extensions/Extensions.Sentences.cs index c776d22..08a6025 100644 --- a/uax29/Extensions/Extensions.Sentences.cs +++ b/uax29/Extensions/Extensions.Sentences.cs @@ -126,7 +126,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static StreamTokenizer SplitSentences(this Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Tokenizer.GetSentences(stream, minBufferBytes, bufferStorage); + public static StreamEnumerator SplitSentences(this Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Tokenizer.GetSentences(stream, minBufferBytes, bufferStorage); /// /// Split the graphemes in the given / . @@ -151,5 +151,5 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static StreamTokenizer SplitSentences(this TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) => Tokenizer.GetSentences(stream, minBufferChars, bufferStorage); + public static StreamEnumerator SplitSentences(this TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) => Tokenizer.GetSentences(stream, minBufferChars, bufferStorage); } diff --git a/uax29/Extensions/Extensions.Words.cs b/uax29/Extensions/Extensions.Words.cs index 3fed1f5..e7ac22e 100644 --- a/uax29/Extensions/Extensions.Words.cs +++ b/uax29/Extensions/Extensions.Words.cs @@ -126,7 +126,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static StreamTokenizer SplitWords(this Stream stream, Options options = Options.None, int minBufferBytes = 1024, byte[]? bufferStorage = null) + public static StreamEnumerator SplitWords(this Stream stream, Options options = Options.None, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Tokenizer.GetWords(stream, options, minBufferBytes, bufferStorage); /// @@ -152,6 +152,6 @@ public static StreamTokenizer SplitWords(this Stream stream, Options optio /// /// An enumerator of words. Use foreach (var word in words). /// - public static StreamTokenizer SplitWords(this TextReader stream, Options options = Options.None, int minBufferChars = 1024, char[]? bufferStorage = null) + public static StreamEnumerator SplitWords(this TextReader stream, Options options = Options.None, int minBufferChars = 1024, char[]? bufferStorage = null) => Tokenizer.GetWords(stream, options, minBufferChars, bufferStorage); } diff --git a/uax29/Tokenizer.Graphemes.cs b/uax29/Legacy/Tokenizer.Graphemes.cs similarity index 92% rename from uax29/Tokenizer.Graphemes.cs rename to uax29/Legacy/Tokenizer.Graphemes.cs index e3828b6..42dd1b7 100644 --- a/uax29/Tokenizer.Graphemes.cs +++ b/uax29/Legacy/Tokenizer.Graphemes.cs @@ -125,11 +125,11 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static StreamTokenizer GetGraphemes(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) + public static StreamEnumerator GetGraphemes(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) { bufferStorage ??= new byte[minBufferBytes * 2]; var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); - return new StreamTokenizer(buffer, Graphemes.SplitBytes); + return new StreamEnumerator(buffer, Graphemes.SplitBytes); } /// @@ -155,10 +155,10 @@ public static StreamTokenizer GetGraphemes(Stream stream, int minBufferByt /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static StreamTokenizer GetGraphemes(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) + public static StreamEnumerator GetGraphemes(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) { bufferStorage ??= new char[minBufferChars * 2]; var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); - return new StreamTokenizer(buffer, Graphemes.SplitChars); + return new StreamEnumerator(buffer, Graphemes.SplitChars); } } diff --git a/uax29/Tokenizer.Sentences.cs b/uax29/Legacy/Tokenizer.Sentences.cs similarity index 92% rename from uax29/Tokenizer.Sentences.cs rename to uax29/Legacy/Tokenizer.Sentences.cs index 6a1ed21..ae70729 100644 --- a/uax29/Tokenizer.Sentences.cs +++ b/uax29/Legacy/Tokenizer.Sentences.cs @@ -127,11 +127,11 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static StreamTokenizer GetSentences(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) + public static StreamEnumerator GetSentences(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) { bufferStorage ??= new byte[minBufferBytes * 2]; var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); - return new StreamTokenizer(buffer, Sentences.SplitBytes); + return new StreamEnumerator(buffer, Sentences.SplitBytes); } /// @@ -157,10 +157,10 @@ public static StreamTokenizer GetSentences(Stream stream, int minBufferByt /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - public static StreamTokenizer GetSentences(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) + public static StreamEnumerator GetSentences(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) { bufferStorage ??= new char[minBufferChars * 2]; var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); - return new StreamTokenizer(buffer, Sentences.SplitChars); + return new StreamEnumerator(buffer, Sentences.SplitChars); } } diff --git a/uax29/Tokenizer.Words.cs b/uax29/Legacy/Tokenizer.Words.cs similarity index 92% rename from uax29/Tokenizer.Words.cs rename to uax29/Legacy/Tokenizer.Words.cs index 99d610d..71a26e9 100644 --- a/uax29/Tokenizer.Words.cs +++ b/uax29/Legacy/Tokenizer.Words.cs @@ -125,11 +125,11 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// - public static StreamTokenizer GetWords(Stream stream, Options options = Options.None, int minBufferBytes = 1024, byte[]? bufferStorage = null) + public static StreamEnumerator GetWords(Stream stream, Options options = Options.None, int minBufferBytes = 1024, byte[]? bufferStorage = null) { bufferStorage ??= new byte[minBufferBytes * 2]; var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); - return new StreamTokenizer(buffer, Words.SplitBytes, options); + return new StreamEnumerator(buffer, Words.SplitBytes, options); } /// @@ -155,10 +155,10 @@ public static StreamTokenizer GetWords(Stream stream, Options options = Op /// /// An enumerator of words. Use foreach (var word in words). /// - public static StreamTokenizer GetWords(TextReader stream, Options options = Options.None, int minBufferChars = 1024, char[]? bufferStorage = null) + public static StreamEnumerator GetWords(TextReader stream, Options options = Options.None, int minBufferChars = 1024, char[]? bufferStorage = null) { bufferStorage ??= new char[minBufferChars * 2]; var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); - return new StreamTokenizer(buffer, Words.SplitChars, options); + return new StreamEnumerator(buffer, Words.SplitChars, options); } } diff --git a/uax29/RangeTokenizer.Test.cs b/uax29/RangeEnumerator.Test.cs similarity index 100% rename from uax29/RangeTokenizer.Test.cs rename to uax29/RangeEnumerator.Test.cs diff --git a/uax29/RangeTokenizer.cs b/uax29/RangeEnumerator.cs similarity index 77% rename from uax29/RangeTokenizer.cs rename to uax29/RangeEnumerator.cs index 1612a1f..f73f1d9 100644 --- a/uax29/RangeTokenizer.cs +++ b/uax29/RangeEnumerator.cs @@ -4,19 +4,19 @@ namespace UAX29; using Property = uint; /// -/// Tokenizer splits strings or UTF-8 bytes as words, sentences or graphemes, per the Unicode UAX #29 spec. +/// RangeEnumerator splits strings or UTF-8 bytes as words, sentences or graphemes, per the Unicode UAX #29 spec. /// /// byte or char, indicating the type of the input, and by implication, the output. -public ref struct RangeTokenizer where T : struct +public ref struct RangeEnumerator where T : struct { SplitEnumerator tokenizer; bool begun = false; /// - /// Tokenizer splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec. + /// RangeEnumerator splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec. /// /// A string, or UTF-8 byte array. - internal RangeTokenizer(SplitEnumerator tokenizer) + internal RangeEnumerator(SplitEnumerator tokenizer) { this.tokenizer = tokenizer; } @@ -42,7 +42,7 @@ public readonly Range Current } } - public readonly RangeTokenizer GetEnumerator() + public readonly RangeEnumerator GetEnumerator() { return this; } @@ -55,7 +55,7 @@ public readonly List ToList() { if (begun) { - throw new InvalidOperationException("ToArray must not be called after iteration has begun. You may wish to call Reset() on the tokenizer."); + throw new InvalidOperationException("ToArray must not be called after iteration has begun. You may wish to call Reset() on the enumerator."); } var result = new List(); diff --git a/uax29/Tokenizer.Test.cs b/uax29/SplitEnumerator.Test.cs similarity index 95% rename from uax29/Tokenizer.Test.cs rename to uax29/SplitEnumerator.Test.cs index 65179dc..3533ea1 100644 --- a/uax29/Tokenizer.Test.cs +++ b/uax29/SplitEnumerator.Test.cs @@ -283,7 +283,7 @@ public void ToList() Assert.That(list, Has.Count.EqualTo(i), "ToList should return the same number of tokens as iteration"); - // Tokenizer should reset back to the beginning + // Enumerator should reset back to the beginning Assert.That(tokens.start, Is.EqualTo(0)); Assert.That(tokens.end, Is.EqualTo(0)); diff --git a/uax29/Tokenizer.cs b/uax29/SplitEnumerator.cs similarity index 93% rename from uax29/Tokenizer.cs rename to uax29/SplitEnumerator.cs index 185b0b9..c3914a2 100644 --- a/uax29/Tokenizer.cs +++ b/uax29/SplitEnumerator.cs @@ -84,7 +84,7 @@ public readonly SplitEnumerator GetEnumerator() } /// - /// Resets the tokenizer back to the first token. + /// Resets the enumerator back to the first token. /// public void Reset() { @@ -144,11 +144,11 @@ public T[][] ToArray() /// An enumerator of Range. Use foreach to iterate over the ranges. Apply them to your original input /// using [range] or .AsSpan(range) to get the tokens. /// - public readonly RangeTokenizer Ranges + public readonly RangeEnumerator Ranges { get { - return new RangeTokenizer(this); + return new RangeEnumerator(this); } } } diff --git a/uax29/StreamTokenizer.Test.cs b/uax29/StreamEnumerator.Test.cs similarity index 100% rename from uax29/StreamTokenizer.Test.cs rename to uax29/StreamEnumerator.Test.cs diff --git a/uax29/StreamTokenizer.cs b/uax29/StreamEnumerator.cs similarity index 71% rename from uax29/StreamTokenizer.cs rename to uax29/StreamEnumerator.cs index 3da11a1..803351f 100644 --- a/uax29/StreamTokenizer.cs +++ b/uax29/StreamEnumerator.cs @@ -6,9 +6,9 @@ using Property = uint; /// -/// StreamTokenizer is a small data structure for splitting strings from Streams or TextReaders. It implements GetEnumerator. +/// StreamEnumerator is a small data structure for splitting strings from Streams or TextReaders. It implements GetEnumerator. /// -public ref struct StreamTokenizer where T : struct +public ref struct StreamEnumerator where T : struct { internal Buffer buffer; readonly Split split; @@ -27,11 +27,11 @@ bool begun = false; /// - /// StreamTokenizer is a small data structure for splitting strings. + /// StreamEnumerator is a small data structure for splitting strings. /// /// For backing storage, typically created from a Stream or TextReader. /// A delegate that does the tokenizing. See Split for details. - internal StreamTokenizer(Buffer buffer, Split split, Options options = Options.None) + internal StreamEnumerator(Buffer buffer, Split split, Options options = Options.None) { this.buffer = buffer; this.split = split; @@ -72,7 +72,7 @@ public ReadOnlySpan Current } } - public readonly StreamTokenizer GetEnumerator() + public readonly StreamEnumerator GetEnumerator() { return this; } @@ -115,19 +115,19 @@ public readonly T[][] ToArray() public static class StreamExtensions { /// - /// Resets an existing tokenizer with a new stream. You might choose this as an optimization, as it will re-use a buffer, avoiding allocations. + /// Resets an existing StreamEnumerator with a new stream. You might choose this as an optimization, as it will re-use a buffer, avoiding allocations. /// /// The new stream - public static void SetStream(ref this StreamTokenizer tokenizer, Stream stream) + public static void SetStream(ref this StreamEnumerator tokenizer, Stream stream) { tokenizer.buffer.SetRead(stream.Read); } /// - /// Resets an existing tokenizer with a new stream. You might choose this as an optimization, as it will re-use a buffer, avoiding allocations. + /// Resets an existing StreamEnumerator with a new stream. You might choose this as an optimization, as it will re-use a buffer, avoiding allocations. /// /// The new stream - public static void SetStream(ref this StreamTokenizer tokenizer, TextReader stream) + public static void SetStream(ref this StreamEnumerator tokenizer, TextReader stream) { tokenizer.buffer.SetRead(stream.Read); } diff --git a/uax29/Unicode.Test.cs b/uax29/Unicode.Test.cs index 9639479..f0f343b 100644 --- a/uax29/Unicode.Test.cs +++ b/uax29/Unicode.Test.cs @@ -32,7 +32,7 @@ internal static void TestTokenizerBytes(SplitEnumerator tokens, UnicodeTes } } - internal static void TestTokenizerStream(StreamTokenizer tokens, UnicodeTest test) + internal static void TestTokenizerStream(StreamEnumerator tokens, UnicodeTest test) { var i = 0; foreach (var token in tokens) @@ -56,7 +56,7 @@ internal static void TestTokenizerChars(SplitEnumerator tokens, UnicodeTes } } - internal static void TestTokenizerTextReader(StreamTokenizer tokens, UnicodeTest test) + internal static void TestTokenizerTextReader(StreamEnumerator tokens, UnicodeTest test) { var i = 0; foreach (var token in tokens) diff --git a/uax29/uax29.csproj b/uax29/uax29.csproj index 5764d03..ca9331e 100644 --- a/uax29/uax29.csproj +++ b/uax29/uax29.csproj @@ -49,4 +49,8 @@ + + + + From 2b6c7bbb3cae1e437362ec157689f8248618d988 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Mon, 15 Jul 2024 10:41:39 -0400 Subject: [PATCH 04/17] Renames & obsolete --- Benchmarks/Program.cs | 14 +- uax29/Extensions/Extensions.Graphemes.cs | 26 ++-- uax29/Extensions/Extensions.Sentences.cs | 26 ++-- uax29/Extensions/Extensions.Words.cs | 26 ++-- uax29/Graphemes.Test.cs | 92 ++++++------- uax29/Legacy/Tokenizer.Graphemes.cs | 13 ++ uax29/Legacy/Tokenizer.Sentences.cs | 16 ++- uax29/Legacy/Tokenizer.Words.cs | 14 +- uax29/README.md | 16 ++- uax29/RangeEnumerator.Test.cs | 14 +- uax29/Sentences.Test.cs | 92 ++++++------- uax29/Split.Graphemes.cs | 164 ++++++++++++++++++++++ uax29/Split.Sentences.cs | 166 +++++++++++++++++++++++ uax29/Split.Words.cs | 164 ++++++++++++++++++++++ uax29/SplitEnumerator.Test.cs | 100 +++++++------- uax29/StreamEnumerator.Test.cs | 28 ++-- uax29/Unicode.Test.cs | 8 +- uax29/Words.Test.cs | 92 ++++++------- 18 files changed, 805 insertions(+), 266 deletions(-) create mode 100644 uax29/Split.Graphemes.cs create mode 100644 uax29/Split.Sentences.cs create mode 100644 uax29/Split.Words.cs diff --git a/Benchmarks/Program.cs b/Benchmarks/Program.cs index 31e5afa..93a7e22 100644 --- a/Benchmarks/Program.cs +++ b/Benchmarks/Program.cs @@ -41,7 +41,7 @@ public void Setup() [Benchmark] public void TokenizeBytes() { - var tokens = Tokenizer.GetWords(sample); + var tokens = Split.Words(sample); foreach (var token in tokens) { } @@ -50,7 +50,7 @@ public void TokenizeBytes() [Benchmark] public void TokenizeBytesOmitWhitespace() { - var tokens = Tokenizer.GetWords(sample, Options.OmitWhitespace); + var tokens = Split.Words(sample, Options.OmitWhitespace); foreach (var token in tokens) { } @@ -59,7 +59,7 @@ public void TokenizeBytesOmitWhitespace() [Benchmark] public void TokenizeString() { - var tokens = Tokenizer.GetWords(sampleStr); + var tokens = Split.Words(sampleStr); foreach (var token in tokens) { } @@ -68,7 +68,7 @@ public void TokenizeString() [Benchmark] public void TokenizeStringOmitWhitespace() { - var tokens = Tokenizer.GetWords(sampleStr, Options.OmitWhitespace); + var tokens = Split.Words(sampleStr, Options.OmitWhitespace); foreach (var token in tokens) { } @@ -78,7 +78,7 @@ public void TokenizeStringOmitWhitespace() public void TokenizeStream() { var stream = new MemoryStream(sample); - var tokens = Tokenizer.GetWords(stream); + var tokens = Split.Words(stream); foreach (var token in tokens) { } @@ -90,7 +90,7 @@ public void TokenizeSetStream() // This is to test to observe allocations. // The creation will allocate a buffer of 1024 bytes - var tokens = Tokenizer.GetWords(sampleStream); + var tokens = Split.Words(sampleStream); var runs = 10; // keep in mind the 10 runs when interpreting the benchmark @@ -117,7 +117,7 @@ public void StringInfoGraphemes() [Benchmark] public void TokenizerGraphemes() { - var tokens = Tokenizer.GetGraphemes(sample); + var tokens = Split.Graphemes(sample); foreach (var token in tokens) { } diff --git a/uax29/Extensions/Extensions.Graphemes.cs b/uax29/Extensions/Extensions.Graphemes.cs index 7c5c441..9909691 100644 --- a/uax29/Extensions/Extensions.Graphemes.cs +++ b/uax29/Extensions/Extensions.Graphemes.cs @@ -10,7 +10,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this Span input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this Span input) => Split.Graphemes(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ @@ -19,7 +19,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this ReadOnlySpan input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this ReadOnlySpan input) => Split.Graphemes(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -28,7 +28,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this Memory input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this Memory input) => Split.Graphemes(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -37,7 +37,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this ReadOnlyMemory input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this ReadOnlyMemory input) => Split.Graphemes(input); /// /// Split the graphemes in the given array of UTF-8 encoded bytes. @@ -46,7 +46,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this byte[] input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this byte[] input) => Split.Graphemes(input); /// /// Split the graphemes in the given string. @@ -55,7 +55,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this string input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this string input) => Split.Graphemes(input); /// /// Split the graphemes in the given string. @@ -64,7 +64,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this char[] input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this char[] input) => Split.Graphemes(input); /// /// Split the graphemes in the given of . @@ -74,7 +74,7 @@ public static partial class Extensions /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// /// - public static SplitEnumerator SplitGraphemes(this Span input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this Span input) => Split.Graphemes(input); /// /// Split the graphemes in the given of . @@ -83,7 +83,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this ReadOnlySpan input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this ReadOnlySpan input) => Split.Graphemes(input); /// /// Split the graphemes in the given of . @@ -92,7 +92,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this Memory input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this Memory input) => Split.Graphemes(input); /// /// Split the graphemes in the given of . @@ -101,7 +101,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this ReadOnlyMemory input) => Tokenizer.GetGraphemes(input); + public static SplitEnumerator SplitGraphemes(this ReadOnlyMemory input) => Split.Graphemes(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -126,7 +126,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static StreamEnumerator SplitGraphemes(this Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Tokenizer.GetGraphemes(stream, minBufferBytes, bufferStorage); + public static StreamEnumerator SplitGraphemes(this Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Split.Graphemes(stream, minBufferBytes, bufferStorage); /// /// Split the graphemes in the given / . @@ -151,5 +151,5 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static StreamEnumerator SplitGraphemes(this TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) => Tokenizer.GetGraphemes(stream, minBufferChars, bufferStorage); + public static StreamEnumerator SplitGraphemes(this TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) => Split.Graphemes(stream, minBufferChars, bufferStorage); } diff --git a/uax29/Extensions/Extensions.Sentences.cs b/uax29/Extensions/Extensions.Sentences.cs index 08a6025..173ec3c 100644 --- a/uax29/Extensions/Extensions.Sentences.cs +++ b/uax29/Extensions/Extensions.Sentences.cs @@ -10,7 +10,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this Span input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this Span input) => Split.Sentences(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ @@ -19,7 +19,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this ReadOnlySpan input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this ReadOnlySpan input) => Split.Sentences(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -28,7 +28,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this Memory input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this Memory input) => Split.Sentences(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -37,7 +37,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this ReadOnlyMemory input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this ReadOnlyMemory input) => Split.Sentences(input); /// /// Split the graphemes in the given array of UTF-8 encoded bytes. @@ -46,7 +46,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this byte[] input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this byte[] input) => Split.Sentences(input); /// /// Split the graphemes in the given string. @@ -55,7 +55,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this string input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this string input) => Split.Sentences(input); /// /// Split the graphemes in the given string. @@ -64,7 +64,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this char[] input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this char[] input) => Split.Sentences(input); /// /// Split the graphemes in the given of . @@ -74,7 +74,7 @@ public static partial class Extensions /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// /// - public static SplitEnumerator SplitSentences(this Span input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this Span input) => Split.Sentences(input); /// /// Split the graphemes in the given of . @@ -83,7 +83,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this ReadOnlySpan input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this ReadOnlySpan input) => Split.Sentences(input); /// /// Split the graphemes in the given of . @@ -92,7 +92,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this Memory input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this Memory input) => Split.Sentences(input); /// /// Split the graphemes in the given of . @@ -101,7 +101,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this ReadOnlyMemory input) => Tokenizer.GetSentences(input); + public static SplitEnumerator SplitSentences(this ReadOnlyMemory input) => Split.Sentences(input); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -126,7 +126,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static StreamEnumerator SplitSentences(this Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Tokenizer.GetSentences(stream, minBufferBytes, bufferStorage); + public static StreamEnumerator SplitSentences(this Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) => Split.Sentences(stream, minBufferBytes, bufferStorage); /// /// Split the graphemes in the given / . @@ -151,5 +151,5 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static StreamEnumerator SplitSentences(this TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) => Tokenizer.GetSentences(stream, minBufferChars, bufferStorage); + public static StreamEnumerator SplitSentences(this TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) => Split.Sentences(stream, minBufferChars, bufferStorage); } diff --git a/uax29/Extensions/Extensions.Words.cs b/uax29/Extensions/Extensions.Words.cs index e7ac22e..efeb3b7 100644 --- a/uax29/Extensions/Extensions.Words.cs +++ b/uax29/Extensions/Extensions.Words.cs @@ -10,7 +10,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this Span input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this Span input, Options options = Options.None) => Split.Words(input, options); /// /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ @@ -19,7 +19,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this ReadOnlySpan input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this ReadOnlySpan input, Options options = Options.None) => Split.Words(input, options); /// /// Split the words in the given of UTF-8 encoded bytes. @@ -28,7 +28,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this Memory input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this Memory input, Options options = Options.None) => Split.Words(input, options); /// /// Split the words in the given of UTF-8 encoded bytes. @@ -37,7 +37,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Split.Words(input, options); /// /// Split the words in the given array of UTF-8 encoded bytes. @@ -46,7 +46,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this byte[] input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this byte[] input, Options options = Options.None) => Split.Words(input, options); /// /// Split the words in the given string. @@ -55,7 +55,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this string input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this string input, Options options = Options.None) => Split.Words(input, options); /// /// Split the words in the given string. @@ -64,7 +64,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this char[] input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this char[] input, Options options = Options.None) => Split.Words(input, options); /// /// Split the words in the given of . @@ -74,7 +74,7 @@ public static partial class Extensions /// An enumerator of words. Use foreach (var word in words). /// /// - public static SplitEnumerator SplitWords(this Span input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this Span input, Options options = Options.None) => Split.Words(input, options); /// /// Split the words in the given of . @@ -83,7 +83,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this ReadOnlySpan input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this ReadOnlySpan input, Options options = Options.None) => Split.Words(input, options); /// /// Split the words in the given of . @@ -92,7 +92,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this Memory input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this Memory input, Options options = Options.None) => Split.Words(input, options); /// /// Split the words in the given of . @@ -101,7 +101,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Tokenizer.GetWords(input, options); + public static SplitEnumerator SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Split.Words(input, options); /// /// Split the words in the given of UTF-8 encoded bytes. @@ -127,7 +127,7 @@ public static partial class Extensions /// An enumerator of words. Use foreach (var word in words). /// public static StreamEnumerator SplitWords(this Stream stream, Options options = Options.None, int minBufferBytes = 1024, byte[]? bufferStorage = null) - => Tokenizer.GetWords(stream, options, minBufferBytes, bufferStorage); + => Split.Words(stream, options, minBufferBytes, bufferStorage); /// /// Split the words in the given / . @@ -153,5 +153,5 @@ public static StreamEnumerator SplitWords(this Stream stream, Options opti /// An enumerator of words. Use foreach (var word in words). /// public static StreamEnumerator SplitWords(this TextReader stream, Options options = Options.None, int minBufferChars = 1024, char[]? bufferStorage = null) - => Tokenizer.GetWords(stream, options, minBufferChars, bufferStorage); + => Split.Words(stream, options, minBufferChars, bufferStorage); } diff --git a/uax29/Graphemes.Test.cs b/uax29/Graphemes.Test.cs index 4bde5fc..f891573 100644 --- a/uax29/Graphemes.Test.cs +++ b/uax29/Graphemes.Test.cs @@ -1,13 +1,13 @@ // generated from https://www.unicode.org/Public/15.0.0/ucd/auxiliary/GraphemeBreakTest.txt -namespace Tests; - -using System.Text; -using UAX29; - -[TestFixture] -public class GraphemesTests -{ - internal readonly static UnicodeTest[] UnicodeTests = [ +namespace Tests; + +using System.Text; +using UAX29; + +[TestFixture] +public class GraphemesTests +{ + internal readonly static UnicodeTest[] UnicodeTests = [ new([0x0020, 0x0020], [[0x0020], [0x0020]], "÷ [0.2] SPACE (Other) ÷ [999.0] SPACE (Other) ÷ [0.3]"), new([0x0020, 0x00CC, 0x0088, 0x0020], [[0x0020, 0x00CC, 0x0088], [0x0020]], "÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend_ExtCccZwj) ÷ [999.0] SPACE (Other) ÷ [0.3]"), new([0x0020, 0x000D], [[0x0020], [0x000D]], "÷ [0.2] SPACE (Other) ÷ [5.0] (CR) ÷ [0.3]"), @@ -610,40 +610,40 @@ public class GraphemesTests new([0x0061, 0x00E2, 0x0080, 0x008D, 0x00F0, 0x009F, 0x009B, 0x0091], [[0x0061, 0x00E2, 0x0080, 0x008D], [0x00F0, 0x009F, 0x009B, 0x0091]], "÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) ÷ [999.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]"), new([0x00E2, 0x009C, 0x0081, 0x00E2, 0x0080, 0x008D, 0x00E2, 0x009C, 0x0081], [[0x00E2, 0x009C, 0x0081, 0x00E2, 0x0080, 0x008D, 0x00E2, 0x009C, 0x0081]], "÷ [0.2] UPPER BLADE SCISSORS (Other) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] UPPER BLADE SCISSORS (Other) ÷ [0.3]"), new([0x0061, 0x00E2, 0x0080, 0x008D, 0x00E2, 0x009C, 0x0081], [[0x0061, 0x00E2, 0x0080, 0x008D], [0x00E2, 0x009C, 0x0081]], "÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) ÷ [999.0] UPPER BLADE SCISSORS (Other) ÷ [0.3]"), - - ]; - - static readonly UnicodeTest[] Tests = UnicodeTests; - - [Test, TestCaseSource(nameof(Tests))] - public void Bytes(UnicodeTest test) - { - var tokens = Tokenizer.GetGraphemes(test.input); - TestUnicode.TestTokenizerBytes(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void String(UnicodeTest test) - { - var s = Encoding.UTF8.GetString(test.input); - var tokens = Tokenizer.GetGraphemes(s); - TestUnicode.TestTokenizerChars(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void Stream(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - var tokens = Tokenizer.GetGraphemes(stream); - TestUnicode.TestTokenizerStream(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void TextReader(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - using var reader = new StreamReader(stream); - var tokens = Tokenizer.GetGraphemes(reader); - TestUnicode.TestTokenizerTextReader(tokens, test); - } -} + + ]; + + static readonly UnicodeTest[] Tests = UnicodeTests; + + [Test, TestCaseSource(nameof(Tests))] + public void Bytes(UnicodeTest test) + { + var tokens = Split.Graphemes(test.input); + TestUnicode.TestTokenizerBytes(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void String(UnicodeTest test) + { + var s = Encoding.UTF8.GetString(test.input); + var tokens = Split.Graphemes(s); + TestUnicode.TestTokenizerChars(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void Stream(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + var tokens = Split.Graphemes(stream); + TestUnicode.TestTokenizerStream(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void TextReader(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + using var reader = new StreamReader(stream); + var tokens = Split.Graphemes(reader); + TestUnicode.TestTokenizerTextReader(tokens, test); + } +} diff --git a/uax29/Legacy/Tokenizer.Graphemes.cs b/uax29/Legacy/Tokenizer.Graphemes.cs index 42dd1b7..3a331a3 100644 --- a/uax29/Legacy/Tokenizer.Graphemes.cs +++ b/uax29/Legacy/Tokenizer.Graphemes.cs @@ -9,6 +9,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] public static SplitEnumerator GetGraphemes(Span input) => new(input, Graphemes.SplitBytes); /// @@ -18,6 +19,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] public static SplitEnumerator GetGraphemes(ReadOnlySpan input) => new(input, Graphemes.SplitBytes); /// @@ -27,6 +29,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] public static SplitEnumerator GetGraphemes(Memory input) => new(input.Span, Graphemes.SplitBytes); /// @@ -36,6 +39,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] public static SplitEnumerator GetGraphemes(ReadOnlyMemory input) => new(input.Span, Graphemes.SplitBytes); /// @@ -45,6 +49,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] public static SplitEnumerator GetGraphemes(byte[] input) => new(input.AsSpan(), Graphemes.SplitBytes); /// @@ -54,6 +59,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] public static SplitEnumerator GetGraphemes(string input) => new(input.AsSpan(), Graphemes.SplitChars); /// @@ -63,6 +69,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] public static SplitEnumerator GetGraphemes(char[] input) => new(input.AsSpan(), Graphemes.SplitChars); /// @@ -73,6 +80,7 @@ public static partial class Tokenizer /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] public static SplitEnumerator GetGraphemes(Span input) => new(input, Graphemes.SplitChars); /// @@ -82,6 +90,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] public static SplitEnumerator GetGraphemes(ReadOnlySpan input) => new(input, Graphemes.SplitChars); /// @@ -91,6 +100,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] public static SplitEnumerator GetGraphemes(Memory input) => new(input.Span, Graphemes.SplitChars); /// @@ -100,6 +110,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// + [Obsolete("Use Split.Graphemes(input) or input.SplitGraphemes()")] public static SplitEnumerator GetGraphemes(ReadOnlyMemory input) => new(input.Span, Graphemes.SplitChars); /// @@ -125,6 +136,7 @@ public static partial class Tokenizer /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// + [Obsolete("Use Split.Graphemes(stream) or stream.SplitGraphemes()")] public static StreamEnumerator GetGraphemes(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) { bufferStorage ??= new byte[minBufferBytes * 2]; @@ -155,6 +167,7 @@ public static StreamEnumerator GetGraphemes(Stream stream, int minBufferBy /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// + [Obsolete("Use Split.Graphemes(stream) or stream.SplitGraphemes()")] public static StreamEnumerator GetGraphemes(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) { bufferStorage ??= new char[minBufferChars * 2]; diff --git a/uax29/Legacy/Tokenizer.Sentences.cs b/uax29/Legacy/Tokenizer.Sentences.cs index ae70729..d02a36b 100644 --- a/uax29/Legacy/Tokenizer.Sentences.cs +++ b/uax29/Legacy/Tokenizer.Sentences.cs @@ -9,6 +9,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] public static SplitEnumerator GetSentences(Span input) => new(input, Sentences.SplitBytes); /// @@ -18,6 +19,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] public static SplitEnumerator GetSentences(ReadOnlySpan input) => new(input, Sentences.SplitBytes); /// @@ -27,6 +29,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] public static SplitEnumerator GetSentences(Memory input) => new(input.Span, Sentences.SplitBytes); /// @@ -36,6 +39,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] public static SplitEnumerator GetSentences(ReadOnlyMemory input) => new(input.Span, Sentences.SplitBytes); /// @@ -45,6 +49,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] public static SplitEnumerator GetSentences(byte[] input) => new(input.AsSpan(), Sentences.SplitBytes); /// @@ -54,6 +59,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] public static SplitEnumerator GetSentences(string input) => new(input.AsSpan(), Sentences.SplitChars); /// @@ -63,6 +69,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] public static SplitEnumerator GetSentences(char[] input) => new(input.AsSpan(), Sentences.SplitChars); /// @@ -72,7 +79,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// - /// + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] public static SplitEnumerator GetSentences(Span input) => new(input, Sentences.SplitChars); /// @@ -82,6 +89,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] public static SplitEnumerator GetSentences(ReadOnlySpan input) => new(input, Sentences.SplitChars); /// @@ -91,6 +99,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] public static SplitEnumerator GetSentences(Memory input) => new(input.Span, Sentences.SplitChars); /// @@ -100,10 +109,9 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// + [Obsolete("Use Split.Sentences(input) or input.SplitSentences()")] public static SplitEnumerator GetSentences(ReadOnlyMemory input) => new(input.Span, Sentences.SplitChars); - - /// /// Split the sentences in the given of UTF-8 encoded bytes. /// @@ -127,6 +135,7 @@ public static partial class Tokenizer /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// + [Obsolete("Use Split.Sentences(stream) or stream.SplitSentences()")] public static StreamEnumerator GetSentences(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) { bufferStorage ??= new byte[minBufferBytes * 2]; @@ -157,6 +166,7 @@ public static StreamEnumerator GetSentences(Stream stream, int minBufferBy /// /// An enumerator of sentences. Use foreach (var sentence in sentences). /// + [Obsolete("Use Split.Sentences(stream) or stream.SplitSentences()")] public static StreamEnumerator GetSentences(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) { bufferStorage ??= new char[minBufferChars * 2]; diff --git a/uax29/Legacy/Tokenizer.Words.cs b/uax29/Legacy/Tokenizer.Words.cs index 71a26e9..e245d67 100644 --- a/uax29/Legacy/Tokenizer.Words.cs +++ b/uax29/Legacy/Tokenizer.Words.cs @@ -9,6 +9,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] public static SplitEnumerator GetWords(Span input, Options options = Options.None) => new(input, Words.SplitBytes, options); /// @@ -18,6 +19,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] public static SplitEnumerator GetWords(ReadOnlySpan input, Options options = Options.None) => new(input, Words.SplitBytes, options); /// @@ -27,6 +29,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] public static SplitEnumerator GetWords(Memory input, Options options = Options.None) => new(input.Span, Words.SplitBytes, options); /// @@ -36,6 +39,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] public static SplitEnumerator GetWords(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, Words.SplitBytes, options); /// @@ -45,6 +49,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] public static SplitEnumerator GetWords(byte[] input, Options options = Options.None) => new(input.AsSpan(), Words.SplitBytes, options); /// @@ -54,6 +59,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] public static SplitEnumerator GetWords(string input, Options options = Options.None) => new(input.AsSpan(), Words.SplitChars, options); /// @@ -63,6 +69,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] public static SplitEnumerator GetWords(char[] input, Options options = Options.None) => new(input.AsSpan(), Words.SplitChars, options); /// @@ -72,7 +79,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// - /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] public static SplitEnumerator GetWords(Span input, Options options = Options.None) => new(input, Words.SplitChars, options); /// @@ -82,6 +89,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] public static SplitEnumerator GetWords(ReadOnlySpan input, Options options = Options.None) => new(input, Words.SplitChars, options); /// @@ -91,6 +99,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] public static SplitEnumerator GetWords(Memory input, Options options = Options.None) => new(input.Span, Words.SplitChars, options); /// @@ -100,6 +109,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// + [Obsolete("Use Split.Words(input) or input.SplitWords()")] public static SplitEnumerator GetWords(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, Words.SplitChars, options); /// @@ -125,6 +135,7 @@ public static partial class Tokenizer /// /// An enumerator of words. Use foreach (var word in words). /// + [Obsolete("Use Split.Words(stream) or stream.SplitWords()")] public static StreamEnumerator GetWords(Stream stream, Options options = Options.None, int minBufferBytes = 1024, byte[]? bufferStorage = null) { bufferStorage ??= new byte[minBufferBytes * 2]; @@ -155,6 +166,7 @@ public static StreamEnumerator GetWords(Stream stream, Options options = O /// /// An enumerator of words. Use foreach (var word in words). /// + [Obsolete("Use Split.Words(stream) or stream.SplitWords()")] public static StreamEnumerator GetWords(TextReader stream, Options options = Options.None, int minBufferChars = 1024, char[]? bufferStorage = null) { bufferStorage ??= new char[minBufferChars * 2]; diff --git a/uax29/README.md b/uax29/README.md index 0a1a004..fc6f075 100644 --- a/uax29/README.md +++ b/uax29/README.md @@ -118,14 +118,24 @@ The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We ### Major version changes -If you are using v1.x of this package, v2 has been renamed: +#### v2 → v3 + +Renamed methods: + +`Tokenizer.GetWords(input)` → `Split.Words(input)` + +or + +`Tokenizer.GetWords(input)` → `input.SplitWords()` + +#### v1 → v2 + +Renamed package, namespace and methods: `dotnet add package uax29.net` → `dotnet add package UAX29` `using uax29` → `using UAX29` -We renamed the methods: - `Tokenizer.Create(input)` → `Tokenizer.GetWords(input)` `Tokenizer.Create(input, TokenType.Graphemes)` → `Tokenizer.GetGraphemes(input)` diff --git a/uax29/RangeEnumerator.Test.cs b/uax29/RangeEnumerator.Test.cs index e3d848c..1032ed6 100644 --- a/uax29/RangeEnumerator.Test.cs +++ b/uax29/RangeEnumerator.Test.cs @@ -18,7 +18,7 @@ public void Reset() var example = "Hello, how are you?"; var bytes = Encoding.UTF8.GetBytes(example); - var words = Tokenizer.GetWords(example); + var words = Split.Words(example); var ranges = words.Ranges; var first = new List(); @@ -49,7 +49,7 @@ public void MatchesTokenizer() foreach (var option in options) { - var tokens = Tokenizer.GetWords(example, option); + var tokens = Split.Words(example, option); var ranges = tokens.Ranges; foreach (var range in ranges) @@ -68,9 +68,9 @@ public void Enumerator() { var input = "Hello, how are you?"; var mem = input.AsMemory(); - Tokenizer.GetWords(mem); + Split.Words(mem); - var words = Tokenizer.GetWords(input); + var words = Split.Words(input); var ranges = words.Ranges; var first = new List(); @@ -81,7 +81,7 @@ public void Enumerator() Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing - var tokens2 = Tokenizer.GetWords(input); + var tokens2 = Split.Words(input); var ranges2 = words.Ranges; var second = new List(); @@ -96,7 +96,7 @@ public void Enumerator() public void ToList() { var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; - var words = Tokenizer.GetWords(example); + var words = Split.Words(example); var ranges = words.Ranges; var list = ranges.ToList(); @@ -126,7 +126,7 @@ public void ToList() public void ToArray() { var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; - var words = Tokenizer.GetWords(example); + var words = Split.Words(example); var ranges = words.Ranges; var array = ranges.ToArray(); diff --git a/uax29/Sentences.Test.cs b/uax29/Sentences.Test.cs index 65313ce..dca41b8 100644 --- a/uax29/Sentences.Test.cs +++ b/uax29/Sentences.Test.cs @@ -1,13 +1,13 @@ // generated from https://www.unicode.org/Public/15.0.0/ucd/auxiliary/SentenceBreakTest.txt -namespace Tests; - -using System.Text; -using UAX29; - -[TestFixture] -public class SentencesTests -{ - internal readonly static UnicodeTest[] UnicodeTests = [ +namespace Tests; + +using System.Text; +using UAX29; + +[TestFixture] +public class SentencesTests +{ + internal readonly static UnicodeTest[] UnicodeTests = [ new([0x0001, 0x0001], [[0x0001, 0x0001]], "÷ [0.2] (Other) × [998.0] (Other) ÷ [0.3]"), new([0x0001, 0x00CC, 0x0088, 0x0001], [[0x0001, 0x00CC, 0x0088, 0x0001]], "÷ [0.2] (Other) × [5.0] COMBINING DIAERESIS (Extend_FE) × [998.0] (Other) ÷ [0.3]"), new([0x0001, 0x000D], [[0x0001, 0x000D]], "÷ [0.2] (Other) × [998.0] (CR) ÷ [0.3]"), @@ -510,40 +510,40 @@ public class SentencesTests new([0x00E2, 0x0081, 0x00A0, 0x0065, 0x00E2, 0x0081, 0x00A0, 0x0074, 0x00E2, 0x0081, 0x00A0, 0x0063, 0x00E2, 0x0081, 0x00A0, 0x002E, 0x00E2, 0x0081, 0x00A0, 0x00E3, 0x0080, 0x0082, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0], [[0x00E2, 0x0081, 0x00A0, 0x0065, 0x00E2, 0x0081, 0x00A0, 0x0074, 0x00E2, 0x0081, 0x00A0, 0x0063, 0x00E2, 0x0081, 0x00A0, 0x002E, 0x00E2, 0x0081, 0x00A0, 0x00E3, 0x0080, 0x0082, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0]], "÷ [0.2] WORD JOINER (Format_FE) × [998.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [998.0] LATIN SMALL LETTER T (Lower) × [5.0] WORD JOINER (Format_FE) × [998.0] LATIN SMALL LETTER C (Lower) × [5.0] WORD JOINER (Format_FE) × [998.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) × [8.1] IDEOGRAPHIC FULL STOP (STerm) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]"), new([0x00E2, 0x0081, 0x00A0, 0x00E5, 0x00AD, 0x0097, 0x00E2, 0x0081, 0x00A0, 0x00E3, 0x0080, 0x0082, 0x00E2, 0x0081, 0x00A0, 0x00E5, 0x00AE, 0x0083, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0], [[0x00E2, 0x0081, 0x00A0, 0x00E5, 0x00AD, 0x0097, 0x00E2, 0x0081, 0x00A0, 0x00E3, 0x0080, 0x0082, 0x00E2, 0x0081, 0x00A0], [0x00E5, 0x00AE, 0x0083, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0]], "÷ [0.2] WORD JOINER (Format_FE) × [998.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [998.0] IDEOGRAPHIC FULL STOP (STerm) × [5.0] WORD JOINER (Format_FE) ÷ [11.0] CJK UNIFIED IDEOGRAPH-5B83 (OLetter) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]"), new([0x00E2, 0x0081, 0x00A0, 0x0021, 0x00E2, 0x0081, 0x00A0, 0x0020, 0x00E2, 0x0081, 0x00A0, 0x0020, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0], [[0x00E2, 0x0081, 0x00A0, 0x0021, 0x00E2, 0x0081, 0x00A0, 0x0020, 0x00E2, 0x0081, 0x00A0, 0x0020, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0]], "÷ [0.2] WORD JOINER (Format_FE) × [998.0] EXCLAMATION MARK (STerm) × [5.0] WORD JOINER (Format_FE) × [9.0] SPACE (Sp) × [5.0] WORD JOINER (Format_FE) × [10.0] SPACE (Sp) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]"), - - ]; - - static readonly UnicodeTest[] Tests = UnicodeTests; - - [Test, TestCaseSource(nameof(Tests))] - public void Bytes(UnicodeTest test) - { - var tokens = Tokenizer.GetSentences(test.input); - TestUnicode.TestTokenizerBytes(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void String(UnicodeTest test) - { - var s = Encoding.UTF8.GetString(test.input); - var tokens = Tokenizer.GetSentences(s); - TestUnicode.TestTokenizerChars(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void Stream(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - var tokens = Tokenizer.GetSentences(stream); - TestUnicode.TestTokenizerStream(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void TextReader(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - using var reader = new StreamReader(stream); - var tokens = Tokenizer.GetSentences(reader); - TestUnicode.TestTokenizerTextReader(tokens, test); - } -} + + ]; + + static readonly UnicodeTest[] Tests = UnicodeTests; + + [Test, TestCaseSource(nameof(Tests))] + public void Bytes(UnicodeTest test) + { + var tokens = Split.Sentences(test.input); + TestUnicode.TestTokenizerBytes(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void String(UnicodeTest test) + { + var s = Encoding.UTF8.GetString(test.input); + var tokens = Split.Sentences(s); + TestUnicode.TestTokenizerChars(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void Stream(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + var tokens = Split.Sentences(stream); + TestUnicode.TestTokenizerStream(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void TextReader(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + using var reader = new StreamReader(stream); + var tokens = Split.Sentences(reader); + TestUnicode.TestTokenizerTextReader(tokens, test); + } +} diff --git a/uax29/Split.Graphemes.cs b/uax29/Split.Graphemes.cs new file mode 100644 index 0000000..61115fb --- /dev/null +++ b/uax29/Split.Graphemes.cs @@ -0,0 +1,164 @@ +namespace UAX29; + +public static partial class Split +{ + /// + /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator Graphemes(Span input) => new(input, UAX29.Graphemes.SplitBytes); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator Graphemes(ReadOnlySpan input) => new(input, UAX29.Graphemes.SplitBytes); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator Graphemes(Memory input) => new(input.Span, UAX29.Graphemes.SplitBytes); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator Graphemes(ReadOnlyMemory input) => new(input.Span, UAX29.Graphemes.SplitBytes); + + /// + /// Split the graphemes in the given array of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator Graphemes(byte[] input) => new(input.AsSpan(), UAX29.Graphemes.SplitBytes); + + /// + /// Split the graphemes in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator Graphemes(string input) => new(input.AsSpan(), UAX29.Graphemes.SplitChars); + + /// + /// Split the graphemes in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator Graphemes(char[] input) => new(input.AsSpan(), UAX29.Graphemes.SplitChars); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + /// + public static SplitEnumerator Graphemes(Span input) => new(input, UAX29.Graphemes.SplitChars); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator Graphemes(ReadOnlySpan input) => new(input, UAX29.Graphemes.SplitChars); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator Graphemes(Memory input) => new(input.Span, UAX29.Graphemes.SplitChars); + + /// + /// Split the graphemes in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static SplitEnumerator Graphemes(ReadOnlyMemory input) => new(input.Span, UAX29.Graphemes.SplitChars); + + /// + /// Split the graphemes in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum grapheme token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 256 bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static StreamEnumerator Graphemes(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) + { + bufferStorage ??= new byte[minBufferBytes * 2]; + var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); + return new StreamEnumerator(buffer, UAX29.Graphemes.SplitBytes); + } + + /// + /// Split the graphemes in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum grapheme token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 256 chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). + /// + public static StreamEnumerator Graphemes(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) + { + bufferStorage ??= new char[minBufferChars * 2]; + var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); + return new StreamEnumerator(buffer, UAX29.Graphemes.SplitChars); + } +} diff --git a/uax29/Split.Sentences.cs b/uax29/Split.Sentences.cs new file mode 100644 index 0000000..5e5f544 --- /dev/null +++ b/uax29/Split.Sentences.cs @@ -0,0 +1,166 @@ +namespace UAX29; + +public static partial class Split +{ + /// + /// Split the sentences in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static SplitEnumerator Sentences(Span input) => new(input, UAX29.Sentences.SplitBytes); + + /// + /// Split the sentences in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static SplitEnumerator Sentences(ReadOnlySpan input) => new(input, UAX29.Sentences.SplitBytes); + + /// + /// Split the sentences in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static SplitEnumerator Sentences(Memory input) => new(input.Span, UAX29.Sentences.SplitBytes); + + /// + /// Split the sentences in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static SplitEnumerator Sentences(ReadOnlyMemory input) => new(input.Span, UAX29.Sentences.SplitBytes); + + /// + /// Split the sentences in the given array of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static SplitEnumerator Sentences(byte[] input) => new(input.AsSpan(), UAX29.Sentences.SplitBytes); + + /// + /// Split the sentences in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static SplitEnumerator Sentences(string input) => new(input.AsSpan(), UAX29.Sentences.SplitChars); + + /// + /// Split the sentences in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static SplitEnumerator Sentences(char[] input) => new(input.AsSpan(), UAX29.Sentences.SplitChars); + + /// + /// Split the sentences in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + /// + public static SplitEnumerator Sentences(Span input) => new(input, UAX29.Sentences.SplitChars); + + /// + /// Split the sentences in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static SplitEnumerator Sentences(ReadOnlySpan input) => new(input, UAX29.Sentences.SplitChars); + + /// + /// Split the sentences in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static SplitEnumerator Sentences(Memory input) => new(input.Span, UAX29.Sentences.SplitChars); + + /// + /// Split the sentences in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static SplitEnumerator Sentences(ReadOnlyMemory input) => new(input.Span, UAX29.Sentences.SplitChars); + + + + /// + /// Split the sentences in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum sentence token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static StreamEnumerator Sentences(Stream stream, int minBufferBytes = 1024, byte[]? bufferStorage = null) + { + bufferStorage ??= new byte[minBufferBytes * 2]; + var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); + return new StreamEnumerator(buffer, UAX29.Sentences.SplitBytes); + } + + /// + /// Split the sentences in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum sentence token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of sentences. Use foreach (var sentence in sentences). + /// + public static StreamEnumerator Sentences(TextReader stream, int minBufferChars = 1024, char[]? bufferStorage = null) + { + bufferStorage ??= new char[minBufferChars * 2]; + var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); + return new StreamEnumerator(buffer, UAX29.Sentences.SplitChars); + } +} diff --git a/uax29/Split.Words.cs b/uax29/Split.Words.cs new file mode 100644 index 0000000..4eb5df1 --- /dev/null +++ b/uax29/Split.Words.cs @@ -0,0 +1,164 @@ +namespace UAX29; + +public static partial class Split +{ + /// + /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator Words(Span input, Options options = Options.None) => new(input, UAX29.Words.SplitBytes, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator Words(ReadOnlySpan input, Options options = Options.None) => new(input, UAX29.Words.SplitBytes, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator Words(Memory input, Options options = Options.None) => new(input.Span, UAX29.Words.SplitBytes, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator Words(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, UAX29.Words.SplitBytes, options); + + /// + /// Split the words in the given array of UTF-8 encoded bytes. + /// + /// The UTF-8 bytes to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator Words(byte[] input, Options options = Options.None) => new(input.AsSpan(), UAX29.Words.SplitBytes, options); + + /// + /// Split the words in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator Words(string input, Options options = Options.None) => new(input.AsSpan(), UAX29.Words.SplitChars, options); + + /// + /// Split the words in the given string. + /// + /// The string to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator Words(char[] input, Options options = Options.None) => new(input.AsSpan(), UAX29.Words.SplitChars, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + /// + public static SplitEnumerator Words(Span input, Options options = Options.None) => new(input, UAX29.Words.SplitChars, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator Words(ReadOnlySpan input, Options options = Options.None) => new(input, UAX29.Words.SplitChars, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator Words(Memory input, Options options = Options.None) => new(input.Span, UAX29.Words.SplitChars, options); + + /// + /// Split the words in the given of . + /// + /// The chars to tokenize. + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static SplitEnumerator Words(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, UAX29.Words.SplitChars, options); + + /// + /// Split the words in the given of UTF-8 encoded bytes. + /// + /// The stream of UTF-8 bytes to tokenize. + /// + /// Optional, the minimum bytes to buffer from the Stream. This determines the maximum word token size. Tokens that exceed the bytes in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 bytes. The tokenizer is intended for natural language, so we don't expect you'll find text with a word beyond a couple of dozen bytes. + /// + /// + /// Optional, a byte array for underlying buffer storage. It must be at least as large at minBufferBytes. + /// + /// If not provided, storage of 2 * minBufferBytes will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferBytes allows fewer, larger reads the stream, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static StreamEnumerator Words(Stream stream, Options options = Options.None, int minBufferBytes = 1024, byte[]? bufferStorage = null) + { + bufferStorage ??= new byte[minBufferBytes * 2]; + var buffer = new Buffer(stream.Read, minBufferBytes, bufferStorage); + return new StreamEnumerator(buffer, UAX29.Words.SplitBytes, options); + } + + /// + /// Split the words in the given / . + /// + /// The stream/text reader of char to tokenize. + /// + /// Optional, the minimum chars to buffer from the reader. This determines the maximum word token size. Tokens that exceed the chars in the buffer + /// will simply be cut off at this length, no error will occur. + /// + /// Default is 1024 chars. The tokenizer is intended for natural language, so we don't expect you'll find text with a word beyond a few dozen chars. + /// + /// + /// Optional, a char array for underlying buffer storage. It must be at least as large at minBufferChars. + /// + /// If not provided, storage of 2 * minBufferChars will be allocated by default. + /// + /// This parameter is a choice about performance and memory usage. A buffer larger than minBufferChars allows fewer, larger reads the reader, + /// which is more efficient, but will increase memory usage. + /// + /// You might also wish to use ArrayPool to reuse the storage and minimize allocations. + /// + /// + /// An enumerator of words. Use foreach (var word in words). + /// + public static StreamEnumerator Words(TextReader stream, Options options = Options.None, int minBufferChars = 1024, char[]? bufferStorage = null) + { + bufferStorage ??= new char[minBufferChars * 2]; + var buffer = new Buffer(stream.Read, minBufferChars, bufferStorage); + return new StreamEnumerator(buffer, UAX29.Words.SplitChars, options); + } +} diff --git a/uax29/SplitEnumerator.Test.cs b/uax29/SplitEnumerator.Test.cs index 3533ea1..d8a0228 100644 --- a/uax29/SplitEnumerator.Test.cs +++ b/uax29/SplitEnumerator.Test.cs @@ -18,7 +18,7 @@ public void Reset() var example = "Hello, how are you?"; var bytes = Encoding.UTF8.GetBytes(example); - var tokens = Tokenizer.GetWords(example); + var tokens = Split.Words(example); var first = new List(); foreach (var token in tokens) @@ -44,7 +44,7 @@ public void SetText() { var example = "Hello, how are you?"; - var tokens = Tokenizer.GetWords(example); + var tokens = Split.Words(example); var first = new List(); foreach (var token in tokens) @@ -107,134 +107,134 @@ public void Overloads() { // chars - Tokenizer.GetWords(input); got++; + Split.Words(input); got++; var array = input.ToCharArray(); - Tokenizer.GetWords(array); got++; + Split.Words(array); got++; var span = new Span(array); - Tokenizer.GetWords(span); got++; + Split.Words(span); got++; ReadOnlySpan rspan = input.AsSpan(); - Tokenizer.GetWords(rspan); got++; + Split.Words(rspan); got++; var mem = new Memory(array); - Tokenizer.GetWords(mem); got++; + Split.Words(mem); got++; ReadOnlyMemory rmem = input.AsMemory(); - Tokenizer.GetWords(rmem); got++; + Split.Words(rmem); got++; - Tokenizer.GetWords(reader); got++; + Split.Words(reader); got++; } { // chars - Tokenizer.GetGraphemes(input); got++; + Split.Graphemes(input); got++; var array = input.ToCharArray(); - Tokenizer.GetGraphemes(array); got++; + Split.Graphemes(array); got++; var span = new Span(array); - Tokenizer.GetGraphemes(span); got++; + Split.Graphemes(span); got++; ReadOnlySpan rspan = input.AsSpan(); - Tokenizer.GetGraphemes(rspan); got++; + Split.Graphemes(rspan); got++; var mem = new Memory(array); - Tokenizer.GetGraphemes(mem); got++; + Split.Graphemes(mem); got++; ReadOnlyMemory rmem = input.AsMemory(); - Tokenizer.GetGraphemes(rmem); got++; + Split.Graphemes(rmem); got++; - Tokenizer.GetGraphemes(reader); got++; + Split.Graphemes(reader); got++; } { // chars - Tokenizer.GetSentences(input); got++; + Split.Sentences(input); got++; var array = input.ToCharArray(); - Tokenizer.GetSentences(array); got++; + Split.Sentences(array); got++; var span = new Span(array); - Tokenizer.GetSentences(span); got++; + Split.Sentences(span); got++; ReadOnlySpan rspan = input.AsSpan(); - Tokenizer.GetSentences(rspan); got++; + Split.Sentences(rspan); got++; var mem = new Memory(array); - Tokenizer.GetSentences(mem); got++; + Split.Sentences(mem); got++; ReadOnlyMemory rmem = input.AsMemory(); - Tokenizer.GetSentences(rmem); got++; + Split.Sentences(rmem); got++; - Tokenizer.GetSentences(reader); got++; + Split.Sentences(reader); got++; } { // bytes - Tokenizer.GetWords(bytes); got++; + Split.Words(bytes); got++; Span span = bytes.AsSpan(); - Tokenizer.GetWords(span); got++; + Split.Words(span); got++; ReadOnlySpan rspan = bytes.AsSpan(); - Tokenizer.GetWords(rspan); got++; + Split.Words(rspan); got++; Memory mem = bytes.AsMemory(); - Tokenizer.GetWords(mem); got++; + Split.Words(mem); got++; ReadOnlyMemory rmem = bytes.AsMemory(); - Tokenizer.GetWords(rmem); got++; + Split.Words(rmem); got++; - Tokenizer.GetWords(stream); got++; + Split.Words(stream); got++; } { // bytes - Tokenizer.GetGraphemes(bytes); got++; + Split.Graphemes(bytes); got++; Span span = bytes.AsSpan(); - Tokenizer.GetGraphemes(span); got++; + Split.Graphemes(span); got++; ReadOnlySpan rspan = bytes.AsSpan(); - Tokenizer.GetGraphemes(rspan); got++; + Split.Graphemes(rspan); got++; Memory mem = bytes.AsMemory(); - Tokenizer.GetGraphemes(mem); got++; + Split.Graphemes(mem); got++; ReadOnlyMemory rmem = bytes.AsMemory(); - Tokenizer.GetGraphemes(rmem); got++; + Split.Graphemes(rmem); got++; - Tokenizer.GetGraphemes(stream); got++; + Split.Graphemes(stream); got++; } { // bytes - Tokenizer.GetSentences(bytes); got++; + Split.Sentences(bytes); got++; Span span = bytes.AsSpan(); - Tokenizer.GetSentences(span); got++; + Split.Sentences(span); got++; ReadOnlySpan rspan = bytes.AsSpan(); - Tokenizer.GetSentences(rspan); got++; + Split.Sentences(rspan); got++; Memory mem = bytes.AsMemory(); - Tokenizer.GetSentences(mem); got++; + Split.Sentences(mem); got++; ReadOnlyMemory rmem = bytes.AsMemory(); - Tokenizer.GetSentences(rmem); got++; + Split.Sentences(rmem); got++; - Tokenizer.GetSentences(stream); got++; + Split.Sentences(stream); got++; } Assert.That(got, Is.EqualTo(expected)); @@ -246,9 +246,9 @@ public void Enumerator() var input = "Hello, how are you?"; var mem = input.AsMemory(); var bytes = Encoding.UTF8.GetBytes(input); - Tokenizer.GetWords(mem); + Split.Words(mem); - var tokens = Tokenizer.GetWords(input); + var tokens = Split.Words(input); var first = new List(); while (tokens.MoveNext()) { @@ -257,7 +257,7 @@ public void Enumerator() } Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing - var tokens2 = Tokenizer.GetWords(input); + var tokens2 = Split.Words(input); var second = new List(); foreach (var token in tokens2) { @@ -271,7 +271,7 @@ public void Enumerator() public void ToList() { var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; - var tokens = Tokenizer.GetWords(example); + var tokens = Split.Words(example); var list = tokens.ToList(); var i = 0; @@ -304,7 +304,7 @@ public void ToList() public void ToArray() { var example = "abcdefghijk lmnopq r stu vwxyz; ABC DEFG HIJKL MNOP Q RSTUV WXYZ! 你好,世界."; - var tokens = Tokenizer.GetWords(example); + var tokens = Split.Words(example); var array = tokens.ToArray(); var i = 0; @@ -339,7 +339,7 @@ public void Position() var example = "Hello, how are you?"; { - var tokens = Tokenizer.GetWords(example); + var tokens = Split.Words(example); tokens.MoveNext(); Assert.That(tokens.Position, Is.EqualTo(0)); tokens.MoveNext(); @@ -358,7 +358,7 @@ public void Position() var bytes = Encoding.UTF8.GetBytes(example); { - var tokens = Tokenizer.GetWords(bytes); + var tokens = Split.Words(bytes); tokens.MoveNext(); Assert.That(tokens.Position, Is.EqualTo(0)); tokens.MoveNext(); @@ -387,7 +387,7 @@ public void OmitWhitespace() // Options.None should be lossless var expected = example; var got = string.Concat( - Tokenizer.GetWords(example, Options.None) + Split.Words(example, Options.None) .ToList() .SelectMany(c => c) ); @@ -399,7 +399,7 @@ public void OmitWhitespace() // Options.OmitWhitespace should have no whitespace var expected = new string(example.Where(c => !char.IsWhiteSpace(c)).ToArray()); var got = string.Concat( - Tokenizer.GetWords(example, Options.OmitWhitespace) + Split.Words(example, Options.OmitWhitespace) .ToList() .SelectMany(c => c) ); diff --git a/uax29/StreamEnumerator.Test.cs b/uax29/StreamEnumerator.Test.cs index ef8d2b2..f723340 100644 --- a/uax29/StreamEnumerator.Test.cs +++ b/uax29/StreamEnumerator.Test.cs @@ -31,10 +31,10 @@ public void StreamMatchesStatic() foreach (var input in examples) { var bytes = Encoding.UTF8.GetBytes(input); - var staticTokens = Tokenizer.GetWords(bytes, Options.OmitWhitespace); + var staticTokens = Split.Words(bytes, Options.OmitWhitespace); using var stream = new MemoryStream(bytes); - var streamTokens = Tokenizer.GetWords(stream, Options.OmitWhitespace); + var streamTokens = Split.Words(stream, Options.OmitWhitespace); foreach (var streamToken in streamTokens) { @@ -70,11 +70,11 @@ public void StreamReaderMatchesStatic() foreach (var input in examples) { var bytes = Encoding.UTF8.GetBytes(input); - var staticTokens = Tokenizer.GetWords(bytes, option); + var staticTokens = Split.Words(bytes, option); using var stream = new MemoryStream(bytes); using var reader = new StreamReader(stream); - var streamTokens = Tokenizer.GetWords(reader, option); + var streamTokens = Split.Words(reader, option); foreach (var streamToken in streamTokens) { @@ -97,7 +97,7 @@ public void SetStream() var bytes = Encoding.UTF8.GetBytes(input); using var stream = new MemoryStream(bytes); - var tokens = Tokenizer.GetWords(stream); + var tokens = Split.Words(stream); var first = new List(); foreach (var token in tokens) @@ -130,7 +130,7 @@ public void SetStreamReader() using var stream = new MemoryStream(bytes); using var reader = new StreamReader(stream); - var tokens = Tokenizer.GetWords(reader); + var tokens = Split.Words(reader); var first = new List(); foreach (var token in tokens) @@ -163,7 +163,7 @@ public void StreamEnumerator() var bytes = Encoding.UTF8.GetBytes(input); using var stream = new MemoryStream(bytes); - var tokens = Tokenizer.GetWords(stream); + var tokens = Split.Words(stream); var first = new List(); while (tokens.MoveNext()) @@ -175,7 +175,7 @@ public void StreamEnumerator() Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing using var stream2 = new MemoryStream(bytes); - var tokens2 = Tokenizer.GetWords(stream2); + var tokens2 = Split.Words(stream2); var second = new List(); foreach (var token in tokens2) @@ -194,10 +194,10 @@ public void StreamToList() var bytes = Encoding.UTF8.GetBytes(example); using var stream = new MemoryStream(bytes); - var list = Tokenizer.GetWords(stream).ToList(); + var list = Split.Words(stream).ToList(); stream.Seek(0, SeekOrigin.Begin); - var tokens = Tokenizer.GetWords(stream); + var tokens = Split.Words(stream); var i = 0; foreach (var token in tokens) @@ -228,10 +228,10 @@ public void StreamToArray() var bytes = Encoding.UTF8.GetBytes(example); using var stream = new MemoryStream(bytes); - var list = Tokenizer.GetWords(stream).ToList(); + var list = Split.Words(stream).ToList(); stream.Seek(0, SeekOrigin.Begin); - var tokens = Tokenizer.GetWords(stream); + var tokens = Split.Words(stream); var i = 0; foreach (var token in tokens) @@ -263,7 +263,7 @@ public void Position() { using var stream = new MemoryStream(bytes); - var tokens = Tokenizer.GetWords(stream, minBufferBytes: 8); + var tokens = Split.Words(stream, minBufferBytes: 8); tokens.MoveNext(); Assert.That(tokens.Position, Is.EqualTo(0)); // ab... tokens.MoveNext(); @@ -282,7 +282,7 @@ public void Position() { using var stream = new MemoryStream(bytes); - var tokens = Tokenizer.GetWords(stream, minBufferBytes: 8, options: Options.OmitWhitespace); + var tokens = Split.Words(stream, minBufferBytes: 8, options: Options.OmitWhitespace); tokens.MoveNext(); Assert.That(tokens.Position, Is.EqualTo(0)); // ab... // tokens.MoveNext(); diff --git a/uax29/Unicode.Test.cs b/uax29/Unicode.Test.cs index f0f343b..c8c5491 100644 --- a/uax29/Unicode.Test.cs +++ b/uax29/Unicode.Test.cs @@ -69,12 +69,12 @@ internal static void TestTokenizerTextReader(StreamEnumerator tokens, Unic } private delegate SplitEnumerator ByteMethod(byte[] input); - static readonly ByteMethod byteWords = (byte[] input) => Tokenizer.GetWords(input); // because of the optional parameter - static readonly ByteMethod[] byteMethods = [byteWords, Tokenizer.GetGraphemes, Tokenizer.GetSentences]; + static readonly ByteMethod byteWords = (byte[] input) => Split.Words(input); // because of the optional parameter + static readonly ByteMethod[] byteMethods = [byteWords, Split.Graphemes, Split.Graphemes]; private delegate SplitEnumerator CharMethod(char[] input); - static readonly CharMethod charWords = (char[] input) => Tokenizer.GetWords(input); // because of the optional parameter - static readonly CharMethod[] charMethods = [charWords, Tokenizer.GetGraphemes, Tokenizer.GetSentences]; + static readonly CharMethod charWords = (char[] input) => Split.Words(input); // because of the optional parameter + static readonly CharMethod[] charMethods = [charWords, Split.Graphemes, Split.Sentences]; [Test] public void InvalidEncoding() diff --git a/uax29/Words.Test.cs b/uax29/Words.Test.cs index 0f3d29c..12c71ad 100644 --- a/uax29/Words.Test.cs +++ b/uax29/Words.Test.cs @@ -1,13 +1,13 @@ // generated from https://www.unicode.org/Public/15.0.0/ucd/auxiliary/WordBreakTest.txt -namespace Tests; - -using System.Text; -using UAX29; - -[TestFixture] -public class WordsTests -{ - internal readonly static UnicodeTest[] UnicodeTests = [ +namespace Tests; + +using System.Text; +using UAX29; + +[TestFixture] +public class WordsTests +{ + internal readonly static UnicodeTest[] UnicodeTests = [ new([0x0001, 0x0001], [[0x0001], [0x0001]], "÷ [0.2] (Other) ÷ [999.0] (Other) ÷ [0.3]"), new([0x0001, 0x00CC, 0x0088, 0x0001], [[0x0001, 0x00CC, 0x0088], [0x0001]], "÷ [0.2] (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] (Other) ÷ [0.3]"), new([0x0001, 0x000D], [[0x0001], [0x000D]], "÷ [0.2] (Other) ÷ [3.2] (CR) ÷ [0.3]"), @@ -1831,40 +1831,40 @@ public class WordsTests new([0x0061, 0x002C, 0x002C, 0x0061], [[0x0061], [0x002C], [0x002C], [0x0061]], "÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [999.0] COMMA (MidNum) ÷ [999.0] LATIN SMALL LETTER A (ALetter) ÷ [0.3]"), new([0x0061, 0x005F, 0x0031, 0x002C, 0x002C, 0x0061], [[0x0061, 0x005F, 0x0031], [0x002C], [0x002C], [0x0061]], "÷ [0.2] LATIN SMALL LETTER A (ALetter) × [13.1] LOW LINE (ExtendNumLet) × [13.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [999.0] COMMA (MidNum) ÷ [999.0] LATIN SMALL LETTER A (ALetter) ÷ [0.3]"), new([0x0061, 0x005F, 0x0061, 0x002C, 0x002C, 0x0061], [[0x0061, 0x005F, 0x0061], [0x002C], [0x002C], [0x0061]], "÷ [0.2] LATIN SMALL LETTER A (ALetter) × [13.1] LOW LINE (ExtendNumLet) × [13.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [999.0] COMMA (MidNum) ÷ [999.0] LATIN SMALL LETTER A (ALetter) ÷ [0.3]"), - - ]; - - static readonly UnicodeTest[] Tests = UnicodeTests; - - [Test, TestCaseSource(nameof(Tests))] - public void Bytes(UnicodeTest test) - { - var tokens = Tokenizer.GetWords(test.input); - TestUnicode.TestTokenizerBytes(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void String(UnicodeTest test) - { - var s = Encoding.UTF8.GetString(test.input); - var tokens = Tokenizer.GetWords(s); - TestUnicode.TestTokenizerChars(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void Stream(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - var tokens = Tokenizer.GetWords(stream); - TestUnicode.TestTokenizerStream(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void TextReader(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - using var reader = new StreamReader(stream); - var tokens = Tokenizer.GetWords(reader); - TestUnicode.TestTokenizerTextReader(tokens, test); - } -} + + ]; + + static readonly UnicodeTest[] Tests = UnicodeTests; + + [Test, TestCaseSource(nameof(Tests))] + public void Bytes(UnicodeTest test) + { + var tokens = Split.Words(test.input); + TestUnicode.TestTokenizerBytes(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void String(UnicodeTest test) + { + var s = Encoding.UTF8.GetString(test.input); + var tokens = Split.Words(s); + TestUnicode.TestTokenizerChars(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void Stream(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + var tokens = Split.Words(stream); + TestUnicode.TestTokenizerStream(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void TextReader(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + using var reader = new StreamReader(stream); + var tokens = Split.Words(reader); + TestUnicode.TestTokenizerTextReader(tokens, test); + } +} From 417bfbad4658d37d37c8c13c46780fb399b8c2da Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Mon, 15 Jul 2024 12:10:28 -0400 Subject: [PATCH 05/17] fewer overloads --- README.md | 34 +++++++--- uax29/Extensions/Extensions.Graphemes.cs | 8 +-- uax29/Extensions/Extensions.Sentences.cs | 12 ++-- uax29/Extensions/Extensions.Words.cs | 12 ++-- uax29/README.md | 24 ++++--- uax29/RangeEnumerator.Test.cs | 22 +++---- uax29/Split.Graphemes.cs | 82 ----------------------- uax29/Split.Sentences.cs | 84 ------------------------ uax29/Split.Words.cs | 82 ----------------------- uax29/SplitEnumerator.Test.cs | 68 ++----------------- uax29/Unicode.Test.cs | 8 +-- 11 files changed, 72 insertions(+), 364 deletions(-) diff --git a/README.md b/README.md index 3a911d4..db6d0d1 100644 --- a/README.md +++ b/README.md @@ -11,15 +11,15 @@ dotnet add package UAX29 ``` ```csharp +using UAX29; using System.Text; -using UAX29.Extensions; var example = "Hello, 🌏 world. 你好,世界."; // The tokenizer can split words, graphemes or sentences. // It operates on strings, UTF-8 bytes, and streams. -var words = example.SplitWords(); +var words = Split.Words(example); // Iterate over the tokens foreach (var word in words) @@ -47,7 +47,7 @@ world */ var utf8bytes = Encoding.UTF8.GetBytes(example); -var graphemes = utf8bytes.SplitGraphemes(); +var graphemes = Split.Graphemes(utf8bytes); // Iterate over the tokens foreach (var grapheme in graphemes) @@ -84,12 +84,22 @@ d */ ``` +There are also optional extension methods in the spirit of `string.Split`: + +```csharp +using UAX29.Extensions; + +example.SplitWords(); +``` + ### Data types For UTF-8 bytes, pass `byte[]`, `Span` or `Stream`; the resulting tokens will be `ReadOnlySpan`. For strings/chars, pass `string`, `char[]`, `Span` or `TextReader`/`StreamReader`; the resulting tokens will be `ReadOnlySpan`. +If you have `Memory`, use `Memory.Span`. + ### Conformance We use the official Unicode [test suites](https://unicode.org/reports/tr41/tr41-26.html#Tests29). Status: @@ -104,13 +114,13 @@ When tokenizing words, I get around 120MB/s on my Macbook M2. For typical text, The tokenizer is implemented as a `ref struct`, so you should see zero allocations for static text such as `byte[]` or `string`/`char`. -Calling `SplitWords` et al returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. +Calling `Split.Words` returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. -For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `SplitWords`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. +For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `GetWords`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. ### Options -Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be returned. +Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be returned (for words only). ### Invalid inputs @@ -118,14 +128,20 @@ The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We ### Major version changes -If you are using v1.x of this package, v2 has been renamed: +#### v2 → v3 + +Renamed methods: + +`Tokenizer.GetWords(input)` → `Split.Words(input)` + +#### v1 → v2 + +Renamed package, namespace and methods: `dotnet add package uax29.net` → `dotnet add package UAX29` `using uax29` → `using UAX29` -We renamed the methods: - `Tokenizer.Create(input)` → `Tokenizer.GetWords(input)` `Tokenizer.Create(input, TokenType.Graphemes)` → `Tokenizer.GetGraphemes(input)` diff --git a/uax29/Extensions/Extensions.Graphemes.cs b/uax29/Extensions/Extensions.Graphemes.cs index 9909691..88755b8 100644 --- a/uax29/Extensions/Extensions.Graphemes.cs +++ b/uax29/Extensions/Extensions.Graphemes.cs @@ -28,7 +28,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this Memory input) => Split.Graphemes(input); + public static SplitEnumerator SplitGraphemes(this Memory input) => Split.Graphemes(input.Span); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -37,7 +37,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this ReadOnlyMemory input) => Split.Graphemes(input); + public static SplitEnumerator SplitGraphemes(this ReadOnlyMemory input) => Split.Graphemes(input.Span); /// /// Split the graphemes in the given array of UTF-8 encoded bytes. @@ -92,7 +92,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this Memory input) => Split.Graphemes(input); + public static SplitEnumerator SplitGraphemes(this Memory input) => Split.Graphemes(input.Span); /// /// Split the graphemes in the given of . @@ -101,7 +101,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitGraphemes(this ReadOnlyMemory input) => Split.Graphemes(input); + public static SplitEnumerator SplitGraphemes(this ReadOnlyMemory input) => Split.Graphemes(input.Span); /// /// Split the graphemes in the given of UTF-8 encoded bytes. diff --git a/uax29/Extensions/Extensions.Sentences.cs b/uax29/Extensions/Extensions.Sentences.cs index 173ec3c..37d2a4c 100644 --- a/uax29/Extensions/Extensions.Sentences.cs +++ b/uax29/Extensions/Extensions.Sentences.cs @@ -28,7 +28,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this Memory input) => Split.Sentences(input); + public static SplitEnumerator SplitSentences(this Memory input) => Split.Sentences(input.Span); /// /// Split the graphemes in the given of UTF-8 encoded bytes. @@ -37,7 +37,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this ReadOnlyMemory input) => Split.Sentences(input); + public static SplitEnumerator SplitSentences(this ReadOnlyMemory input) => Split.Sentences(input.Span); /// /// Split the graphemes in the given array of UTF-8 encoded bytes. @@ -46,7 +46,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this byte[] input) => Split.Sentences(input); + public static SplitEnumerator SplitSentences(this byte[] input) => Split.Sentences(input.AsSpan()); /// /// Split the graphemes in the given string. @@ -64,7 +64,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this char[] input) => Split.Sentences(input); + public static SplitEnumerator SplitSentences(this char[] input) => Split.Sentences(input.AsSpan()); /// /// Split the graphemes in the given of . @@ -92,7 +92,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this Memory input) => Split.Sentences(input); + public static SplitEnumerator SplitSentences(this Memory input) => Split.Sentences(input.Span); /// /// Split the graphemes in the given of . @@ -101,7 +101,7 @@ public static partial class Extensions /// /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). /// - public static SplitEnumerator SplitSentences(this ReadOnlyMemory input) => Split.Sentences(input); + public static SplitEnumerator SplitSentences(this ReadOnlyMemory input) => Split.Sentences(input.Span); /// /// Split the graphemes in the given of UTF-8 encoded bytes. diff --git a/uax29/Extensions/Extensions.Words.cs b/uax29/Extensions/Extensions.Words.cs index efeb3b7..884c06a 100644 --- a/uax29/Extensions/Extensions.Words.cs +++ b/uax29/Extensions/Extensions.Words.cs @@ -28,7 +28,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this Memory input, Options options = Options.None) => Split.Words(input, options); + public static SplitEnumerator SplitWords(this Memory input, Options options = Options.None) => Split.Words(input.Span, options); /// /// Split the words in the given of UTF-8 encoded bytes. @@ -37,7 +37,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Split.Words(input, options); + public static SplitEnumerator SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Split.Words(input.Span, options); /// /// Split the words in the given array of UTF-8 encoded bytes. @@ -46,7 +46,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this byte[] input, Options options = Options.None) => Split.Words(input, options); + public static SplitEnumerator SplitWords(this byte[] input, Options options = Options.None) => Split.Words(input.AsSpan(), options); /// /// Split the words in the given string. @@ -64,7 +64,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this char[] input, Options options = Options.None) => Split.Words(input, options); + public static SplitEnumerator SplitWords(this char[] input, Options options = Options.None) => Split.Words(input.AsSpan(), options); /// /// Split the words in the given of . @@ -92,7 +92,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this Memory input, Options options = Options.None) => Split.Words(input, options); + public static SplitEnumerator SplitWords(this Memory input, Options options = Options.None) => Split.Words(input.Span, options); /// /// Split the words in the given of . @@ -101,7 +101,7 @@ public static partial class Extensions /// /// An enumerator of words. Use foreach (var word in words). /// - public static SplitEnumerator SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Split.Words(input, options); + public static SplitEnumerator SplitWords(this ReadOnlyMemory input, Options options = Options.None) => Split.Words(input.Span, options); /// /// Split the words in the given of UTF-8 encoded bytes. diff --git a/uax29/README.md b/uax29/README.md index fc6f075..db6d0d1 100644 --- a/uax29/README.md +++ b/uax29/README.md @@ -19,7 +19,7 @@ var example = "Hello, 🌏 world. 你好,世界."; // The tokenizer can split words, graphemes or sentences. // It operates on strings, UTF-8 bytes, and streams. -var words = Tokenizer.GetWords(example); +var words = Split.Words(example); // Iterate over the tokens foreach (var word in words) @@ -47,7 +47,7 @@ world */ var utf8bytes = Encoding.UTF8.GetBytes(example); -var graphemes = Tokenizer.GetGraphemes(utf8bytes); +var graphemes = Split.Graphemes(utf8bytes); // Iterate over the tokens foreach (var grapheme in graphemes) @@ -84,19 +84,29 @@ d */ ``` +There are also optional extension methods in the spirit of `string.Split`: + +```csharp +using UAX29.Extensions; + +example.SplitWords(); +``` + ### Data types For UTF-8 bytes, pass `byte[]`, `Span` or `Stream`; the resulting tokens will be `ReadOnlySpan`. For strings/chars, pass `string`, `char[]`, `Span` or `TextReader`/`StreamReader`; the resulting tokens will be `ReadOnlySpan`. +If you have `Memory`, use `Memory.Span`. + ### Conformance We use the official Unicode [test suites](https://unicode.org/reports/tr41/tr41-26.html#Tests29). Status: [![.NET](https://github.com/clipperhouse/uax29.net/actions/workflows/dotnet.yml/badge.svg)](https://github.com/clipperhouse/uax29.net/actions/workflows/dotnet.yml) -This is the same algorithm that is implemented in Lucene's [StandardTokenizer](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/analysis/standard/StandardTokenizer.html). +This is the same spec that is implemented in Lucene's [StandardTokenizer](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/analysis/standard/StandardTokenizer.html). ### Performance @@ -104,13 +114,13 @@ When tokenizing words, I get around 120MB/s on my Macbook M2. For typical text, The tokenizer is implemented as a `ref struct`, so you should see zero allocations for static text such as `byte[]` or `string`/`char`. -Calling `GetWords` et al returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. +Calling `Split.Words` returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `GetWords`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. ### Options -Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be returned. +Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be returned (for words only). ### Invalid inputs @@ -124,10 +134,6 @@ Renamed methods: `Tokenizer.GetWords(input)` → `Split.Words(input)` -or - -`Tokenizer.GetWords(input)` → `input.SplitWords()` - #### v1 → v2 Renamed package, namespace and methods: diff --git a/uax29/RangeEnumerator.Test.cs b/uax29/RangeEnumerator.Test.cs index 1032ed6..e790158 100644 --- a/uax29/RangeEnumerator.Test.cs +++ b/uax29/RangeEnumerator.Test.cs @@ -67,27 +67,21 @@ public void MatchesTokenizer() public void Enumerator() { var input = "Hello, how are you?"; - var mem = input.AsMemory(); - Split.Words(mem); var words = Split.Words(input); - var ranges = words.Ranges; - - var first = new List(); - while (ranges.MoveNext()) + var first = new List(); + foreach (var word in words) { - first.Add(ranges.Current); + first.Add(word.ToString()); } - Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing - - var tokens2 = Split.Words(input); - var ranges2 = words.Ranges; + Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing - var second = new List(); - foreach (var range in ranges2) + var ranges = Split.Words(input).Ranges; + var second = new List(); + foreach (var range in ranges) { - second.Add(range); + second.Add(input[range]); } Assert.That(first.SequenceEqual(second)); } diff --git a/uax29/Split.Graphemes.cs b/uax29/Split.Graphemes.cs index 61115fb..135d720 100644 --- a/uax29/Split.Graphemes.cs +++ b/uax29/Split.Graphemes.cs @@ -2,15 +2,6 @@ public static partial class Split { - /// - /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). - /// - public static SplitEnumerator Graphemes(Span input) => new(input, UAX29.Graphemes.SplitBytes); - /// /// Split the graphemes in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ /// @@ -20,61 +11,6 @@ public static partial class Split /// public static SplitEnumerator Graphemes(ReadOnlySpan input) => new(input, UAX29.Graphemes.SplitBytes); - /// - /// Split the graphemes in the given of UTF-8 encoded bytes. - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). - /// - public static SplitEnumerator Graphemes(Memory input) => new(input.Span, UAX29.Graphemes.SplitBytes); - - /// - /// Split the graphemes in the given of UTF-8 encoded bytes. - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). - /// - public static SplitEnumerator Graphemes(ReadOnlyMemory input) => new(input.Span, UAX29.Graphemes.SplitBytes); - - /// - /// Split the graphemes in the given array of UTF-8 encoded bytes. - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). - /// - public static SplitEnumerator Graphemes(byte[] input) => new(input.AsSpan(), UAX29.Graphemes.SplitBytes); - - /// - /// Split the graphemes in the given string. - /// - /// The string to tokenize. - /// - /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). - /// - public static SplitEnumerator Graphemes(string input) => new(input.AsSpan(), UAX29.Graphemes.SplitChars); - - /// - /// Split the graphemes in the given string. - /// - /// The string to tokenize. - /// - /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). - /// - public static SplitEnumerator Graphemes(char[] input) => new(input.AsSpan(), UAX29.Graphemes.SplitChars); - - /// - /// Split the graphemes in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). - /// - /// - public static SplitEnumerator Graphemes(Span input) => new(input, UAX29.Graphemes.SplitChars); - /// /// Split the graphemes in the given of . /// @@ -84,24 +20,6 @@ public static partial class Split /// public static SplitEnumerator Graphemes(ReadOnlySpan input) => new(input, UAX29.Graphemes.SplitChars); - /// - /// Split the graphemes in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). - /// - public static SplitEnumerator Graphemes(Memory input) => new(input.Span, UAX29.Graphemes.SplitChars); - - /// - /// Split the graphemes in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of graphemes. Use foreach (var grapheme in graphemes). - /// - public static SplitEnumerator Graphemes(ReadOnlyMemory input) => new(input.Span, UAX29.Graphemes.SplitChars); - /// /// Split the graphemes in the given of UTF-8 encoded bytes. /// diff --git a/uax29/Split.Sentences.cs b/uax29/Split.Sentences.cs index 5e5f544..09d7c37 100644 --- a/uax29/Split.Sentences.cs +++ b/uax29/Split.Sentences.cs @@ -2,15 +2,6 @@ public static partial class Split { - /// - /// Split the sentences in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of sentences. Use foreach (var sentence in sentences). - /// - public static SplitEnumerator Sentences(Span input) => new(input, UAX29.Sentences.SplitBytes); - /// /// Split the sentences in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ /// @@ -20,61 +11,6 @@ public static partial class Split /// public static SplitEnumerator Sentences(ReadOnlySpan input) => new(input, UAX29.Sentences.SplitBytes); - /// - /// Split the sentences in the given of UTF-8 encoded bytes. - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of sentences. Use foreach (var sentence in sentences). - /// - public static SplitEnumerator Sentences(Memory input) => new(input.Span, UAX29.Sentences.SplitBytes); - - /// - /// Split the sentences in the given of UTF-8 encoded bytes. - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of sentences. Use foreach (var sentence in sentences). - /// - public static SplitEnumerator Sentences(ReadOnlyMemory input) => new(input.Span, UAX29.Sentences.SplitBytes); - - /// - /// Split the sentences in the given array of UTF-8 encoded bytes. - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of sentences. Use foreach (var sentence in sentences). - /// - public static SplitEnumerator Sentences(byte[] input) => new(input.AsSpan(), UAX29.Sentences.SplitBytes); - - /// - /// Split the sentences in the given string. - /// - /// The string to tokenize. - /// - /// An enumerator of sentences. Use foreach (var sentence in sentences). - /// - public static SplitEnumerator Sentences(string input) => new(input.AsSpan(), UAX29.Sentences.SplitChars); - - /// - /// Split the sentences in the given string. - /// - /// The string to tokenize. - /// - /// An enumerator of sentences. Use foreach (var sentence in sentences). - /// - public static SplitEnumerator Sentences(char[] input) => new(input.AsSpan(), UAX29.Sentences.SplitChars); - - /// - /// Split the sentences in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of sentences. Use foreach (var sentence in sentences). - /// - /// - public static SplitEnumerator Sentences(Span input) => new(input, UAX29.Sentences.SplitChars); - /// /// Split the sentences in the given of . /// @@ -84,26 +20,6 @@ public static partial class Split /// public static SplitEnumerator Sentences(ReadOnlySpan input) => new(input, UAX29.Sentences.SplitChars); - /// - /// Split the sentences in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of sentences. Use foreach (var sentence in sentences). - /// - public static SplitEnumerator Sentences(Memory input) => new(input.Span, UAX29.Sentences.SplitChars); - - /// - /// Split the sentences in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of sentences. Use foreach (var sentence in sentences). - /// - public static SplitEnumerator Sentences(ReadOnlyMemory input) => new(input.Span, UAX29.Sentences.SplitChars); - - - /// /// Split the sentences in the given of UTF-8 encoded bytes. /// diff --git a/uax29/Split.Words.cs b/uax29/Split.Words.cs index 4eb5df1..1b6fad6 100644 --- a/uax29/Split.Words.cs +++ b/uax29/Split.Words.cs @@ -2,15 +2,6 @@ public static partial class Split { - /// - /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static SplitEnumerator Words(Span input, Options options = Options.None) => new(input, UAX29.Words.SplitBytes, options); - /// /// Split the words in the given of UTF-8 encoded bytes, according to the Unicode UAX #29 spec. https://unicode.org/reports/tr29/ /// @@ -20,61 +11,6 @@ public static partial class Split /// public static SplitEnumerator Words(ReadOnlySpan input, Options options = Options.None) => new(input, UAX29.Words.SplitBytes, options); - /// - /// Split the words in the given of UTF-8 encoded bytes. - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static SplitEnumerator Words(Memory input, Options options = Options.None) => new(input.Span, UAX29.Words.SplitBytes, options); - - /// - /// Split the words in the given of UTF-8 encoded bytes. - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static SplitEnumerator Words(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, UAX29.Words.SplitBytes, options); - - /// - /// Split the words in the given array of UTF-8 encoded bytes. - /// - /// The UTF-8 bytes to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static SplitEnumerator Words(byte[] input, Options options = Options.None) => new(input.AsSpan(), UAX29.Words.SplitBytes, options); - - /// - /// Split the words in the given string. - /// - /// The string to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static SplitEnumerator Words(string input, Options options = Options.None) => new(input.AsSpan(), UAX29.Words.SplitChars, options); - - /// - /// Split the words in the given string. - /// - /// The string to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static SplitEnumerator Words(char[] input, Options options = Options.None) => new(input.AsSpan(), UAX29.Words.SplitChars, options); - - /// - /// Split the words in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - /// - public static SplitEnumerator Words(Span input, Options options = Options.None) => new(input, UAX29.Words.SplitChars, options); - /// /// Split the words in the given of . /// @@ -84,24 +20,6 @@ public static partial class Split /// public static SplitEnumerator Words(ReadOnlySpan input, Options options = Options.None) => new(input, UAX29.Words.SplitChars, options); - /// - /// Split the words in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static SplitEnumerator Words(Memory input, Options options = Options.None) => new(input.Span, UAX29.Words.SplitChars, options); - - /// - /// Split the words in the given of . - /// - /// The chars to tokenize. - /// - /// An enumerator of words. Use foreach (var word in words). - /// - public static SplitEnumerator Words(ReadOnlyMemory input, Options options = Options.None) => new(input.Span, UAX29.Words.SplitChars, options); - /// /// Split the words in the given of UTF-8 encoded bytes. /// diff --git a/uax29/SplitEnumerator.Test.cs b/uax29/SplitEnumerator.Test.cs index d8a0228..23765d1 100644 --- a/uax29/SplitEnumerator.Test.cs +++ b/uax29/SplitEnumerator.Test.cs @@ -73,14 +73,10 @@ static int ExpectedOverloads() expected++; // char[] expected++; // Span expected++; // ReadOnlySpan - expected++; // Memory - expected++; // ReadOnlyMemory expected++; // byte[] expected++; // Span expected++; // ReadOnlySpan - expected++; // Memory - expected++; // ReadOnlyMemory expected++; // Stream expected++; // TextReader @@ -104,9 +100,8 @@ public void Overloads() using var stream = new MemoryStream(bytes); using var reader = new StreamReader(stream); + // Chars { - // chars - Split.Words(input); got++; var array = input.ToCharArray(); @@ -118,19 +113,9 @@ public void Overloads() ReadOnlySpan rspan = input.AsSpan(); Split.Words(rspan); got++; - var mem = new Memory(array); - Split.Words(mem); got++; - - ReadOnlyMemory rmem = input.AsMemory(); - Split.Words(rmem); got++; - Split.Words(reader); got++; } - - { - // chars - Split.Graphemes(input); got++; var array = input.ToCharArray(); @@ -142,19 +127,9 @@ public void Overloads() ReadOnlySpan rspan = input.AsSpan(); Split.Graphemes(rspan); got++; - var mem = new Memory(array); - Split.Graphemes(mem); got++; - - ReadOnlyMemory rmem = input.AsMemory(); - Split.Graphemes(rmem); got++; - Split.Graphemes(reader); got++; } - - { - // chars - Split.Sentences(input); got++; var array = input.ToCharArray(); @@ -166,18 +141,11 @@ public void Overloads() ReadOnlySpan rspan = input.AsSpan(); Split.Sentences(rspan); got++; - var mem = new Memory(array); - Split.Sentences(mem); got++; - - ReadOnlyMemory rmem = input.AsMemory(); - Split.Sentences(rmem); got++; - Split.Sentences(reader); got++; } + // Bytes { - // bytes - Split.Words(bytes); got++; Span span = bytes.AsSpan(); @@ -186,19 +154,9 @@ public void Overloads() ReadOnlySpan rspan = bytes.AsSpan(); Split.Words(rspan); got++; - Memory mem = bytes.AsMemory(); - Split.Words(mem); got++; - - ReadOnlyMemory rmem = bytes.AsMemory(); - Split.Words(rmem); got++; - Split.Words(stream); got++; } - - { - // bytes - Split.Graphemes(bytes); got++; Span span = bytes.AsSpan(); @@ -207,19 +165,9 @@ public void Overloads() ReadOnlySpan rspan = bytes.AsSpan(); Split.Graphemes(rspan); got++; - Memory mem = bytes.AsMemory(); - Split.Graphemes(mem); got++; - - ReadOnlyMemory rmem = bytes.AsMemory(); - Split.Graphemes(rmem); got++; - Split.Graphemes(stream); got++; } - - { - // bytes - Split.Sentences(bytes); got++; Span span = bytes.AsSpan(); @@ -228,12 +176,6 @@ public void Overloads() ReadOnlySpan rspan = bytes.AsSpan(); Split.Sentences(rspan); got++; - Memory mem = bytes.AsMemory(); - Split.Sentences(mem); got++; - - ReadOnlyMemory rmem = bytes.AsMemory(); - Split.Sentences(rmem); got++; - Split.Sentences(stream); got++; } @@ -244,9 +186,7 @@ public void Overloads() public void Enumerator() { var input = "Hello, how are you?"; - var mem = input.AsMemory(); var bytes = Encoding.UTF8.GetBytes(input); - Split.Words(mem); var tokens = Split.Words(input); var first = new List(); @@ -257,11 +197,11 @@ public void Enumerator() } Assert.That(first, Has.Count.GreaterThan(1)); // just make sure it did the thing - var tokens2 = Split.Words(input); + var tokens2 = Split.Words(bytes); var second = new List(); foreach (var token in tokens2) { - var s = token.ToString(); + var s = Encoding.UTF8.GetString(token); second.Add(s); } Assert.That(first.SequenceEqual(second)); diff --git a/uax29/Unicode.Test.cs b/uax29/Unicode.Test.cs index c8c5491..e2a3d6b 100644 --- a/uax29/Unicode.Test.cs +++ b/uax29/Unicode.Test.cs @@ -68,12 +68,12 @@ internal static void TestTokenizerTextReader(StreamEnumerator tokens, Unic } } - private delegate SplitEnumerator ByteMethod(byte[] input); - static readonly ByteMethod byteWords = (byte[] input) => Split.Words(input); // because of the optional parameter + private delegate SplitEnumerator ByteMethod(ReadOnlySpan input); + static readonly ByteMethod byteWords = (ReadOnlySpan input) => Split.Words(input); // because of the optional parameter static readonly ByteMethod[] byteMethods = [byteWords, Split.Graphemes, Split.Graphemes]; - private delegate SplitEnumerator CharMethod(char[] input); - static readonly CharMethod charWords = (char[] input) => Split.Words(input); // because of the optional parameter + private delegate SplitEnumerator CharMethod(ReadOnlySpan input); + static readonly CharMethod charWords = (ReadOnlySpan input) => Split.Words(input); // because of the optional parameter static readonly CharMethod[] charMethods = [charWords, Split.Graphemes, Split.Sentences]; [Test] From be37ea6f67dc5a85563e1d46d309f3c1efb56525 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Mon, 15 Jul 2024 12:21:57 -0400 Subject: [PATCH 06/17] readme & comments --- README.md | 2 +- uax29/Extensions/Extensions.Test.cs | 17 +++-------------- uax29/README.md | 2 +- 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index db6d0d1..0bae8a1 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ For UTF-8 bytes, pass `byte[]`, `Span` or `Stream`; the resulting tokens w For strings/chars, pass `string`, `char[]`, `Span` or `TextReader`/`StreamReader`; the resulting tokens will be `ReadOnlySpan`. -If you have `Memory`, use `Memory.Span`. +If you have `Memory`, pass `Memory.Span`. ### Conformance diff --git a/uax29/Extensions/Extensions.Test.cs b/uax29/Extensions/Extensions.Test.cs index 751059d..f8e365c 100644 --- a/uax29/Extensions/Extensions.Test.cs +++ b/uax29/Extensions/Extensions.Test.cs @@ -50,6 +50,7 @@ public void Overloads() using var stream = new MemoryStream(bytes); using var reader = new StreamReader(stream); + // Chars { // string input.SplitWords(); got++; @@ -68,16 +69,13 @@ public void Overloads() var mem = new Memory(input.ToCharArray()); mem.SplitWords(); got++; - // ReadOnlyMemoryMemory + // ReadOnlyMemory ReadOnlyMemory rmem = input.AsMemory(); rmem.SplitWords(); got++; reader.SplitWords(); got++; } - { - // chars - input.SplitGraphemes(); got++; var array = input.ToCharArray(); @@ -97,11 +95,7 @@ public void Overloads() reader.SplitGraphemes(); got++; } - - { - // chars - input.SplitSentences(); got++; var array = input.ToCharArray(); @@ -122,9 +116,8 @@ public void Overloads() reader.SplitSentences(); got++; } + // Bytes { - // bytes - bytes.SplitWords(); got++; Span span = bytes.AsSpan(); @@ -141,11 +134,7 @@ public void Overloads() stream.SplitWords(); got++; } - - { - // bytes - bytes.SplitGraphemes(); got++; Span span = bytes.AsSpan(); diff --git a/uax29/README.md b/uax29/README.md index db6d0d1..0bae8a1 100644 --- a/uax29/README.md +++ b/uax29/README.md @@ -98,7 +98,7 @@ For UTF-8 bytes, pass `byte[]`, `Span` or `Stream`; the resulting tokens w For strings/chars, pass `string`, `char[]`, `Span` or `TextReader`/`StreamReader`; the resulting tokens will be `ReadOnlySpan`. -If you have `Memory`, use `Memory.Span`. +If you have `Memory`, pass `Memory.Span`. ### Conformance From d1ad7087b390cada418e2994d76a84a500d7aaff Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Mon, 15 Jul 2024 13:29:53 -0400 Subject: [PATCH 07/17] readme --- README.md | 2 +- uax29/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0bae8a1..19903a8 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ The tokenizer is implemented as a `ref struct`, so you should see zero allocatio Calling `Split.Words` returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. -For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `GetWords`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. +For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `Split.Words`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. ### Options diff --git a/uax29/README.md b/uax29/README.md index 0bae8a1..19903a8 100644 --- a/uax29/README.md +++ b/uax29/README.md @@ -116,7 +116,7 @@ The tokenizer is implemented as a `ref struct`, so you should see zero allocatio Calling `Split.Words` returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. -For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `GetWords`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. +For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `Split.Words`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. ### Options From b54f87d5aaa736561b19fcb9a7572e609da127b9 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Mon, 15 Jul 2024 13:30:37 -0400 Subject: [PATCH 08/17] Benchmark ArrayPool Much less allocation confirmed, but a little slower overall. --- Benchmarks/Program.cs | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/Benchmarks/Program.cs b/Benchmarks/Program.cs index 93a7e22..157dcb1 100644 --- a/Benchmarks/Program.cs +++ b/Benchmarks/Program.cs @@ -1,3 +1,4 @@ +using System.Buffers; using System.Text; using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Configs; @@ -77,32 +78,24 @@ public void TokenizeStringOmitWhitespace() [Benchmark] public void TokenizeStream() { - var stream = new MemoryStream(sample); - var tokens = Split.Words(stream); - foreach (var token in tokens) - { - } + sampleStream.Seek(0, SeekOrigin.Begin); + var tokens = Split.Words(sampleStream); + foreach (var token in tokens) { } } + static readonly ArrayPool pool = ArrayPool.Shared; + [Benchmark] - public void TokenizeSetStream() + public void TokenizeStreamArrayPool() { - // This is to test to observe allocations. + var storage = pool.Rent(2048); - // The creation will allocate a buffer of 1024 bytes - var tokens = Split.Words(sampleStream); + sampleStream.Seek(0, SeekOrigin.Begin); + var tokens = Split.Words(sampleStream, minBufferBytes: 1024, bufferStorage: storage); + tokens.SetStream(sampleStream); + foreach (var token in tokens) { } - var runs = 10; - // keep in mind the 10 runs when interpreting the benchmark - for (var i = 0; i < runs; i++) - { - // subsequent runs should allocate less by using SetStream - sampleStream.Seek(0, SeekOrigin.Begin); - tokens.SetStream(sampleStream); - foreach (var token in tokens) - { - } - } + pool.Return(storage); } [Benchmark] From f45b58c4cda163bd3fef045acb72cf1bc38f408f Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Mon, 15 Jul 2024 13:32:27 -0400 Subject: [PATCH 09/17] Test buffer constraint --- uax29/Buffer.Test.cs | 38 ++++++++++++++++++++++++++++++++++++++ uax29/Buffer.cs | 10 +++++----- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/uax29/Buffer.Test.cs b/uax29/Buffer.Test.cs index 06b4006..7917531 100644 --- a/uax29/Buffer.Test.cs +++ b/uax29/Buffer.Test.cs @@ -81,6 +81,44 @@ public void Moving() Assert.That(buffer.end, Is.EqualTo(storageSize)); } } + + [Test] + public void MinBufferSize() + { + var input = "Hello, how are you?"; + var bytes = Encoding.UTF8.GetBytes(input); + using var stream = new MemoryStream(bytes); + + { + var storage = new byte[1024]; + var minBufferBytes = 1024; + bool threw = false; + try + { + var words = new Buffer(stream.Read, minBufferBytes, storage); // ok + } + catch (ArgumentException) + { + threw = true; + } + Assert.That(threw, Is.False); + } + { + var storage = new byte[1024]; + var minBufferBytes = 1025; + + bool threw = false; + try + { + var words = new Buffer(stream.Read, minBufferBytes, storage); // not ok + } + catch (ArgumentException) + { + threw = true; + } + Assert.That(threw, Is.True); + } + } } diff --git a/uax29/Buffer.cs b/uax29/Buffer.cs index 689d391..3547d5b 100644 --- a/uax29/Buffer.cs +++ b/uax29/Buffer.cs @@ -19,15 +19,15 @@ public ref struct Buffer /// public bool EOF { get; private set; } - public Buffer(Read read, int minItems, T[]? storage = null) + public Buffer(Read read, int minBuffer, T[]? storage = null) { this.read = read; - this.minItems = minItems; - if (storage != null && storage.Length < minItems) + this.minItems = minBuffer; + if (storage != null && storage.Length < minBuffer) { - throw new ArgumentException($"Storage ({typeof(T)}[{storage.Length}]) must be at least as large as minItems ({minItems})."); + throw new ArgumentException($"Storage ({typeof(T)}[{storage.Length}]) must be at least as large as minBuffer ({minBuffer})."); } - storage ??= new T[minItems]; + storage ??= new T[minBuffer]; this.storage = storage; } From bf36d353cbf0ffd9cf04bedc4c67e12d56ece8d5 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Mon, 15 Jul 2024 13:59:57 -0400 Subject: [PATCH 10/17] warn v3 --- README.md | 2 ++ uax29/README.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index 19903a8..8900b28 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ Any time our code operates on individual words, we are tokenizing. Often, we do ### Example +_⚠️ This documentation on `main` refers to v3, which is not yet published on Nuget. See [v2 documentation](https://github.com/clipperhouse/uax29.net/blob/v2.2.0/README.md) until then._ + ``` dotnet add package UAX29 ``` diff --git a/uax29/README.md b/uax29/README.md index 19903a8..8900b28 100644 --- a/uax29/README.md +++ b/uax29/README.md @@ -6,6 +6,8 @@ Any time our code operates on individual words, we are tokenizing. Often, we do ### Example +_⚠️ This documentation on `main` refers to v3, which is not yet published on Nuget. See [v2 documentation](https://github.com/clipperhouse/uax29.net/blob/v2.2.0/README.md) until then._ + ``` dotnet add package UAX29 ``` From e3ba67de0d7cd42a7ce37a815f407e84a776abaf Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Mon, 15 Jul 2024 23:52:58 -0400 Subject: [PATCH 11/17] fix up test codegen --- Codegen/Program.cs | 70 +++++++++++++++---------------- uax29/Graphemes.Dict.cs | 2 +- uax29/Graphemes.Test.cs | 92 ++++++++++++++++++++--------------------- uax29/Sentences.Dict.cs | 2 +- uax29/Sentences.Test.cs | 92 ++++++++++++++++++++--------------------- uax29/Unicode.Test.cs | 8 ++-- uax29/Words.Dict.cs | 2 +- uax29/Words.Test.cs | 92 ++++++++++++++++++++--------------------- 8 files changed, 180 insertions(+), 180 deletions(-) diff --git a/Codegen/Program.cs b/Codegen/Program.cs index 85bdda4..19a35c5 100644 --- a/Codegen/Program.cs +++ b/Codegen/Program.cs @@ -142,7 +142,7 @@ internal static partial class {typ}s } dict.Write(@" - internal static readonly Dict Dict = new(GetDict()); + static readonly Dict Dict = new(GetDict()); static Dictionary GetDict() => new() { "); @@ -181,6 +181,40 @@ static async Task WriteTests(string typ) [TestFixture] public class {typ}sTests {{ + static UnicodeTest[] Tests => UnicodeTests; + + [Test, TestCaseSource(nameof(Tests))] + public void Bytes(UnicodeTest test) + {{ + var tokens = Split.{typ}s(test.input); + TestUnicode.TestBytes(tokens, test); + }} + + [Test, TestCaseSource(nameof(Tests))] + public void String(UnicodeTest test) + {{ + var s = Encoding.UTF8.GetString(test.input); + var tokens = Split.{typ}s(s); + TestUnicode.TestChars(tokens, test); + }} + + [Test, TestCaseSource(nameof(Tests))] + public void Stream(UnicodeTest test) + {{ + using var stream = new MemoryStream(test.input); + var tokens = Split.{typ}s(stream); + TestUnicode.TestStream(tokens, test); + }} + + [Test, TestCaseSource(nameof(Tests))] + public void TextReader(UnicodeTest test) + {{ + using var stream = new MemoryStream(test.input); + using var reader = new StreamReader(stream); + var tokens = Split.{typ}s(reader); + TestUnicode.TestTextReader(tokens, test); + }} + internal readonly static UnicodeTest[] UnicodeTests = [ "); while (true) @@ -241,40 +275,6 @@ public class {typ}sTests } dict.Write(@$" ]; - - static readonly UnicodeTest[] Tests = UnicodeTests; - - [Test, TestCaseSource(nameof(Tests))] - public void Bytes(UnicodeTest test) - {{ - var tokens = Tokenizer.Get{typ}s(test.input); - TestUnicode.TestTokenizerBytes(tokens, test); - }} - - [Test, TestCaseSource(nameof(Tests))] - public void String(UnicodeTest test) - {{ - var s = Encoding.UTF8.GetString(test.input); - var tokens = Tokenizer.Get{typ}s(s); - TestUnicode.TestTokenizerChars(tokens, test); - }} - - [Test, TestCaseSource(nameof(Tests))] - public void Stream(UnicodeTest test) - {{ - using var stream = new MemoryStream(test.input); - var tokens = Tokenizer.Get{typ}s(stream); - TestUnicode.TestTokenizerStream(tokens, test); - }} - - [Test, TestCaseSource(nameof(Tests))] - public void TextReader(UnicodeTest test) - {{ - using var stream = new MemoryStream(test.input); - using var reader = new StreamReader(stream); - var tokens = Tokenizer.Get{typ}s(reader); - TestUnicode.TestTokenizerTextReader(tokens, test); - }} }} "); } diff --git a/uax29/Graphemes.Dict.cs b/uax29/Graphemes.Dict.cs index 0c3838d..9d3f2aa 100644 --- a/uax29/Graphemes.Dict.cs +++ b/uax29/Graphemes.Dict.cs @@ -21,7 +21,7 @@ internal static partial class Graphemes const Property ZWJ = 4096; const Property Extended_Pictographic = 8192; - internal static readonly Dict Dict = new(GetDict()); + static readonly Dict Dict = new(GetDict()); static Dictionary GetDict() => new() { {0x0600, Prepend}, diff --git a/uax29/Graphemes.Test.cs b/uax29/Graphemes.Test.cs index f891573..be92909 100644 --- a/uax29/Graphemes.Test.cs +++ b/uax29/Graphemes.Test.cs @@ -1,13 +1,47 @@ // generated from https://www.unicode.org/Public/15.0.0/ucd/auxiliary/GraphemeBreakTest.txt -namespace Tests; - -using System.Text; -using UAX29; - -[TestFixture] -public class GraphemesTests -{ - internal readonly static UnicodeTest[] UnicodeTests = [ +namespace Tests; + +using System.Text; +using UAX29; + +[TestFixture] +public class GraphemesTests +{ + static UnicodeTest[] Tests => UnicodeTests; + + [Test, TestCaseSource(nameof(Tests))] + public void Bytes(UnicodeTest test) + { + var tokens = Split.Graphemes(test.input); + TestUnicode.TestBytes(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void String(UnicodeTest test) + { + var s = Encoding.UTF8.GetString(test.input); + var tokens = Split.Graphemes(s); + TestUnicode.TestChars(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void Stream(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + var tokens = Split.Graphemes(stream); + TestUnicode.TestStream(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void TextReader(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + using var reader = new StreamReader(stream); + var tokens = Split.Graphemes(reader); + TestUnicode.TestTextReader(tokens, test); + } + + internal readonly static UnicodeTest[] UnicodeTests = [ new([0x0020, 0x0020], [[0x0020], [0x0020]], "÷ [0.2] SPACE (Other) ÷ [999.0] SPACE (Other) ÷ [0.3]"), new([0x0020, 0x00CC, 0x0088, 0x0020], [[0x0020, 0x00CC, 0x0088], [0x0020]], "÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend_ExtCccZwj) ÷ [999.0] SPACE (Other) ÷ [0.3]"), new([0x0020, 0x000D], [[0x0020], [0x000D]], "÷ [0.2] SPACE (Other) ÷ [5.0] (CR) ÷ [0.3]"), @@ -610,40 +644,6 @@ public class GraphemesTests new([0x0061, 0x00E2, 0x0080, 0x008D, 0x00F0, 0x009F, 0x009B, 0x0091], [[0x0061, 0x00E2, 0x0080, 0x008D], [0x00F0, 0x009F, 0x009B, 0x0091]], "÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) ÷ [999.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]"), new([0x00E2, 0x009C, 0x0081, 0x00E2, 0x0080, 0x008D, 0x00E2, 0x009C, 0x0081], [[0x00E2, 0x009C, 0x0081, 0x00E2, 0x0080, 0x008D, 0x00E2, 0x009C, 0x0081]], "÷ [0.2] UPPER BLADE SCISSORS (Other) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] UPPER BLADE SCISSORS (Other) ÷ [0.3]"), new([0x0061, 0x00E2, 0x0080, 0x008D, 0x00E2, 0x009C, 0x0081], [[0x0061, 0x00E2, 0x0080, 0x008D], [0x00E2, 0x009C, 0x0081]], "÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) ÷ [999.0] UPPER BLADE SCISSORS (Other) ÷ [0.3]"), - - ]; - - static readonly UnicodeTest[] Tests = UnicodeTests; - - [Test, TestCaseSource(nameof(Tests))] - public void Bytes(UnicodeTest test) - { - var tokens = Split.Graphemes(test.input); - TestUnicode.TestTokenizerBytes(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void String(UnicodeTest test) - { - var s = Encoding.UTF8.GetString(test.input); - var tokens = Split.Graphemes(s); - TestUnicode.TestTokenizerChars(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void Stream(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - var tokens = Split.Graphemes(stream); - TestUnicode.TestTokenizerStream(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void TextReader(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - using var reader = new StreamReader(stream); - var tokens = Split.Graphemes(reader); - TestUnicode.TestTokenizerTextReader(tokens, test); - } -} + + ]; +} diff --git a/uax29/Sentences.Dict.cs b/uax29/Sentences.Dict.cs index 660ba7b..63756e8 100644 --- a/uax29/Sentences.Dict.cs +++ b/uax29/Sentences.Dict.cs @@ -21,7 +21,7 @@ internal static partial class Sentences const Property Close = 4096; const Property SContinue = 8192; - internal static readonly Dict Dict = new(GetDict()); + static readonly Dict Dict = new(GetDict()); static Dictionary GetDict() => new() { {0x000D, CR}, diff --git a/uax29/Sentences.Test.cs b/uax29/Sentences.Test.cs index dca41b8..3bc6753 100644 --- a/uax29/Sentences.Test.cs +++ b/uax29/Sentences.Test.cs @@ -1,13 +1,47 @@ // generated from https://www.unicode.org/Public/15.0.0/ucd/auxiliary/SentenceBreakTest.txt -namespace Tests; - -using System.Text; -using UAX29; - -[TestFixture] -public class SentencesTests -{ - internal readonly static UnicodeTest[] UnicodeTests = [ +namespace Tests; + +using System.Text; +using UAX29; + +[TestFixture] +public class SentencesTests +{ + static UnicodeTest[] Tests => UnicodeTests; + + [Test, TestCaseSource(nameof(Tests))] + public void Bytes(UnicodeTest test) + { + var tokens = Split.Sentences(test.input); + TestUnicode.TestBytes(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void String(UnicodeTest test) + { + var s = Encoding.UTF8.GetString(test.input); + var tokens = Split.Sentences(s); + TestUnicode.TestChars(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void Stream(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + var tokens = Split.Sentences(stream); + TestUnicode.TestStream(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void TextReader(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + using var reader = new StreamReader(stream); + var tokens = Split.Sentences(reader); + TestUnicode.TestTextReader(tokens, test); + } + + internal readonly static UnicodeTest[] UnicodeTests = [ new([0x0001, 0x0001], [[0x0001, 0x0001]], "÷ [0.2] (Other) × [998.0] (Other) ÷ [0.3]"), new([0x0001, 0x00CC, 0x0088, 0x0001], [[0x0001, 0x00CC, 0x0088, 0x0001]], "÷ [0.2] (Other) × [5.0] COMBINING DIAERESIS (Extend_FE) × [998.0] (Other) ÷ [0.3]"), new([0x0001, 0x000D], [[0x0001, 0x000D]], "÷ [0.2] (Other) × [998.0] (CR) ÷ [0.3]"), @@ -510,40 +544,6 @@ public class SentencesTests new([0x00E2, 0x0081, 0x00A0, 0x0065, 0x00E2, 0x0081, 0x00A0, 0x0074, 0x00E2, 0x0081, 0x00A0, 0x0063, 0x00E2, 0x0081, 0x00A0, 0x002E, 0x00E2, 0x0081, 0x00A0, 0x00E3, 0x0080, 0x0082, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0], [[0x00E2, 0x0081, 0x00A0, 0x0065, 0x00E2, 0x0081, 0x00A0, 0x0074, 0x00E2, 0x0081, 0x00A0, 0x0063, 0x00E2, 0x0081, 0x00A0, 0x002E, 0x00E2, 0x0081, 0x00A0, 0x00E3, 0x0080, 0x0082, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0]], "÷ [0.2] WORD JOINER (Format_FE) × [998.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [998.0] LATIN SMALL LETTER T (Lower) × [5.0] WORD JOINER (Format_FE) × [998.0] LATIN SMALL LETTER C (Lower) × [5.0] WORD JOINER (Format_FE) × [998.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) × [8.1] IDEOGRAPHIC FULL STOP (STerm) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]"), new([0x00E2, 0x0081, 0x00A0, 0x00E5, 0x00AD, 0x0097, 0x00E2, 0x0081, 0x00A0, 0x00E3, 0x0080, 0x0082, 0x00E2, 0x0081, 0x00A0, 0x00E5, 0x00AE, 0x0083, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0], [[0x00E2, 0x0081, 0x00A0, 0x00E5, 0x00AD, 0x0097, 0x00E2, 0x0081, 0x00A0, 0x00E3, 0x0080, 0x0082, 0x00E2, 0x0081, 0x00A0], [0x00E5, 0x00AE, 0x0083, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0]], "÷ [0.2] WORD JOINER (Format_FE) × [998.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [998.0] IDEOGRAPHIC FULL STOP (STerm) × [5.0] WORD JOINER (Format_FE) ÷ [11.0] CJK UNIFIED IDEOGRAPH-5B83 (OLetter) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]"), new([0x00E2, 0x0081, 0x00A0, 0x0021, 0x00E2, 0x0081, 0x00A0, 0x0020, 0x00E2, 0x0081, 0x00A0, 0x0020, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0], [[0x00E2, 0x0081, 0x00A0, 0x0021, 0x00E2, 0x0081, 0x00A0, 0x0020, 0x00E2, 0x0081, 0x00A0, 0x0020, 0x00E2, 0x0081, 0x00A0, 0x00E2, 0x0081, 0x00A0]], "÷ [0.2] WORD JOINER (Format_FE) × [998.0] EXCLAMATION MARK (STerm) × [5.0] WORD JOINER (Format_FE) × [9.0] SPACE (Sp) × [5.0] WORD JOINER (Format_FE) × [10.0] SPACE (Sp) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]"), - - ]; - - static readonly UnicodeTest[] Tests = UnicodeTests; - - [Test, TestCaseSource(nameof(Tests))] - public void Bytes(UnicodeTest test) - { - var tokens = Split.Sentences(test.input); - TestUnicode.TestTokenizerBytes(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void String(UnicodeTest test) - { - var s = Encoding.UTF8.GetString(test.input); - var tokens = Split.Sentences(s); - TestUnicode.TestTokenizerChars(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void Stream(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - var tokens = Split.Sentences(stream); - TestUnicode.TestTokenizerStream(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void TextReader(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - using var reader = new StreamReader(stream); - var tokens = Split.Sentences(reader); - TestUnicode.TestTokenizerTextReader(tokens, test); - } -} + + ]; +} diff --git a/uax29/Unicode.Test.cs b/uax29/Unicode.Test.cs index e2a3d6b..21bb6c1 100644 --- a/uax29/Unicode.Test.cs +++ b/uax29/Unicode.Test.cs @@ -20,7 +20,7 @@ public void Setup() { } - internal static void TestTokenizerBytes(SplitEnumerator tokens, UnicodeTest test) + internal static void TestBytes(SplitEnumerator tokens, UnicodeTest test) { var i = 0; foreach (var token in tokens) @@ -32,7 +32,7 @@ internal static void TestTokenizerBytes(SplitEnumerator tokens, UnicodeTes } } - internal static void TestTokenizerStream(StreamEnumerator tokens, UnicodeTest test) + internal static void TestStream(StreamEnumerator tokens, UnicodeTest test) { var i = 0; foreach (var token in tokens) @@ -44,7 +44,7 @@ internal static void TestTokenizerStream(StreamEnumerator tokens, UnicodeT } } - internal static void TestTokenizerChars(SplitEnumerator tokens, UnicodeTest test) + internal static void TestChars(SplitEnumerator tokens, UnicodeTest test) { var i = 0; foreach (var token in tokens) @@ -56,7 +56,7 @@ internal static void TestTokenizerChars(SplitEnumerator tokens, UnicodeTes } } - internal static void TestTokenizerTextReader(StreamEnumerator tokens, UnicodeTest test) + internal static void TestTextReader(StreamEnumerator tokens, UnicodeTest test) { var i = 0; foreach (var token in tokens) diff --git a/uax29/Words.Dict.cs b/uax29/Words.Dict.cs index 06395c9..13aab2c 100644 --- a/uax29/Words.Dict.cs +++ b/uax29/Words.Dict.cs @@ -27,7 +27,7 @@ internal static partial class Words const Property Extended_Pictographic = 262144; const Property Tab = 524288; - internal static readonly Dict Dict = new(GetDict()); + static readonly Dict Dict = new(GetDict()); static Dictionary GetDict() => new() { {0x0022, Double_Quote}, diff --git a/uax29/Words.Test.cs b/uax29/Words.Test.cs index 12c71ad..d1a8f8d 100644 --- a/uax29/Words.Test.cs +++ b/uax29/Words.Test.cs @@ -1,13 +1,47 @@ // generated from https://www.unicode.org/Public/15.0.0/ucd/auxiliary/WordBreakTest.txt -namespace Tests; - -using System.Text; -using UAX29; - -[TestFixture] -public class WordsTests -{ - internal readonly static UnicodeTest[] UnicodeTests = [ +namespace Tests; + +using System.Text; +using UAX29; + +[TestFixture] +public class WordsTests +{ + static UnicodeTest[] Tests => UnicodeTests; + + [Test, TestCaseSource(nameof(Tests))] + public void Bytes(UnicodeTest test) + { + var tokens = Split.Words(test.input); + TestUnicode.TestBytes(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void String(UnicodeTest test) + { + var s = Encoding.UTF8.GetString(test.input); + var tokens = Split.Words(s); + TestUnicode.TestChars(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void Stream(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + var tokens = Split.Words(stream); + TestUnicode.TestStream(tokens, test); + } + + [Test, TestCaseSource(nameof(Tests))] + public void TextReader(UnicodeTest test) + { + using var stream = new MemoryStream(test.input); + using var reader = new StreamReader(stream); + var tokens = Split.Words(reader); + TestUnicode.TestTextReader(tokens, test); + } + + internal readonly static UnicodeTest[] UnicodeTests = [ new([0x0001, 0x0001], [[0x0001], [0x0001]], "÷ [0.2] (Other) ÷ [999.0] (Other) ÷ [0.3]"), new([0x0001, 0x00CC, 0x0088, 0x0001], [[0x0001, 0x00CC, 0x0088], [0x0001]], "÷ [0.2] (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] (Other) ÷ [0.3]"), new([0x0001, 0x000D], [[0x0001], [0x000D]], "÷ [0.2] (Other) ÷ [3.2] (CR) ÷ [0.3]"), @@ -1831,40 +1865,6 @@ public class WordsTests new([0x0061, 0x002C, 0x002C, 0x0061], [[0x0061], [0x002C], [0x002C], [0x0061]], "÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [999.0] COMMA (MidNum) ÷ [999.0] LATIN SMALL LETTER A (ALetter) ÷ [0.3]"), new([0x0061, 0x005F, 0x0031, 0x002C, 0x002C, 0x0061], [[0x0061, 0x005F, 0x0031], [0x002C], [0x002C], [0x0061]], "÷ [0.2] LATIN SMALL LETTER A (ALetter) × [13.1] LOW LINE (ExtendNumLet) × [13.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [999.0] COMMA (MidNum) ÷ [999.0] LATIN SMALL LETTER A (ALetter) ÷ [0.3]"), new([0x0061, 0x005F, 0x0061, 0x002C, 0x002C, 0x0061], [[0x0061, 0x005F, 0x0061], [0x002C], [0x002C], [0x0061]], "÷ [0.2] LATIN SMALL LETTER A (ALetter) × [13.1] LOW LINE (ExtendNumLet) × [13.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [999.0] COMMA (MidNum) ÷ [999.0] LATIN SMALL LETTER A (ALetter) ÷ [0.3]"), - - ]; - - static readonly UnicodeTest[] Tests = UnicodeTests; - - [Test, TestCaseSource(nameof(Tests))] - public void Bytes(UnicodeTest test) - { - var tokens = Split.Words(test.input); - TestUnicode.TestTokenizerBytes(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void String(UnicodeTest test) - { - var s = Encoding.UTF8.GetString(test.input); - var tokens = Split.Words(s); - TestUnicode.TestTokenizerChars(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void Stream(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - var tokens = Split.Words(stream); - TestUnicode.TestTokenizerStream(tokens, test); - } - - [Test, TestCaseSource(nameof(Tests))] - public void TextReader(UnicodeTest test) - { - using var stream = new MemoryStream(test.input); - using var reader = new StreamReader(stream); - var tokens = Split.Words(reader); - TestUnicode.TestTokenizerTextReader(tokens, test); - } -} + + ]; +} From 4158b06255831989df7b693718b0780dc6cec442 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Mon, 15 Jul 2024 23:57:08 -0400 Subject: [PATCH 12/17] more fixup --- Codegen/Program.cs | 2 +- uax29/Graphemes.Test.cs | 2 +- uax29/Sentences.Test.cs | 2 +- uax29/Words.Test.cs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Codegen/Program.cs b/Codegen/Program.cs index 19a35c5..2078c43 100644 --- a/Codegen/Program.cs +++ b/Codegen/Program.cs @@ -215,7 +215,7 @@ public void TextReader(UnicodeTest test) TestUnicode.TestTextReader(tokens, test); }} - internal readonly static UnicodeTest[] UnicodeTests = [ + readonly static UnicodeTest[] UnicodeTests = [ "); while (true) { diff --git a/uax29/Graphemes.Test.cs b/uax29/Graphemes.Test.cs index be92909..03254f7 100644 --- a/uax29/Graphemes.Test.cs +++ b/uax29/Graphemes.Test.cs @@ -41,7 +41,7 @@ public void TextReader(UnicodeTest test) TestUnicode.TestTextReader(tokens, test); } - internal readonly static UnicodeTest[] UnicodeTests = [ + readonly static UnicodeTest[] UnicodeTests = [ new([0x0020, 0x0020], [[0x0020], [0x0020]], "÷ [0.2] SPACE (Other) ÷ [999.0] SPACE (Other) ÷ [0.3]"), new([0x0020, 0x00CC, 0x0088, 0x0020], [[0x0020, 0x00CC, 0x0088], [0x0020]], "÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend_ExtCccZwj) ÷ [999.0] SPACE (Other) ÷ [0.3]"), new([0x0020, 0x000D], [[0x0020], [0x000D]], "÷ [0.2] SPACE (Other) ÷ [5.0] (CR) ÷ [0.3]"), diff --git a/uax29/Sentences.Test.cs b/uax29/Sentences.Test.cs index 3bc6753..2d1359f 100644 --- a/uax29/Sentences.Test.cs +++ b/uax29/Sentences.Test.cs @@ -41,7 +41,7 @@ public void TextReader(UnicodeTest test) TestUnicode.TestTextReader(tokens, test); } - internal readonly static UnicodeTest[] UnicodeTests = [ + readonly static UnicodeTest[] UnicodeTests = [ new([0x0001, 0x0001], [[0x0001, 0x0001]], "÷ [0.2] (Other) × [998.0] (Other) ÷ [0.3]"), new([0x0001, 0x00CC, 0x0088, 0x0001], [[0x0001, 0x00CC, 0x0088, 0x0001]], "÷ [0.2] (Other) × [5.0] COMBINING DIAERESIS (Extend_FE) × [998.0] (Other) ÷ [0.3]"), new([0x0001, 0x000D], [[0x0001, 0x000D]], "÷ [0.2] (Other) × [998.0] (CR) ÷ [0.3]"), diff --git a/uax29/Words.Test.cs b/uax29/Words.Test.cs index d1a8f8d..d47218d 100644 --- a/uax29/Words.Test.cs +++ b/uax29/Words.Test.cs @@ -41,7 +41,7 @@ public void TextReader(UnicodeTest test) TestUnicode.TestTextReader(tokens, test); } - internal readonly static UnicodeTest[] UnicodeTests = [ + readonly static UnicodeTest[] UnicodeTests = [ new([0x0001, 0x0001], [[0x0001], [0x0001]], "÷ [0.2] (Other) ÷ [999.0] (Other) ÷ [0.3]"), new([0x0001, 0x00CC, 0x0088, 0x0001], [[0x0001, 0x00CC, 0x0088], [0x0001]], "÷ [0.2] (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] (Other) ÷ [0.3]"), new([0x0001, 0x000D], [[0x0001], [0x000D]], "÷ [0.2] (Other) ÷ [3.2] (CR) ÷ [0.3]"), From 09461fd9534298149fdcaa0ef9487f42c4c10c8a Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Tue, 16 Jul 2024 00:25:15 -0400 Subject: [PATCH 13/17] names --- Benchmarks/Program.cs | 14 +++++++------- uax29/RangeEnumerator.Test.cs | 4 ++-- uax29/SplitEnumerator.Test.cs | 4 ++-- uax29/SplitEnumerator.cs | 2 +- uax29/Unicode.Test.cs | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Benchmarks/Program.cs b/Benchmarks/Program.cs index 157dcb1..01b448b 100644 --- a/Benchmarks/Program.cs +++ b/Benchmarks/Program.cs @@ -40,7 +40,7 @@ public void Setup() } [Benchmark] - public void TokenizeBytes() + public void SplitBytes() { var tokens = Split.Words(sample); foreach (var token in tokens) @@ -49,7 +49,7 @@ public void TokenizeBytes() } [Benchmark] - public void TokenizeBytesOmitWhitespace() + public void SplitBytesOmitWhitespace() { var tokens = Split.Words(sample, Options.OmitWhitespace); foreach (var token in tokens) @@ -58,7 +58,7 @@ public void TokenizeBytesOmitWhitespace() } [Benchmark] - public void TokenizeString() + public void SplitString() { var tokens = Split.Words(sampleStr); foreach (var token in tokens) @@ -67,7 +67,7 @@ public void TokenizeString() } [Benchmark] - public void TokenizeStringOmitWhitespace() + public void SplitStringOmitWhitespace() { var tokens = Split.Words(sampleStr, Options.OmitWhitespace); foreach (var token in tokens) @@ -76,7 +76,7 @@ public void TokenizeStringOmitWhitespace() } [Benchmark] - public void TokenizeStream() + public void SplitStream() { sampleStream.Seek(0, SeekOrigin.Begin); var tokens = Split.Words(sampleStream); @@ -86,7 +86,7 @@ public void TokenizeStream() static readonly ArrayPool pool = ArrayPool.Shared; [Benchmark] - public void TokenizeStreamArrayPool() + public void SplitStreamArrayPool() { var storage = pool.Rent(2048); @@ -108,7 +108,7 @@ public void StringInfoGraphemes() } [Benchmark] - public void TokenizerGraphemes() + public void SplitGraphemes() { var tokens = Split.Graphemes(sample); foreach (var token in tokens) diff --git a/uax29/RangeEnumerator.Test.cs b/uax29/RangeEnumerator.Test.cs index e790158..f2b1fce 100644 --- a/uax29/RangeEnumerator.Test.cs +++ b/uax29/RangeEnumerator.Test.cs @@ -5,7 +5,7 @@ using System.Text; [TestFixture] -public class TestRangeTokenizer +public class TestRangeEnumerator { [SetUp] public void Setup() @@ -43,7 +43,7 @@ public void Reset() static readonly Options[] options = [Options.None, Options.OmitWhitespace]; [Test] - public void MatchesTokenizer() + public void MatchesSplit() { var example = "abcdefghijk lmnopq r \tstu vwxyz; ABC DEFG \r\nHIJKL MNOP Q RSTUV WXYZ! 你好,世界.\r"; diff --git a/uax29/SplitEnumerator.Test.cs b/uax29/SplitEnumerator.Test.cs index 23765d1..84d4ba5 100644 --- a/uax29/SplitEnumerator.Test.cs +++ b/uax29/SplitEnumerator.Test.cs @@ -5,7 +5,7 @@ using System.Text; [TestFixture] -public class TestTokenizer +public class TestEnumerator { [SetUp] public void Setup() @@ -256,7 +256,7 @@ public void ToArray() Assert.That(array, Has.Length.EqualTo(i), "ToArray should return the same number of tokens as iteration"); - // Tokenizer should reset back to the beginning + // Should reset back to the beginning Assert.That(tokens.start, Is.EqualTo(0)); Assert.That(tokens.end, Is.EqualTo(0)); diff --git a/uax29/SplitEnumerator.cs b/uax29/SplitEnumerator.cs index c3914a2..0b5cd64 100644 --- a/uax29/SplitEnumerator.cs +++ b/uax29/SplitEnumerator.cs @@ -26,7 +26,7 @@ namespace UAX29; bool begun = false; /// - /// Tokenizer splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec. + /// Splits strings (or UTF-8 bytes) as words, sentences or graphemes, per the Unicode UAX #29 spec. /// /// A string, or UTF-8 byte array. /// A func/method meeting the Split delegate signature. diff --git a/uax29/Unicode.Test.cs b/uax29/Unicode.Test.cs index 21bb6c1..4630d37 100644 --- a/uax29/Unicode.Test.cs +++ b/uax29/Unicode.Test.cs @@ -130,7 +130,6 @@ public void InvalidEncoding() { var bytes = new byte[i]; rng.GetBytes(bytes); - var s = Encoding.UTF8.GetChars(bytes); foreach (var method in byteMethods) { @@ -146,6 +145,7 @@ public void InvalidEncoding() } } + var s = Encoding.UTF8.GetChars(bytes); foreach (var method in charMethods) { var tokens = method(s); From 754b9fdd4db1a1eb260a7bc5e5d20a066f72b99a Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Tue, 16 Jul 2024 14:53:30 -0400 Subject: [PATCH 14/17] name --- uax29/StreamEnumerator.Test.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uax29/StreamEnumerator.Test.cs b/uax29/StreamEnumerator.Test.cs index f723340..fc02ed5 100644 --- a/uax29/StreamEnumerator.Test.cs +++ b/uax29/StreamEnumerator.Test.cs @@ -4,7 +4,7 @@ using UAX29; [TestFixture] -public class TestStreamTokenizer +public class TestStreamEnumerator { [SetUp] public void Setup() From 1b0a52a25e6ecd6b77c4d2f3897e26f52b5f2bc4 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Wed, 17 Jul 2024 15:02:55 -0400 Subject: [PATCH 15/17] better calculation for whitespace --- Codegen/Program.cs | 23 ++++++++++++++--- uax29/Graphemes.Splitter.cs | 4 +-- uax29/Options.cs | 7 +----- uax29/Sentences.Splitter.cs | 4 +-- uax29/SplitEnumerator.cs | 4 +-- uax29/Splitter.cs | 2 +- uax29/StreamEnumerator.cs | 4 +-- uax29/Words.Dict.cs | 50 +++++++++++++++++++------------------ uax29/Words.Splitter.cs | 10 +++----- 9 files changed, 59 insertions(+), 49 deletions(-) diff --git a/Codegen/Program.cs b/Codegen/Program.cs index 2078c43..3ceaadc 100644 --- a/Codegen/Program.cs +++ b/Codegen/Program.cs @@ -115,11 +115,26 @@ static async Task WriteCategories(string typ) if (typ == "Word") { - // hack in a Tab category that the spec doesn't use, be we do - const string tab = "Tab"; + const string ws = "Whitespace"; currentCat <<= 1; - cats.Add(tab, currentCat); - catsByRune.Add(0x09, tab); + cats.Add(ws, currentCat); + + for (var i = 0; i < char.MaxValue; i++) + { + var ch = (char)i; + if (char.IsWhiteSpace(ch)) + { + var r = new Rune(ch); + if (catsByRune.TryGetValue(r.Value, out string? existing)) + { + catsByRune[r.Value] = $"{existing} | {ws}"; + } + else + { + catsByRune.Add(r.Value, ws); + } + } + } } // write the file diff --git a/uax29/Graphemes.Splitter.cs b/uax29/Graphemes.Splitter.cs index 97e54cd..25be246 100644 --- a/uax29/Graphemes.Splitter.cs +++ b/uax29/Graphemes.Splitter.cs @@ -27,7 +27,7 @@ internal Splitter(Decoders decoders) /// The string in which to split graphemes. /// Ignore, only applicable to splitting words, not graphemes. /// The number of bytes/chars that comprise the grapheme. - internal int Split(ReadOnlySpan input, out Property _) // this out param is only relevant in Words.Splitter + internal int Split(ReadOnlySpan input, out bool _) // this out param is only relevant in Words.Splitter { Debug.Assert(input.Length > 0); @@ -163,7 +163,7 @@ internal int Split(ReadOnlySpan input, out Property _) // this out para break; } - _ = 0; // see the Property out parameter at tops + _ = false; // see the out parameter at top return pos; } } diff --git a/uax29/Options.cs b/uax29/Options.cs index dfe3b2b..8f83c15 100644 --- a/uax29/Options.cs +++ b/uax29/Options.cs @@ -12,12 +12,7 @@ public enum Options : byte None = 0, /// - /// Omit tokens that consist entirely of whitespace, defined as UAX #29 WSegSpace | CR | LF | Tab. - /// - /// “Whitespace” in this implementation includes those which delimit words, but not all characters that are categorically whitespace. - /// For example, “non-breaking space” is whitespace, but it’s not what you want when splitting words, and so - /// it is not considered whitespace for our purposes. - /// + /// Omit tokens that consist entirely of whitespace, as defined by char.IsWhitespace. /// * Only supported for splitting Words; ignored for Graphemes and Sentences. * /// OmitWhitespace = 1, diff --git a/uax29/Sentences.Splitter.cs b/uax29/Sentences.Splitter.cs index 2606dcb..e2bc4fd 100644 --- a/uax29/Sentences.Splitter.cs +++ b/uax29/Sentences.Splitter.cs @@ -29,7 +29,7 @@ internal Splitter(Decoders decoders) /// The string in which to split sentences. /// Ignore, only applicable to splitting words, not sentences. /// The number of bytes/chars that comprise the sentence. - internal int Split(ReadOnlySpan input, out Property _) // this out param is only relevant in Words.Splitter + internal int Split(ReadOnlySpan input, out bool _) // this out param is only relevant in Words.Splitter { Debug.Assert(input.Length > 0); @@ -247,7 +247,7 @@ internal int Split(ReadOnlySpan input, out Property _) // this out para pos += w; } - _ = 0; // see the out Property parameter at top + _ = false; // see the out parameter at top return pos; diff --git a/uax29/SplitEnumerator.cs b/uax29/SplitEnumerator.cs index 0b5cd64..10c8f53 100644 --- a/uax29/SplitEnumerator.cs +++ b/uax29/SplitEnumerator.cs @@ -48,14 +48,14 @@ public bool MoveNext() while (end < input.Length) { - var advance = this.split(input[end..], out var seen); + var advance = this.split(input[end..], out var whitespace); Debug.Assert(advance > 0); start = end; end += advance; // This option is only supported for words; prevent other uses at the static API level - if (options.Includes(Options.OmitWhitespace) && seen.IsExclusively(Words.Whitespace)) + if (whitespace && options.Includes(Options.OmitWhitespace)) { continue; } diff --git a/uax29/Splitter.cs b/uax29/Splitter.cs index b0198b4..ebcbb37 100644 --- a/uax29/Splitter.cs +++ b/uax29/Splitter.cs @@ -11,7 +11,7 @@ /// byte or char, indicating the type of the input, and by implication, the output. /// The string to split/tokenize. /// How many bytes/chars were consumed from the input. -internal delegate int Split(ReadOnlySpan input, out Property seen); +internal delegate int Split(ReadOnlySpan input, out bool whitespace); internal static class PropertyExtensions { diff --git a/uax29/StreamEnumerator.cs b/uax29/StreamEnumerator.cs index 803351f..ede42d6 100644 --- a/uax29/StreamEnumerator.cs +++ b/uax29/StreamEnumerator.cs @@ -47,13 +47,13 @@ public bool MoveNext() count += end; buffer.Consume(this.Current.Length); // previous token - var advance = this.split(buffer.Contents, out Property seen); + var advance = this.split(buffer.Contents, out var whitespace); Debug.Assert(advance > 0); end = advance; // This option is only supported for words; prevent other uses at the static API level - if (options.Includes(Options.OmitWhitespace) && seen.IsExclusively(Words.Whitespace)) + if (whitespace && options.Includes(Options.OmitWhitespace)) { continue; } diff --git a/uax29/Words.Dict.cs b/uax29/Words.Dict.cs index 13aab2c..428f20b 100644 --- a/uax29/Words.Dict.cs +++ b/uax29/Words.Dict.cs @@ -25,7 +25,7 @@ internal static partial class Words const Property ZWJ = 65536; const Property WSegSpace = 131072; const Property Extended_Pictographic = 262144; - const Property Tab = 524288; + const Property Whitespace = 524288; static readonly Dict Dict = new(GetDict()); static Dictionary GetDict() => new() @@ -107,13 +107,13 @@ internal static partial class Words {0xFB4D, Hebrew_Letter}, {0xFB4E, Hebrew_Letter}, {0xFB4F, Hebrew_Letter}, - {0x000D, CR}, - {0x000A, LF}, - {0x000B, Newline}, - {0x000C, Newline}, - {0x0085, Newline}, - {0x2028, Newline}, - {0x2029, Newline}, + {0x000D, CR | Whitespace}, + {0x000A, LF | Whitespace}, + {0x000B, Newline | Whitespace}, + {0x000C, Newline | Whitespace}, + {0x0085, Newline | Whitespace}, + {0x2028, Newline | Whitespace}, + {0x2029, Newline | Whitespace}, {0x0300, Extend}, {0x0301, Extend}, {0x0302, Extend}, @@ -33298,7 +33298,7 @@ internal static partial class Words {0x1FBF8, Numeric}, {0x1FBF9, Numeric}, {0x005F, ExtendNumLet}, - {0x202F, ExtendNumLet}, + {0x202F, ExtendNumLet | Whitespace}, {0x203F, ExtendNumLet}, {0x2040, ExtendNumLet}, {0x2054, ExtendNumLet}, @@ -33309,20 +33309,20 @@ internal static partial class Words {0xFE4F, ExtendNumLet}, {0xFF3F, ExtendNumLet}, {0x200D, ZWJ}, - {0x0020, WSegSpace}, - {0x1680, WSegSpace}, - {0x2000, WSegSpace}, - {0x2001, WSegSpace}, - {0x2002, WSegSpace}, - {0x2003, WSegSpace}, - {0x2004, WSegSpace}, - {0x2005, WSegSpace}, - {0x2006, WSegSpace}, - {0x2008, WSegSpace}, - {0x2009, WSegSpace}, - {0x200A, WSegSpace}, - {0x205F, WSegSpace}, - {0x3000, WSegSpace}, + {0x0020, WSegSpace | Whitespace}, + {0x1680, WSegSpace | Whitespace}, + {0x2000, WSegSpace | Whitespace}, + {0x2001, WSegSpace | Whitespace}, + {0x2002, WSegSpace | Whitespace}, + {0x2003, WSegSpace | Whitespace}, + {0x2004, WSegSpace | Whitespace}, + {0x2005, WSegSpace | Whitespace}, + {0x2006, WSegSpace | Whitespace}, + {0x2008, WSegSpace | Whitespace}, + {0x2009, WSegSpace | Whitespace}, + {0x200A, WSegSpace | Whitespace}, + {0x205F, WSegSpace | Whitespace}, + {0x3000, WSegSpace | Whitespace}, {0x00A9, Extended_Pictographic}, {0x00AE, Extended_Pictographic}, {0x203C, Extended_Pictographic}, @@ -36854,6 +36854,8 @@ internal static partial class Words {0x1FFFB, Extended_Pictographic}, {0x1FFFC, Extended_Pictographic}, {0x1FFFD, Extended_Pictographic}, - {0x0009, Tab}, + {0x0009, Whitespace}, + {0x00A0, Whitespace}, + {0x2007, Whitespace}, }; // end dict }; // end class diff --git a/uax29/Words.Splitter.cs b/uax29/Words.Splitter.cs index 9fdd754..7ad2c92 100644 --- a/uax29/Words.Splitter.cs +++ b/uax29/Words.Splitter.cs @@ -8,8 +8,6 @@ internal static partial class Words { - internal const Property Whitespace = CR | LF | WSegSpace | Tab; - internal static readonly Split SplitBytes = new Splitter(Decoders.Utf8).Split; internal static readonly Split SplitChars = new Splitter(Decoders.Char).Split; @@ -31,14 +29,14 @@ internal Splitter(Decoders decoders) /// The string in which to split words. /// Categories that were seen in the first word. /// The number of bytes/chars that comprise the word. - internal int Split(ReadOnlySpan input, out Property seen) + internal int Split(ReadOnlySpan input, out bool whitespace) { Debug.Assert(input.Length > 0); // These vars are stateful across loop iterations int pos = 0; int w; - seen = 0; + whitespace = true; Property current = 0; Property lastExIgnore = 0; // "last excluding ignored categories" Property lastLastExIgnore = 0; // "the last one before that" @@ -58,7 +56,7 @@ internal int Split(ReadOnlySpan input, out Property seen) pos += w; current = Dict.Lookup(rune.Value); - seen |= current; + whitespace = whitespace && current.Is(Whitespace); } // https://unicode.org/reports/tr29/#WB2 @@ -79,7 +77,7 @@ internal int Split(ReadOnlySpan input, out Property seen) lastExIgnore = last; } - seen |= last; + whitespace = whitespace && current.Is(Whitespace); current = Dict.Lookup(rune.Value); From ecd22d9d8abf68d1ffd57244071555933ab010ed Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Wed, 17 Jul 2024 15:06:34 -0400 Subject: [PATCH 16/17] lose IsExclusively --- uax29/Splitter.Test.cs | 63 ------------------------------------------ uax29/Splitter.cs | 15 ---------- 2 files changed, 78 deletions(-) delete mode 100644 uax29/Splitter.Test.cs diff --git a/uax29/Splitter.Test.cs b/uax29/Splitter.Test.cs deleted file mode 100644 index 8b73096..0000000 --- a/uax29/Splitter.Test.cs +++ /dev/null @@ -1,63 +0,0 @@ -using System.Text; -using UAX29; - -namespace Tests; - -/// A bitmap of Unicode categories -using Property = uint; - -[TestFixture] -public class TestSplitter -{ - - [SetUp] - public void Setup() - { - } - - const Property Yes1 = 1; - const Property No1 = 2; - const Property Yes2 = 4; - const Property No2 = 8; - const Property Yes3 = 16; - const Property Yeses = Yes1 | Yes2 | Yes3; - - [Test] - public void TestIsExclusively() - { - { - var seen = Yes1; - Assert.That(seen.IsExclusively(Yeses), Is.True); - } - - { - var seen = Yes1 | Yes2; - Assert.That(seen.IsExclusively(Yeses), Is.True); - } - - { - var seen = No1; - Assert.That(seen.IsExclusively(Yeses), Is.False); - } - - { - var seen = No1 | No2; - Assert.That(seen.IsExclusively(Yeses), Is.False); - } - - { - var seen = Yes1 | No1; - Assert.That(seen.IsExclusively(Yeses), Is.False); - } - - { - var seen = Yes1 | Yes3 | No1; - Assert.That(seen.IsExclusively(Yeses), Is.False); - } - - { - Property seen = 0; - Assert.That(seen.IsExclusively(Yeses), Is.False); - } - } -} diff --git a/uax29/Splitter.cs b/uax29/Splitter.cs index ebcbb37..bef428a 100644 --- a/uax29/Splitter.cs +++ b/uax29/Splitter.cs @@ -25,19 +25,4 @@ internal static bool Is(this Property lookup, Property properties) { return (lookup & properties) != 0; } - - /// - /// Determines if property consists entirely of compare, i.e. no other values (flags) besides the ones in compare. - /// - /// The property to test; the haystack. - /// The property to test against; the needle. - /// True if property consists entirely of compare, otherwise false. - internal static bool IsExclusively(this Property property, Property compare) - { - Debug.Assert(compare > 0); - return - (property & compare) != 0 && // compare appears in property - (property & ~compare) == 0 // but no others do - ; - } } From c8776d24cc0b58e13e286d834d1f7be5ebee0cdc Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Wed, 17 Jul 2024 15:15:49 -0400 Subject: [PATCH 17/17] readme --- README.md | 4 ++-- uax29/README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8900b28..483938e 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ The tokenizer is implemented as a `ref struct`, so you should see zero allocatio Calling `Split.Words` returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. -For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `Split.Words`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. +For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `Split.Words`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://github.com/clipperhouse/uax29.net/blob/main/Benchmarks/Program.cs#L89). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. ### Options @@ -126,7 +126,7 @@ Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be ### Invalid inputs -The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We [make an effort](https://github.com/clipperhouse/uax29.net/blob/main/uax29/Unicode.Test.cs#L55) to ensure that all bytes will be returned even if invalid, i.e. to be lossless in any case, though the resulting tokenization may not be useful. Garbage in, garbage out. +The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We [make an effort](https://github.com/clipperhouse/uax29.net/blob/main/uax29/Unicode.Test.cs#L80) to ensure that all bytes will be returned even if invalid, i.e. to be lossless in any case, though the resulting tokenization may not be useful. Garbage in, garbage out. ### Major version changes diff --git a/uax29/README.md b/uax29/README.md index 8900b28..483938e 100644 --- a/uax29/README.md +++ b/uax29/README.md @@ -118,7 +118,7 @@ The tokenizer is implemented as a `ref struct`, so you should see zero allocatio Calling `Split.Words` returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate. -For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `Split.Words`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. +For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `Split.Words`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://github.com/clipperhouse/uax29.net/blob/main/Benchmarks/Program.cs#L89). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation. ### Options @@ -126,7 +126,7 @@ Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be ### Invalid inputs -The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We [make an effort](https://github.com/clipperhouse/uax29.net/blob/main/uax29/Unicode.Test.cs#L55) to ensure that all bytes will be returned even if invalid, i.e. to be lossless in any case, though the resulting tokenization may not be useful. Garbage in, garbage out. +The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We [make an effort](https://github.com/clipperhouse/uax29.net/blob/main/uax29/Unicode.Test.cs#L80) to ensure that all bytes will be returned even if invalid, i.e. to be lossless in any case, though the resulting tokenization may not be useful. Garbage in, garbage out. ### Major version changes