clipperhouse · clipperhouse · Jul 17, 2024 · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ BenchmarkDotNet.Artifacts
 bin
 obj
 .vscode/tasks.json
+global.json
diff --git a/Benchmarks/Program.cs b/Benchmarks/Program.cs
@@ -1,3 +1,4 @@
+using System.Buffers;
 using System.Text;
 using BenchmarkDotNet.Attributes;
 using BenchmarkDotNet.Configs;
@@ -39,70 +40,62 @@ public void Setup()
 	}
 
 	[Benchmark]
-	public void TokenizeBytes()
+	public void SplitBytes()
 	{
-		var tokens = Tokenizer.GetWords(sample);
+		var tokens = Split.Words(sample);
 		foreach (var token in tokens)
 		{
 		}
 	}
 
 	[Benchmark]
-	public void TokenizeBytesOmitWhitespace()
+	public void SplitBytesOmitWhitespace()
 	{
-		var tokens = Tokenizer.GetWords(sample, Options.OmitWhitespace);
+		var tokens = Split.Words(sample, Options.OmitWhitespace);
 		foreach (var token in tokens)
 		{
 		}
 	}
 
 	[Benchmark]
-	public void TokenizeString()
+	public void SplitString()
 	{
-		var tokens = Tokenizer.GetWords(sampleStr);
+		var tokens = Split.Words(sampleStr);
 		foreach (var token in tokens)
 		{
 		}
 	}
 
 	[Benchmark]
-	public void TokenizeStringOmitWhitespace()
+	public void SplitStringOmitWhitespace()
 	{
-		var tokens = Tokenizer.GetWords(sampleStr, Options.OmitWhitespace);
+		var tokens = Split.Words(sampleStr, Options.OmitWhitespace);
 		foreach (var token in tokens)
 		{
 		}
 	}
 
 	[Benchmark]
-	public void TokenizeStream()
+	public void SplitStream()
 	{
-		var stream = new MemoryStream(sample);
-		var tokens = Tokenizer.GetWords(stream);
-		foreach (var token in tokens)
-		{
-		}
+		sampleStream.Seek(0, SeekOrigin.Begin);
+		var tokens = Split.Words(sampleStream);
+		foreach (var token in tokens) { }
 	}
 
+	static readonly ArrayPool<byte> pool = ArrayPool<byte>.Shared;
+
 	[Benchmark]
-	public void TokenizeSetStream()
+	public void SplitStreamArrayPool()
 	{
-		// This is to test to observe allocations.
+		var storage = pool.Rent(2048);
 
-		// The creation will allocate a buffer of 1024 bytes
-		var tokens = Tokenizer.GetWords(sampleStream);
+		sampleStream.Seek(0, SeekOrigin.Begin);
+		var tokens = Split.Words(sampleStream, minBufferBytes: 1024, bufferStorage: storage);
+		tokens.SetStream(sampleStream);
+		foreach (var token in tokens) { }
 
-		var runs = 10;
-		// keep in mind the 10 runs when interpreting the benchmark
-		for (var i = 0; i < runs; i++)
-		{
-			// subsequent runs should allocate less by using SetStream
-			sampleStream.Seek(0, SeekOrigin.Begin);
-			tokens.SetStream(sampleStream);
-			foreach (var token in tokens)
-			{
-			}
-		}
+		pool.Return(storage);
 	}
 
 	[Benchmark]
@@ -115,9 +108,9 @@ public void StringInfoGraphemes()
 	}
 
 	[Benchmark]
-	public void TokenizerGraphemes()
+	public void SplitGraphemes()
 	{
-		var tokens = Tokenizer.GetGraphemes(sample);
+		var tokens = Split.Graphemes(sample);
 		foreach (var token in tokens)
 		{
 		}

diff --git a/Benchmarks/Speed.cs b/Benchmarks/Speed.cs
@@ -11,17 +11,17 @@ public string GetValue(Summary summary, BenchmarkCase benchmarkCase)
             return "N/A";
         }
         var ourReport = summary.Reports.First(x => x.BenchmarkCase.Equals(benchmarkCase));
-        long length = new System.IO.FileInfo("sample.txt").Length;
-        var mean = ourReport.ResultStatistics.Mean;
-        return $"{(length / ourReport.ResultStatistics.Mean):#####.000}";
+        long length = new FileInfo("sample.txt").Length;
+        var mean = ourReport.ResultStatistics!.Mean;
+        return $"{length / mean:#####.000} GB/s";
     }
 
     public string GetValue(Summary summary, BenchmarkCase benchmarkCase, SummaryStyle style) => GetValue(summary, benchmarkCase);
     public bool IsDefault(Summary summary, BenchmarkCase benchmarkCase) => false;
     public bool IsAvailable(Summary summary) => true;
 
     public string Id { get; } = nameof(Speed);
-    public string ColumnName { get; } = "Speed (GB/s)";
+    public string ColumnName { get; } = "Throughput";
     public bool AlwaysShow { get; } = true;
     public ColumnCategory Category { get; } = ColumnCategory.Custom;
     public int PriorityInCategory { get; }

diff --git a/Codegen/Program.cs b/Codegen/Program.cs
@@ -115,11 +115,26 @@ static async Task WriteCategories(string typ)
 
 			if (typ == "Word")
 			{
-				// hack in a Tab category that the spec doesn't use, be we do
-				const string tab = "Tab";
+				const string ws = "Whitespace";
 				currentCat <<= 1;
-				cats.Add(tab, currentCat);
-				catsByRune.Add(0x09, tab);
+				cats.Add(ws, currentCat);
+
+				for (var i = 0; i < char.MaxValue; i++)
+				{
+					var ch = (char)i;
+					if (char.IsWhiteSpace(ch))
+					{
+						var r = new Rune(ch);
+						if (catsByRune.TryGetValue(r.Value, out string? existing))
+						{
+							catsByRune[r.Value] = $"{existing} | {ws}";
+						}
+						else
+						{
+							catsByRune.Add(r.Value, ws);
+						}
+					}
+				}
 			}
 
 			// write the file
@@ -142,7 +157,7 @@ internal static partial class {typ}s
 			}
 
 			dict.Write(@"
-	internal static readonly Dict Dict = new(GetDict());
+	static readonly Dict Dict = new(GetDict());
 	static Dictionary<int, Property> GetDict() => new()
 	{
 ");
@@ -181,7 +196,41 @@ static async Task WriteTests(string typ)
 [TestFixture]
 public class {typ}sTests
 {{
-	internal readonly static UnicodeTest[] UnicodeTests = [
+	static UnicodeTest[] Tests => UnicodeTests;
+
+	[Test, TestCaseSource(nameof(Tests))]
+	public void Bytes(UnicodeTest test)
+	{{
+		var tokens = Split.{typ}s(test.input);
+		TestUnicode.TestBytes(tokens, test);
+	}}
+
+	[Test, TestCaseSource(nameof(Tests))]
+	public void String(UnicodeTest test)
+	{{
+		var s = Encoding.UTF8.GetString(test.input);
+		var tokens = Split.{typ}s(s);
+		TestUnicode.TestChars(tokens, test);
+	}}
+
+	[Test, TestCaseSource(nameof(Tests))]
+	public void Stream(UnicodeTest test)
+	{{
+		using var stream = new MemoryStream(test.input);
+		var tokens = Split.{typ}s(stream);
+		TestUnicode.TestStream(tokens, test);
+	}}
+
+	[Test, TestCaseSource(nameof(Tests))]
+	public void TextReader(UnicodeTest test)
+	{{
+		using var stream = new MemoryStream(test.input);
+		using var reader = new StreamReader(stream);
+		var tokens = Split.{typ}s(reader);
+		TestUnicode.TestTextReader(tokens, test);
+	}}
+
+	readonly static UnicodeTest[] UnicodeTests = [
 ");
 			while (true)
 			{
@@ -241,40 +290,6 @@ public class {typ}sTests
 			}
 			dict.Write(@$"
 	];
-
-	static readonly UnicodeTest[] Tests = UnicodeTests;
-
-	[Test, TestCaseSource(nameof(Tests))]
-	public void Bytes(UnicodeTest test)
-	{{
-		var tokens = Tokenizer.Get{typ}s(test.input);
-		TestUnicode.TestTokenizerBytes(tokens, test);
-	}}
-
-	[Test, TestCaseSource(nameof(Tests))]
-	public void String(UnicodeTest test)
-	{{
-		var s = Encoding.UTF8.GetString(test.input);
-		var tokens = Tokenizer.Get{typ}s(s);
-		TestUnicode.TestTokenizerChars(tokens, test);
-	}}
-
-	[Test, TestCaseSource(nameof(Tests))]
-	public void Stream(UnicodeTest test)
-	{{
-		using var stream = new MemoryStream(test.input);
-		var tokens = Tokenizer.Get{typ}s(stream);
-		TestUnicode.TestTokenizerStream(tokens, test);
-	}}
-
-	[Test, TestCaseSource(nameof(Tests))]
-	public void TextReader(UnicodeTest test)
-	{{
-		using var stream = new MemoryStream(test.input);
-		using var reader = new StreamReader(stream);
-		var tokens = Tokenizer.Get{typ}s(reader);
-		TestUnicode.TestTokenizerTextReader(tokens, test);
-	}}
 }}
 ");
 		}

diff --git a/README.md b/README.md
@@ -6,6 +6,8 @@ Any time our code operates on individual words, we are tokenizing. Often, we do
 
 ### Example
 
+_⚠️ This documentation on `main` refers to v3, which is not yet published on Nuget. See [v2 documentation](https://github.com/clipperhouse/uax29.net/blob/v2.2.0/README.md) until then._
+
 ```
 dotnet add package UAX29
 ```
@@ -19,7 +21,7 @@ var example = "Hello, 🌏 world. 你好，世界.";
 // The tokenizer can split words, graphemes or sentences.
 // It operates on strings, UTF-8 bytes, and streams.
 
-var words = Tokenizer.GetWords(example);
+var words = Split.Words(example);
 
 // Iterate over the tokens
 foreach (var word in words)
@@ -47,7 +49,7 @@ world
 */
 
 var utf8bytes = Encoding.UTF8.GetBytes(example);
-var graphemes = Tokenizer.GetGraphemes(utf8bytes);
+var graphemes = Split.Graphemes(utf8bytes);
 
 // Iterate over the tokens
 foreach (var grapheme in graphemes)
@@ -84,48 +86,64 @@ d
 */
 ```
 
+There are also optional extension methods in the spirit of `string.Split`:
+
+```csharp
+using UAX29.Extensions;
+
+example.SplitWords();
+```
+
 ### Data types
 
 For UTF-8 bytes, pass `byte[]`, `Span<byte>` or `Stream`; the resulting tokens will be `ReadOnlySpan<byte>`.
 
 For strings/chars, pass `string`, `char[]`, `Span<char>` or `TextReader`/`StreamReader`; the resulting tokens will be `ReadOnlySpan<char>`.
 
+If you have `Memory<byte|char>`, pass `Memory.Span`.
+
 ### Conformance
 
 We use the official Unicode [test suites](https://unicode.org/reports/tr41/tr41-26.html#Tests29). Status:
 
 [![.NET](https://github.com/clipperhouse/uax29.net/actions/workflows/dotnet.yml/badge.svg)](https://github.com/clipperhouse/uax29.net/actions/workflows/dotnet.yml)
 
-This is the same algorithm that is implemented in Lucene's [StandardTokenizer](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/analysis/standard/StandardTokenizer.html).
+This is the same spec that is implemented in Lucene's [StandardTokenizer](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/analysis/standard/StandardTokenizer.html).
 
 ### Performance
 
 When tokenizing words, I get around 120MB/s on my Macbook M2. For typical text, that's around 30 million tokens/s. [Benchmarks](https://github.com/clipperhouse/uax29.net/tree/main/Benchmarks)
 
 The tokenizer is implemented as a `ref struct`, so you should see zero allocations for static text such as `byte[]` or `string`/`char`.
 
-Calling `GetWords` et al returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate.
+Calling `Split.Words` returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate.
 
-For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `GetWords`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation.
+For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `Split.Words`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://github.com/clipperhouse/uax29.net/blob/main/Benchmarks/Program.cs#L89). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation.
 
 ### Options
 
-Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be returned.
+Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be returned (for words only).
 
 ### Invalid inputs
 
-The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We [make an effort](https://github.com/clipperhouse/uax29.net/blob/main/uax29/Unicode.Test.cs#L55) to ensure that all bytes will be returned even if invalid, i.e. to be lossless in any case, though the resulting tokenization may not be useful. Garbage in, garbage out.
+The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We [make an effort](https://github.com/clipperhouse/uax29.net/blob/main/uax29/Unicode.Test.cs#L80) to ensure that all bytes will be returned even if invalid, i.e. to be lossless in any case, though the resulting tokenization may not be useful. Garbage in, garbage out.
 
 ### Major version changes
 
-If you are using v1.x of this package, v2 has been renamed:
+#### v2 → v3
+
+Renamed methods:
+
+`Tokenizer.GetWords(input)` → `Split.Words(input)`
+
+#### v1 → v2
+
+Renamed package, namespace and methods:
 
 `dotnet add package uax29.net` → `dotnet add package UAX29`
 
 `using uax29` → `using UAX29`
 
-We renamed the methods:
-
 `Tokenizer.Create(input)` → `Tokenizer.GetWords(input)`
 
 `Tokenizer.Create(input, TokenType.Graphemes)` → `Tokenizer.GetGraphemes(input)`
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,4 @@ BenchmarkDotNet.Artifacts @@
     bin
     obj
     .vscode/tasks.json
+    global.json