Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extensions & renames #27

Merged
merged 17 commits into from
Jul 17, 2024
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ BenchmarkDotNet.Artifacts
bin
obj
.vscode/tasks.json
global.json
55 changes: 24 additions & 31 deletions Benchmarks/Program.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System.Buffers;
using System.Text;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Configs;
Expand Down Expand Up @@ -39,70 +40,62 @@ public void Setup()
}

[Benchmark]
public void TokenizeBytes()
public void SplitBytes()
{
var tokens = Tokenizer.GetWords(sample);
var tokens = Split.Words(sample);
foreach (var token in tokens)
{
}
}

[Benchmark]
public void TokenizeBytesOmitWhitespace()
public void SplitBytesOmitWhitespace()
{
var tokens = Tokenizer.GetWords(sample, Options.OmitWhitespace);
var tokens = Split.Words(sample, Options.OmitWhitespace);
foreach (var token in tokens)
{
}
}

[Benchmark]
public void TokenizeString()
public void SplitString()
{
var tokens = Tokenizer.GetWords(sampleStr);
var tokens = Split.Words(sampleStr);
foreach (var token in tokens)
{
}
}

[Benchmark]
public void TokenizeStringOmitWhitespace()
public void SplitStringOmitWhitespace()
{
var tokens = Tokenizer.GetWords(sampleStr, Options.OmitWhitespace);
var tokens = Split.Words(sampleStr, Options.OmitWhitespace);
foreach (var token in tokens)
{
}
}

[Benchmark]
public void TokenizeStream()
public void SplitStream()
{
var stream = new MemoryStream(sample);
var tokens = Tokenizer.GetWords(stream);
foreach (var token in tokens)
{
}
sampleStream.Seek(0, SeekOrigin.Begin);
var tokens = Split.Words(sampleStream);
foreach (var token in tokens) { }
}

static readonly ArrayPool<byte> pool = ArrayPool<byte>.Shared;

[Benchmark]
public void TokenizeSetStream()
public void SplitStreamArrayPool()
{
// This is to test to observe allocations.
var storage = pool.Rent(2048);

// The creation will allocate a buffer of 1024 bytes
var tokens = Tokenizer.GetWords(sampleStream);
sampleStream.Seek(0, SeekOrigin.Begin);
var tokens = Split.Words(sampleStream, minBufferBytes: 1024, bufferStorage: storage);
tokens.SetStream(sampleStream);
foreach (var token in tokens) { }

var runs = 10;
// keep in mind the 10 runs when interpreting the benchmark
for (var i = 0; i < runs; i++)
{
// subsequent runs should allocate less by using SetStream
sampleStream.Seek(0, SeekOrigin.Begin);
tokens.SetStream(sampleStream);
foreach (var token in tokens)
{
}
}
pool.Return(storage);
}

[Benchmark]
Expand All @@ -115,9 +108,9 @@ public void StringInfoGraphemes()
}

[Benchmark]
public void TokenizerGraphemes()
public void SplitGraphemes()
{
var tokens = Tokenizer.GetGraphemes(sample);
var tokens = Split.Graphemes(sample);
foreach (var token in tokens)
{
}
Expand Down
8 changes: 4 additions & 4 deletions Benchmarks/Speed.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@ public string GetValue(Summary summary, BenchmarkCase benchmarkCase)
return "N/A";
}
var ourReport = summary.Reports.First(x => x.BenchmarkCase.Equals(benchmarkCase));
long length = new System.IO.FileInfo("sample.txt").Length;
var mean = ourReport.ResultStatistics.Mean;
return $"{(length / ourReport.ResultStatistics.Mean):#####.000}";
long length = new FileInfo("sample.txt").Length;
var mean = ourReport.ResultStatistics!.Mean;
return $"{length / mean:#####.000} GB/s";
}

public string GetValue(Summary summary, BenchmarkCase benchmarkCase, SummaryStyle style) => GetValue(summary, benchmarkCase);
public bool IsDefault(Summary summary, BenchmarkCase benchmarkCase) => false;
public bool IsAvailable(Summary summary) => true;

public string Id { get; } = nameof(Speed);
public string ColumnName { get; } = "Speed (GB/s)";
public string ColumnName { get; } = "Throughput";
public bool AlwaysShow { get; } = true;
public ColumnCategory Category { get; } = ColumnCategory.Custom;
public int PriorityInCategory { get; }
Expand Down
95 changes: 55 additions & 40 deletions Codegen/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,26 @@ static async Task WriteCategories(string typ)

if (typ == "Word")
{
// hack in a Tab category that the spec doesn't use, be we do
const string tab = "Tab";
const string ws = "Whitespace";
currentCat <<= 1;
cats.Add(tab, currentCat);
catsByRune.Add(0x09, tab);
cats.Add(ws, currentCat);

for (var i = 0; i < char.MaxValue; i++)
{
var ch = (char)i;
if (char.IsWhiteSpace(ch))
{
var r = new Rune(ch);
if (catsByRune.TryGetValue(r.Value, out string? existing))
{
catsByRune[r.Value] = $"{existing} | {ws}";
}
else
{
catsByRune.Add(r.Value, ws);
}
}
}
}

// write the file
Expand All @@ -142,7 +157,7 @@ internal static partial class {typ}s
}

dict.Write(@"
internal static readonly Dict Dict = new(GetDict());
static readonly Dict Dict = new(GetDict());
static Dictionary<int, Property> GetDict() => new()
{
");
Expand Down Expand Up @@ -181,7 +196,41 @@ static async Task WriteTests(string typ)
[TestFixture]
public class {typ}sTests
{{
internal readonly static UnicodeTest[] UnicodeTests = [
static UnicodeTest[] Tests => UnicodeTests;

[Test, TestCaseSource(nameof(Tests))]
public void Bytes(UnicodeTest test)
{{
var tokens = Split.{typ}s(test.input);
TestUnicode.TestBytes(tokens, test);
}}

[Test, TestCaseSource(nameof(Tests))]
public void String(UnicodeTest test)
{{
var s = Encoding.UTF8.GetString(test.input);
var tokens = Split.{typ}s(s);
TestUnicode.TestChars(tokens, test);
}}

[Test, TestCaseSource(nameof(Tests))]
public void Stream(UnicodeTest test)
{{
using var stream = new MemoryStream(test.input);
var tokens = Split.{typ}s(stream);
TestUnicode.TestStream(tokens, test);
}}

[Test, TestCaseSource(nameof(Tests))]
public void TextReader(UnicodeTest test)
{{
using var stream = new MemoryStream(test.input);
using var reader = new StreamReader(stream);
var tokens = Split.{typ}s(reader);
TestUnicode.TestTextReader(tokens, test);
}}

readonly static UnicodeTest[] UnicodeTests = [
");
while (true)
{
Expand Down Expand Up @@ -241,40 +290,6 @@ public class {typ}sTests
}
dict.Write(@$"
];

static readonly UnicodeTest[] Tests = UnicodeTests;

[Test, TestCaseSource(nameof(Tests))]
public void Bytes(UnicodeTest test)
{{
var tokens = Tokenizer.Get{typ}s(test.input);
TestUnicode.TestTokenizerBytes(tokens, test);
}}

[Test, TestCaseSource(nameof(Tests))]
public void String(UnicodeTest test)
{{
var s = Encoding.UTF8.GetString(test.input);
var tokens = Tokenizer.Get{typ}s(s);
TestUnicode.TestTokenizerChars(tokens, test);
}}

[Test, TestCaseSource(nameof(Tests))]
public void Stream(UnicodeTest test)
{{
using var stream = new MemoryStream(test.input);
var tokens = Tokenizer.Get{typ}s(stream);
TestUnicode.TestTokenizerStream(tokens, test);
}}

[Test, TestCaseSource(nameof(Tests))]
public void TextReader(UnicodeTest test)
{{
using var stream = new MemoryStream(test.input);
using var reader = new StreamReader(stream);
var tokens = Tokenizer.Get{typ}s(reader);
TestUnicode.TestTokenizerTextReader(tokens, test);
}}
}}
");
}
Expand Down
38 changes: 28 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Any time our code operates on individual words, we are tokenizing. Often, we do

### Example

_⚠️ This documentation on `main` refers to v3, which is not yet published on Nuget. See [v2 documentation](https://github.com/clipperhouse/uax29.net/blob/v2.2.0/README.md) until then._

```
dotnet add package UAX29
```
Expand All @@ -19,7 +21,7 @@ var example = "Hello, 🌏 world. 你好,世界.";
// The tokenizer can split words, graphemes or sentences.
// It operates on strings, UTF-8 bytes, and streams.

var words = Tokenizer.GetWords(example);
var words = Split.Words(example);

// Iterate over the tokens
foreach (var word in words)
Expand Down Expand Up @@ -47,7 +49,7 @@ world
*/

var utf8bytes = Encoding.UTF8.GetBytes(example);
var graphemes = Tokenizer.GetGraphemes(utf8bytes);
var graphemes = Split.Graphemes(utf8bytes);

// Iterate over the tokens
foreach (var grapheme in graphemes)
Expand Down Expand Up @@ -84,48 +86,64 @@ d
*/
```

There are also optional extension methods in the spirit of `string.Split`:

```csharp
using UAX29.Extensions;

example.SplitWords();
```

### Data types

For UTF-8 bytes, pass `byte[]`, `Span<byte>` or `Stream`; the resulting tokens will be `ReadOnlySpan<byte>`.

For strings/chars, pass `string`, `char[]`, `Span<char>` or `TextReader`/`StreamReader`; the resulting tokens will be `ReadOnlySpan<char>`.

If you have `Memory<byte|char>`, pass `Memory.Span`.

### Conformance

We use the official Unicode [test suites](https://unicode.org/reports/tr41/tr41-26.html#Tests29). Status:

[![.NET](https://github.com/clipperhouse/uax29.net/actions/workflows/dotnet.yml/badge.svg)](https://github.com/clipperhouse/uax29.net/actions/workflows/dotnet.yml)

This is the same algorithm that is implemented in Lucene's [StandardTokenizer](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/analysis/standard/StandardTokenizer.html).
This is the same spec that is implemented in Lucene's [StandardTokenizer](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/analysis/standard/StandardTokenizer.html).

### Performance

When tokenizing words, I get around 120MB/s on my Macbook M2. For typical text, that's around 30 million tokens/s. [Benchmarks](https://github.com/clipperhouse/uax29.net/tree/main/Benchmarks)

The tokenizer is implemented as a `ref struct`, so you should see zero allocations for static text such as `byte[]` or `string`/`char`.

Calling `GetWords` et al returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate.
Calling `Split.Words` returns a lazy enumerator, and will not allocate per-token. There are `ToList` and `ToArray` methods for convenience, which will allocate.

For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `GetWords`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.arraypool-1). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation.
For `Stream` or `TextReader`/`StreamReader`, a buffer needs to be allocated behind the scenes. You can specify the size when calling `Split.Words`. You can also optionally pass your own `byte[]` or `char[]` to do your own allocation, perhaps with [ArrayPool](https://github.com/clipperhouse/uax29.net/blob/main/Benchmarks/Program.cs#L89). Or, you can re-use the buffer by calling `SetStream` on an existing tokenizer, which will avoid re-allocation.

### Options

Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be returned.
Pass `Options.OmitWhitespace` if you would like whitespace-only tokens not to be returned (for words only).

### Invalid inputs

The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We [make an effort](https://github.com/clipperhouse/uax29.net/blob/main/uax29/Unicode.Test.cs#L55) to ensure that all bytes will be returned even if invalid, i.e. to be lossless in any case, though the resulting tokenization may not be useful. Garbage in, garbage out.
The tokenizer expects valid (decodable) UTF-8 bytes or UTF-16 chars as input. We [make an effort](https://github.com/clipperhouse/uax29.net/blob/main/uax29/Unicode.Test.cs#L80) to ensure that all bytes will be returned even if invalid, i.e. to be lossless in any case, though the resulting tokenization may not be useful. Garbage in, garbage out.

### Major version changes

If you are using v1.x of this package, v2 has been renamed:
#### v2 → v3

Renamed methods:

`Tokenizer.GetWords(input)` → `Split.Words(input)`

#### v1 → v2

Renamed package, namespace and methods:

`dotnet add package uax29.net` → `dotnet add package UAX29`

`using uax29` → `using UAX29`

We renamed the methods:

`Tokenizer.Create(input)` → `Tokenizer.GetWords(input)`

`Tokenizer.Create(input, TokenType.Graphemes)` → `Tokenizer.GetGraphemes(input)`
Expand Down
Loading