Skip to content

Commit

Permalink
better calculation for whitespace
Browse files Browse the repository at this point in the history
  • Loading branch information
clipperhouse committed Jul 17, 2024
1 parent 754b9fd commit 1b0a52a
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 49 deletions.
23 changes: 19 additions & 4 deletions Codegen/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,26 @@ static async Task WriteCategories(string typ)

if (typ == "Word")
{
// hack in a Tab category that the spec doesn't use, be we do
const string tab = "Tab";
const string ws = "Whitespace";
currentCat <<= 1;
cats.Add(tab, currentCat);
catsByRune.Add(0x09, tab);
cats.Add(ws, currentCat);

for (var i = 0; i < char.MaxValue; i++)
{
var ch = (char)i;
if (char.IsWhiteSpace(ch))
{
var r = new Rune(ch);
if (catsByRune.TryGetValue(r.Value, out string? existing))
{
catsByRune[r.Value] = $"{existing} | {ws}";
}
else
{
catsByRune.Add(r.Value, ws);
}
}
}
}

// write the file
Expand Down
4 changes: 2 additions & 2 deletions uax29/Graphemes.Splitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ internal Splitter(Decoders<TSpan> decoders)
/// <param name="input">The string in which to split graphemes.</param>
/// <param name="seen">Ignore, only applicable to splitting words, not graphemes.</param>
/// <returns>The number of bytes/chars that comprise the grapheme.</returns>
internal int Split(ReadOnlySpan<TSpan> input, out Property _) // this out param is only relevant in Words.Splitter
internal int Split(ReadOnlySpan<TSpan> input, out bool _) // this out param is only relevant in Words.Splitter
{
Debug.Assert(input.Length > 0);

Expand Down Expand Up @@ -163,7 +163,7 @@ internal int Split(ReadOnlySpan<TSpan> input, out Property _) // this out para
break;
}

_ = 0; // see the Property out parameter at tops
_ = false; // see the out parameter at top
return pos;
}
}
Expand Down
7 changes: 1 addition & 6 deletions uax29/Options.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,7 @@ public enum Options : byte
None = 0,

/// <summary>
/// Omit tokens that consist entirely of whitespace, defined as UAX #29 WSegSpace | CR | LF | Tab.
/// <para>
/// “Whitespace” in this implementation includes those which delimit words, but not all characters that are categorically whitespace.
/// For example, “non-breaking space” is whitespace, but it’s not what you want when splitting words, and so
/// it is not considered whitespace for our purposes.
/// </para>
/// Omit tokens that consist entirely of whitespace, as defined by char.IsWhitespace.
/// <para>* Only supported for splitting Words; ignored for Graphemes and Sentences. *</para>
/// </summary>
OmitWhitespace = 1,
Expand Down
4 changes: 2 additions & 2 deletions uax29/Sentences.Splitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ internal Splitter(Decoders<TSpan> decoders)
/// <param name="input">The string in which to split sentences.</param>
/// <param name="seen">Ignore, only applicable to splitting words, not sentences.</param>
/// <returns>The number of bytes/chars that comprise the sentence.</returns>
internal int Split(ReadOnlySpan<TSpan> input, out Property _) // this out param is only relevant in Words.Splitter
internal int Split(ReadOnlySpan<TSpan> input, out bool _) // this out param is only relevant in Words.Splitter
{
Debug.Assert(input.Length > 0);

Expand Down Expand Up @@ -247,7 +247,7 @@ internal int Split(ReadOnlySpan<TSpan> input, out Property _) // this out para
pos += w;
}

_ = 0; // see the out Property parameter at top
_ = false; // see the out parameter at top

return pos;

Expand Down
4 changes: 2 additions & 2 deletions uax29/SplitEnumerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@ public bool MoveNext()

while (end < input.Length)
{
var advance = this.split(input[end..], out var seen);
var advance = this.split(input[end..], out var whitespace);
Debug.Assert(advance > 0);

start = end;
end += advance;

// This option is only supported for words; prevent other uses at the static API level
if (options.Includes(Options.OmitWhitespace) && seen.IsExclusively(Words.Whitespace))
if (whitespace && options.Includes(Options.OmitWhitespace))
{
continue;
}
Expand Down
2 changes: 1 addition & 1 deletion uax29/Splitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
/// <typeparam name="T">byte or char, indicating the type of the input, and by implication, the output.</typeparam>
/// <param name="input">The string to split/tokenize.</param>
/// <returns>How many bytes/chars were consumed from the input.</returns>
internal delegate int Split<T>(ReadOnlySpan<T> input, out Property seen);
internal delegate int Split<T>(ReadOnlySpan<T> input, out bool whitespace);

internal static class PropertyExtensions
{
Expand Down
4 changes: 2 additions & 2 deletions uax29/StreamEnumerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,13 @@ public bool MoveNext()
count += end;
buffer.Consume(this.Current.Length); // previous token

var advance = this.split(buffer.Contents, out Property seen);
var advance = this.split(buffer.Contents, out var whitespace);
Debug.Assert(advance > 0);

end = advance;

// This option is only supported for words; prevent other uses at the static API level
if (options.Includes(Options.OmitWhitespace) && seen.IsExclusively(Words.Whitespace))
if (whitespace && options.Includes(Options.OmitWhitespace))
{
continue;
}
Expand Down
50 changes: 26 additions & 24 deletions uax29/Words.Dict.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ internal static partial class Words
const Property ZWJ = 65536;
const Property WSegSpace = 131072;
const Property Extended_Pictographic = 262144;
const Property Tab = 524288;
const Property Whitespace = 524288;

static readonly Dict Dict = new(GetDict());
static Dictionary<int, Property> GetDict() => new()
Expand Down Expand Up @@ -107,13 +107,13 @@ internal static partial class Words
{0xFB4D, Hebrew_Letter},
{0xFB4E, Hebrew_Letter},
{0xFB4F, Hebrew_Letter},
{0x000D, CR},
{0x000A, LF},
{0x000B, Newline},
{0x000C, Newline},
{0x0085, Newline},
{0x2028, Newline},
{0x2029, Newline},
{0x000D, CR | Whitespace},
{0x000A, LF | Whitespace},
{0x000B, Newline | Whitespace},
{0x000C, Newline | Whitespace},
{0x0085, Newline | Whitespace},
{0x2028, Newline | Whitespace},
{0x2029, Newline | Whitespace},
{0x0300, Extend},
{0x0301, Extend},
{0x0302, Extend},
Expand Down Expand Up @@ -33298,7 +33298,7 @@ internal static partial class Words
{0x1FBF8, Numeric},
{0x1FBF9, Numeric},
{0x005F, ExtendNumLet},
{0x202F, ExtendNumLet},
{0x202F, ExtendNumLet | Whitespace},
{0x203F, ExtendNumLet},
{0x2040, ExtendNumLet},
{0x2054, ExtendNumLet},
Expand All @@ -33309,20 +33309,20 @@ internal static partial class Words
{0xFE4F, ExtendNumLet},
{0xFF3F, ExtendNumLet},
{0x200D, ZWJ},
{0x0020, WSegSpace},
{0x1680, WSegSpace},
{0x2000, WSegSpace},
{0x2001, WSegSpace},
{0x2002, WSegSpace},
{0x2003, WSegSpace},
{0x2004, WSegSpace},
{0x2005, WSegSpace},
{0x2006, WSegSpace},
{0x2008, WSegSpace},
{0x2009, WSegSpace},
{0x200A, WSegSpace},
{0x205F, WSegSpace},
{0x3000, WSegSpace},
{0x0020, WSegSpace | Whitespace},
{0x1680, WSegSpace | Whitespace},
{0x2000, WSegSpace | Whitespace},
{0x2001, WSegSpace | Whitespace},
{0x2002, WSegSpace | Whitespace},
{0x2003, WSegSpace | Whitespace},
{0x2004, WSegSpace | Whitespace},
{0x2005, WSegSpace | Whitespace},
{0x2006, WSegSpace | Whitespace},
{0x2008, WSegSpace | Whitespace},
{0x2009, WSegSpace | Whitespace},
{0x200A, WSegSpace | Whitespace},
{0x205F, WSegSpace | Whitespace},
{0x3000, WSegSpace | Whitespace},
{0x00A9, Extended_Pictographic},
{0x00AE, Extended_Pictographic},
{0x203C, Extended_Pictographic},
Expand Down Expand Up @@ -36854,6 +36854,8 @@ internal static partial class Words
{0x1FFFB, Extended_Pictographic},
{0x1FFFC, Extended_Pictographic},
{0x1FFFD, Extended_Pictographic},
{0x0009, Tab},
{0x0009, Whitespace},
{0x00A0, Whitespace},
{0x2007, Whitespace},
}; // end dict
}; // end class
10 changes: 4 additions & 6 deletions uax29/Words.Splitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

internal static partial class Words
{
internal const Property Whitespace = CR | LF | WSegSpace | Tab;

internal static readonly Split<byte> SplitBytes = new Splitter<byte>(Decoders.Utf8).Split;
internal static readonly Split<char> SplitChars = new Splitter<char>(Decoders.Char).Split;

Expand All @@ -31,14 +29,14 @@ internal Splitter(Decoders<TSpan> decoders)
/// <param name="input">The string in which to split words.</param>
/// <param name="seen">Categories that were seen in the first word.</param>
/// <returns>The number of bytes/chars that comprise the word.</returns>
internal int Split(ReadOnlySpan<TSpan> input, out Property seen)
internal int Split(ReadOnlySpan<TSpan> input, out bool whitespace)
{
Debug.Assert(input.Length > 0);

// These vars are stateful across loop iterations
int pos = 0;
int w;
seen = 0;
whitespace = true;
Property current = 0;
Property lastExIgnore = 0; // "last excluding ignored categories"
Property lastLastExIgnore = 0; // "the last one before that"
Expand All @@ -58,7 +56,7 @@ internal int Split(ReadOnlySpan<TSpan> input, out Property seen)

pos += w;
current = Dict.Lookup(rune.Value);
seen |= current;
whitespace = whitespace && current.Is(Whitespace);
}

// https://unicode.org/reports/tr29/#WB2
Expand All @@ -79,7 +77,7 @@ internal int Split(ReadOnlySpan<TSpan> input, out Property seen)
lastExIgnore = last;
}

seen |= last;
whitespace = whitespace && current.Is(Whitespace);

current = Dict.Lookup(rune.Value);

Expand Down

0 comments on commit 1b0a52a

Please sign in to comment.