Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementing Char for graphemes and all the necessary changes to make it happen. #698

Merged
merged 14 commits into from
Jan 1, 2025
Merged
6 changes: 3 additions & 3 deletions benches/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,9 @@ fn bench_then(c: &mut Criterion) {

#[cfg(feature = "regex")]
fn bench_regex(c: &mut Criterion) {
let re_foo = regex::<_, _, extra::Default>("foo");
let re_foo2 = regex::<_, _, extra::Default>("[fF]oo");
let re_rep = regex::<_, _, extra::Default>("(?:abc){4}");
let re_foo = regex::<_, extra::Default>("foo");
let re_foo2 = regex::<_, extra::Default>("[fF]oo");
let re_rep = regex::<_, extra::Default>("(?:abc){4}");

let mut group = c.benchmark_group("regex");

Expand Down
8 changes: 4 additions & 4 deletions src/combinator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1627,7 +1627,7 @@ where
///
/// ```
/// # use chumsky::prelude::*;
/// let row_4 = text::int::<_, _, extra::Err<Simple<char>>>(10)
/// let row_4 = text::int::<_, extra::Err<Simple<char>>>(10)
/// .padded()
/// .separated_by(just(','))
/// .at_most(4)
Expand Down Expand Up @@ -1659,7 +1659,7 @@ where
///
/// ```
/// # use chumsky::prelude::*;
/// let coordinate_3d = text::int::<_, _, extra::Err<Simple<char>>>(10)
/// let coordinate_3d = text::int::<_, extra::Err<Simple<char>>>(10)
/// .padded()
/// .separated_by(just(','))
/// .exactly(3)
Expand Down Expand Up @@ -1688,7 +1688,7 @@ where
///
/// ```
/// # use chumsky::prelude::*;
/// let r#enum = text::ascii::keyword::<_, _, _, extra::Err<Simple<char>>>("enum")
/// let r#enum = text::ascii::keyword::<_, _, extra::Err<Simple<char>>>("enum")
/// .padded()
/// .ignore_then(text::ascii::ident()
/// .padded()
Expand Down Expand Up @@ -1718,7 +1718,7 @@ where
///
/// ```
/// # use chumsky::prelude::*;
/// let numbers = text::int::<_, _, extra::Err<Simple<char>>>(10)
/// let numbers = text::int::<_, extra::Err<Simple<char>>>(10)
/// .padded()
/// .separated_by(just(','))
/// .allow_trailing()
Expand Down
68 changes: 65 additions & 3 deletions src/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -762,7 +762,7 @@ impl<'p> Seq<'p, char> for str {
}
}

impl<'p> Seq<'p, char> for &'p str {
impl<'p> Seq<'p, char> for String {
type Item<'a>
= char
where
Expand Down Expand Up @@ -792,7 +792,7 @@ impl<'p> Seq<'p, char> for &'p str {
}
}

impl<'p> Seq<'p, char> for String {
impl<'p> Seq<'p, char> for &'p str {
type Item<'a>
= char
where
Expand Down Expand Up @@ -822,6 +822,66 @@ impl<'p> Seq<'p, char> for String {
}
}

impl<'p> Seq<'p, &'p Grapheme> for &'p str {
type Item<'a>
= &'p Grapheme
where
Self: 'a;

type Iter<'a>
= GraphemesIter<'p>
where
Self: 'a;

#[inline(always)]
fn seq_iter(&self) -> Self::Iter<'_> {
Graphemes::new(self).iter()
}

#[inline(always)]
fn contains(&self, val: &&'p Grapheme) -> bool {
Graphemes::new(self).contains(val)
}

#[inline]
fn to_maybe_ref<'b>(item: Self::Item<'b>) -> MaybeRef<'p, &'p Grapheme>
where
'p: 'b,
{
MaybeRef::Val(item)
}
}

impl<'p> Seq<'p, &'p Grapheme> for &'p Graphemes {
type Item<'a>
= &'p Grapheme
where
Self: 'a;

type Iter<'a>
= GraphemesIter<'p>
where
Self: 'a;

#[inline(always)]
fn seq_iter(&self) -> Self::Iter<'_> {
self.iter()
}

#[inline(always)]
fn contains(&self, val: &&'p Grapheme) -> bool {
self.iter().any(|i| i == *val)
}

#[inline]
fn to_maybe_ref<'b>(item: Self::Item<'b>) -> MaybeRef<'p, &'p Grapheme>
where
'p: 'b,
{
MaybeRef::Val(item)
}
}

/// A utility trait to abstract over *linear* container-like things.
///
/// This trait is likely to change in future versions of the crate, so avoid implementing it yourself.
Expand All @@ -838,8 +898,10 @@ impl<'p, T> OrderedSeq<'p, T> for core::ops::RangeInclusive<T> where Self: Seq<'
impl<'p, T> OrderedSeq<'p, T> for RangeFrom<T> where Self: Seq<'p, T> {}

impl OrderedSeq<'_, char> for str {}
impl<'p> OrderedSeq<'p, char> for &'p str {}
impl OrderedSeq<'_, char> for String {}
impl<'p> OrderedSeq<'p, char> for &'p str {}
impl<'p> OrderedSeq<'p, &'p Grapheme> for &'p str {}
impl<'p> OrderedSeq<'p, &'p Grapheme> for &'p Graphemes {}

#[cfg(test)]
mod test {
Expand Down
107 changes: 12 additions & 95 deletions src/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ pub use crate::stream::{BoxedExactSizeStream, BoxedStream, IterInput, Stream};
use super::*;
#[cfg(feature = "std")]
use std::io::{BufReader, Read, Seek};
use unicode_segmentation::{Graphemes, UnicodeSegmentation};

/// A trait for types that represents a stream of input tokens. Unlike [`Iterator`], this type
/// supports backtracking and a few other features required by the crate.
Expand Down Expand Up @@ -210,8 +209,9 @@ pub trait SliceInput<'src>: ExactSizeInput<'src> {
// Implemented by inputs that reference a string slice and use byte indices as their cursor. This trait is sealed right
// now because `StrInput` places additional requirements on its cursor semantics.
/// A trait for types that represent string-like streams of input tokens.
pub trait StrInput<'src, C: Char>:
Sealed + ValueInput<'src, Cursor = usize, Token = C> + SliceInput<'src, Slice = &'src C::Str>
pub trait StrInput<'src>: Sealed + ValueInput<'src, Cursor = usize> + SliceInput<'src>
where
Self::Token: Char,
{
}

Expand Down Expand Up @@ -298,7 +298,7 @@ impl<'src> ValueInput<'src> for &'src str {
}

impl Sealed for &str {}
impl<'src> StrInput<'src, char> for &'src str {}
impl<'src> StrInput<'src> for &'src str {}

impl<'src> SliceInput<'src> for &'src str {
type Slice = &'src str;
Expand All @@ -319,89 +319,6 @@ impl<'src> SliceInput<'src> for &'src str {
}
}

impl<'src> Input<'src> for Graphemes<'src> {
type Cursor = usize;
type Span = SimpleSpan<usize>;

type Token = &'src str;
type MaybeToken = &'src str;

type Cache = &'src str;

#[inline]
fn begin(self) -> (Self::Cursor, Self::Cache) {
(0, self.as_str())
}

#[inline]
fn cursor_location(cursor: &Self::Cursor) -> usize {
*cursor
}

#[inline(always)]
unsafe fn next_maybe(
this: &mut Self::Cache,
cursor: &mut Self::Cursor,
) -> Option<Self::MaybeToken> {
if *cursor < this.len() {
// SAFETY: `cursor < self.len()` above guarantees cursor is in-bounds
// We only ever return cursors that are at a code point boundary.
// The `next()` implementation returns `None`, only in the
// situation of zero length of the remaining part of the string.
// And the Unicode standard guarantees that any sequence of code
// points is a valid sequence of grapheme clusters, so the
// behaviour of the `next()` function should not change.
let c = this
.get_unchecked(*cursor..)
.graphemes(true)
.next()
.unwrap_unchecked();
*cursor += c.len();
Some(c)
} else {
None
}
}

#[inline(always)]
unsafe fn span(_this: &mut Self::Cache, range: Range<&Self::Cursor>) -> Self::Span {
(*range.start..*range.end).into()
}
}

impl<'src> ExactSizeInput<'src> for Graphemes<'src> {
#[inline(always)]
unsafe fn span_from(this: &mut Self::Cache, range: RangeFrom<&Self::Cursor>) -> Self::Span {
(*range.start..this.len()).into()
}
}

impl<'src> ValueInput<'src> for Graphemes<'src> {
#[inline(always)]
unsafe fn next(this: &mut Self::Cache, cursor: &mut Self::Cursor) -> Option<Self::Token> {
Self::next_maybe(this, cursor)
}
}

impl<'src> SliceInput<'src> for Graphemes<'src> {
type Slice = Graphemes<'src>;

#[inline(always)]
fn full_slice(this: &mut Self::Cache) -> Self::Slice {
this.graphemes(true)
}

#[inline(always)]
unsafe fn slice(this: &mut Self::Cache, range: Range<&Self::Cursor>) -> Self::Slice {
this[*range.start..*range.end].graphemes(true)
}

#[inline(always)]
unsafe fn slice_from(this: &mut Self::Cache, from: RangeFrom<&Self::Cursor>) -> Self::Slice {
this[*from.start..].graphemes(true)
}
}

impl<'src, T> Input<'src> for &'src [T] {
type Cursor = usize;
type Span = SimpleSpan<usize>;
Expand Down Expand Up @@ -448,7 +365,7 @@ impl<'src, T> ExactSizeInput<'src> for &'src [T] {
}

impl Sealed for &[u8] {}
impl<'src> StrInput<'src, u8> for &'src [u8] {}
impl<'src> StrInput<'src> for &'src [u8] {}

impl<'src, T> SliceInput<'src> for &'src [T] {
type Slice = &'src [T];
Expand Down Expand Up @@ -532,7 +449,7 @@ impl<'src, T: 'src, const N: usize> ExactSizeInput<'src> for &'src [T; N] {
}

impl<const N: usize> Sealed for &[u8; N] {}
impl<'src, const N: usize> StrInput<'src, u8> for &'src [u8; N] {}
impl<'src, const N: usize> StrInput<'src> for &'src [u8; N] {}

impl<'src, T: 'src, const N: usize> SliceInput<'src> for &'src [T; N] {
type Slice = &'src [T];
Expand Down Expand Up @@ -881,14 +798,14 @@ where
F: Fn(I::Span) -> S,
{
}
impl<'src, C, S, I, F: 'src> StrInput<'src, C> for MappedSpan<S, I, F>
impl<'src, S, I, F: 'src> StrInput<'src> for MappedSpan<S, I, F>
where
I: StrInput<'src, C>,
I: StrInput<'src>,
I::Token: Char,
S: Span + Clone + 'src,
S::Context: Clone + 'src,
S::Offset: From<<I::Span as Span>::Offset>,
F: Fn(I::Span) -> S,
C: Char,
{
}

Expand Down Expand Up @@ -1027,13 +944,13 @@ where
S::Offset: From<<I::Span as Span>::Offset>,
{
}
impl<'src, C, S, I> StrInput<'src, C> for WithContext<S, I>
impl<'src, S, I> StrInput<'src> for WithContext<S, I>
where
I: StrInput<'src, C>,
I: StrInput<'src>,
I::Token: Char,
S: Span + Clone + 'src,
S::Context: Clone + 'src,
S::Offset: From<<I::Span as Span>::Offset>,
C: Char,
{
}

Expand Down
Loading
Loading