diff --git a/textsplitter/markdown_splitter.go b/textsplitter/markdown_splitter.go index bba511fe9..117d9b1e1 100644 --- a/textsplitter/markdown_splitter.go +++ b/textsplitter/markdown_splitter.go @@ -24,6 +24,7 @@ func NewMarkdownTextSplitter(opts ...Option) *MarkdownTextSplitter { CodeBlocks: options.CodeBlocks, ReferenceLinks: options.ReferenceLinks, HeadingHierarchy: options.KeepHeadingHierarchy, + JoinTableRows: options.JoinTableRows, } if sp.SecondSplitter == nil { @@ -55,6 +56,7 @@ type MarkdownTextSplitter struct { CodeBlocks bool ReferenceLinks bool HeadingHierarchy bool + JoinTableRows bool } // SplitText splits a text into multiple text. @@ -71,6 +73,7 @@ func (sp MarkdownTextSplitter) SplitText(text string) ([]string, error) { secondSplitter: sp.SecondSplitter, renderCodeBlocks: sp.CodeBlocks, useInlineContent: !sp.ReferenceLinks, + joinTableRows: sp.JoinTableRows, hTitleStack: []string{}, hTitlePrependHierarchy: sp.HeadingHierarchy, } @@ -126,6 +129,10 @@ type markdownContext struct { // useInlineContent determines whether the default inline content is rendered useInlineContent bool + + // joinTableRows determines whether a chunk should contain multiple table rows, + // or if each row in a table should be split into a separate chunk. + joinTableRows bool } // splitText splits Markdown text. @@ -425,14 +432,22 @@ func (mc *markdownContext) splitTableRows(header []string, bodies [][]string) { return } - // append table header for _, row := range bodies { line := tableRowInMarkdown(row) - mc.joinSnippet(fmt.Sprintf("%s\n%s", headerMD, line)) + // If we're at the start of the current snippet, or adding the current line would + // overflow the chunk size, prepend the header to the line (so that the new chunk + // will include the table header). + if len(mc.curSnippet) == 0 || utf8.RuneCountInString(mc.curSnippet)+utf8.RuneCountInString(line) >= mc.chunkSize { + line = fmt.Sprintf("%s\n%s", headerMD, line) + } - // keep every row in a single Document - mc.applyToChunks() + mc.joinSnippet(line) + + // If we're not joining table rows, create a new chunk. + if !mc.joinTableRows { + mc.applyToChunks() + } } } diff --git a/textsplitter/markdown_splitter_test.go b/textsplitter/markdown_splitter_test.go index 6c8d99e6f..a75cad2a4 100644 --- a/textsplitter/markdown_splitter_test.go +++ b/textsplitter/markdown_splitter_test.go @@ -95,14 +95,50 @@ Some content below h1>h2>h4.`, } // TestMarkdownHeaderTextSplitter_Table markdown always split by line. +// +//nolint:funlen func TestMarkdownHeaderTextSplitter_Table(t *testing.T) { t.Parallel() + type testCase struct { + name string markdown string + options []Option expectedDocs []schema.Document } + testCases := []testCase{ { + name: "size(64)-overlap(32)", + options: []Option{ + WithChunkSize(64), + WithChunkOverlap(32), + }, + markdown: `| Syntax | Description | +| ----------- | ----------- | +| Header | Title | +| Paragraph | Text |`, + expectedDocs: []schema.Document{ + { + PageContent: `| Syntax | Description | +| --- | --- | +| Header | Title |`, + Metadata: map[string]any{}, + }, + { + PageContent: `| Syntax | Description | +| --- | --- | +| Paragraph | Text |`, + Metadata: map[string]any{}, + }, + }, + }, + { + name: "size(512)-overlap(64)", + options: []Option{ + WithChunkSize(512), + WithChunkOverlap(64), + }, markdown: `| Syntax | Description | | ----------- | ----------- | | Header | Title | @@ -117,6 +153,53 @@ func TestMarkdownHeaderTextSplitter_Table(t *testing.T) { { PageContent: `| Syntax | Description | | --- | --- | +| Paragraph | Text |`, + Metadata: map[string]any{}, + }, + }, + }, + { + name: "big-tables-overflow", + options: []Option{ + WithChunkSize(64), + WithChunkOverlap(32), + WithJoinTableRows(true), + }, + markdown: `| Syntax | Description | +| ----------- | ----------- | +| Header | Title | +| Paragraph | Text |`, + expectedDocs: []schema.Document{ + { + PageContent: `| Syntax | Description | +| --- | --- | +| Header | Title |`, + Metadata: map[string]any{}, + }, + { + PageContent: `| Syntax | Description | +| --- | --- | +| Paragraph | Text |`, + Metadata: map[string]any{}, + }, + }, + }, + { + name: "big-tables", + options: []Option{ + WithChunkSize(128), + WithChunkOverlap(32), + WithJoinTableRows(true), + }, + markdown: `| Syntax | Description | +| ----------- | ----------- | +| Header | Title | +| Paragraph | Text |`, + expectedDocs: []schema.Document{ + { + PageContent: `| Syntax | Description | +| --- | --- | +| Header | Title | | Paragraph | Text |`, Metadata: map[string]any{}, }, @@ -125,15 +208,17 @@ func TestMarkdownHeaderTextSplitter_Table(t *testing.T) { } for _, tc := range testCases { - splitter := NewMarkdownTextSplitter(WithChunkSize(64), WithChunkOverlap(32)) - docs, err := CreateDocuments(splitter, []string{tc.markdown}, nil) - require.NoError(t, err) - assert.Equal(t, tc.expectedDocs, docs) + t.Run(tc.name, func(t *testing.T) { + t.Parallel() - splitter = NewMarkdownTextSplitter(WithChunkSize(512), WithChunkOverlap(64)) - docs, err = CreateDocuments(splitter, []string{tc.markdown}, nil) - require.NoError(t, err) - assert.Equal(t, tc.expectedDocs, docs) + rq := require.New(t) + + splitter := NewMarkdownTextSplitter(tc.options...) + + docs, err := CreateDocuments(splitter, []string{tc.markdown}, nil) + rq.NoError(err) + rq.Equal(tc.expectedDocs, docs) + }) } } diff --git a/textsplitter/options.go b/textsplitter/options.go index 50ac76940..3d1664353 100644 --- a/textsplitter/options.go +++ b/textsplitter/options.go @@ -17,6 +17,7 @@ type Options struct { CodeBlocks bool ReferenceLinks bool KeepHeadingHierarchy bool // Persist hierarchy of markdown headers in each chunk + JoinTableRows bool } // DefaultOptions returns the default options for all text splitter. @@ -145,3 +146,14 @@ func WithHeadingHierarchy(trackHeadingHierarchy bool) Option { o.KeepHeadingHierarchy = trackHeadingHierarchy } } + +// WithJoinTableRows sets whether tables should be split by row or not. When it is set to True, +// table rows are joined until the chunksize. When it is set to False (the default), tables are +// split by row. +// +// The default behavior is to split tables by row, so that each row is in a separate chunk. +func WithJoinTableRows(join bool) Option { + return func(o *Options) { + o.JoinTableRows = join + } +}