-
-
Notifications
You must be signed in to change notification settings - Fork 743
/
Copy pathtoken_splitter.go
82 lines (71 loc) · 1.91 KB
/
token_splitter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
package textsplitter
import (
"fmt"
"github.com/pkoukk/tiktoken-go"
)
const (
// nolint:gosec
_defaultTokenModelName = "gpt-3.5-turbo"
_defaultTokenEncoding = "cl100k_base"
_defaultTokenChunkSize = 512
_defaultTokenChunkOverlap = 100
)
// TokenSplitter is a text splitter that will split texts by tokens.
type TokenSplitter struct {
ChunkSize int
ChunkOverlap int
ModelName string
EncodingName string
AllowedSpecial []string
DisallowedSpecial []string
}
func NewTokenSplitter(opts ...Option) TokenSplitter {
options := DefaultOptions()
for _, o := range opts {
o(&options)
}
s := TokenSplitter{
ChunkSize: options.ChunkSize,
ChunkOverlap: options.ChunkOverlap,
ModelName: options.ModelName,
EncodingName: options.EncodingName,
AllowedSpecial: options.AllowedSpecial,
DisallowedSpecial: options.DisallowedSpecial,
}
return s
}
// SplitText splits a text into multiple text.
func (s TokenSplitter) SplitText(text string) ([]string, error) {
// Get the tokenizer
var tk *tiktoken.Tiktoken
var err error
if s.EncodingName != "" {
tk, err = tiktoken.GetEncoding(s.EncodingName)
} else {
tk, err = tiktoken.EncodingForModel(s.ModelName)
}
if err != nil {
return nil, fmt.Errorf("tiktoken.GetEncoding: %w", err)
}
texts := s.splitText(text, tk)
return texts, nil
}
func (s TokenSplitter) splitText(text string, tk *tiktoken.Tiktoken) []string {
splits := make([]string, 0)
inputIDs := tk.Encode(text, s.AllowedSpecial, s.DisallowedSpecial)
startIdx := 0
curIdx := len(inputIDs)
if startIdx+s.ChunkSize < curIdx {
curIdx = startIdx + s.ChunkSize
}
for startIdx < len(inputIDs) {
chunkIDs := inputIDs[startIdx:curIdx]
splits = append(splits, tk.Decode(chunkIDs))
startIdx += s.ChunkSize - s.ChunkOverlap
curIdx = startIdx + s.ChunkSize
if curIdx > len(inputIDs) {
curIdx = len(inputIDs)
}
}
return splits
}