-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgpt2.go
32 lines (27 loc) · 852 Bytes
/
gpt2.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
package tiktoken
import (
_ "embed"
"strings"
)
//go:embed resource/gpt2/vocab.bpe
var gpt2Vocab string
//go:embed resource/gpt2/encoder.json
var gpt2Encode string
// NewGPT2 creates a new Codec instance for the GPT-2 tokenization scheme.
// It loads the mergeable ranks from the embedded gpt2Vocab and gpt2Encode resources.
// The function returns a pointer to the Codec or an error if any.
func NewGPT2() (*Codec, error) {
ranks, err := CovertVocabBPEAndEncoderJSONToMergeableBPERanks(strings.NewReader(gpt2Vocab), strings.NewReader(gpt2Encode))
if err != nil {
return nil, err
}
return &Codec{
Name: "gpt2",
ExplicitNVocab: 50257,
PatStr: `'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`,
MergeableRanks: ranks,
SpecialTokens: map[string]uint{
EndOfText: 50256,
},
}, nil
}