Skip to content

Commit

Permalink
FEATURE: Gemini Tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
xfalcox committed Jan 23, 2025
1 parent 5a97752 commit 822150b
Show file tree
Hide file tree
Showing 7 changed files with 839,001 additions and 3 deletions.
4 changes: 2 additions & 2 deletions app/models/embedding_definition.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def tokenizer_names
DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer,
DiscourseAi::Tokenizer::BgeLargeEnTokenizer,
DiscourseAi::Tokenizer::BgeM3Tokenizer,
DiscourseAi::Tokenizer::OpenAiTokenizer,
DiscourseAi::Tokenizer::GeminiTokenizer,
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer,
DiscourseAi::Tokenizer::OpenAiTokenizer,
].map(&:name)
Expand Down Expand Up @@ -61,7 +61,7 @@ def presets
pg_function: "<=>",
url:
"https://generativelanguage.googleapis.com/v1beta/models/embedding-001:embedContent",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
tokenizer_class: "DiscourseAi::Tokenizer::GeminiTokenizer",
provider: GOOGLE,
},
{
Expand Down
2 changes: 1 addition & 1 deletion lib/completions/llm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def presets
display_name: "Gemini 1.5 Flash",
},
],
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer,
tokenizer: DiscourseAi::Tokenizer::GeminiTokenizer,
provider: "google",
},
{
Expand Down
1 change: 1 addition & 0 deletions lib/tokenizer/basic_tokenizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class << self
def available_llm_tokenizers
[
DiscourseAi::Tokenizer::AnthropicTokenizer,
DiscourseAi::Tokenizer::GeminiTokenizer,
DiscourseAi::Tokenizer::Llama3Tokenizer,
DiscourseAi::Tokenizer::MixtralTokenizer,
DiscourseAi::Tokenizer::OpenAiTokenizer,
Expand Down
11 changes: 11 additions & 0 deletions lib/tokenizer/gemini_tokenizer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# frozen_string_literal: true

module DiscourseAi
module Tokenizer
class GeminiTokenizer < BasicTokenizer
def self.tokenizer
@@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/gemma2.json")
end
end
end
end
29 changes: 29 additions & 0 deletions spec/shared/tokenizer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -228,3 +228,32 @@
end
end
end

describe DiscourseAi::Tokenizer::GeminiTokenizer do
describe "#size" do
describe "returns a token count" do
it "for a sentence with punctuation and capitalization and numbers" do
expect(described_class.size("Hello, World! 123")).to eq(9)
end
end
end

describe "#truncate" do
it "truncates a sentence" do
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
expect(described_class.truncate(sentence, 3)).to eq("foo bar")
end

it "truncates a sentence successfully at a multibyte unicode character" do
sentence = "foo bar πŸ‘¨πŸΏβ€πŸ‘©πŸΏβ€πŸ‘§πŸΏβ€πŸ‘§πŸΏ baz qux quux corge grault garply waldo fred plugh xyzzy thud"
expect(described_class.truncate(sentence, 8)).to eq("foo bar πŸ‘¨πŸΏβ€πŸ‘©")
end

it "truncates unicode characters properly when they use more than one token per char" do
sentence = "ζˆ‘ε–œζ¬’εƒζ―”θ¨"
original_size = described_class.size(sentence)
expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
original_size
end
end
end
4 changes: 4 additions & 0 deletions tokenizers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,7 @@ Licensed under MIT License
## Meta-Llama-3-70B-Instruct

Licensed under META LLAMA 3 COMMUNITY LICENSE

## Gemma 2

Licensed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms)
Loading

0 comments on commit 822150b

Please sign in to comment.