Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX: Open AI embeddings config migration & Seeded indexes cleanup #1092

Merged
merged 1 commit into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions db/migrate/20250125162658_fix_broken_open_ai_embeddings_config.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# frozen_string_literal: true

class FixBrokenOpenAiEmbeddingsConfig < ActiveRecord::Migration[7.2]
def up
return if fetch_setting("ai_embeddings_selected_model").present?

return if DB.query_single("SELECT COUNT(*) FROM embedding_definitions").first > 0

open_ai_models = %w[text-embedding-3-large text-embedding-3-small text-embedding-ada-002]
current_model = fetch_setting("ai_embeddings_model")
return if !open_ai_models.include?(current_model)

endpoint = fetch_setting("ai_openai_embeddings_url") || "https://api.openai.com/v1/embeddings"
api_key = fetch_setting("ai_openai_api_key")
return if api_key.blank?

attrs = {
display_name: current_model,
url: endpoint,
api_key: api_key,
provider: "open_ai",
}.merge(model_attrs(current_model))

persist_config(attrs)
end

def fetch_setting(name)
DB.query_single(
"SELECT value FROM site_settings WHERE name = :setting_name",
setting_name: name,
).first || ENV["DISCOURSE_#{name&.upcase}"]
end

def model_attrs(model_name)
if model_name == "text-embedding-3-large"
{
dimensions: 2000,
max_sequence_length: 8191,
id: 7,
pg_function: "<=>",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
matryoshka_dimensions: true,
provider_params: {
model_name: "text-embedding-3-large",
},
}
elsif model_name == "text-embedding-3-small"
{
dimensions: 1536,
max_sequence_length: 8191,
id: 6,
pg_function: "<=>",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
provider_params: {
model_name: "text-embedding-3-small",
},
}
else
{
dimensions: 1536,
max_sequence_length: 8191,
id: 2,
pg_function: "<=>",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
provider_params: {
model_name: "text-embedding-ada-002",
},
}
end
end

def persist_config(attrs)
DB.exec(
<<~SQL,
INSERT INTO embedding_definitions (id, display_name, dimensions, max_sequence_length, version, pg_function, provider, tokenizer_class, url, api_key, provider_params, matryoshka_dimensions, created_at, updated_at)
VALUES (:id, :display_name, :dimensions, :max_sequence_length, 1, :pg_function, :provider, :tokenizer_class, :url, :api_key, :provider_params, :matryoshka_dimensions, :now, :now)
SQL
id: attrs[:id],
display_name: attrs[:display_name],
dimensions: attrs[:dimensions],
max_sequence_length: attrs[:max_sequence_length],
pg_function: attrs[:pg_function],
provider: attrs[:provider],
tokenizer_class: attrs[:tokenizer_class],
url: attrs[:url],
api_key: attrs[:api_key],
provider_params: attrs[:provider_params]&.to_json,
matryoshka_dimensions: !!attrs[:matryoshka_dimensions],
now: Time.zone.now,
)

# We hardcoded the ID to match with already generated embeddings. Let's restart the seq to avoid conflicts.
DB.exec(
"ALTER SEQUENCE embedding_definitions_id_seq RESTART WITH :new_seq",
new_seq: attrs[:id].to_i + 1,
)

DB.exec(<<~SQL, new_value: attrs[:id])
INSERT INTO site_settings(name, data_type, value, created_at, updated_at)
VALUES ('ai_embeddings_selected_model', 3, ':new_value', NOW(), NOW())
SQL
end

def down
raise ActiveRecord::IrreversibleMigration
end
end
61 changes: 61 additions & 0 deletions db/migrate/20250127145305_clean_unused_embedding_search_indexes.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# frozen_string_literal: true
class CleanUnusedEmbeddingSearchIndexes < ActiveRecord::Migration[7.2]
def up
existing_definitions =
DB.query("SELECT id, dimensions FROM embedding_definitions WHERE id <= 8")

drop_statements =
(1..8)
.reduce([]) do |memo, model_id|
model = existing_definitions.find { |ed| ed&.id == model_id }

if model.blank? || !correctly_indexed?(model)
embedding_tables.each do |type|
memo << "DROP INDEX IF EXISTS ai_#{type}_embeddings_#{model_id}_1_search_bit;"
end
end

memo
end
.join("\n")

DB.exec(drop_statements) if drop_statements.present?

amend_statements =
(1..8)
.reduce([]) do |memo, model_id|
model = existing_definitions.find { |ed| ed&.id == model_id }

memo << amended_idxs(model) if model.present? && !correctly_indexed?(model)

memo
end
.join("\n")

DB.exec(amend_statements) if amend_statements.present?
end

def embedding_tables
%w[topics posts document_fragments]
end

def amended_idxs(model)
embedding_tables.map { |t| <<~SQL }.join("\n")
CREATE INDEX IF NOT EXISTS ai_#{t}_embeddings_#{model.id}_1_search_bit ON ai_#{t}_embeddings
USING hnsw ((binary_quantize(embeddings)::bit(#{model.dimensions})) bit_hamming_ops)
WHERE model_id = #{model.id} AND strategy_id = 1;
SQL
end

def correctly_indexed?(edef)
seeded_dimensions[edef.id] == edef.dimensions
end

def seeded_dimensions
{ 1 => 768, 2 => 1536, 3 => 1024, 4 => 1024, 5 => 768, 6 => 1536, 7 => 2000, 8 => 1024 }
end

def down
raise ActiveRecord::IrreversibleMigration
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# frozen_string_literal: true

require "rails_helper"
require Rails.root.join(
"plugins/discourse-ai/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config",
)

RSpec.describe FixBrokenOpenAiEmbeddingsConfig do
let(:connection) { ActiveRecord::Base.connection }

def store_setting(name, val)
connection.execute <<~SQL
INSERT INTO site_settings(name, data_type, value, created_at, updated_at)
VALUES ('#{name}', 3, '#{val}', NOW(), NOW())
SQL
end

def configured_model_id
DB.query_single(
"SELECT value FROM site_settings WHERE name = 'ai_embeddings_selected_model'",
).first
end

describe "#up" do
context "when embeddings are already configured" do
fab!(:embedding_definition)

before { store_setting("ai_embeddings_selected_model", embedding_definition.id) }

it "does nothing" do
subject.up

expect(configured_model_id).to eq(embedding_definition.id.to_s)
end
end

context "when there is no previous config" do
it "does nothing" do
subject.up

expect(configured_model_id).to be_blank
end
end

context "when things are not fully configured" do
before do
store_setting("ai_embeddings_model", "text-embedding-3-large")
store_setting("ai_openai_api_key", "")
end

it "does nothing" do
subject.up

expect(configured_model_id).to be_blank
end
end

context "when we have a configuration that previously failed to copy" do
before do
store_setting("ai_embeddings_model", "text-embedding-3-large")
store_setting("ai_openai_api_key", "123")
end

it "copies the config" do
subject.up

embedding_def = EmbeddingDefinition.last

expect(configured_model_id).to eq(embedding_def.id.to_s)
end
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# frozen_string_literal: true

require "rails_helper"
require Rails.root.join(
"plugins/discourse-ai/db/migrate/20250127145305_clean_unused_embedding_search_indexes",
)

RSpec.describe CleanUnusedEmbeddingSearchIndexes do
let(:connection) { ActiveRecord::Base.connection }

describe "#up" do
before do
# Copied from 20241008054440_create_binary_indexes_for_embeddings
%w[topics posts document_fragments].each do |type|
# our supported embeddings models IDs and dimensions
[
[1, 768],
[2, 1536],
[3, 1024],
[4, 1024],
[5, 768],
[6, 1536],
[7, 2000],
[8, 1024],
].each { |model_id, dimensions| connection.execute <<-SQL }
CREATE INDEX IF NOT EXISTS ai_#{type}_embeddings_#{model_id}_1_search_bit ON ai_#{type}_embeddings
USING hnsw ((binary_quantize(embeddings)::bit(#{dimensions})) bit_hamming_ops)
WHERE model_id = #{model_id} AND strategy_id = 1;
SQL
end
end

let(:all_idx_names) do
%w[topics posts document_fragments].reduce([]) do |memo, type|
(1..8).each { |model_id| memo << "ai_#{type}_embeddings_#{model_id}_1_search_bit" }

memo
end
end

context "when there are no embedding definitions" do
it "removes all indexes" do
subject.up

remaininig_idxs =
DB.query_single(
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
names: all_idx_names,
)

expect(remaininig_idxs).to be_empty
end
end

context "when there is an embedding definition with the same dimensions" do
fab!(:embedding_def) { Fabricate(:embedding_definition, id: 2, dimensions: 1536) }

it "keeps the matching index and removes the rest" do
expected_model_idxs =
%w[topics posts document_fragments].reduce([]) do |memo, type|
memo << "ai_#{type}_embeddings_2_1_search_bit"
end

subject.up

remaininig_idxs =
DB.query_single(
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
names: all_idx_names,
)

expect(remaininig_idxs).to contain_exactly(*expected_model_idxs)
# This method checks dimensions are correct.
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_def)).to eq(true)
end
end

context "when there is an embedding definition with different dimenstions" do
fab!(:embedding_def) { Fabricate(:embedding_definition, id: 2, dimensions: 1536) }
fab!(:embedding_def_2) { Fabricate(:embedding_definition, id: 3, dimensions: 768) }

it "updates the index to use the right dimensions" do
expected_model_idxs =
%w[topics posts document_fragments].reduce([]) do |memo, type|
memo << "ai_#{type}_embeddings_2_1_search_bit"
memo << "ai_#{type}_embeddings_3_1_search_bit"
end

subject.up

remaininig_idxs =
DB.query_single(
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
names: all_idx_names,
)

expect(remaininig_idxs).to contain_exactly(*expected_model_idxs)
# This method checks dimensions are correct.
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_def_2)).to eq(true)
end
end

context "when there are indexes outside the pre-seeded range" do
before { %w[topics posts document_fragments].each { |type| connection.execute <<-SQL } }
CREATE INDEX IF NOT EXISTS ai_#{type}_embeddings_9_1_search_bit ON ai_#{type}_embeddings
USING hnsw ((binary_quantize(embeddings)::bit(556)) bit_hamming_ops)
WHERE model_id = 9 AND strategy_id = 1;
SQL

let(:all_idx_names) do
%w[topics posts document_fragments].reduce([]) do |memo, type|
(1..8).each { |model_id| memo << "ai_#{type}_embeddings_#{model_id}_1_search_bit" }

memo
end
end

it "leaves them untouched" do
expected_model_idxs =
%w[topics posts document_fragments].reduce([]) do |memo, type|
memo << "ai_#{type}_embeddings_9_1_search_bit"
end

subject.up

other_idxs =
DB.query_single(
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
names: expected_model_idxs,
)

expect(other_idxs).to contain_exactly(*expected_model_idxs)
end
end
end
end
Loading