diff --git a/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config.rb b/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config.rb new file mode 100644 index 000000000..32b6ab80e --- /dev/null +++ b/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config.rb @@ -0,0 +1,107 @@ +# frozen_string_literal: true + +class FixBrokenOpenAiEmbeddingsConfig < ActiveRecord::Migration[7.2] + def up + return if fetch_setting("ai_embeddings_selected_model").present? + + return if DB.query_single("SELECT COUNT(*) FROM embedding_definitions").first > 0 + + open_ai_models = %w[text-embedding-3-large text-embedding-3-small text-embedding-ada-002] + current_model = fetch_setting("ai_embeddings_model") + return if !open_ai_models.include?(current_model) + + endpoint = fetch_setting("ai_openai_embeddings_url") || "https://api.openai.com/v1/embeddings" + api_key = fetch_setting("ai_openai_api_key") + return if api_key.blank? + + attrs = { + display_name: current_model, + url: endpoint, + api_key: api_key, + provider: "open_ai", + }.merge(model_attrs(current_model)) + + persist_config(attrs) + end + + def fetch_setting(name) + DB.query_single( + "SELECT value FROM site_settings WHERE name = :setting_name", + setting_name: name, + ).first || ENV["DISCOURSE_#{name&.upcase}"] + end + + def model_attrs(model_name) + if model_name == "text-embedding-3-large" + { + dimensions: 2000, + max_sequence_length: 8191, + id: 7, + pg_function: "<=>", + tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer", + matryoshka_dimensions: true, + provider_params: { + model_name: "text-embedding-3-large", + }, + } + elsif model_name == "text-embedding-3-small" + { + dimensions: 1536, + max_sequence_length: 8191, + id: 6, + pg_function: "<=>", + tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer", + provider_params: { + model_name: "text-embedding-3-small", + }, + } + else + { + dimensions: 1536, + max_sequence_length: 8191, + id: 2, + pg_function: "<=>", + tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer", + provider_params: { + model_name: "text-embedding-ada-002", + }, + } + end + end + + def persist_config(attrs) + DB.exec( + <<~SQL, + INSERT INTO embedding_definitions (id, display_name, dimensions, max_sequence_length, version, pg_function, provider, tokenizer_class, url, api_key, provider_params, matryoshka_dimensions, created_at, updated_at) + VALUES (:id, :display_name, :dimensions, :max_sequence_length, 1, :pg_function, :provider, :tokenizer_class, :url, :api_key, :provider_params, :matryoshka_dimensions, :now, :now) + SQL + id: attrs[:id], + display_name: attrs[:display_name], + dimensions: attrs[:dimensions], + max_sequence_length: attrs[:max_sequence_length], + pg_function: attrs[:pg_function], + provider: attrs[:provider], + tokenizer_class: attrs[:tokenizer_class], + url: attrs[:url], + api_key: attrs[:api_key], + provider_params: attrs[:provider_params]&.to_json, + matryoshka_dimensions: !!attrs[:matryoshka_dimensions], + now: Time.zone.now, + ) + + # We hardcoded the ID to match with already generated embeddings. Let's restart the seq to avoid conflicts. + DB.exec( + "ALTER SEQUENCE embedding_definitions_id_seq RESTART WITH :new_seq", + new_seq: attrs[:id].to_i + 1, + ) + + DB.exec(<<~SQL, new_value: attrs[:id]) + INSERT INTO site_settings(name, data_type, value, created_at, updated_at) + VALUES ('ai_embeddings_selected_model', 3, ':new_value', NOW(), NOW()) + SQL + end + + def down + raise ActiveRecord::IrreversibleMigration + end +end diff --git a/db/migrate/20250127145305_clean_unused_embedding_search_indexes.rb b/db/migrate/20250127145305_clean_unused_embedding_search_indexes.rb new file mode 100644 index 000000000..9c6e0d177 --- /dev/null +++ b/db/migrate/20250127145305_clean_unused_embedding_search_indexes.rb @@ -0,0 +1,61 @@ +# frozen_string_literal: true +class CleanUnusedEmbeddingSearchIndexes < ActiveRecord::Migration[7.2] + def up + existing_definitions = + DB.query("SELECT id, dimensions FROM embedding_definitions WHERE id <= 8") + + drop_statements = + (1..8) + .reduce([]) do |memo, model_id| + model = existing_definitions.find { |ed| ed&.id == model_id } + + if model.blank? || !correctly_indexed?(model) + embedding_tables.each do |type| + memo << "DROP INDEX IF EXISTS ai_#{type}_embeddings_#{model_id}_1_search_bit;" + end + end + + memo + end + .join("\n") + + DB.exec(drop_statements) if drop_statements.present? + + amend_statements = + (1..8) + .reduce([]) do |memo, model_id| + model = existing_definitions.find { |ed| ed&.id == model_id } + + memo << amended_idxs(model) if model.present? && !correctly_indexed?(model) + + memo + end + .join("\n") + + DB.exec(amend_statements) if amend_statements.present? + end + + def embedding_tables + %w[topics posts document_fragments] + end + + def amended_idxs(model) + embedding_tables.map { |t| <<~SQL }.join("\n") + CREATE INDEX IF NOT EXISTS ai_#{t}_embeddings_#{model.id}_1_search_bit ON ai_#{t}_embeddings + USING hnsw ((binary_quantize(embeddings)::bit(#{model.dimensions})) bit_hamming_ops) + WHERE model_id = #{model.id} AND strategy_id = 1; + SQL + end + + def correctly_indexed?(edef) + seeded_dimensions[edef.id] == edef.dimensions + end + + def seeded_dimensions + { 1 => 768, 2 => 1536, 3 => 1024, 4 => 1024, 5 => 768, 6 => 1536, 7 => 2000, 8 => 1024 } + end + + def down + raise ActiveRecord::IrreversibleMigration + end +end diff --git a/spec/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config_spec.rb b/spec/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config_spec.rb new file mode 100644 index 000000000..1c621a7be --- /dev/null +++ b/spec/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config_spec.rb @@ -0,0 +1,73 @@ +# frozen_string_literal: true + +require "rails_helper" +require Rails.root.join( + "plugins/discourse-ai/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config", + ) + +RSpec.describe FixBrokenOpenAiEmbeddingsConfig do + let(:connection) { ActiveRecord::Base.connection } + + def store_setting(name, val) + connection.execute <<~SQL + INSERT INTO site_settings(name, data_type, value, created_at, updated_at) + VALUES ('#{name}', 3, '#{val}', NOW(), NOW()) + SQL + end + + def configured_model_id + DB.query_single( + "SELECT value FROM site_settings WHERE name = 'ai_embeddings_selected_model'", + ).first + end + + describe "#up" do + context "when embeddings are already configured" do + fab!(:embedding_definition) + + before { store_setting("ai_embeddings_selected_model", embedding_definition.id) } + + it "does nothing" do + subject.up + + expect(configured_model_id).to eq(embedding_definition.id.to_s) + end + end + + context "when there is no previous config" do + it "does nothing" do + subject.up + + expect(configured_model_id).to be_blank + end + end + + context "when things are not fully configured" do + before do + store_setting("ai_embeddings_model", "text-embedding-3-large") + store_setting("ai_openai_api_key", "") + end + + it "does nothing" do + subject.up + + expect(configured_model_id).to be_blank + end + end + + context "when we have a configuration that previously failed to copy" do + before do + store_setting("ai_embeddings_model", "text-embedding-3-large") + store_setting("ai_openai_api_key", "123") + end + + it "copies the config" do + subject.up + + embedding_def = EmbeddingDefinition.last + + expect(configured_model_id).to eq(embedding_def.id.to_s) + end + end + end +end diff --git a/spec/db/migrate/20250127145305_clean_unused_embedding_search_indexes_spec.rb b/spec/db/migrate/20250127145305_clean_unused_embedding_search_indexes_spec.rb new file mode 100644 index 000000000..2278692f9 --- /dev/null +++ b/spec/db/migrate/20250127145305_clean_unused_embedding_search_indexes_spec.rb @@ -0,0 +1,136 @@ +# frozen_string_literal: true + +require "rails_helper" +require Rails.root.join( + "plugins/discourse-ai/db/migrate/20250127145305_clean_unused_embedding_search_indexes", + ) + +RSpec.describe CleanUnusedEmbeddingSearchIndexes do + let(:connection) { ActiveRecord::Base.connection } + + describe "#up" do + before do + # Copied from 20241008054440_create_binary_indexes_for_embeddings + %w[topics posts document_fragments].each do |type| + # our supported embeddings models IDs and dimensions + [ + [1, 768], + [2, 1536], + [3, 1024], + [4, 1024], + [5, 768], + [6, 1536], + [7, 2000], + [8, 1024], + ].each { |model_id, dimensions| connection.execute <<-SQL } + CREATE INDEX IF NOT EXISTS ai_#{type}_embeddings_#{model_id}_1_search_bit ON ai_#{type}_embeddings + USING hnsw ((binary_quantize(embeddings)::bit(#{dimensions})) bit_hamming_ops) + WHERE model_id = #{model_id} AND strategy_id = 1; + SQL + end + end + + let(:all_idx_names) do + %w[topics posts document_fragments].reduce([]) do |memo, type| + (1..8).each { |model_id| memo << "ai_#{type}_embeddings_#{model_id}_1_search_bit" } + + memo + end + end + + context "when there are no embedding definitions" do + it "removes all indexes" do + subject.up + + remaininig_idxs = + DB.query_single( + "SELECT indexname FROM pg_indexes WHERE indexname IN (:names)", + names: all_idx_names, + ) + + expect(remaininig_idxs).to be_empty + end + end + + context "when there is an embedding definition with the same dimensions" do + fab!(:embedding_def) { Fabricate(:embedding_definition, id: 2, dimensions: 1536) } + + it "keeps the matching index and removes the rest" do + expected_model_idxs = + %w[topics posts document_fragments].reduce([]) do |memo, type| + memo << "ai_#{type}_embeddings_2_1_search_bit" + end + + subject.up + + remaininig_idxs = + DB.query_single( + "SELECT indexname FROM pg_indexes WHERE indexname IN (:names)", + names: all_idx_names, + ) + + expect(remaininig_idxs).to contain_exactly(*expected_model_idxs) + # This method checks dimensions are correct. + expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_def)).to eq(true) + end + end + + context "when there is an embedding definition with different dimenstions" do + fab!(:embedding_def) { Fabricate(:embedding_definition, id: 2, dimensions: 1536) } + fab!(:embedding_def_2) { Fabricate(:embedding_definition, id: 3, dimensions: 768) } + + it "updates the index to use the right dimensions" do + expected_model_idxs = + %w[topics posts document_fragments].reduce([]) do |memo, type| + memo << "ai_#{type}_embeddings_2_1_search_bit" + memo << "ai_#{type}_embeddings_3_1_search_bit" + end + + subject.up + + remaininig_idxs = + DB.query_single( + "SELECT indexname FROM pg_indexes WHERE indexname IN (:names)", + names: all_idx_names, + ) + + expect(remaininig_idxs).to contain_exactly(*expected_model_idxs) + # This method checks dimensions are correct. + expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_def_2)).to eq(true) + end + end + + context "when there are indexes outside the pre-seeded range" do + before { %w[topics posts document_fragments].each { |type| connection.execute <<-SQL } } + CREATE INDEX IF NOT EXISTS ai_#{type}_embeddings_9_1_search_bit ON ai_#{type}_embeddings + USING hnsw ((binary_quantize(embeddings)::bit(556)) bit_hamming_ops) + WHERE model_id = 9 AND strategy_id = 1; + SQL + + let(:all_idx_names) do + %w[topics posts document_fragments].reduce([]) do |memo, type| + (1..8).each { |model_id| memo << "ai_#{type}_embeddings_#{model_id}_1_search_bit" } + + memo + end + end + + it "leaves them untouched" do + expected_model_idxs = + %w[topics posts document_fragments].reduce([]) do |memo, type| + memo << "ai_#{type}_embeddings_9_1_search_bit" + end + + subject.up + + other_idxs = + DB.query_single( + "SELECT indexname FROM pg_indexes WHERE indexname IN (:names)", + names: expected_model_idxs, + ) + + expect(other_idxs).to contain_exactly(*expected_model_idxs) + end + end + end +end