Skip to content

Commit

Permalink
FIX: Do batches for backfilling huge embeddings tables
Browse files Browse the repository at this point in the history
  • Loading branch information
romanrizzi committed Jan 14, 2025
1 parent 356ea77 commit 1a8b551
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 12 deletions.
24 changes: 17 additions & 7 deletions db/migrate/20250114160417_backfill_topic_embeddings.rb
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
# frozen_string_literal: true
class BackfillTopicEmbeddings < ActiveRecord::Migration[7.2]
def up
not_backfilled = DB.query_single("SELECT COUNT(*) FROM ai_topics_embeddings").first.to_i == 0
disable_ddl_transaction!

if not_backfilled
# Copy data from old tables to new tables
execute <<~SQL
INSERT INTO ai_topics_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT * FROM ai_topic_embeddings;
def up
loop do
count = execute(<<~SQL).cmd_tuples
INSERT INTO ai_topics_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT source.*
FROM ai_topic_embeddings source
WHERE NOT EXISTS (
SELECT 1
FROM ai_topics_embeddings target
WHERE target.model_id = source.model_id
AND target.strategy_id = source.strategy_id
AND target.topic_id = source.topic_id
)
LIMIT 10000
SQL

break if count == 0
end
end

Expand Down
22 changes: 17 additions & 5 deletions db/migrate/20250114160446_backfill_post_embeddings.rb
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
# frozen_string_literal: true
class BackfillPostEmbeddings < ActiveRecord::Migration[7.2]
disable_ddl_transaction!

def up
not_backfilled = DB.query_single("SELECT COUNT(*) FROM ai_posts_embeddings").first.to_i == 0
# Copy data from old tables to new tables in batches.

if not_backfilled
# Copy data from old tables to new tables
execute <<~SQL
loop do
count = execute(<<~SQL).cmd_tuples
INSERT INTO ai_posts_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT * FROM ai_post_embeddings;
SELECT source.*
FROM ai_post_embeddings source
WHERE NOT EXISTS (
SELECT 1
FROM ai_posts_embeddings target
WHERE target.model_id = source.model_id
AND target.strategy_id = source.strategy_id
AND target.post_id = source.post_id
)
LIMIT 10000
SQL

break if count == 0
end
end

Expand Down

0 comments on commit 1a8b551

Please sign in to comment.