From 5c1012766664c70eb1ff0937ed8cdd61afc6cf69 Mon Sep 17 00:00:00 2001 From: Roman Rizzi Date: Tue, 14 Jan 2025 15:31:52 -0300 Subject: [PATCH] PERF: Optimize backfill query to prevent statement timeouts --- ...0250114160417_backfill_topic_embeddings.rb | 18 +++++++++-------- ...20250114160446_backfill_post_embeddings.rb | 20 ++++++++++--------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/db/migrate/20250114160417_backfill_topic_embeddings.rb b/db/migrate/20250114160417_backfill_topic_embeddings.rb index d0a07f253..dcec61db4 100644 --- a/db/migrate/20250114160417_backfill_topic_embeddings.rb +++ b/db/migrate/20250114160417_backfill_topic_embeddings.rb @@ -6,16 +6,18 @@ def up loop do count = execute(<<~SQL).cmd_tuples INSERT INTO ai_topics_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at) - SELECT source.* - FROM ai_topic_embeddings source - WHERE NOT EXISTS ( - SELECT 1 - FROM ai_topics_embeddings target - WHERE target.model_id = source.model_id - AND target.strategy_id = source.strategy_id - AND target.topic_id = source.topic_id + SELECT source.* + FROM ( + SELECT old_table.* + FROM ai_topic_embeddings old_table + LEFT JOIN ai_topics_embeddings target ON ( + target.model_id = old_table.model_id AND + target.strategy_id = old_table.strategy_id AND + target.topic_id = old_table.topic_id ) + WHERE target.topic_id IS NULL LIMIT 10000 + ) source SQL break if count == 0 diff --git a/db/migrate/20250114160446_backfill_post_embeddings.rb b/db/migrate/20250114160446_backfill_post_embeddings.rb index 365d85162..0933e1977 100644 --- a/db/migrate/20250114160446_backfill_post_embeddings.rb +++ b/db/migrate/20250114160446_backfill_post_embeddings.rb @@ -7,17 +7,19 @@ def up loop do count = execute(<<~SQL).cmd_tuples - INSERT INTO ai_posts_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at) - SELECT source.* - FROM ai_post_embeddings source - WHERE NOT EXISTS ( - SELECT 1 - FROM ai_posts_embeddings target - WHERE target.model_id = source.model_id - AND target.strategy_id = source.strategy_id - AND target.post_id = source.post_id + INSERT INTO ai_posts_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at) + SELECT source.* + FROM ( + SELECT old_table.* + FROM ai_post_embeddings old_table + LEFT JOIN ai_posts_embeddings target ON ( + target.model_id = old_table.model_id AND + target.strategy_id = old_table.strategy_id AND + target.post_id = old_table.post_id ) + WHERE target.post_id IS NULL LIMIT 10000 + ) source SQL break if count == 0