Skip to content

Commit

Permalink
FIX: Make summaries backfill job more resilient. (#1071)
Browse files Browse the repository at this point in the history
To quickly select backfill candidates without comparing SHAs, we compare the last summarized post to the topic's highest_post_number. However, hiding or deleting a post and adding a small action will update this column, causing the job to stall and re-generate the same summary repeatedly until someone posts a regular reply. On top of this, this is not always true for topics with `best_replies`, as this last reply isn't necessarily included.

Since this is not evident at first glance and each summarization strategy picks its targets differently, I'm opting to simplify the backfill logic and how we track potential candidates.

The first step is dropping `content_range`, which serves no purpose and it's there because summary caching was supposed to work differently at the beginning. So instead, I'm replacing it with a column called `highest_target_number`, which tracks `highest_post_number` for topics and could track other things like channel's `message_count` in the future.

Now that we have this column when selecting every potential backfill candidate, we'll check if the summary is truly outdated by comparing the SHAs, and if it's not, we just update the column and move on
  • Loading branch information
romanrizzi authored Jan 16, 2025
1 parent f9aa2de commit 46fcdb6
Show file tree
Hide file tree
Showing 16 changed files with 150 additions and 117 deletions.
2 changes: 1 addition & 1 deletion app/jobs/regular/fast_track_topic_gist.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def execute(args)
gist = summarizer.existing_summary
return if gist.present? && (!gist.outdated || gist.created_at >= 5.minutes.ago)

summarizer.force_summarize(Discourse.system_user)
summarizer.summarize(Discourse.system_user)
end
end
end
26 changes: 21 additions & 5 deletions app/jobs/scheduled/summaries_backfill.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,34 @@ def execute(_args)
backfill_candidates(gist_t)
.limit(current_budget(gist_t))
.each do |topic|
DiscourseAi::Summarization.topic_gist(topic).force_summarize(system_user)
strategy = DiscourseAi::Summarization.topic_gist(topic)
try_summarize(strategy, system_user, topic)
end
end

complete_t = AiSummary.summary_types[:complete]
backfill_candidates(complete_t)
.limit(current_budget(complete_t))
.each do |topic|
DiscourseAi::Summarization.topic_summary(topic).force_summarize(system_user)
strategy = DiscourseAi::Summarization.topic_summary(topic)
try_summarize(strategy, system_user, topic)
end
end

def try_summarize(strategy, user, topic)
existing_summary = strategy.existing_summary

if existing_summary.blank? || existing_summary.outdated
strategy.summarize(user)
else
# Hiding or deleting a post, and creating a small action alters the Topic#highest_post_number.
# We use this as a quick way to select potential backfill candidates without relying on original_content_sha.
# At this point, we are confident the summary doesn't need to be regenerated so something other than a regular reply
# caused the number to change in the topic.
existing_summary.update!(highest_target_number: topic.highest_post_number)
end
end

def backfill_candidates(summary_type)
max_age_days = SiteSetting.ai_summary_backfill_topic_max_age_days

Expand All @@ -45,12 +61,12 @@ def backfill_candidates(summary_type)
.where(
<<~SQL, # (1..1) gets stored ad (1..2).
ais.id IS NULL OR (
UPPER(ais.content_range) < topics.highest_post_number + 1
AND ais.created_at < (current_timestamp - INTERVAL '5 minutes')
ais.highest_target_number < topics.highest_post_number
AND ais.updated_at < (current_timestamp - INTERVAL '5 minutes')
)
SQL
)
.order("ais.created_at DESC NULLS FIRST, topics.last_posted_at DESC")
.order("ais.updated_at DESC NULLS FIRST, topics.last_posted_at DESC")
end

def current_budget(type)
Expand Down
35 changes: 22 additions & 13 deletions app/models/ai_summary.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# frozen_string_literal: true

class AiSummary < ActiveRecord::Base
# TODO remove this line 01-3-2025
self.ignored_columns = %i[content_range]

belongs_to :target, polymorphic: true

enum :summary_type, { complete: 0, gist: 1 }
Expand All @@ -15,14 +18,20 @@ def self.store!(strategy, llm_model, summary, og_content, human:)
target_id: strategy.target.id,
target_type: strategy.target.class.name,
algorithm: llm_model.name,
content_range: (content_ids.first..content_ids.last),
highest_target_number: strategy.highest_target_number,
summarized_text: summary,
original_content_sha: build_sha(content_ids.join),
summary_type: strategy.type,
origin: !!human ? origins[:human] : origins[:system],
},
unique_by: %i[target_id target_type summary_type],
update_only: %i[summarized_text original_content_sha algorithm origin content_range],
update_only: %i[
summarized_text
original_content_sha
algorithm
origin
highest_target_number
],
)
.first
.then { AiSummary.find_by(id: _1["id"]) }
Expand All @@ -45,17 +54,17 @@ def outdated
#
# Table name: ai_summaries
#
# id :bigint not null, primary key
# target_id :integer not null
# target_type :string not null
# content_range :int4range
# summarized_text :string not null
# original_content_sha :string not null
# algorithm :string not null
# created_at :datetime not null
# updated_at :datetime not null
# summary_type :integer default("complete"), not null
# origin :integer
# id :bigint not null, primary key
# target_id :integer not null
# target_type :string not null
# summarized_text :string not null
# original_content_sha :string not null
# algorithm :string not null
# created_at :datetime not null
# updated_at :datetime not null
# summary_type :integer default("complete"), not null
# origin :integer
# highest_target_number :integer not null
#
# Indexes
#
Expand Down
11 changes: 1 addition & 10 deletions app/serializers/ai_topic_summary_serializer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,6 @@ def can_regenerate
end

def new_posts_since_summary
# Postgres uses discrete range types for int4range, which means
# (1..2) is stored as (1...3).
#
# We use Range#max to work around this, which in the case above always returns 2.
# Be careful with using Range#end here, it could lead to unexpected results as:
#
# (1..2).end => 2
# (1...3).end => 3

object.target.highest_post_number.to_i - object.content_range&.max.to_i
object.target.highest_post_number.to_i - object.highest_target_number.to_i
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# frozen_string_literal: true
class AddHighestTargetNumberToAiSummary < ActiveRecord::Migration[7.2]
def up
add_column :ai_summaries, :highest_target_number, :integer, null: false

execute <<~SQL
UPDATE ai_summaries SET highest_target_number = GREATEST(UPPER(content_range) - 1, 1)
SQL
end

def down
drop_column :ai_summaries, :highest_target_number
end
end
12 changes: 12 additions & 0 deletions db/post_migrate/20250115181147_drop_ai_summaries_content_range.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# frozen_string_literal: true
class DropAiSummariesContentRange < ActiveRecord::Migration[7.2]
DROPPED_COLUMNS ||= { ai_summaries: %i[content_range] }

def up
DROPPED_COLUMNS.each { |table, columns| Migration::ColumnDropper.execute_drop(table, columns) }
end

def down
raise ActiveRecord::IrreversibleMigration
end
end
7 changes: 2 additions & 5 deletions lib/summarization/fold_content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def initialize(llm, strategy, persist_summaries: true)
# @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function.
# Note: The block is only called with results of the final summary, not intermediate summaries.
#
# This method doesn't care if we already have an up to date summary. It always regenerate.
#
# @returns { AiSummary } - Resulting summary.
def summarize(user, &on_partial_blk)
base_summary = ""
Expand Down Expand Up @@ -68,11 +70,6 @@ def delete_cached_summaries!
AiSummary.where(target: strategy.target, summary_type: strategy.type).destroy_all
end

def force_summarize(user, &on_partial_blk)
@persist_summaries = true
summarize(user, &on_partial_blk)
end

private

attr_reader :persist_summaries
Expand Down
4 changes: 4 additions & 0 deletions lib/summarization/strategies/chat_messages.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ def type
AiSummary.summary_types[:complete]
end

def highest_target_number
nil # We don't persist so we can return nil.
end

def initialize(target, since)
super(target)
@since = since
Expand Down
5 changes: 5 additions & 0 deletions lib/summarization/strategies/hot_topic_gists.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ def feature
"gists"
end

def highest_target_number
target.highest_post_number
end

def targets_data
op_post_number = 1

Expand All @@ -38,6 +42,7 @@ def targets_data
.limit(20)
.pluck(:post_number)
end

posts_data =
Post
.where(topic_id: target.id)
Expand Down
4 changes: 4 additions & 0 deletions lib/summarization/strategies/topic_summary.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ def type
AiSummary.summary_types[:complete]
end

def highest_target_number
target.highest_post_number
end

def targets_data
posts_data =
(target.has_summary? ? best_replies : pick_selection).pluck(
Expand Down
1 change: 1 addition & 0 deletions spec/fabricators/ai_summary_fabricator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
target { Fabricate(:topic) }
summary_type AiSummary.summary_types[:complete]
origin AiSummary.origins[:human]
highest_target_number 1
end

Fabricator(:topic_ai_gist, from: :ai_summary) do
Expand Down
42 changes: 34 additions & 8 deletions spec/jobs/scheduled/summaries_backfill_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,13 @@
end

it "ignores up to date summaries" do
Fabricate(:ai_summary, target: topic, content_range: (1..2))
Fabricate(:ai_summary, target: topic, highest_target_number: 2, updated_at: 10.minutes.ago)

expect(subject.backfill_candidates(type)).to be_empty
end

it "ignores outdated summaries created less than five minutes ago" do
Fabricate(:ai_summary, target: topic, content_range: (1..1), created_at: 4.minutes.ago)
it "ignores outdated summaries updated less than five minutes ago" do
Fabricate(:ai_summary, target: topic, highest_target_number: 1, updated_at: 4.minutes.ago)

expect(subject.backfill_candidates(type)).to be_empty
end
Expand All @@ -66,7 +66,7 @@
topic_2 =
Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1)
topic.update!(last_posted_at: 1.minute.ago)
Fabricate(:ai_summary, target: topic, created_at: 1.hour.ago, content_range: (1..1))
Fabricate(:ai_summary, target: topic, updated_at: 1.hour.ago, highest_target_number: 1)

expect(subject.backfill_candidates(type).map(&:id)).to contain_exactly(topic_2.id, topic.id)
end
Expand All @@ -84,13 +84,13 @@
topic_2 =
Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1)
topic.update!(last_posted_at: 1.minute.ago)
Fabricate(:ai_summary, target: topic, created_at: 3.hours.ago, content_range: (1..1))
Fabricate(:topic_ai_gist, target: topic, created_at: 3.hours.ago, content_range: (1..1))
Fabricate(:ai_summary, target: topic, updated_at: 3.hours.ago, highest_target_number: 1)
Fabricate(:topic_ai_gist, target: topic, updated_at: 3.hours.ago, highest_target_number: 1)

summary_1 = "Summary of topic_2"
gist_1 = "Gist of topic_2"
summary_2 = "Summary of topic"
gist_2 = "Gist of topic"
summary_2 = "Updated summary of topic"
gist_2 = "Updated gist of topic"

DiscourseAi::Completions::Llm.with_prepared_responses(
[gist_1, gist_2, summary_1, summary_2],
Expand All @@ -100,6 +100,32 @@
expect(AiSummary.gist.find_by(target: topic_2).summarized_text).to eq(gist_1)
expect(AiSummary.complete.find_by(target: topic).summarized_text).to eq(summary_2)
expect(AiSummary.gist.find_by(target: topic).summarized_text).to eq(gist_2)

# Queue has to be empty if we just generated all summaries
expect(subject.backfill_candidates(AiSummary.summary_types[:complete])).to be_empty
expect(subject.backfill_candidates(AiSummary.summary_types[:gist])).to be_empty

# Queue still empty when they are up to date and time passes.
AiSummary.update_all(updated_at: 20.minutes.ago)
expect(subject.backfill_candidates(AiSummary.summary_types[:complete])).to be_empty
expect(subject.backfill_candidates(AiSummary.summary_types[:gist])).to be_empty
end

it "updates the highest_target_number if the summary turned to be up to date" do
existing_summary =
Fabricate(
:ai_summary,
target: topic,
updated_at: 3.hours.ago,
highest_target_number: topic.highest_post_number,
)
og_highest_post_number = topic.highest_post_number
topic.update!(highest_post_number: og_highest_post_number + 1)

# No prepared responses here. We don't perform a completion call.
subject.execute({})

expect(existing_summary.reload.highest_target_number).to eq(og_highest_post_number + 1)
end
end
end
16 changes: 2 additions & 14 deletions spec/lib/guardian_extensions_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,7 @@
end

it "returns true if there is a cached summary" do
AiSummary.create!(
target: topic,
summarized_text: "test",
original_content_sha: "123",
algorithm: "test",
summary_type: AiSummary.summary_types[:complete],
)
Fabricate(:ai_summary, target: topic)

expect(guardian.can_see_summary?(topic)).to eq(true)
end
Expand Down Expand Up @@ -66,13 +60,7 @@
end

it "returns true for anons when there is a cached summary" do
AiSummary.create!(
target: topic,
summarized_text: "test",
original_content_sha: "123",
algorithm: "test",
summary_type: AiSummary.summary_types[:complete],
)
Fabricate(:ai_summary, target: topic)

expect(guardian.can_see_summary?(topic)).to eq(true)
end
Expand Down
Loading

0 comments on commit 46fcdb6

Please sign in to comment.