diff --git a/app/jobs/regular/fast_track_topic_gist.rb b/app/jobs/regular/fast_track_topic_gist.rb index 8885bee30..a20d18179 100644 --- a/app/jobs/regular/fast_track_topic_gist.rb +++ b/app/jobs/regular/fast_track_topic_gist.rb @@ -16,7 +16,7 @@ def execute(args) gist = summarizer.existing_summary return if gist.present? && (!gist.outdated || gist.created_at >= 5.minutes.ago) - summarizer.force_summarize(Discourse.system_user) + summarizer.summarize(Discourse.system_user) end end end diff --git a/app/jobs/scheduled/summaries_backfill.rb b/app/jobs/scheduled/summaries_backfill.rb index 34ea2d532..10dcc7195 100644 --- a/app/jobs/scheduled/summaries_backfill.rb +++ b/app/jobs/scheduled/summaries_backfill.rb @@ -18,7 +18,8 @@ def execute(_args) backfill_candidates(gist_t) .limit(current_budget(gist_t)) .each do |topic| - DiscourseAi::Summarization.topic_gist(topic).force_summarize(system_user) + strategy = DiscourseAi::Summarization.topic_gist(topic) + try_summarize(strategy, system_user, topic) end end @@ -26,10 +27,25 @@ def execute(_args) backfill_candidates(complete_t) .limit(current_budget(complete_t)) .each do |topic| - DiscourseAi::Summarization.topic_summary(topic).force_summarize(system_user) + strategy = DiscourseAi::Summarization.topic_summary(topic) + try_summarize(strategy, system_user, topic) end end + def try_summarize(strategy, user, topic) + existing_summary = strategy.existing_summary + + if existing_summary.blank? || existing_summary.outdated + strategy.summarize(user) + else + # Hiding or deleting a post, and creating a small action alters the Topic#highest_post_number. + # We use this as a quick way to select potential backfill candidates without relying on original_content_sha. + # At this point, we are confident the summary doesn't need to be regenerated so something other than a regular reply + # caused the number to change in the topic. + existing_summary.update!(highest_target_number: topic.highest_post_number) + end + end + def backfill_candidates(summary_type) max_age_days = SiteSetting.ai_summary_backfill_topic_max_age_days @@ -45,12 +61,12 @@ def backfill_candidates(summary_type) .where( <<~SQL, # (1..1) gets stored ad (1..2). ais.id IS NULL OR ( - UPPER(ais.content_range) < topics.highest_post_number + 1 - AND ais.created_at < (current_timestamp - INTERVAL '5 minutes') + ais.highest_target_number < topics.highest_post_number + AND ais.updated_at < (current_timestamp - INTERVAL '5 minutes') ) SQL ) - .order("ais.created_at DESC NULLS FIRST, topics.last_posted_at DESC") + .order("ais.updated_at DESC NULLS FIRST, topics.last_posted_at DESC") end def current_budget(type) diff --git a/app/models/ai_summary.rb b/app/models/ai_summary.rb index 30b23351b..458c5f181 100644 --- a/app/models/ai_summary.rb +++ b/app/models/ai_summary.rb @@ -1,6 +1,9 @@ # frozen_string_literal: true class AiSummary < ActiveRecord::Base + # TODO remove this line 01-3-2025 + self.ignored_columns = %i[content_range] + belongs_to :target, polymorphic: true enum :summary_type, { complete: 0, gist: 1 } @@ -15,14 +18,20 @@ def self.store!(strategy, llm_model, summary, og_content, human:) target_id: strategy.target.id, target_type: strategy.target.class.name, algorithm: llm_model.name, - content_range: (content_ids.first..content_ids.last), + highest_target_number: strategy.highest_target_number, summarized_text: summary, original_content_sha: build_sha(content_ids.join), summary_type: strategy.type, origin: !!human ? origins[:human] : origins[:system], }, unique_by: %i[target_id target_type summary_type], - update_only: %i[summarized_text original_content_sha algorithm origin content_range], + update_only: %i[ + summarized_text + original_content_sha + algorithm + origin + highest_target_number + ], ) .first .then { AiSummary.find_by(id: _1["id"]) } @@ -45,17 +54,17 @@ def outdated # # Table name: ai_summaries # -# id :bigint not null, primary key -# target_id :integer not null -# target_type :string not null -# content_range :int4range -# summarized_text :string not null -# original_content_sha :string not null -# algorithm :string not null -# created_at :datetime not null -# updated_at :datetime not null -# summary_type :integer default("complete"), not null -# origin :integer +# id :bigint not null, primary key +# target_id :integer not null +# target_type :string not null +# summarized_text :string not null +# original_content_sha :string not null +# algorithm :string not null +# created_at :datetime not null +# updated_at :datetime not null +# summary_type :integer default("complete"), not null +# origin :integer +# highest_target_number :integer not null # # Indexes # diff --git a/app/serializers/ai_topic_summary_serializer.rb b/app/serializers/ai_topic_summary_serializer.rb index 738f843e4..7ffff9e20 100644 --- a/app/serializers/ai_topic_summary_serializer.rb +++ b/app/serializers/ai_topic_summary_serializer.rb @@ -13,15 +13,6 @@ def can_regenerate end def new_posts_since_summary - # Postgres uses discrete range types for int4range, which means - # (1..2) is stored as (1...3). - # - # We use Range#max to work around this, which in the case above always returns 2. - # Be careful with using Range#end here, it could lead to unexpected results as: - # - # (1..2).end => 2 - # (1...3).end => 3 - - object.target.highest_post_number.to_i - object.content_range&.max.to_i + object.target.highest_post_number.to_i - object.highest_target_number.to_i end end diff --git a/db/migrate/20250115173456_add_highest_target_number_to_ai_summary.rb b/db/migrate/20250115173456_add_highest_target_number_to_ai_summary.rb new file mode 100644 index 000000000..9a208a3e9 --- /dev/null +++ b/db/migrate/20250115173456_add_highest_target_number_to_ai_summary.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true +class AddHighestTargetNumberToAiSummary < ActiveRecord::Migration[7.2] + def up + add_column :ai_summaries, :highest_target_number, :integer, null: false + + execute <<~SQL + UPDATE ai_summaries SET highest_target_number = GREATEST(UPPER(content_range) - 1, 1) + SQL + end + + def down + drop_column :ai_summaries, :highest_target_number + end +end diff --git a/db/post_migrate/20250115181147_drop_ai_summaries_content_range.rb b/db/post_migrate/20250115181147_drop_ai_summaries_content_range.rb new file mode 100644 index 000000000..726b08588 --- /dev/null +++ b/db/post_migrate/20250115181147_drop_ai_summaries_content_range.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true +class DropAiSummariesContentRange < ActiveRecord::Migration[7.2] + DROPPED_COLUMNS ||= { ai_summaries: %i[content_range] } + + def up + DROPPED_COLUMNS.each { |table, columns| Migration::ColumnDropper.execute_drop(table, columns) } + end + + def down + raise ActiveRecord::IrreversibleMigration + end +end diff --git a/lib/summarization/fold_content.rb b/lib/summarization/fold_content.rb index 32bf97791..831d311eb 100644 --- a/lib/summarization/fold_content.rb +++ b/lib/summarization/fold_content.rb @@ -21,6 +21,8 @@ def initialize(llm, strategy, persist_summaries: true) # @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function. # Note: The block is only called with results of the final summary, not intermediate summaries. # + # This method doesn't care if we already have an up to date summary. It always regenerate. + # # @returns { AiSummary } - Resulting summary. def summarize(user, &on_partial_blk) base_summary = "" @@ -68,11 +70,6 @@ def delete_cached_summaries! AiSummary.where(target: strategy.target, summary_type: strategy.type).destroy_all end - def force_summarize(user, &on_partial_blk) - @persist_summaries = true - summarize(user, &on_partial_blk) - end - private attr_reader :persist_summaries diff --git a/lib/summarization/strategies/chat_messages.rb b/lib/summarization/strategies/chat_messages.rb index 1f3aad6d8..2f073d71a 100644 --- a/lib/summarization/strategies/chat_messages.rb +++ b/lib/summarization/strategies/chat_messages.rb @@ -8,6 +8,10 @@ def type AiSummary.summary_types[:complete] end + def highest_target_number + nil # We don't persist so we can return nil. + end + def initialize(target, since) super(target) @since = since diff --git a/lib/summarization/strategies/hot_topic_gists.rb b/lib/summarization/strategies/hot_topic_gists.rb index 1708f2b86..b3e888761 100644 --- a/lib/summarization/strategies/hot_topic_gists.rb +++ b/lib/summarization/strategies/hot_topic_gists.rb @@ -12,6 +12,10 @@ def feature "gists" end + def highest_target_number + target.highest_post_number + end + def targets_data op_post_number = 1 @@ -38,6 +42,7 @@ def targets_data .limit(20) .pluck(:post_number) end + posts_data = Post .where(topic_id: target.id) diff --git a/lib/summarization/strategies/topic_summary.rb b/lib/summarization/strategies/topic_summary.rb index c252fe33b..e97c391a9 100644 --- a/lib/summarization/strategies/topic_summary.rb +++ b/lib/summarization/strategies/topic_summary.rb @@ -8,6 +8,10 @@ def type AiSummary.summary_types[:complete] end + def highest_target_number + target.highest_post_number + end + def targets_data posts_data = (target.has_summary? ? best_replies : pick_selection).pluck( diff --git a/spec/fabricators/ai_summary_fabricator.rb b/spec/fabricators/ai_summary_fabricator.rb index 250ae943a..d596dbaba 100644 --- a/spec/fabricators/ai_summary_fabricator.rb +++ b/spec/fabricators/ai_summary_fabricator.rb @@ -7,6 +7,7 @@ target { Fabricate(:topic) } summary_type AiSummary.summary_types[:complete] origin AiSummary.origins[:human] + highest_target_number 1 end Fabricator(:topic_ai_gist, from: :ai_summary) do diff --git a/spec/jobs/scheduled/summaries_backfill_spec.rb b/spec/jobs/scheduled/summaries_backfill_spec.rb index 2764055dd..401f4d205 100644 --- a/spec/jobs/scheduled/summaries_backfill_spec.rb +++ b/spec/jobs/scheduled/summaries_backfill_spec.rb @@ -44,13 +44,13 @@ end it "ignores up to date summaries" do - Fabricate(:ai_summary, target: topic, content_range: (1..2)) + Fabricate(:ai_summary, target: topic, highest_target_number: 2, updated_at: 10.minutes.ago) expect(subject.backfill_candidates(type)).to be_empty end - it "ignores outdated summaries created less than five minutes ago" do - Fabricate(:ai_summary, target: topic, content_range: (1..1), created_at: 4.minutes.ago) + it "ignores outdated summaries updated less than five minutes ago" do + Fabricate(:ai_summary, target: topic, highest_target_number: 1, updated_at: 4.minutes.ago) expect(subject.backfill_candidates(type)).to be_empty end @@ -66,7 +66,7 @@ topic_2 = Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1) topic.update!(last_posted_at: 1.minute.ago) - Fabricate(:ai_summary, target: topic, created_at: 1.hour.ago, content_range: (1..1)) + Fabricate(:ai_summary, target: topic, updated_at: 1.hour.ago, highest_target_number: 1) expect(subject.backfill_candidates(type).map(&:id)).to contain_exactly(topic_2.id, topic.id) end @@ -84,13 +84,13 @@ topic_2 = Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1) topic.update!(last_posted_at: 1.minute.ago) - Fabricate(:ai_summary, target: topic, created_at: 3.hours.ago, content_range: (1..1)) - Fabricate(:topic_ai_gist, target: topic, created_at: 3.hours.ago, content_range: (1..1)) + Fabricate(:ai_summary, target: topic, updated_at: 3.hours.ago, highest_target_number: 1) + Fabricate(:topic_ai_gist, target: topic, updated_at: 3.hours.ago, highest_target_number: 1) summary_1 = "Summary of topic_2" gist_1 = "Gist of topic_2" - summary_2 = "Summary of topic" - gist_2 = "Gist of topic" + summary_2 = "Updated summary of topic" + gist_2 = "Updated gist of topic" DiscourseAi::Completions::Llm.with_prepared_responses( [gist_1, gist_2, summary_1, summary_2], @@ -100,6 +100,32 @@ expect(AiSummary.gist.find_by(target: topic_2).summarized_text).to eq(gist_1) expect(AiSummary.complete.find_by(target: topic).summarized_text).to eq(summary_2) expect(AiSummary.gist.find_by(target: topic).summarized_text).to eq(gist_2) + + # Queue has to be empty if we just generated all summaries + expect(subject.backfill_candidates(AiSummary.summary_types[:complete])).to be_empty + expect(subject.backfill_candidates(AiSummary.summary_types[:gist])).to be_empty + + # Queue still empty when they are up to date and time passes. + AiSummary.update_all(updated_at: 20.minutes.ago) + expect(subject.backfill_candidates(AiSummary.summary_types[:complete])).to be_empty + expect(subject.backfill_candidates(AiSummary.summary_types[:gist])).to be_empty + end + + it "updates the highest_target_number if the summary turned to be up to date" do + existing_summary = + Fabricate( + :ai_summary, + target: topic, + updated_at: 3.hours.ago, + highest_target_number: topic.highest_post_number, + ) + og_highest_post_number = topic.highest_post_number + topic.update!(highest_post_number: og_highest_post_number + 1) + + # No prepared responses here. We don't perform a completion call. + subject.execute({}) + + expect(existing_summary.reload.highest_target_number).to eq(og_highest_post_number + 1) end end end diff --git a/spec/lib/guardian_extensions_spec.rb b/spec/lib/guardian_extensions_spec.rb index 589b362a2..7e3bb6f0a 100644 --- a/spec/lib/guardian_extensions_spec.rb +++ b/spec/lib/guardian_extensions_spec.rb @@ -26,13 +26,7 @@ end it "returns true if there is a cached summary" do - AiSummary.create!( - target: topic, - summarized_text: "test", - original_content_sha: "123", - algorithm: "test", - summary_type: AiSummary.summary_types[:complete], - ) + Fabricate(:ai_summary, target: topic) expect(guardian.can_see_summary?(topic)).to eq(true) end @@ -66,13 +60,7 @@ end it "returns true for anons when there is a cached summary" do - AiSummary.create!( - target: topic, - summarized_text: "test", - original_content_sha: "123", - algorithm: "test", - summary_type: AiSummary.summary_types[:complete], - ) + Fabricate(:ai_summary, target: topic) expect(guardian.can_see_summary?(topic)).to eq(true) end diff --git a/spec/requests/summarization/summary_controller_spec.rb b/spec/requests/summarization/summary_controller_spec.rb index 493d7c856..01ab28bdb 100644 --- a/spec/requests/summarization/summary_controller_spec.rb +++ b/spec/requests/summarization/summary_controller_spec.rb @@ -13,15 +13,7 @@ context "when streaming" do it "return a cached summary with json payload and does not trigger job if it exists" do - section = - AiSummary.create!( - target: topic, - summarized_text: "test", - algorithm: "test", - original_content_sha: "test", - summary_type: AiSummary.summary_types[:complete], - ) - + summary = Fabricate(:ai_summary, target: topic) sign_in(Fabricate(:admin)) get "/discourse-ai/summarization/t/#{topic.id}.json?stream=true" @@ -29,8 +21,10 @@ expect(response.status).to eq(200) expect(Jobs::StreamTopicAiSummary.jobs.size).to eq(0) - summary = response.parsed_body - expect(summary.dig("ai_topic_summary", "summarized_text")).to eq(section.summarized_text) + response_summary = response.parsed_body + expect(response_summary.dig("ai_topic_summary", "summarized_text")).to eq( + summary.summarized_text, + ) end end @@ -42,21 +36,15 @@ end it "returns a cached summary" do - section = - AiSummary.create!( - target: topic, - summarized_text: "test", - algorithm: "test", - original_content_sha: "test", - summary_type: AiSummary.summary_types[:complete], - ) - + summary = Fabricate(:ai_summary, target: topic) get "/discourse-ai/summarization/t/#{topic.id}.json" expect(response.status).to eq(200) - summary = response.parsed_body - expect(summary.dig("ai_topic_summary", "summarized_text")).to eq(section.summarized_text) + response_summary = response.parsed_body + expect(response_summary.dig("ai_topic_summary", "summarized_text")).to eq( + summary.summarized_text, + ) end end @@ -90,15 +78,15 @@ get "/discourse-ai/summarization/t/#{topic.id}.json" expect(response.status).to eq(200) - summary = response.parsed_body["ai_topic_summary"] - section = AiSummary.last - - expect(section.summarized_text).to eq(summary_text) - expect(summary["summarized_text"]).to eq(section.summarized_text) - expect(summary["algorithm"]).to eq("fake") - expect(summary["outdated"]).to eq(false) - expect(summary["can_regenerate"]).to eq(true) - expect(summary["new_posts_since_summary"]).to be_zero + response_summary = response.parsed_body["ai_topic_summary"] + summary = AiSummary.last + + expect(summary.summarized_text).to eq(summary_text) + expect(response_summary["summarized_text"]).to eq(summary.summarized_text) + expect(response_summary["algorithm"]).to eq("fake") + expect(response_summary["outdated"]).to eq(false) + expect(response_summary["can_regenerate"]).to eq(true) + expect(response_summary["new_posts_since_summary"]).to be_zero end end @@ -129,21 +117,16 @@ end it "returns a cached summary" do - section = - AiSummary.create!( - target: topic, - summarized_text: "test", - algorithm: "test", - original_content_sha: "test", - summary_type: AiSummary.summary_types[:complete], - ) + summary = Fabricate(:ai_summary, target: topic) get "/discourse-ai/summarization/t/#{topic.id}.json" expect(response.status).to eq(200) - summary = response.parsed_body - expect(summary.dig("ai_topic_summary", "summarized_text")).to eq(section.summarized_text) + response_summary = response.parsed_body + expect(response_summary.dig("ai_topic_summary", "summarized_text")).to eq( + summary.summarized_text, + ) end end end diff --git a/spec/services/discourse_ai/topic_summarization_spec.rb b/spec/services/discourse_ai/topic_summarization_spec.rb index eca5a13ef..a75dd527c 100644 --- a/spec/services/discourse_ai/topic_summarization_spec.rb +++ b/spec/services/discourse_ai/topic_summarization_spec.rb @@ -20,7 +20,7 @@ def assert_summary_is_cached(topic, summary_response) cached_summary = AiSummary.find_by(target: topic, summary_type: AiSummary.summary_types[:complete]) - expect(cached_summary.content_range).to cover(*topic.posts.map(&:post_number)) + expect(cached_summary.highest_target_number).to eq(topic.highest_post_number) expect(cached_summary.summarized_text).to eq(summary) expect(cached_summary.original_content_sha).to be_present expect(cached_summary.algorithm).to eq("fake") diff --git a/spec/system/summarization/topic_summarization_spec.rb b/spec/system/summarization/topic_summarization_spec.rb index 36caea2e6..45ffd7a66 100644 --- a/spec/system/summarization/topic_summarization_spec.rb +++ b/spec/system/summarization/topic_summarization_spec.rb @@ -16,6 +16,8 @@ let(:topic_page) { PageObjects::Pages::Topic.new } let(:summary_box) { PageObjects::Components::AiSummaryTrigger.new } + fab!(:ai_summary) { Fabricate(:ai_summary, target: topic, summarized_text: "This is a summary") } + before do group.add(current_user) @@ -27,16 +29,6 @@ end context "when a summary is cached" do - before do - AiSummary.create!( - target: topic, - summarized_text: summarization_result, - algorithm: "test", - original_content_sha: "test", - summary_type: AiSummary.summary_types[:complete], - ) - end - it "displays it" do topic_page.visit_topic(topic) summary_box.click_summarize @@ -45,15 +37,6 @@ end context "when a summary is outdated" do - before do - AiSummary.create!( - target: topic, - summarized_text: summarization_result, - algorithm: "test", - original_content_sha: "test", - summary_type: AiSummary.summary_types[:complete], - ) - end fab!(:new_post) do Fabricate( :post,