Skip to content

Commit

Permalink
OK this now sort of works, need to extract llm selector
Browse files Browse the repository at this point in the history
  • Loading branch information
SamSaffron committed Feb 7, 2025
1 parent bdacc92 commit 95572ac
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 91 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def upload_file

def validate_extension!(filename)
extension = File.extname(filename)[1..-1] || ""
authorized_extensions = %w[txt md]
authorized_extensions = %w[txt md pdf png jpg jpeg]
if !authorized_extensions.include?(extension)
raise Discourse::InvalidParameters.new(
I18n.t(
Expand Down
26 changes: 26 additions & 0 deletions app/jobs/regular/digest_rag_upload.rb
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,32 @@ def first_chunk(text, chunk_tokens:, tokenizer:, splitters: ["\n\n", "\n", ".",
end

def get_uploaded_file(upload)
if upload.extension == "pdf"
pages =
DiscourseAi::Utils::PdfToImages.new(
upload: upload,
user: Discourse.system_user,
).uploaded_pages

return(
DiscourseAi::Utils::ImageToText.as_fake_file(
uploads: pages,
llm_model: LlmModel.find_by(display_name: "GPT-4o"),
user: Discourse.system_user,
)
)
end

if %w[png jpg jpeg].include?(upload.extension)
return(
DiscourseAi::Utils::ImageToText.as_fake_file(
uploads: [upload],
llm_model: LlmModel.find_by(display_name: "GPT-4o"),
user: Discourse.system_user,
)
)
end

store = Discourse.store
@file ||=
if store.external?
Expand Down
2 changes: 1 addition & 1 deletion assets/javascripts/discourse/components/rag-uploader.gjs
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ export default class RagUploader extends Component {
disabled={{this.uploading}}
type="file"
multiple="multiple"
accept=".txt,.md"
accept=".txt,.md,.pdf"
/>
<DButton
@label="discourse_ai.rag.uploads.button"
Expand Down
156 changes: 67 additions & 89 deletions lib/utils/pdf_to_text.rb → lib/utils/image_to_text.rb
Original file line number Diff line number Diff line change
@@ -1,110 +1,88 @@
# frozen_string_literal: true

class DiscourseAi::Utils::PdfToText
MAX_PDF_SIZE = 100.megabytes
MAX_CONVERT_SECONDS = 30
class DiscourseAi::Utils::ImageToText
BACKOFF_SECONDS = [5, 30, 60]

attr_reader :upload, :llm_model, :user
class Reader
def initialize(uploads:, llm_model:, user:)
@uploads = uploads
@llm_model = llm_model
@user = user
@buffer = +""

def initialize(upload:, llm_model:, user:)
@upload = upload
@llm_model = llm_model
@user = user
@uploaded_pages = UploadReference.where(target: upload).map(&:upload)
end
@to_process = uploads.dup
end

def extract_pages
temp_dir = File.join(Dir.tmpdir, "discourse-pdf-#{SecureRandom.hex(8)}")
FileUtils.mkdir_p(temp_dir)
# return nil if no more data
def read(length)
# for implementation simplicity we will process one image at a time
if !@buffer.empty?
part = @buffer.slice!(0, length)
return part
end

begin
pdf_path =
if upload.local?
Discourse.store.path_for(upload)
return nil if @to_process.empty?

upload = @to_process.shift
extractor =
DiscourseAi::Utils::ImageToText.new(upload: upload, llm_model: @llm_model, user: @user)
extractor.extract_text do |chunk, error|
if error
Discourse.warn_exception(
error,
message: "Discourse AI: Failed to extract text from image",
)
else
Discourse.store.download_safe(upload, max_file_size_kb: MAX_PDF_SIZE)&.path
# this introduces chunk markers so discourse rag ingestion requires no overlaps
@buffer << "\n[[metadata ]]\n"
@buffer << chunk
end
end

raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil?

temp_pdf = File.join(temp_dir, "source.pdf")
FileUtils.cp(pdf_path, temp_pdf)

# Convert PDF to individual page images
output_pattern = File.join(temp_dir, "page-%04d.png")

command = [
"magick",
"-density",
"300",
temp_pdf,
"-background",
"white",
"-auto-orient",
"-quality",
"85",
output_pattern,
]

Discourse::Utils.execute_command(
*command,
failure_message: "Failed to convert PDF to images",
timeout: MAX_CONVERT_SECONDS,
)

uploads = []
Dir
.glob(File.join(temp_dir, "page-*.png"))
.sort
.each do |page_path|
upload =
UploadCreator.new(File.open(page_path), "page-#{File.basename(page_path)}").create_for(
@user.id,
)

uploads << upload
end
read(length)
end
end

# Create upload references
UploadReference.ensure_exist!(upload_ids: uploads.map(&:id), target: @upload)
def self.as_fake_file(uploads:, llm_model:, user:)
# given our implementation for extracting text expect a file, return a simple object that can simulate read(size)
# and stream content
Reader.new(uploads: uploads, llm_model: llm_model, user: user)
end

@uploaded_pages = uploads
ensure
FileUtils.rm_rf(temp_dir) if Dir.exist?(temp_dir)
end
attr_reader :upload, :llm_model, :user

def initialize(upload:, llm_model:, user:)
@upload = upload
@llm_model = llm_model
@user = user
end

def extract_text(uploads: nil, retries: 3)
def extract_text(retries: 3)
uploads ||= @uploaded_pages

raise "must specify a block" if !block_given?
uploads
.map do |upload|
extracted = nil
error = nil

backoff = BACKOFF_SECONDS.dup

retries.times do
seconds = nil
begin
extracted = extract_text_from_page(upload)
break
rescue => e
error = e
seconds = backoff.shift || seconds
sleep(seconds)
end
end
if extracted
extracted.each { |chunk| yield(chunk, upload) }
else
yield(nil, upload, error)
end
extracted || []
extracted = nil
error = nil

backoff = BACKOFF_SECONDS.dup

retries.times do
seconds = nil
begin
extracted = extract_text_from_page(upload)
break
rescue => e
error = e
seconds = backoff.shift || seconds
sleep(seconds)
end
.flatten
end
if extracted
extracted.each { |chunk| yield(chunk) }
else
yield(nil, error)
end
extracted || []
end

private
Expand Down
80 changes: 80 additions & 0 deletions lib/utils/pdf_to_images.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# frozen_string_literal: true

class DiscourseAi::Utils::PdfToImages
MAX_PDF_SIZE = 100.megabytes
MAX_CONVERT_SECONDS = 30
BACKOFF_SECONDS = [5, 30, 60]

attr_reader :upload, :user

def initialize(upload:, user:)
@upload = upload
@user = user
@uploaded_pages = UploadReference.where(target: upload).map(&:upload).presence
end

def uploaded_pages
@uploaded_pages ||= extract_pages
end

def extract_pages
temp_dir = File.join(Dir.tmpdir, "discourse-pdf-#{SecureRandom.hex(8)}")
FileUtils.mkdir_p(temp_dir)

begin
pdf_path =
if upload.local?
Discourse.store.path_for(upload)
else
Discourse.store.download_safe(upload, max_file_size_kb: MAX_PDF_SIZE)&.path
end

raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil?

temp_pdf = File.join(temp_dir, "source.pdf")
FileUtils.cp(pdf_path, temp_pdf)

# Convert PDF to individual page images
output_pattern = File.join(temp_dir, "page-%04d.png")

command = [
"magick",
"-density",
"300",
temp_pdf,
"-background",
"white",
"-auto-orient",
"-quality",
"85",
output_pattern,
]

Discourse::Utils.execute_command(
*command,
failure_message: "Failed to convert PDF to images",
timeout: MAX_CONVERT_SECONDS,
)

uploads = []
Dir
.glob(File.join(temp_dir, "page-*.png"))
.sort
.each do |page_path|
upload =
UploadCreator.new(File.open(page_path), "page-#{File.basename(page_path)}").create_for(
@user.id,
)

uploads << upload
end

# Create upload references
UploadReference.ensure_exist!(upload_ids: uploads.map(&:id), target: @upload)

@uploaded_pages = uploads
ensure
FileUtils.rm_rf(temp_dir) if Dir.exist?(temp_dir)
end
end
end

0 comments on commit 95572ac

Please sign in to comment.