Skip to content
This repository has been archived by the owner on Jan 5, 2025. It is now read-only.

Commit

Permalink
Merge pull request #226 from lvalics/main
Browse files Browse the repository at this point in the history
Read only the first 2000 characters
  • Loading branch information
gharbat authored Jan 22, 2024
2 parents 8903d38 + 624c076 commit d01e9c2
Show file tree
Hide file tree
Showing 14 changed files with 377 additions and 164 deletions.
59 changes: 48 additions & 11 deletions dj_backend_server/api/data_sources/pdf_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import requests
import traceback
from uuid import uuid4
from typing import Optional
from django.views.decorators.csrf import csrf_exempt
from django.utils import timezone
from django.shortcuts import get_object_or_404
Expand All @@ -25,21 +26,48 @@


@csrf_exempt
def pdf_handler(shared_folder: str, namespace: str, delete_folder_flag: bool):
def pdf_handler(shared_folder: str, namespace: str, delete_folder_flag: Optional[bool] = False, text_data: Optional[str] = None):
"""
This function handles PDF files and other types of files in a shared folder. It processes the text data if provided directly,
otherwise it reads from the files in the shared folder. It processes each file based on its extension, converts .doc, .docx, .xls,
and .xlsx files to .txt, and saves .txt, .csv, and .json files as .txt. It then converts the text data to a vector database.
Args:
shared_folder (str): The name of the shared folder where the files are located.
namespace (str): The namespace for the vector database.
delete_folder_flag (bool): A flag indicating whether to delete the folder after processing the files.
text_data (Optional[str], optional): The text data to be processed. If this is provided, the function will not read from
the files. Defaults to None.
Raises:
Exception: If an error occurs during the processing of the files or the conversion of the text data to a vector database.
"""
print ("Debug: pdf_handler")
# If text data is provided directly, process it without reading from files
if text_data:
process_text_data(text_data, namespace)
print ("Debug: text_data is provided directly, process it without reading from files")
return

# Convert delete_folder_flag to boolean (send 0 - FALSE or 1 - TRUE)
delete_folder_flag = bool(int(delete_folder_flag))

delete_folder_flag = bool(delete_folder_flag) if delete_folder_flag is not None else False
# Check if the shared_folder is provided, if not, return early as there are no files to process
if not shared_folder:
print("No shared folder provided for file processing.")
return

try:
#TODO: When will be multiple external library to choose, need to change.
if os.environ.get("PDF_LIBRARY") == "external":
directory_path = os.path.join("website_data_sources", shared_folder)
# print(f"Debug: Processing folder {directory_path}")
if shared_folder:
directory_path = os.path.join("website_data_sources", shared_folder)
print(f"Debug: Processing folder {directory_path}")

if os.path.exists(directory_path):
print(f"Debug: Directory exists. Files: {os.listdir(directory_path)}")
else:
print(f"Debug: Directory does not exist")
print(f"Debug: No shared folder provided for file processing.")
return

# Process each file in the directory based on its extension
for filename in os.listdir(directory_path):
Expand Down Expand Up @@ -163,14 +191,11 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool

raw_docs = directory_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)

docs = text_splitter.split_documents(raw_docs)

# print("docs -->", docs);
# print("docs -->", [doc.metadata for doc in docs])
# for doc in docs:
# print("Document content:", doc.page_content)
print("external files docs -->", docs);

if not docs:
print("No documents were processed successfully.")
Expand Down Expand Up @@ -232,3 +257,15 @@ def convert_to_txt(file_path):
json.dump(data, txt_file, ensure_ascii=False, indent=4)
else:
raise NotImplementedError(f"Conversion for {os.path.splitext(file_path)[1]} files to text not implemented yet.")

def process_text_data(text_data: str, namespace: str):
"""
Processes the provided text data and ingests it into the vector database.
Args:
text_data (str): The text data to process.
namespace (str): The namespace for the vector database.
"""
print ("Debug: process_text_data")
txt_to_vectordb(text_data, namespace, False)
return
18 changes: 15 additions & 3 deletions dj_backend_server/api/data_sources/website_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,19 @@
from uuid import uuid4

def website_handler(shared_folder, namespace):
"""
This function handles the processing of text files in a shared folder. It loads the text files, splits the text into chunks,
generates embeddings for the chunks, and initializes a vector store with the chunks and their embeddings. If the processing is
successful, it updates the status of the website data source to 'COMPLETED'. If an exception occurs during the processing, it
updates the status of the website data source to 'FAILED' and saves the exception details in a FailedJob object.
Args:
shared_folder (str): The ID of the WebsiteDataSource object and the name of the shared folder where the text files are located.
namespace (str): The namespace for the vector store.
Raises:
Exception: If an error occurs during the processing of the text files or the initialization of the vector store.
"""
website_data_source = WebsiteDataSource.objects.get(id=shared_folder)
try:
directory_path = os.path.join("website_data_sources", shared_folder)
Expand All @@ -26,15 +39,14 @@ def website_handler(shared_folder, namespace):

docs = text_splitter.split_documents(raw_docs)

print("docs -->", docs);
print("website docs -->", docs);
embeddings = get_embeddings()

init_vector_store(docs, embeddings, StoreOptions(namespace=namespace))

website_data_source.crawling_status = WebsiteDataSourceStatusType.COMPLETED.value
website_data_source.save()
# delete_folder(folder_path=directory_path)
print('All is done, folder deleted...')
print('Website embeddings, done ...')
except Exception as e:
website_data_source.crawling_status = WebsiteDataSourceStatusType.FAILED.value
website_data_source.save()
Expand Down
32 changes: 11 additions & 21 deletions dj_backend_server/api/pdf_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,9 @@
from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.http import require_POST
from django.shortcuts import get_object_or_404
from web.services.handle_pdf_datasource import HandlePdfDataSource
from web.models.chatbot import Chatbot
from web.signals.pdf_datasource_was_added import pdf_data_source_added
from web.enums.chatbot_initial_prompt_enum import ChatBotInitialPromptEnum
from web.enums.common_enums import ChatBotDefaults
from uuid import uuid4

@csrf_exempt
@require_POST
Expand All @@ -31,22 +27,16 @@ def upload_pdf_api(request):
except Chatbot.DoesNotExist:
return JsonResponse({'error': 'Invalid token'}, status=403)

if request.method == 'POST':
"""
Handles the POST request to upload PDF files. It extracts the bot token, processes the uploaded files,
creates a data source, and triggers an event to indicate that the PDF data source was added.
"""
delete_folder_flag = request.POST.get('delete_folder_flag', '0') == '1'
files = request.FILES.getlist('pdffiles')
text_data = request.POST.get('text_data', '')

delete_folder_flag = request.POST.get('delete_folder_flag', '0') == '1'
# Handle the PDF data source
handle_pdf = HandlePdfDataSource(bot, files)
data_source = handle_pdf.handle()
print (f"text_data: {data_source}")

files = request.FILES.getlist('pdffiles')
# Handle the PDF data source
handle_pdf = HandlePdfDataSource(bot, files)
data_source = handle_pdf.handle()
# print (data_source)

# Trigger the PdfDataSourceWasAdded event
pdf_data_source_added.send(sender='create_via_pdf_flow', bot_id=bot.id, data_source_id=data_source.id, delete_folder_flag=delete_folder_flag)
return JsonResponse({'message': 'PDF uploaded and chatbot created successfully', 'data_source_id': data_source.id, 'bot_id': bot.id})

return JsonResponse({'error': 'Invalid request method'}, status=405)
# Trigger the PdfDataSourceWasAdded event
pdf_data_source_added.send(sender='create_via_pdf_flow', bot_id=bot.id, data_source_id=data_source.id, delete_folder_flag=delete_folder_flag)
return JsonResponse({'message': 'PDF uploaded and chatbot created successfully', 'data_source_id': data_source.id, 'bot_id': bot.id})

6 changes: 3 additions & 3 deletions dj_backend_server/api/utils/init_vector_store.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from langchain.docstore.document import Document
from langchain.vectorstores.qdrant import Qdrant
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pinecone import Pinecone
from qdrant_client import QdrantClient
from qdrant_client import models
from api.enums import StoreType
from langchain.embeddings.openai import OpenAIEmbeddings
from api.interfaces import StoreOptions
from api.configs import PINECONE_TEXT_KEY, VECTOR_STORE_INDEX_NAME
import pinecone
from langchain.vectorstores.pinecone import Pinecone
from dotenv import load_dotenv
import os
import threading
Expand Down Expand Up @@ -54,7 +54,7 @@ def init_vector_store(docs: list[Document], embeddings: OpenAIEmbeddings, option
Pinecone.from_documents(documents=docs, embedding=embeddings, index_name=VECTOR_STORE_INDEX_NAME, namespace=options.namespace)

elif store_type == StoreType.QDRANT:
# print("LEHEL called qdrant.from_documents")
# print("Called qdrant.from_documents")
Qdrant.from_documents(docs, embeddings, collection_name=options.namespace, url=os.environ['QDRANT_URL'])

else:
Expand Down
27 changes: 12 additions & 15 deletions dj_backend_server/nginx/nginx.template.conf
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ http {
listen 80;
server_name ${APP_URL};

proxy_read_timeout 300s; # Increase the timeout to 300 seconds
proxy_connect_timeout 75s; # Increase the connection timeout to 75 seconds

# Duplicate your existing settings here
charset utf-8;

Expand Down Expand Up @@ -50,6 +53,8 @@ http {
# expires -1; #dev environment
# proxy_cache_bypass 1;
# proxy_no_cache 1;
# proxy_read_timeout 300s; # Increase the timeout to 300 seconds
# proxy_connect_timeout 75s; # Increase the connection timeout to 75 seconds
# }

#FOR HTTPS:// USE THIS
Expand Down Expand Up @@ -80,6 +85,9 @@ http {
listen 443 ssl;
server_name ${APP_URL};

proxy_read_timeout 300s; # Increase the timeout to 300 seconds
proxy_connect_timeout 75s; # Increase the connection timeout to 75 seconds

ssl_certificate /etc/nginx/ssl/cert.pem;
ssl_certificate_key /etc/nginx/ssl/privkey.pem;

Expand Down Expand Up @@ -129,28 +137,15 @@ http {
open_file_cache_min_uses 2;
open_file_cache_errors on;

# location /static/ {
# alias /app/web/static/; # The trailing slash is important
# # proxy_set_header Host $host;
# # proxy_set_header X-Real-IP $remote_addr;
# # proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
# # proxy_ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3;
# expires -1; #dev
# # proxy_cache_bypass 1;
# # proxy_no_cache 1;
# add_header Cache-Control "public, max-age=2592000";
# proxy_set_header X-Forwarded-Proto $scheme;
# add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; # HSTS header
# add_header Content-Security-Policy "default-src 'self'; style-src 'self' 'unsafe-inline'; script-src 'self' 'unsafe-inline' 'unsafe-eval'";
# }

location /static {
proxy_pass http://web:8000;
#alias /app/web/static/;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3;
proxy_read_timeout 300s; # Increase the timeout to 300 seconds
proxy_connect_timeout 75s; # Increase the connection timeout to 75 seconds
expires -1; #dev environment
proxy_no_cache 1; #dev environment
proxy_cache_bypass 1; #dev environment
Expand All @@ -168,6 +163,8 @@ http {
proxy_set_header X-Forwarded-Proto $scheme; # Forward the original scheme (HTTP or HTTPS)
proxy_set_header Origin $http_origin; # Optionally forward the Origin header
proxy_ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3;
proxy_read_timeout 300s; # Increase the timeout to 300 seconds
proxy_connect_timeout 75s; # Increase the connection timeout to 75 seconds
add_header Cache-Control "public, max-age=2592000";
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; # HSTS header
expires -1; #dev environment
Expand Down
9 changes: 4 additions & 5 deletions dj_backend_server/web/models/pdf_data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,13 @@ def get_files_info(self):
return self.files_info

def delete_files(self):
folder_path = f"/app/{self.folder_name}"
# print (f"FOLDER: {folder_path}")
folder_path = f"/app/website_data_sources/{self.folder_name}"
print (f"FOLDER: {folder_path}")
if os.path.exists(folder_path):
shutil.rmtree(folder_path)
return f"All files in folder {folder_path} have been deleted."
return f"All files in folder {self.folder_name} have been deleted."
else:
return "No files were deleted or folder does not exist."

class Meta:
db_table = 'pdf_data_sources' # Replace 'pdf_data_source' with the actual table name in the database

db_table = 'pdf_data_sources' # Replace 'pdf_data_source' with the actual table name in the database
25 changes: 19 additions & 6 deletions dj_backend_server/web/services/handle_pdf_datasource.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# services.py
import os
import hashlib
from django.core.exceptions import ValidationError
Expand All @@ -24,13 +23,27 @@ def handle(self) -> PdfDataSource:

files_urls = []
files_info_list = []
for file in self.files:
# It appears that the HandlePdfDataSource class is being used in two different contexts,
# one where self.files is expected to be a list of tuples (in the API handler)
# and one where it is expected to be a list of file objects (in the web view).
# To resolve this inconsistency, we need to modify the HandlePdfDataSource class to handle both cases.

# for file_field_name, file in self.files.items():
for file_item in self.files:
# Check if file_item is a tuple (file_field_name, file) or just a file object
if isinstance(file_item, tuple):
file_field_name, file = file_item
file_name = file_field_name
else:
file = file_item
file_name = file.name

try:
# Validate file types or other conditions if necessary
# For example: if not file.name.endswith('.pdf'): raise ValidationError('Invalid file type')

# Generate a unique file name using UUID
file_extension = os.path.splitext(file.name)[1]
file_extension = os.path.splitext(file_name)[1]
file_uuid_name = str(uuid4()) + file_extension
file_path = os.path.join(folder_path, file_uuid_name)

Expand Down Expand Up @@ -66,13 +79,13 @@ def handle(self) -> PdfDataSource:
failed_at=datetime.now())
failed_job.save()
# You can also raise a more specific custom exception if needed
raise ValidationError(f"Error while uploading file: {file.name}, Error: {str(e)}")
raise ValidationError(f"Error while uploading file: {file_name}, Error: {str(e)}")

data_source.chatbot_id = self.bot.id
data_source.files = files_urls
data_source.files_info = files_info_list
data_source.folder_name = folder_name
data_source.ingest_status = 'PDF(s) Uploaded'
data_source.ingest_status = 'File(s) uploaded'

data_source.save()
return data_source
return data_source
2 changes: 1 addition & 1 deletion dj_backend_server/web/static/chat.css

Large diffs are not rendered by default.

Loading

0 comments on commit d01e9c2

Please sign in to comment.