Merge pull request #226 from lvalics/main

Read only the first 2000 characters
openchatai · Jan 22, 2024 · d01e9c2 · d01e9c2
2 parents 8903d38 + 624c076
commit d01e9c2
Show file tree

Hide file tree

Showing 14 changed files with 377 additions and 164 deletions.
diff --git a/dj_backend_server/api/data_sources/pdf_handler.py b/dj_backend_server/api/data_sources/pdf_handler.py
@@ -8,6 +8,7 @@
 import requests
 import traceback
 from uuid import uuid4
+from typing import Optional
 from django.views.decorators.csrf import csrf_exempt
 from django.utils import timezone
 from django.shortcuts import get_object_or_404
@@ -25,21 +26,48 @@
 
 
 @csrf_exempt
-def pdf_handler(shared_folder: str, namespace: str, delete_folder_flag: bool):
+def pdf_handler(shared_folder: str, namespace: str, delete_folder_flag: Optional[bool] = False, text_data: Optional[str] = None):
+    """
+    This function handles PDF files and other types of files in a shared folder. It processes the text data if provided directly, 
+    otherwise it reads from the files in the shared folder. It processes each file based on its extension, converts .doc, .docx, .xls, 
+    and .xlsx files to .txt, and saves .txt, .csv, and .json files as .txt. It then converts the text data to a vector database.
+
+    Args:
+        shared_folder (str): The name of the shared folder where the files are located.
+        namespace (str): The namespace for the vector database.
+        delete_folder_flag (bool): A flag indicating whether to delete the folder after processing the files.
+        text_data (Optional[str], optional): The text data to be processed. If this is provided, the function will not read from 
+        the files. Defaults to None.
+
+    Raises:
+        Exception: If an error occurs during the processing of the files or the conversion of the text data to a vector database.
+    """
+    print ("Debug: pdf_handler")
+    # If text data is provided directly, process it without reading from files
+    if text_data:
+        process_text_data(text_data, namespace)
+        print ("Debug: text_data is provided directly, process it without reading from files")
+        return
 
     # Convert delete_folder_flag to boolean (send 0 - FALSE or 1 - TRUE)
-    delete_folder_flag = bool(int(delete_folder_flag))
-
+    delete_folder_flag = bool(delete_folder_flag) if delete_folder_flag is not None else False
+    # Check if the shared_folder is provided, if not, return early as there are no files to process
+    if not shared_folder:
+        print("No shared folder provided for file processing.")
+        return
+
     try:
         #TODO: When will be multiple external library to choose, need to change.
         if os.environ.get("PDF_LIBRARY") == "external":
-            directory_path = os.path.join("website_data_sources", shared_folder)
-            # print(f"Debug: Processing folder {directory_path}")
+            if shared_folder:
+                directory_path = os.path.join("website_data_sources", shared_folder)
+                print(f"Debug: Processing folder {directory_path}")
 
             if os.path.exists(directory_path):
                 print(f"Debug: Directory exists. Files: {os.listdir(directory_path)}")
             else:
-                print(f"Debug: Directory does not exist")
+                print(f"Debug: No shared folder provided for file processing.")
+                return
 
             # Process each file in the directory based on its extension
             for filename in os.listdir(directory_path):
@@ -163,14 +191,11 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool
 
         raw_docs = directory_loader.load()
 
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200, length_function=len)
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
 
         docs = text_splitter.split_documents(raw_docs)
 
-        # print("docs -->", docs);
-        # print("docs -->", [doc.metadata for doc in docs])
-        # for doc in docs:
-        #     print("Document content:", doc.page_content)
+        print("external files docs -->", docs);
 
         if not docs:
              print("No documents were processed successfully.")
@@ -232,3 +257,15 @@ def convert_to_txt(file_path):
                 json.dump(data, txt_file, ensure_ascii=False, indent=4)
     else:
         raise NotImplementedError(f"Conversion for {os.path.splitext(file_path)[1]} files to text not implemented yet.")
+
+def process_text_data(text_data: str, namespace: str):
+    """
+    Processes the provided text data and ingests it into the vector database.
+
+    Args:
+        text_data (str): The text data to process.
+        namespace (str): The namespace for the vector database.
+    """
+    print ("Debug: process_text_data")
+    txt_to_vectordb(text_data, namespace, False)
+    return    
diff --git a/dj_backend_server/api/data_sources/website_handler.py b/dj_backend_server/api/data_sources/website_handler.py
@@ -15,6 +15,19 @@
 from uuid import uuid4
 
 def website_handler(shared_folder, namespace):
+    """
+    This function handles the processing of text files in a shared folder. It loads the text files, splits the text into chunks, 
+    generates embeddings for the chunks, and initializes a vector store with the chunks and their embeddings. If the processing is 
+    successful, it updates the status of the website data source to 'COMPLETED'. If an exception occurs during the processing, it 
+    updates the status of the website data source to 'FAILED' and saves the exception details in a FailedJob object.
+
+    Args:
+        shared_folder (str): The ID of the WebsiteDataSource object and the name of the shared folder where the text files are located.
+        namespace (str): The namespace for the vector store.
+
+    Raises:
+        Exception: If an error occurs during the processing of the text files or the initialization of the vector store.
+    """
     website_data_source = WebsiteDataSource.objects.get(id=shared_folder)
     try:
         directory_path = os.path.join("website_data_sources", shared_folder)
@@ -26,15 +39,14 @@ def website_handler(shared_folder, namespace):
 
         docs = text_splitter.split_documents(raw_docs)
 
-        print("docs -->", docs);
+        print("website docs -->", docs);
         embeddings = get_embeddings()
 
         init_vector_store(docs, embeddings, StoreOptions(namespace=namespace))
 
         website_data_source.crawling_status = WebsiteDataSourceStatusType.COMPLETED.value
         website_data_source.save()
-        # delete_folder(folder_path=directory_path)
-        print('All is done, folder deleted...')
+        print('Website embeddings, done ...')
     except Exception as e:
         website_data_source.crawling_status = WebsiteDataSourceStatusType.FAILED.value
         website_data_source.save()

diff --git a/dj_backend_server/api/pdf_handler.py b/dj_backend_server/api/pdf_handler.py
@@ -7,13 +7,9 @@
 from django.http import JsonResponse
 from django.views.decorators.csrf import csrf_exempt
 from django.views.decorators.http import require_POST
-from django.shortcuts import get_object_or_404
 from web.services.handle_pdf_datasource import HandlePdfDataSource
 from web.models.chatbot import Chatbot
 from web.signals.pdf_datasource_was_added import pdf_data_source_added
-from web.enums.chatbot_initial_prompt_enum import ChatBotInitialPromptEnum
-from web.enums.common_enums import ChatBotDefaults
-from uuid import uuid4
 
 @csrf_exempt
 @require_POST
@@ -31,22 +27,16 @@ def upload_pdf_api(request):
     except Chatbot.DoesNotExist:
         return JsonResponse({'error': 'Invalid token'}, status=403)
 
-    if request.method == 'POST':
-        """
-        Handles the POST request to upload PDF files. It extracts the bot token, processes the uploaded files,
-        creates a data source, and triggers an event to indicate that the PDF data source was added.
-        """
+    delete_folder_flag = request.POST.get('delete_folder_flag', '0') == '1'
+    files = request.FILES.getlist('pdffiles')
+    text_data = request.POST.get('text_data', '')
 
-        delete_folder_flag = request.POST.get('delete_folder_flag', '0') == '1'
+    # Handle the PDF data source
+    handle_pdf = HandlePdfDataSource(bot, files)
+    data_source = handle_pdf.handle()
+    print (f"text_data: {data_source}")
 
-        files = request.FILES.getlist('pdffiles')
-        # Handle the PDF data source
-        handle_pdf = HandlePdfDataSource(bot, files)
-        data_source = handle_pdf.handle()
-        # print (data_source)
-
-        # Trigger the PdfDataSourceWasAdded event
-        pdf_data_source_added.send(sender='create_via_pdf_flow', bot_id=bot.id, data_source_id=data_source.id, delete_folder_flag=delete_folder_flag)
-        return JsonResponse({'message': 'PDF uploaded and chatbot created successfully', 'data_source_id': data_source.id, 'bot_id': bot.id})
-
-    return JsonResponse({'error': 'Invalid request method'}, status=405)
+    # Trigger the PdfDataSourceWasAdded event
+    pdf_data_source_added.send(sender='create_via_pdf_flow', bot_id=bot.id, data_source_id=data_source.id, delete_folder_flag=delete_folder_flag)
+    return JsonResponse({'message': 'PDF uploaded and chatbot created successfully', 'data_source_id': data_source.id, 'bot_id': bot.id})
+
diff --git a/dj_backend_server/api/utils/init_vector_store.py b/dj_backend_server/api/utils/init_vector_store.py
@@ -1,13 +1,13 @@
 from langchain.docstore.document import Document
 from langchain.vectorstores.qdrant import Qdrant
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores.pinecone import Pinecone
 from qdrant_client import QdrantClient
 from qdrant_client import models
 from api.enums import StoreType
-from langchain.embeddings.openai import OpenAIEmbeddings
 from api.interfaces import StoreOptions
 from api.configs import PINECONE_TEXT_KEY, VECTOR_STORE_INDEX_NAME
 import pinecone
-from langchain.vectorstores.pinecone import Pinecone
 from dotenv import load_dotenv
 import os
 import threading
@@ -54,7 +54,7 @@ def init_vector_store(docs: list[Document], embeddings: OpenAIEmbeddings, option
         Pinecone.from_documents(documents=docs, embedding=embeddings, index_name=VECTOR_STORE_INDEX_NAME, namespace=options.namespace)
 
     elif store_type == StoreType.QDRANT:
-        # print("LEHEL called qdrant.from_documents")
+        # print("Called qdrant.from_documents")
         Qdrant.from_documents(docs, embeddings, collection_name=options.namespace, url=os.environ['QDRANT_URL'])
 
     else:

diff --git a/dj_backend_server/nginx/nginx.template.conf b/dj_backend_server/nginx/nginx.template.conf
@@ -9,6 +9,9 @@ http {
         listen 80;
         server_name ${APP_URL}; 
 
+        proxy_read_timeout 300s;  # Increase the timeout to 300 seconds
+        proxy_connect_timeout 75s; # Increase the connection timeout to 75 seconds
+
     # Duplicate your existing settings here
         charset     utf-8;
 
@@ -50,6 +53,8 @@ http {
         #     expires -1; #dev environment
         #     proxy_cache_bypass 1;
         #     proxy_no_cache 1;
+        #     proxy_read_timeout 300s;  # Increase the timeout to 300 seconds
+        #     proxy_connect_timeout 75s; # Increase the connection timeout to 75 seconds
         # }
 
         #FOR HTTPS:// USE THIS
@@ -80,6 +85,9 @@ http {
         listen 443 ssl;
         server_name ${APP_URL};
 
+        proxy_read_timeout 300s;  # Increase the timeout to 300 seconds
+        proxy_connect_timeout 75s; # Increase the connection timeout to 75 seconds
+
         ssl_certificate /etc/nginx/ssl/cert.pem;
         ssl_certificate_key /etc/nginx/ssl/privkey.pem;
 
@@ -129,28 +137,15 @@ http {
         open_file_cache_min_uses 2;
         open_file_cache_errors on;
 
-        # location /static/ {
-        #     alias /app/web/static/; # The trailing slash is important
-        #     # proxy_set_header Host $host;
-        #     # proxy_set_header X-Real-IP $remote_addr;
-        #     # proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-        #     # proxy_ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3;     
-        #     expires -1; #dev
-        #     # proxy_cache_bypass 1;
-        #     # proxy_no_cache 1;
-        #     add_header Cache-Control "public, max-age=2592000";
-        #     proxy_set_header X-Forwarded-Proto $scheme;
-        #     add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;  # HSTS header
-        #     add_header Content-Security-Policy "default-src 'self'; style-src 'self' 'unsafe-inline'; script-src 'self' 'unsafe-inline' 'unsafe-eval'";
-        # }
-
         location /static {
             proxy_pass http://web:8000;
             #alias /app/web/static/;
             proxy_set_header Host $host;
             proxy_set_header X-Real-IP $remote_addr;
             proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
             proxy_ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3; 
+            proxy_read_timeout 300s;  # Increase the timeout to 300 seconds
+            proxy_connect_timeout 75s; # Increase the connection timeout to 75 seconds
             expires -1; #dev environment
             proxy_no_cache 1; #dev environment
             proxy_cache_bypass 1; #dev environment
@@ -168,6 +163,8 @@ http {
             proxy_set_header X-Forwarded-Proto $scheme;  # Forward the original scheme (HTTP or HTTPS)
             proxy_set_header Origin $http_origin;  # Optionally forward the Origin header
             proxy_ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3; 
+            proxy_read_timeout 300s;  # Increase the timeout to 300 seconds
+            proxy_connect_timeout 75s; # Increase the connection timeout to 75 seconds
             add_header Cache-Control "public, max-age=2592000";
             add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;  # HSTS header            
             expires -1; #dev environment

diff --git a/dj_backend_server/web/models/pdf_data_sources.py b/dj_backend_server/web/models/pdf_data_sources.py
@@ -52,14 +52,13 @@ def get_files_info(self):
         return self.files_info
 
     def delete_files(self):
-        folder_path = f"/app/{self.folder_name}"
-        # print (f"FOLDER: {folder_path}")
+        folder_path = f"/app/website_data_sources/{self.folder_name}"
+        print (f"FOLDER: {folder_path}")
         if os.path.exists(folder_path):
             shutil.rmtree(folder_path)
-            return f"All files in folder {folder_path} have been deleted."
+            return f"All files in folder {self.folder_name} have been deleted."
         else:
             return "No files were deleted or folder does not exist."
 
     class Meta:
-        db_table = 'pdf_data_sources'  # Replace 'pdf_data_source' with the actual table name in the database
-
+        db_table = 'pdf_data_sources'  # Replace 'pdf_data_source' with the actual table name in the database
diff --git a/dj_backend_server/web/services/handle_pdf_datasource.py b/dj_backend_server/web/services/handle_pdf_datasource.py
@@ -1,4 +1,3 @@
-# services.py
 import os
 import hashlib
 from django.core.exceptions import ValidationError
@@ -24,13 +23,27 @@ def handle(self) -> PdfDataSource:
 
         files_urls = []
         files_info_list = []
-        for file in self.files:
+        # It appears that the HandlePdfDataSource class is being used in two different contexts, 
+        # one where self.files is expected to be a list of tuples (in the API handler) 
+        # and one where it is expected to be a list of file objects (in the web view). 
+        # To resolve this inconsistency, we need to modify the HandlePdfDataSource class to handle both cases.
+
+        # for file_field_name, file in self.files.items():
+        for file_item in self.files:
+            # Check if file_item is a tuple (file_field_name, file) or just a file object
+            if isinstance(file_item, tuple):
+                file_field_name, file = file_item
+                file_name = file_field_name
+            else:
+                file = file_item
+                file_name = file.name
+
             try:
                 # Validate file types or other conditions if necessary
                 # For example: if not file.name.endswith('.pdf'): raise ValidationError('Invalid file type')
 
                 # Generate a unique file name using UUID
-                file_extension = os.path.splitext(file.name)[1]
+                file_extension = os.path.splitext(file_name)[1]
                 file_uuid_name = str(uuid4()) + file_extension
                 file_path = os.path.join(folder_path, file_uuid_name)
 
@@ -66,13 +79,13 @@ def handle(self) -> PdfDataSource:
  failed_at=datetime.now())
                 failed_job.save()
                 # You can also raise a more specific custom exception if needed
-                raise ValidationError(f"Error while uploading file: {file.name}, Error: {str(e)}")
+                raise ValidationError(f"Error while uploading file: {file_name}, Error: {str(e)}")
 
         data_source.chatbot_id = self.bot.id
         data_source.files = files_urls
         data_source.files_info = files_info_list
         data_source.folder_name = folder_name
-        data_source.ingest_status = 'PDF(s) Uploaded'
+        data_source.ingest_status = 'File(s) uploaded'
 
         data_source.save()
-        return data_source
+        return data_source
diff --git a/dj_backend_server/web/static/chat.css b/dj_backend_server/web/static/chat.css