diff --git a/.gitignore b/.gitignore index 622ed88..f6ca216 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,15 @@ -.env +# Python bytecode files +__pycache__/ +*.pyc +*.pyo + +# Virtual environment venv/ -__pycache__/ \ No newline at end of file +env/ + +# .env file (contains sensitive API keys) +.env + +# Docker-related files +docker-compose.override.yml +*.log diff --git a/Dockerfile b/Dockerfile index 8dddfeb..c2e3255 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,16 +7,16 @@ WORKDIR /app # Copy the current directory contents into the container at /app COPY . /app -# Install any needed packages specified in requirements.txt +# Install dependencies from the requirements.txt RUN pip install --no-cache-dir --upgrade pip \ && pip install --no-cache-dir -r requirements.txt -# Make port 8000 available to the world outside this container -EXPOSE 8000 +# Set environment variables from the .env file or the docker-compose.yml +ENV ASSEMBLYAI_API_KEY=${ASSEMBLYAI_API_KEY} +ENV GENAI_API_KEY=${GENAI_API_KEY} -# Define environment variable -ENV ASSEMBLYAI_API_KEY="ASSEMBLYAI_API_KEY" -ENV GENAI_API_KEY="GENAI_API_KEY" +# Expose port 8000 +EXPOSE 8000 -# Run app.py when the container launches -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file +# Command to run the FastAPI app using Uvicorn +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/app/app.py b/app/app.py new file mode 100644 index 0000000..f1f7ac8 --- /dev/null +++ b/app/app.py @@ -0,0 +1,59 @@ +import os +from fastapi import FastAPI, HTTPException, Request +from fastapi.responses import JSONResponse, HTMLResponse +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates +from dotenv import load_dotenv +from app.services.youtube_service import YouTubeService +from app.services.genai_service import GenaiService + +# Load environment variables +load_dotenv() + +# FastAPI app initialization +app = FastAPI() + +# Mount static files and templates +app.mount("/static", StaticFiles(directory="static"), name="static") +templates = Jinja2Templates(directory="app/templates") + +# Load API keys from .env +api_key = os.getenv("ASSEMBLYAI_API_KEY") +genaiApiKey = os.getenv("GENAI_API_KEY") +modelName = "multi-qa-mpnet-base-dot-v1" + +class ProcessRequest(BaseModel): + action: str + input: str + +@app.post("/process") +async def process_request(request: ProcessRequest): + action = request.action + input_text = request.input + + if action == "transcribe": + yt_service = YouTubeService(api_key, input_text) + yt_service.download_video() + transcript_text = yt_service.transcribe_video("video.m4a") + if not transcript_text: + raise HTTPException(status_code=500, detail="Transcription failed.") + + genai_service = GenaiService(modelName, genaiApiKey) + summary_text = genai_service.getSummary(transcript_text) + return JSONResponse(content={"status": "success", "summary": summary_text}) + + elif action == "ask": + genai_service = GenaiService(modelName, genaiApiKey) + answer_text = genai_service.getAnswer(input_text, [input_text]) + return JSONResponse(content={"status": "success", "answer": answer_text}) + + else: + raise HTTPException(status_code=400, detail="Invalid action.") + +@app.get("/", response_class=HTMLResponse) +async def read_root(request: Request): + return templates.TemplateResponse("index.html", {"request": request}) + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/app/main.py b/app/main.py deleted file mode 100644 index 4102307..0000000 --- a/app/main.py +++ /dev/null @@ -1,18 +0,0 @@ -from fastapi import FastAPI -from fastapi.responses import JSONResponse -from fastapi.staticfiles import StaticFiles -from fastapi.templating import Jinja2Templates -from app.routes.process_routes import router as process_router -from app.routes.home_routes import router as home_router - -app = FastAPI() - -app.mount("/static", StaticFiles(directory="static"), name="static") -templates = Jinja2Templates(directory="templates") - -app.include_router(process_router) -app.include_router(home_router) - -@app.get("/") -async def root(): - return {"message": "Welcome to the Court Case Scraper API!"} diff --git a/app/routes/__init__.py b/app/models.py similarity index 100% rename from app/routes/__init__.py rename to app/models.py diff --git a/app/routes/home_routes.py b/app/routes/home_routes.py deleted file mode 100644 index 46b24a0..0000000 --- a/app/routes/home_routes.py +++ /dev/null @@ -1,11 +0,0 @@ -from fastapi import APIRouter -from fastapi.responses import HTMLResponse -from fastapi.templating import Jinja2Templates -from fastapi import Request - -router = APIRouter() -templates = Jinja2Templates(directory="templates") - -@router.get("/", response_class=HTMLResponse) -async def read_root(request: Request): - return templates.TemplateResponse("index.html", {"request": request}) diff --git a/app/routes/process_routes.py b/app/routes/process_routes.py deleted file mode 100644 index 8d6b4d0..0000000 --- a/app/routes/process_routes.py +++ /dev/null @@ -1,67 +0,0 @@ -from fastapi import APIRouter, HTTPException -from fastapi.responses import JSONResponse -from pydantic import BaseModel -from app.services.transcription_service import YouTubeTranscriber -from app.services.qa_service import QABot -from utils.file_utils import remove_file -import os -import google.generativeai as genai - -router = APIRouter() - -class ProcessRequest(BaseModel): - action: str - input: str - -api_key = os.getenv("ASSEMBLYAI_API_KEY") -genaiApiKey = os.getenv("GENAI_API_KEY") -modelName = "multi-qa-mpnet-base-dot-v1" - -class GenaiQA: - def __init__(self, modelName, genaiApiKey): - self.model = SentenceTransformer(modelName) - genai.configure(api_key=genaiApiKey) - self.genaiModel = genai.GenerativeModel(model_name="gemini-1.5-flash") - - def getSummary(self, transcriptionText): - if not transcriptionText: - return "No transcription text provided." - inputUser = (f"This document contains a transcription of the video's audio. Please just provide a professionally crafted summary based on the transcript paragraph. Transcription: {transcriptionText}") - response = self.genaiModel.generate_content(inputUser) - return response.text - - def getAnswer(self, query, localData): - qaBot = QABot(self.model) - answer = qaBot.answerQuery(query, localData) - inputUser = f"For this question, I'm seeking the perfect answer. Please provide the answer directly. {query}\n\n{answer}" - response = self.genaiModel.generate_content(inputUser) - return response.text - -@router.post("/process") -async def process_request(request: ProcessRequest): - action = request.action - input_text = request.input - - if os.path.exists('video.m4a'): - os.remove('video.m4a') - - if action == "transcribe": - yt_transcriber = YouTubeTranscriber(api_key, input_text) - yt_transcriber.download_video() - transcript_text = yt_transcriber.transcribe_video("video.m4a") - if not transcript_text: - raise HTTPException(status_code=500, detail="Transcription failed.") - - genaiQA = GenaiQA(modelName, genaiApiKey) - summary_text = genaiQA.getSummary(transcript_text) - - return JSONResponse(content={"status": "success", "summary": summary_text}) - - elif action == "ask": - genaiQA = GenaiQA(modelName, genaiApiKey) - answer_text = genaiQA.getAnswer(input_text, [input_text]) - - return JSONResponse(content={"status": "success", "answer": answer_text}) - - else: - raise HTTPException(status_code=400, detail="Invalid action.") diff --git a/app/services/file_service.py b/app/services/file_service.py deleted file mode 100644 index f63c877..0000000 --- a/app/services/file_service.py +++ /dev/null @@ -1,12 +0,0 @@ -import os - -def remove_file(filename): - if os.path.exists(filename): - os.remove(filename) - -def create_directory(directory): - if not os.path.exists(directory): - os.makedirs(directory) - -def file_exists(filename): - return os.path.exists(filename) diff --git a/app/services/genai_service.py b/app/services/genai_service.py new file mode 100644 index 0000000..6538436 --- /dev/null +++ b/app/services/genai_service.py @@ -0,0 +1,23 @@ +import google.generativeai as genai +from app.services.qa_service import QAService +from sentence_transformers import SentenceTransformer + +class GenaiService: + def __init__(self, model_name, genai_api_key): + self.model = SentenceTransformer(model_name) + genai.configure(api_key=genai_api_key) + self.genai_model = genai.GenerativeModel(model_name="gemini-1.5-flash") + + def get_summary(self, transcription_text): + if not transcription_text: + return "No transcription text provided." + input_user = (f"This document contains a transcription of the video's audio. Please provide a professionally crafted summary based on the transcript. Transcription: {transcription_text}") + response = self.genai_model.generate_content(input_user) + return response.text + + def get_answer(self, query, local_data): + qa_service = QAService(self.model) + answer = qa_service.answer_query(query, local_data) + input_user = f"For this question, provide a direct and accurate answer. {query}\n\n{answer}" + response = self.genai_model.generate_content(input_user) + return response.text diff --git a/app/services/qa_service.py b/app/services/qa_service.py index 37f0a99..d61a569 100644 --- a/app/services/qa_service.py +++ b/app/services/qa_service.py @@ -2,32 +2,32 @@ from numpy import dot from numpy.linalg import norm -class QABot: +class QAService: def __init__(self, model): self.model = model - def generateAnswer(self, query, relevantData): - if not relevantData: + def generate_answer(self, query, relevant_data): + if not relevant_data: return "No relevant data found." - scores = [score for score, _ in relevantData] - maxScore = max(scores) if scores else 1 - normalizedScores = [score / maxScore for score in scores] - return "\n\n".join(f"**Passage {i + 1} (Score: {normalizedScores[i]:.2f}):** {text}" - for i, (_, text) in enumerate(relevantData)) + scores = [score for score, _ in relevant_data] + max_score = max(scores) if scores else 1 + normalized_scores = [score / max_score for score in scores] + return "\n\n".join(f"**Passage {i + 1} (Score: {normalized_scores[i]:.2f}):** {text}" + for i, (_, text) in enumerate(relevant_data)) - def answerQuery(self, query, localData): - relevantData = self.fetchRelevantData(query, localData) - return self.generateAnswer(query, relevantData) + def answer_query(self, query, local_data): + relevant_data = self.fetch_relevant_data(query, local_data) + return self.generate_answer(query, relevant_data) - def fetchRelevantData(self, query, localData, topK=5): - queryEmbedding = self.model.encode(query, convert_to_tensor=True).tolist() - scoresAndTexts = [] - for text in localData: - textEmbedding = self.model.encode(text, convert_to_tensor=True).tolist() - score = self.computeSimilarity(queryEmbedding, textEmbedding) - scoresAndTexts.append((score, text)) - sortedScoresAndTexts = sorted(scoresAndTexts, key=lambda x: x[0], reverse=True) - return sortedScoresAndTexts[:topK] + def fetch_relevant_data(self, query, local_data, top_k=5): + query_embedding = self.model.encode(query, convert_to_tensor=True).tolist() + scores_and_texts = [] + for text in local_data: + text_embedding = self.model.encode(text, convert_to_tensor=True).tolist() + score = self.compute_similarity(query_embedding, text_embedding) + scores_and_texts.append((score, text)) + sorted_scores_and_texts = sorted(scores_and_texts, key=lambda x: x[0], reverse=True) + return sorted_scores_and_texts[:top_k] - def computeSimilarity(self, queryEmbedding, textEmbedding): - return dot(queryEmbedding, textEmbedding) / (norm(queryEmbedding) * norm(textEmbedding)) + def compute_similarity(self, query_embedding, text_embedding): + return dot(query_embedding, text_embedding) / (norm(query_embedding) * norm(text_embedding)) diff --git a/static/css/documentation_style.css b/app/services/static/css/documentation_style.css similarity index 100% rename from static/css/documentation_style.css rename to app/services/static/css/documentation_style.css diff --git a/static/css/style.css b/app/services/static/css/style.css similarity index 100% rename from static/css/style.css rename to app/services/static/css/style.css diff --git a/static/js/script.js b/app/services/static/js/script.js similarity index 100% rename from static/js/script.js rename to app/services/static/js/script.js diff --git a/templates/documentation.html b/app/services/templates/documentation.html similarity index 100% rename from templates/documentation.html rename to app/services/templates/documentation.html diff --git a/templates/index.html b/app/services/templates/index.html similarity index 100% rename from templates/index.html rename to app/services/templates/index.html diff --git a/app/services/transcription_service.py b/app/services/transcription_service.py deleted file mode 100644 index 1b735a1..0000000 --- a/app/services/transcription_service.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -import assemblyai as aai -from utils.downloader import YouTubeDownloader - -class YouTubeTranscriber: - def __init__(self, api_key, url): - self.api_key = api_key - self.url = url - aai.settings.api_key = self.api_key - self.downloader = YouTubeDownloader(self.url) - - def remove_existing_video(self, filename): - if os.path.exists(filename): - os.remove(filename) - - def download_video(self): - self.downloader.download_video() - - def transcribe_video(self, filename): - if not os.path.exists(filename): - print(f"File {filename} not found.") - return "" - transcriber = aai.Transcriber() - transcript = transcriber.transcribe(filename) - return transcript.text diff --git a/utils/downloader.py b/app/services/youtube_service.py similarity index 61% rename from utils/downloader.py rename to app/services/youtube_service.py index 0d4b7e0..d127e3a 100644 --- a/utils/downloader.py +++ b/app/services/youtube_service.py @@ -1,8 +1,11 @@ -import yt_dlp import os +import yt_dlp +import assemblyai as aai +from rich import print -class YouTubeDownloader: - def __init__(self, url): +class YouTubeService: + def __init__(self, api_key, url): + self.api_key = api_key self.url = url self.ydl_opts = { 'outtmpl': 'video.%(ext)s', @@ -10,6 +13,7 @@ def __init__(self, url): 'noplaylist': True, 'merge_output_format': None, } + aai.settings.api_key = self.api_key def remove_existing_video(self, filename): if os.path.exists(filename): @@ -18,8 +22,4 @@ def remove_existing_video(self, filename): def download_video(self): try: self.remove_existing_video('video.m4a') - with yt_dlp.YoutubeDL(self.ydl_opts) as ydl: - ydl.download([self.url]) - print("Download completed successfully!") - except Exception as e: - print(f"An error occurred: {e}") + with yt diff --git a/docker-compose.yml b/docker-compose.yml index 0bfe575..c3be912 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,4 +9,18 @@ services: ASSEMBLYAI_API_KEY: ${ASSEMBLYAI_API_KEY} GENAI_API_KEY: ${GENAI_API_KEY} volumes: - - .:/app \ No newline at end of file + - .:/app + depends_on: + - api + + api: + image: python:3.12-slim + container_name: fastapi-container + environment: + ASSEMBLYAI_API_KEY: ${ASSEMBLYAI_API_KEY} + GENAI_API_KEY: ${GENAI_API_KEY} + command: uvicorn app.main:app --host 0.0.0.0 --port 8000 + volumes: + - .:/app + ports: + - "8000:8000" diff --git a/main.py b/main.py deleted file mode 100644 index 294eafa..0000000 --- a/main.py +++ /dev/null @@ -1,150 +0,0 @@ -import os -from dotenv import load_dotenv -from fastapi import FastAPI, HTTPException, Request -from fastapi.responses import JSONResponse, HTMLResponse -from fastapi.staticfiles import StaticFiles -from fastapi.templating import Jinja2Templates -from pydantic import BaseModel -from rich import print -from sentence_transformers import SentenceTransformer -import google.generativeai as genai -import yt_dlp -import assemblyai as aai -from numpy import dot -from numpy.linalg import norm - -load_dotenv() - -class ProcessRequest(BaseModel): - action: str - input: str - -app = FastAPI() - -app.mount("/static", StaticFiles(directory="static"), name="static") -templates = Jinja2Templates(directory="templates") - -api_key = os.getenv("ASSEMBLYAI_API_KEY") -genaiApiKey = os.getenv("GENAI_API_KEY") -modelName = "multi-qa-mpnet-base-dot-v1" - -class QABot: - def __init__(self, model): - self.model = model - - def generateAnswer(self, query, relevantData): - if not relevantData: - return "No relevant data found." - scores = [score for score, _ in relevantData] - maxScore = max(scores) if scores else 1 - normalizedScores = [score / maxScore for score in scores] - return "\n\n".join(f"**Passage {i + 1} (Score: {normalizedScores[i]:.2f}):** {text}" - for i, (_, text) in enumerate(relevantData)) - - def answerQuery(self, query, localData): - relevantData = self.fetchRelevantData(query, localData) - return self.generateAnswer(query, relevantData) - - def fetchRelevantData(self, query, localData, topK=5): - queryEmbedding = self.model.encode(query, convert_to_tensor=True).tolist() - scoresAndTexts = [] - for text in localData: - textEmbedding = self.model.encode(text, convert_to_tensor=True).tolist() - score = self.computeSimilarity(queryEmbedding, textEmbedding) - scoresAndTexts.append((score, text)) - sortedScoresAndTexts = sorted(scoresAndTexts, key=lambda x: x[0], reverse=True) - return sortedScoresAndTexts[:topK] - - def computeSimilarity(self, queryEmbedding, textEmbedding): - return dot(queryEmbedding, textEmbedding) / (norm(queryEmbedding) * norm(textEmbedding)) - -class GenaiQA: - def __init__(self, modelName, genaiApiKey): - self.model = SentenceTransformer(modelName) - genai.configure(api_key=genaiApiKey) - self.genaiModel = genai.GenerativeModel(model_name="gemini-1.5-flash") - - def getSummary(self, transcriptionText): - if not transcriptionText: - return "No transcription text provided." - - inputUser = (f"This document contains a transcription of the video's audio. Please just provide a professionally crafted summary based on the transcript paragraph. Transcription: {transcriptionText}") - response = self.genaiModel.generate_content(inputUser) - return response.text - - def getAnswer(self, query, localData): - qaBot = QABot(self.model) - answer = qaBot.answerQuery(query, localData) - inputUser = f"For this question, I'm seeking the perfect answer. Please provide the answer directly. {query}\n\n{answer}" - response = self.genaiModel.generate_content(inputUser) - return response.text - -class YouTubeTranscriber: - def __init__(self, api_key, url): - self.api_key = api_key - self.url = url - self.ydl_opts = { - 'outtmpl': 'video.%(ext)s', - 'format': 'bestaudio/best', - 'noplaylist': True, - 'merge_output_format': None, - } - aai.settings.api_key = self.api_key - - def remove_existing_video(self, filename): - if os.path.exists(filename): - os.remove(filename) - - def download_video(self): - try: - self.remove_existing_video('video.m4a') - with yt_dlp.YoutubeDL(self.ydl_opts) as ydl: - ydl.download([self.url]) - print("Download completed successfully!") - except Exception as e: - print(f"An error occurred: {e}") - - def transcribe_video(self, filename): - if not os.path.exists(filename): - print(f"File {filename} not found.") - return "" - transcriber = aai.Transcriber() - transcript = transcriber.transcribe(filename) - return transcript.text - -@app.post("/process") -async def process_request(request: ProcessRequest): - action = request.action - input_text = request.input - - if os.path.exists('video.m4a'): - os.remove('video.m4a') - - if action == "transcribe": - yt_transcriber = YouTubeTranscriber(api_key, input_text) - yt_transcriber.download_video() - transcript_text = yt_transcriber.transcribe_video("video.m4a") - if not transcript_text: - raise HTTPException(status_code=500, detail="Transcription failed.") - - genaiQA = GenaiQA(modelName, genaiApiKey) - summary_text = genaiQA.getSummary(transcript_text) - - return JSONResponse(content={"status": "success", "summary": summary_text}) - - elif action == "ask": - genaiQA = GenaiQA(modelName, genaiApiKey) - answer_text = genaiQA.getAnswer(input_text, [input_text]) - - return JSONResponse(content={"status": "success", "answer": answer_text}) - - else: - raise HTTPException(status_code=400, detail="Invalid action.") - -@app.get("/", response_class=HTMLResponse) -async def read_root(request: Request): - return templates.TemplateResponse("index.html", {"request": request}) - -if __name__ == "__main__": - import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/run.sh b/run.sh deleted file mode 100644 index 2a0a276..0000000 --- a/run.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/utils/file_utils.py b/utils/file_utils.py deleted file mode 100644 index f63c877..0000000 --- a/utils/file_utils.py +++ /dev/null @@ -1,12 +0,0 @@ -import os - -def remove_file(filename): - if os.path.exists(filename): - os.remove(filename) - -def create_directory(directory): - if not os.path.exists(directory): - os.makedirs(directory) - -def file_exists(filename): - return os.path.exists(filename)