Unobtainiumrock
diff --git a/‎.gitignore
+1-1 b/‎.gitignore
+1-1
diff --git a/‎README.md
+24-66 b/‎README.md
+24-66
diff --git a/‎app.py
+74-42 b/‎app.py
+74-42
@@ -44,4 +44,4 @@ Thumbs.db
 __pycache__/
 
 # ignore adalflow cache
-adalflow/
+/adalflow
@@ -1,61 +1,7 @@
-# RAG Code Assistant
+# GithubChat
 
-A Retrieval-Augmented Generation (RAG) system for analyzing and understanding code repositories. The system provides both a command-line interface and a web UI for interacting with your codebase. In this repo there are two versions of the RAG system:
-
-1. `app.py` - a demo version that uses test data
-2. `app_repo.py` - a version that uses a real codebase
-
-It is still a work in progress and lots of things can be improved.
-
-# Repository Architecture
-
-This document explains how the different components of the RAG (Retrieval-Augmented Generation) system work together.
-
-## File Structure and Dependencies
-
-```mermaid
-graph TD
-    config[config.py] --> rag[rag.py]
-    config --> data_pipeline[data_pipeline.py]
-    data_pipeline --> test_rag[test_rag.py]
-    data_pipeline --> app_repo[app_repo.py]
-    rag --> app[app.py]
-    rag --> app_repo
-    test_rag --> app
-```
-
-## Data Flow
-
-```mermaid
-flowchart TD
-    subgraph Input
-        A[User Query] --> B[Streamlit Interface]
-        C[Repository/Documents] --> D[Document Processor]
-    end
-
-    subgraph Processing
-        B --> E[RAG System]
-        D --> F[Text Splitter]
-        F --> G[Embedder]
-        G --> H[FAISS Index]
-        H --> E
-    end
-
-    subgraph Output
-        E --> I[Response]
-        E --> J[Context]
-        I --> K[Chat Interface]
-        J --> K
-    end
-```
-
-## Features
-
-- Code-aware responses using RAG
-- Memory for maintaining conversation context
-- Support for multiple programming languages
-- Interactive web interface
-- Command-line interface
+A RAG assistant to allow you to chat with any github repo. 
+Learn fast. The default repo is AdalFlow github repo.
 
 ## Setup
 
@@ -79,19 +25,31 @@ OPENAI_API_KEY = "your-openai-api-key-here"
 
 ## Running the Application
 
-### Web Interface
-
-1. Run the demo version (with test data):
+Run the streamlit app:
 ```bash
 poetry run streamlit run app.py
 ```
 
-2. Run the repository analysis version:
-```bash
-poetry run streamlit run app_repo.py
-```
+## ROADMAP
+- [x] Clearyly structured RAG that can prepare a repo, persit from reloading, and answer questions.
+  - `DatabaseManager` in `src/data_pipeline.py` to manage the database.
+  - `RAG` class in `src/rag.py` to manage the whole RAG lifecycle.
+
+<!-- CREATE Checklist -->
+- [ ] Create an evaluation dataset  
+- [ ] Evaluate the RAG performance on the dataset  
+- [ ] Auto-optimize the RAG model
+<!-- ## Learn
+
+## Local Storage
+We use adalflow's root directory, which is at ~/.adalflow.
+- repos/repo_name/...
+- repos/repo_name_db/...
+
+- data_pipeline.py: From the main and local code test, you will know the process of download repo and chunk files, and embed the chunks.
+- rag.py: The main code of the RAG model. -->
 
-### Command Line Interface
+<!-- ### Command Line Interface
 
 Run the RAG system directly:
 ```bash
@@ -130,4 +88,4 @@ poetry run python rag.py
 - [ ] Add evaluation metrics
 - [ ] Improve the embedding model
 - [ ] Improve the text splitter and chunking
-- [ ] Improve the retriever
+- [ ] Improve the retriever -->
@@ -1,89 +1,121 @@
 import streamlit as st
 import os
-import tempfile
 from src.rag import RAG
-from tests.test_rag import initialize_test_database
+from src.data_pipeline import (
+    extract_class_definition,
+    extract_class_name_from_query,
+)
 
+from config import DEFAULT_GITHUB_REPO
 
-# Initialize RAG system with test data
-@st.cache_resource
-def init_rag():
-    # Set your OpenAI API key
-    os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
 
-    # Create a temporary directory for the database
-    temp_dir = tempfile.mkdtemp()
-    db_path = os.path.join(temp_dir, "test_db")
+def init_rag(repo_path_or_url: str):
+
+    # from adalflow.utils import setup_env
+
+    os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
 
-    # Initialize test database with example data
-    initialize_test_database(db_path)
+    rag = RAG()
+    print(f"Loading repository from: {repo_path_or_url}")
+    rag.prepare_retriever(repo_url_or_path=repo_path_or_url)
+    return rag
 
-    # Create RAG instance with test database using general QA prompt
-    return RAG(index_path=db_path, prompt_type="general_qa")
 
+st.title("GithubChat")
+st.caption("Learn a repo with RAG assistant")
 
-st.title("RAG Chat Interface")
-st.caption(
-    "Test data includes information about Alice (software engineer), Bob (data scientist), and the company cafeteria."
+repo_path = st.text_input(
+    "Repository Path",
+    value=DEFAULT_GITHUB_REPO,
+    help="Github repo URL",
 )
 
-# Initialize session state for chat history
 if "messages" not in st.session_state:
     st.session_state.messages = []
+if "rag" not in st.session_state:
+    st.session_state.rag = None
 
-# Initialize RAG
-rag = init_rag()
+if st.button("Initialize local RAG"):
+    try:
+        st.session_state.rag = init_rag(repo_path)
+        if st.session_state.rag:
+            st.toast("Repository loaded successfully!")
+    except Exception as e:
+        st.toast(f"Load failed for repository at: {repo_path}")
 
-# Clear chat button
 if st.button("Clear Chat"):
     st.session_state.messages = []
-    # Reset RAG memory
-    rag.memory.current_conversation.dialog_turns.clear()
+    if st.session_state.rag:
+        st.session_state.rag.memory.current_conversation.dialog_turns.clear()
 
-# Display chat messages
 for message in st.session_state.messages:
     with st.chat_message(message["role"]):
         st.write(message["content"])
         if "context" in message:
-            with st.expander(f"View source from {message.get('file_path', 'sample')}"):
-                st.code(message["context"], language=message.get("language", "text"))
+            with st.expander(f"View source from {message.get('file_path', 'unknown')}"):
+                st.code(message["context"], language=message.get("language", "python"))
 
-# Chat input
-if prompt := st.chat_input(
-    "What would you like to know about Alice, Bob, or the cafeteria?"
+if st.session_state.rag and (
+    prompt := st.chat_input(
+        "Ask about the code (e.g., 'Show me the implementation of the RAG class', 'How is memory handled?')"
+    )
 ):
-    # Add user message to chat history
     st.session_state.messages.append({"role": "user", "content": prompt})
 
-    # Display user message
     with st.chat_message("user"):
         st.write(prompt)
 
-    # Get RAG response
+    class_name = extract_class_name_from_query(prompt)
+
     with st.chat_message("assistant"):
-        with st.spinner("Thinking..."):
-            response, docs = rag(prompt)
-            st.write(response)
+        with st.spinner("Analyzing code..."):
+            response, docs = st.session_state.rag(prompt)
 
-            # Show relevant context
+            # Show relevant context first, then the explanation
             if docs and docs[0].documents:
-                context = docs[0].documents[0].text
-                file_path = docs[0].documents[0].meta_data.get("title", "sample")
+                # Try to find implementation code first
+                implementation_docs = [
+                    doc
+                    for doc in docs[0].documents
+                    if doc.meta_data.get("is_implementation", False)
+                ]
+
+                # Use implementation if found, otherwise use first document
+                doc = (
+                    implementation_docs[0]
+                    if implementation_docs
+                    else docs[0].documents[0]
+                )
+                context = doc.text
+                file_path = doc.meta_data.get("file_path", "unknown")
+                file_type = doc.meta_data.get("type", "python")
+
+                # If asking about a specific class, try to extract just that class definition
+                if class_name and file_type == "python":
+                    class_context = extract_class_definition(context, class_name)
+                    if class_context != context:  # Only use if we found the class
+                        context = class_context
+
                 with st.expander(f"View source from {file_path}"):
-                    st.code(context, language="text")
+                    st.code(context, language=file_type)
+
+                # Now show the explanation
+                st.write(response)
 
-                # Add assistant message with context to chat history
+                # Add to chat history
                 st.session_state.messages.append(
                     {
                         "role": "assistant",
                         "content": response,
                         "context": context,
                         "file_path": file_path,
-                        "language": "text",
+                        "language": file_type,
                     }
                 )
             else:
-                # Add assistant message without context to chat history
+                st.write(response)
                 st.session_state.messages.append(
                     {"role": "assistant", "content": response}
                 )
+elif not st.session_state.rag:
+    st.info("Please load a repository first!")