Skip to content

Commit 7a388e4

Browse files
committed
restructure the rag and create a simple db manager to better manage new repo, simplify the readme and add a simple roadmap
1 parent d20aec2 commit 7a388e4

File tree

8 files changed

+305
-366
lines changed

8 files changed

+305
-366
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,4 @@ Thumbs.db
4444
__pycache__/
4545

4646
# ignore adalflow cache
47-
adalflow/
47+
/adalflow

README.md

+24-66
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,7 @@
1-
# RAG Code Assistant
1+
# GithubChat
22

3-
A Retrieval-Augmented Generation (RAG) system for analyzing and understanding code repositories. The system provides both a command-line interface and a web UI for interacting with your codebase. In this repo there are two versions of the RAG system:
4-
5-
1. `app.py` - a demo version that uses test data
6-
2. `app_repo.py` - a version that uses a real codebase
7-
8-
It is still a work in progress and lots of things can be improved.
9-
10-
# Repository Architecture
11-
12-
This document explains how the different components of the RAG (Retrieval-Augmented Generation) system work together.
13-
14-
## File Structure and Dependencies
15-
16-
```mermaid
17-
graph TD
18-
config[config.py] --> rag[rag.py]
19-
config --> data_pipeline[data_pipeline.py]
20-
data_pipeline --> test_rag[test_rag.py]
21-
data_pipeline --> app_repo[app_repo.py]
22-
rag --> app[app.py]
23-
rag --> app_repo
24-
test_rag --> app
25-
```
26-
27-
## Data Flow
28-
29-
```mermaid
30-
flowchart TD
31-
subgraph Input
32-
A[User Query] --> B[Streamlit Interface]
33-
C[Repository/Documents] --> D[Document Processor]
34-
end
35-
36-
subgraph Processing
37-
B --> E[RAG System]
38-
D --> F[Text Splitter]
39-
F --> G[Embedder]
40-
G --> H[FAISS Index]
41-
H --> E
42-
end
43-
44-
subgraph Output
45-
E --> I[Response]
46-
E --> J[Context]
47-
I --> K[Chat Interface]
48-
J --> K
49-
end
50-
```
51-
52-
## Features
53-
54-
- Code-aware responses using RAG
55-
- Memory for maintaining conversation context
56-
- Support for multiple programming languages
57-
- Interactive web interface
58-
- Command-line interface
3+
A RAG assistant to allow you to chat with any github repo.
4+
Learn fast. The default repo is AdalFlow github repo.
595

606
## Setup
617

@@ -79,19 +25,31 @@ OPENAI_API_KEY = "your-openai-api-key-here"
7925

8026
## Running the Application
8127

82-
### Web Interface
83-
84-
1. Run the demo version (with test data):
28+
Run the streamlit app:
8529
```bash
8630
poetry run streamlit run app.py
8731
```
8832

89-
2. Run the repository analysis version:
90-
```bash
91-
poetry run streamlit run app_repo.py
92-
```
33+
## ROADMAP
34+
- [x] Clearyly structured RAG that can prepare a repo, persit from reloading, and answer questions.
35+
- `DatabaseManager` in `src/data_pipeline.py` to manage the database.
36+
- `RAG` class in `src/rag.py` to manage the whole RAG lifecycle.
37+
38+
<!-- CREATE Checklist -->
39+
- [ ] Create an evaluation dataset
40+
- [ ] Evaluate the RAG performance on the dataset
41+
- [ ] Auto-optimize the RAG model
42+
<!-- ## Learn
43+
44+
## Local Storage
45+
We use adalflow's root directory, which is at ~/.adalflow.
46+
- repos/repo_name/...
47+
- repos/repo_name_db/...
48+
49+
- data_pipeline.py: From the main and local code test, you will know the process of download repo and chunk files, and embed the chunks.
50+
- rag.py: The main code of the RAG model. -->
9351

94-
### Command Line Interface
52+
<!-- ### Command Line Interface
9553
9654
Run the RAG system directly:
9755
```bash
@@ -130,4 +88,4 @@ poetry run python rag.py
13088
- [ ] Add evaluation metrics
13189
- [ ] Improve the embedding model
13290
- [ ] Improve the text splitter and chunking
133-
- [ ] Improve the retriever
91+
- [ ] Improve the retriever -->

app.py

+74-42
Original file line numberDiff line numberDiff line change
@@ -1,89 +1,121 @@
11
import streamlit as st
22
import os
3-
import tempfile
43
from src.rag import RAG
5-
from tests.test_rag import initialize_test_database
4+
from src.data_pipeline import (
5+
extract_class_definition,
6+
extract_class_name_from_query,
7+
)
68

9+
from config import DEFAULT_GITHUB_REPO
710

8-
# Initialize RAG system with test data
9-
@st.cache_resource
10-
def init_rag():
11-
# Set your OpenAI API key
12-
os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
1311

14-
# Create a temporary directory for the database
15-
temp_dir = tempfile.mkdtemp()
16-
db_path = os.path.join(temp_dir, "test_db")
12+
def init_rag(repo_path_or_url: str):
13+
14+
# from adalflow.utils import setup_env
15+
16+
os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
1717

18-
# Initialize test database with example data
19-
initialize_test_database(db_path)
18+
rag = RAG()
19+
print(f"Loading repository from: {repo_path_or_url}")
20+
rag.prepare_retriever(repo_url_or_path=repo_path_or_url)
21+
return rag
2022

21-
# Create RAG instance with test database using general QA prompt
22-
return RAG(index_path=db_path, prompt_type="general_qa")
2323

24+
st.title("GithubChat")
25+
st.caption("Learn a repo with RAG assistant")
2426

25-
st.title("RAG Chat Interface")
26-
st.caption(
27-
"Test data includes information about Alice (software engineer), Bob (data scientist), and the company cafeteria."
27+
repo_path = st.text_input(
28+
"Repository Path",
29+
value=DEFAULT_GITHUB_REPO,
30+
help="Github repo URL",
2831
)
2932

30-
# Initialize session state for chat history
3133
if "messages" not in st.session_state:
3234
st.session_state.messages = []
35+
if "rag" not in st.session_state:
36+
st.session_state.rag = None
3337

34-
# Initialize RAG
35-
rag = init_rag()
38+
if st.button("Initialize local RAG"):
39+
try:
40+
st.session_state.rag = init_rag(repo_path)
41+
if st.session_state.rag:
42+
st.toast("Repository loaded successfully!")
43+
except Exception as e:
44+
st.toast(f"Load failed for repository at: {repo_path}")
3645

37-
# Clear chat button
3846
if st.button("Clear Chat"):
3947
st.session_state.messages = []
40-
# Reset RAG memory
41-
rag.memory.current_conversation.dialog_turns.clear()
48+
if st.session_state.rag:
49+
st.session_state.rag.memory.current_conversation.dialog_turns.clear()
4250

43-
# Display chat messages
4451
for message in st.session_state.messages:
4552
with st.chat_message(message["role"]):
4653
st.write(message["content"])
4754
if "context" in message:
48-
with st.expander(f"View source from {message.get('file_path', 'sample')}"):
49-
st.code(message["context"], language=message.get("language", "text"))
55+
with st.expander(f"View source from {message.get('file_path', 'unknown')}"):
56+
st.code(message["context"], language=message.get("language", "python"))
5057

51-
# Chat input
52-
if prompt := st.chat_input(
53-
"What would you like to know about Alice, Bob, or the cafeteria?"
58+
if st.session_state.rag and (
59+
prompt := st.chat_input(
60+
"Ask about the code (e.g., 'Show me the implementation of the RAG class', 'How is memory handled?')"
61+
)
5462
):
55-
# Add user message to chat history
5663
st.session_state.messages.append({"role": "user", "content": prompt})
5764

58-
# Display user message
5965
with st.chat_message("user"):
6066
st.write(prompt)
6167

62-
# Get RAG response
68+
class_name = extract_class_name_from_query(prompt)
69+
6370
with st.chat_message("assistant"):
64-
with st.spinner("Thinking..."):
65-
response, docs = rag(prompt)
66-
st.write(response)
71+
with st.spinner("Analyzing code..."):
72+
response, docs = st.session_state.rag(prompt)
6773

68-
# Show relevant context
74+
# Show relevant context first, then the explanation
6975
if docs and docs[0].documents:
70-
context = docs[0].documents[0].text
71-
file_path = docs[0].documents[0].meta_data.get("title", "sample")
76+
# Try to find implementation code first
77+
implementation_docs = [
78+
doc
79+
for doc in docs[0].documents
80+
if doc.meta_data.get("is_implementation", False)
81+
]
82+
83+
# Use implementation if found, otherwise use first document
84+
doc = (
85+
implementation_docs[0]
86+
if implementation_docs
87+
else docs[0].documents[0]
88+
)
89+
context = doc.text
90+
file_path = doc.meta_data.get("file_path", "unknown")
91+
file_type = doc.meta_data.get("type", "python")
92+
93+
# If asking about a specific class, try to extract just that class definition
94+
if class_name and file_type == "python":
95+
class_context = extract_class_definition(context, class_name)
96+
if class_context != context: # Only use if we found the class
97+
context = class_context
98+
7299
with st.expander(f"View source from {file_path}"):
73-
st.code(context, language="text")
100+
st.code(context, language=file_type)
101+
102+
# Now show the explanation
103+
st.write(response)
74104

75-
# Add assistant message with context to chat history
105+
# Add to chat history
76106
st.session_state.messages.append(
77107
{
78108
"role": "assistant",
79109
"content": response,
80110
"context": context,
81111
"file_path": file_path,
82-
"language": "text",
112+
"language": file_type,
83113
}
84114
)
85115
else:
86-
# Add assistant message without context to chat history
116+
st.write(response)
87117
st.session_state.messages.append(
88118
{"role": "assistant", "content": response}
89119
)
120+
elif not st.session_state.rag:
121+
st.info("Please load a repository first!")

0 commit comments

Comments
 (0)