From 232e806e2b759b49bc494d2812ad48085ac79040 Mon Sep 17 00:00:00 2001 From: Kritin_Vongthongsri Date: Mon, 27 Jan 2025 23:27:14 -0800 Subject: [PATCH 1/5] delete .vector_db folder if chunking is unsuccesful --- .../synthesizer/chunking/context_generator.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/deepeval/synthesizer/chunking/context_generator.py b/deepeval/synthesizer/chunking/context_generator.py index 293c75440..3c8c7e5c0 100644 --- a/deepeval/synthesizer/chunking/context_generator.py +++ b/deepeval/synthesizer/chunking/context_generator.py @@ -6,6 +6,7 @@ from pydantic import BaseModel import asyncio import random +import shutil import math import os @@ -94,6 +95,14 @@ def generate_contexts( smallest_document_token_count + (self.chunk_overlap * (num_context_per_document - 1)) ) // num_context_per_document + # Delete chroma collection if chunking is unsuccessful + for path in self.document_paths: + full_document_path, _ = os.path.splitext(path) + document_name = os.path.basename(full_document_path) + collection_path=f".vector_db/{document_name}processed_chunks_{self.chunk_size}_{self.chunk_overlap}" + if os.path.exists(collection_path): + shutil.rmtree(collection_path) + print(f"Deleted ChromaDB folder: {collection_path}") raise ValueError( f"Your smallest document is only sized {smallest_document_token_count} tokens." f"Please adjust the chunk_size to no more than {suggested_chunk_size}." @@ -164,6 +173,14 @@ async def a_generate_contexts( smallest_document_token_count + (self.chunk_overlap * (num_context_per_document - 1)) ) // num_context_per_document + # Delete chroma collection if chunking is unsuccessful + for path in self.document_paths: + full_document_path, _ = os.path.splitext(path) + document_name = os.path.basename(full_document_path) + collection_path=f".vector_db/{document_name}processed_chunks_{self.chunk_size}_{self.chunk_overlap}" + if os.path.exists(collection_path): + shutil.rmtree(collection_path) + print(f"Deleted ChromaDB folder: {collection_path}") raise ValueError( f"Your smallest document is only sized {smallest_document_token_count} tokens." f"Please adjust the chunk_size to no more than {suggested_chunk_size}." @@ -551,6 +568,7 @@ async def a_evaluate_chunk(self, chunk) -> float: def _load_docs(self): import chromadb + from chromadb.errors import InvalidCollectionException for path in tqdm_bar(self.document_paths, "✨ 🚀 ✨ Loading Documents"): try: @@ -568,7 +586,7 @@ def _load_docs(self): self.source_files_to_collections_map = {} self.source_files_to_collections_map[path] = collection - except: + except InvalidCollectionException: if self.doc_to_chunker_map == None: self.doc_to_chunker_map = {} doc_chunker = DocumentChunker(self.embedder) From 96188d201c8cbafa14ba225942b06dbdf1cf24a1 Mon Sep 17 00:00:00 2001 From: Kritin_Vongthongsri Date: Mon, 27 Jan 2025 23:28:04 -0800 Subject: [PATCH 2/5] remove print statements --- deepeval/synthesizer/chunking/context_generator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deepeval/synthesizer/chunking/context_generator.py b/deepeval/synthesizer/chunking/context_generator.py index 3c8c7e5c0..15050c029 100644 --- a/deepeval/synthesizer/chunking/context_generator.py +++ b/deepeval/synthesizer/chunking/context_generator.py @@ -102,7 +102,6 @@ def generate_contexts( collection_path=f".vector_db/{document_name}processed_chunks_{self.chunk_size}_{self.chunk_overlap}" if os.path.exists(collection_path): shutil.rmtree(collection_path) - print(f"Deleted ChromaDB folder: {collection_path}") raise ValueError( f"Your smallest document is only sized {smallest_document_token_count} tokens." f"Please adjust the chunk_size to no more than {suggested_chunk_size}." @@ -180,7 +179,6 @@ async def a_generate_contexts( collection_path=f".vector_db/{document_name}processed_chunks_{self.chunk_size}_{self.chunk_overlap}" if os.path.exists(collection_path): shutil.rmtree(collection_path) - print(f"Deleted ChromaDB folder: {collection_path}") raise ValueError( f"Your smallest document is only sized {smallest_document_token_count} tokens." f"Please adjust the chunk_size to no more than {suggested_chunk_size}." From 85731e320d599dff32d3c51fd21f07774dc0ad05 Mon Sep 17 00:00:00 2001 From: Kritin_Vongthongsri Date: Mon, 27 Jan 2025 23:38:35 -0800 Subject: [PATCH 3/5] generate from scratch fixes --- deepeval/synthesizer/synthesizer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deepeval/synthesizer/synthesizer.py b/deepeval/synthesizer/synthesizer.py index e624e9649..601e294c5 100644 --- a/deepeval/synthesizer/synthesizer.py +++ b/deepeval/synthesizer/synthesizer.py @@ -133,7 +133,7 @@ def generate_goldens_from_docs( # Generate contexts from provided docs if self.context_generator is None: self.context_generator = ContextGenerator( - document_paths, + document_paths=document_paths, embedder=context_construction_config.embedder, chunk_size=context_construction_config.chunk_size, chunk_overlap=context_construction_config.chunk_overlap, @@ -193,7 +193,7 @@ async def a_generate_goldens_from_docs( # Generate contexts from provided docs if self.context_generator is None: self.context_generator = ContextGenerator( - document_paths, + document_paths=document_paths, embedder=context_construction_config.embedder, chunk_size=context_construction_config.chunk_size, chunk_overlap=context_construction_config.chunk_overlap, @@ -675,6 +675,8 @@ def transform_distribution( ) -> Dict[PromptEvolution, float]: prompt_evolutions: Dict[PromptEvolution, float] = {} for evo, weight in evolutions.items(): + if evo == Evolution.MULTICONTEXT: + continue prompt_evolution = self.map_evolution_to_prompt_evolution(evo) prompt_evolutions[prompt_evolution] = weight return prompt_evolutions From e65b15b07903c26b92477101fb3297e21c318f07 Mon Sep 17 00:00:00 2001 From: Kritin_Vongthongsri Date: Mon, 27 Jan 2025 23:40:56 -0800 Subject: [PATCH 4/5] Revert "remove print statements" This reverts commit 96188d201c8cbafa14ba225942b06dbdf1cf24a1. --- deepeval/synthesizer/chunking/context_generator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deepeval/synthesizer/chunking/context_generator.py b/deepeval/synthesizer/chunking/context_generator.py index 15050c029..3c8c7e5c0 100644 --- a/deepeval/synthesizer/chunking/context_generator.py +++ b/deepeval/synthesizer/chunking/context_generator.py @@ -102,6 +102,7 @@ def generate_contexts( collection_path=f".vector_db/{document_name}processed_chunks_{self.chunk_size}_{self.chunk_overlap}" if os.path.exists(collection_path): shutil.rmtree(collection_path) + print(f"Deleted ChromaDB folder: {collection_path}") raise ValueError( f"Your smallest document is only sized {smallest_document_token_count} tokens." f"Please adjust the chunk_size to no more than {suggested_chunk_size}." @@ -179,6 +180,7 @@ async def a_generate_contexts( collection_path=f".vector_db/{document_name}processed_chunks_{self.chunk_size}_{self.chunk_overlap}" if os.path.exists(collection_path): shutil.rmtree(collection_path) + print(f"Deleted ChromaDB folder: {collection_path}") raise ValueError( f"Your smallest document is only sized {smallest_document_token_count} tokens." f"Please adjust the chunk_size to no more than {suggested_chunk_size}." From 07e6313bb96ad3d0d2d415a85844c787b3039053 Mon Sep 17 00:00:00 2001 From: Kritin_Vongthongsri Date: Mon, 27 Jan 2025 23:41:00 -0800 Subject: [PATCH 5/5] Revert "delete .vector_db folder if chunking is unsuccesful" This reverts commit 232e806e2b759b49bc494d2812ad48085ac79040. --- .../synthesizer/chunking/context_generator.py | 20 +------------------ 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/deepeval/synthesizer/chunking/context_generator.py b/deepeval/synthesizer/chunking/context_generator.py index 3c8c7e5c0..293c75440 100644 --- a/deepeval/synthesizer/chunking/context_generator.py +++ b/deepeval/synthesizer/chunking/context_generator.py @@ -6,7 +6,6 @@ from pydantic import BaseModel import asyncio import random -import shutil import math import os @@ -95,14 +94,6 @@ def generate_contexts( smallest_document_token_count + (self.chunk_overlap * (num_context_per_document - 1)) ) // num_context_per_document - # Delete chroma collection if chunking is unsuccessful - for path in self.document_paths: - full_document_path, _ = os.path.splitext(path) - document_name = os.path.basename(full_document_path) - collection_path=f".vector_db/{document_name}processed_chunks_{self.chunk_size}_{self.chunk_overlap}" - if os.path.exists(collection_path): - shutil.rmtree(collection_path) - print(f"Deleted ChromaDB folder: {collection_path}") raise ValueError( f"Your smallest document is only sized {smallest_document_token_count} tokens." f"Please adjust the chunk_size to no more than {suggested_chunk_size}." @@ -173,14 +164,6 @@ async def a_generate_contexts( smallest_document_token_count + (self.chunk_overlap * (num_context_per_document - 1)) ) // num_context_per_document - # Delete chroma collection if chunking is unsuccessful - for path in self.document_paths: - full_document_path, _ = os.path.splitext(path) - document_name = os.path.basename(full_document_path) - collection_path=f".vector_db/{document_name}processed_chunks_{self.chunk_size}_{self.chunk_overlap}" - if os.path.exists(collection_path): - shutil.rmtree(collection_path) - print(f"Deleted ChromaDB folder: {collection_path}") raise ValueError( f"Your smallest document is only sized {smallest_document_token_count} tokens." f"Please adjust the chunk_size to no more than {suggested_chunk_size}." @@ -568,7 +551,6 @@ async def a_evaluate_chunk(self, chunk) -> float: def _load_docs(self): import chromadb - from chromadb.errors import InvalidCollectionException for path in tqdm_bar(self.document_paths, "✨ 🚀 ✨ Loading Documents"): try: @@ -586,7 +568,7 @@ def _load_docs(self): self.source_files_to_collections_map = {} self.source_files_to_collections_map[path] = collection - except InvalidCollectionException: + except: if self.doc_to_chunker_map == None: self.doc_to_chunker_map = {} doc_chunker = DocumentChunker(self.embedder)