diff --git a/integrations/ragas/example/evaluation_from_pipeline_example.py b/integrations/ragas/example/evaluation_from_pipeline_example.py new file mode 100644 index 000000000..7aad28359 --- /dev/null +++ b/integrations/ragas/example/evaluation_from_pipeline_example.py @@ -0,0 +1,140 @@ +# A valid OpenAI API key must be provided as an environment variable "OPENAI_API_KEY" to run this example. + +import os +from dotenv import load_dotenv + +load_dotenv() +from haystack import Document +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.components.embedders import OpenAITextEmbedder, OpenAIDocumentEmbedder +from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever +from haystack.components.builders import ChatPromptBuilder +from haystack.dataclasses import ChatMessage +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.components.builders import AnswerBuilder +from haystack import Pipeline +from haystack_integrations.components.evaluators.ragas import RagasEvaluator + +from langchain_openai import ChatOpenAI +from ragas.llms import LangchainLLMWrapper +from ragas.metrics import AnswerRelevancy, ContextPrecision, Faithfulness +from ragas import evaluate +from ragas.dataset_schema import EvaluationDataset + + +dataset = [ + "OpenAI is one of the most recognized names in the large language model space, known for its GPT series of models. These models excel at generating human-like text and performing tasks like creative writing, answering questions, and summarizing content. GPT-4, their latest release, has set benchmarks in understanding context and delivering detailed responses.", + "Anthropic is well-known for its Claude series of language models, designed with a strong focus on safety and ethical AI behavior. Claude is particularly praised for its ability to follow complex instructions and generate text that aligns closely with user intent.", + "DeepMind, a division of Google, is recognized for its cutting-edge Gemini models, which are integrated into various Google products like Bard and Workspace tools. These models are renowned for their conversational abilities and their capacity to handle complex, multi-turn dialogues.", + "Meta AI is best known for its LLaMA (Large Language Model Meta AI) series, which has been made open-source for researchers and developers. LLaMA models are praised for their ability to support innovation and experimentation due to their accessibility and strong performance.", + "Meta AI with it's LLaMA models aims to democratize AI development by making high-quality models available for free, fostering collaboration across industries. Their open-source approach has been a game-changer for researchers without access to expensive resources.", + "Microsoft’s Azure AI platform is famous for integrating OpenAI’s GPT models, enabling businesses to use these advanced models in a scalable and secure cloud environment. Azure AI powers applications like Copilot in Office 365, helping users draft emails, generate summaries, and more.", + "Amazon’s Bedrock platform is recognized for providing access to various language models, including its own models and third-party ones like Anthropic’s Claude and AI21’s Jurassic. Bedrock is especially valued for its flexibility, allowing users to choose models based on their specific needs.", + "Cohere is well-known for its language models tailored for business use, excelling in tasks like search, summarization, and customer support. Their models are recognized for being efficient, cost-effective, and easy to integrate into workflows.", + "AI21 Labs is famous for its Jurassic series of language models, which are highly versatile and capable of handling tasks like content creation and code generation. The Jurassic models stand out for their natural language understanding and ability to generate detailed and coherent responses.", + "In the rapidly advancing field of artificial intelligence, several companies have made significant contributions with their large language models. Notable players include OpenAI, known for its GPT Series (including GPT-4); Anthropic, which offers the Claude Series; Google DeepMind with its Gemini Models; Meta AI, recognized for its LLaMA Series; Microsoft Azure AI, which integrates OpenAI’s GPT Models; Amazon AWS (Bedrock), providing access to various models including Claude (Anthropic) and Jurassic (AI21 Labs); Cohere, which offers its own models tailored for business use; and AI21 Labs, known for its Jurassic Series. These companies are shaping the landscape of AI by providing powerful models with diverse capabilities.", +] + +# Initialize components for RAG pipeline +document_store = InMemoryDocumentStore() +docs = [Document(content=doc) for doc in dataset] + +document_embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small") +text_embedder = OpenAITextEmbedder(model="text-embedding-3-small") + +docs_with_embeddings = document_embedder.run(docs) +document_store.write_documents(docs_with_embeddings["documents"]) + +retriever = InMemoryEmbeddingRetriever(document_store, top_k=2) + +template = [ + ChatMessage.from_user( + """ +Given the following information, answer the question. + +Context: +{% for document in documents %} + {{ document.content }} +{% endfor %} + +Question: {{question}} +Answer: +""" + ) +] + +prompt_builder = ChatPromptBuilder(template=template) +chat_generator = OpenAIChatGenerator(model="gpt-4o-mini") + +# Creating the Pipeline +rag_pipeline = Pipeline() + +# Adding the components +rag_pipeline.add_component("text_embedder", text_embedder) +rag_pipeline.add_component("retriever", retriever) +rag_pipeline.add_component("prompt_builder", prompt_builder) +rag_pipeline.add_component("llm", chat_generator) +rag_pipeline.add_component("answer_builder", AnswerBuilder()) + +# Connecting the components +rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") +rag_pipeline.connect("retriever", "prompt_builder") +rag_pipeline.connect("prompt_builder.prompt", "llm.messages") +rag_pipeline.connect("llm.replies", "answer_builder.replies") +rag_pipeline.connect("retriever", "answer_builder.documents") +rag_pipeline.connect("llm.replies", "answer_builder.replies") +rag_pipeline.connect("retriever", "answer_builder.documents") + + +questions = [ + "Who are the major players in the large language model space?", + "What is Microsoft’s Azure AI platform known for?", + "What kind of models does Cohere provide?", +] + +references = [ + "The major players include OpenAI (GPT Series), Anthropic (Claude Series), Google DeepMind (Gemini Models), Meta AI (LLaMA Series), Microsoft Azure AI (integrating GPT Models), Amazon AWS (Bedrock with Claude and Jurassic), Cohere (business-focused models), and AI21 Labs (Jurassic Series).", + "Microsoft’s Azure AI platform is known for integrating OpenAI’s GPT models, enabling businesses to use these models in a scalable and secure cloud environment.", + "Cohere provides language models tailored for business use, excelling in tasks like search, summarization, and customer support.", +] + + +evals_list = [] + +for que_idx in range(len(questions)): + + single_turn = {} + single_turn['user_input'] = questions[que_idx] + single_turn['reference'] = references[que_idx] + + # Running the pipeline + response = rag_pipeline.run( + { + "text_embedder": {"text": questions[que_idx]}, + "prompt_builder": {"question": questions[que_idx]}, + "answer_builder": {"query": questions[que_idx]}, + } + ) + + single_turn['response'] = response["answer_builder"]["answers"][0].data + + haystack_documents = response["answer_builder"]["answers"][0].documents + # extracting context from haystack documents + single_turn['retrieved_contexts'] = [doc.content for doc in haystack_documents] + + evals_list.append(single_turn) + +evaluation_dataset = EvaluationDataset.from_list(evals_list) + +llm = ChatOpenAI(model="gpt-4o-mini") +evaluator_llm = LangchainLLMWrapper(llm) + +result = evaluate( + dataset=evaluation_dataset, + metrics=[AnswerRelevancy(), ContextPrecision(), Faithfulness()], + llm=evaluator_llm, +) + +print(result) + +result.to_pandas() diff --git a/integrations/ragas/example/evaluation_with_components_example.py b/integrations/ragas/example/evaluation_with_components_example.py new file mode 100644 index 000000000..4ee4c0a1d --- /dev/null +++ b/integrations/ragas/example/evaluation_with_components_example.py @@ -0,0 +1,115 @@ +# A valid OpenAI API key must be provided as an environment variable "OPENAI_API_KEY" to run this example. + +import os +from dotenv import load_dotenv + +load_dotenv() +from haystack import Document +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.components.embedders import OpenAITextEmbedder, OpenAIDocumentEmbedder +from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever +from haystack.components.builders import ChatPromptBuilder +from haystack.dataclasses import ChatMessage +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.components.builders import AnswerBuilder +from haystack import Pipeline +from haystack_integrations.components.evaluators.ragas import RagasEvaluator + +from langchain_openai import ChatOpenAI +from ragas.llms import LangchainLLMWrapper +from ragas.metrics import AnswerRelevancy, ContextPrecision, Faithfulness + + +dataset = [ + "OpenAI is one of the most recognized names in the large language model space, known for its GPT series of models. These models excel at generating human-like text and performing tasks like creative writing, answering questions, and summarizing content. GPT-4, their latest release, has set benchmarks in understanding context and delivering detailed responses.", + "Anthropic is well-known for its Claude series of language models, designed with a strong focus on safety and ethical AI behavior. Claude is particularly praised for its ability to follow complex instructions and generate text that aligns closely with user intent.", + "DeepMind, a division of Google, is recognized for its cutting-edge Gemini models, which are integrated into various Google products like Bard and Workspace tools. These models are renowned for their conversational abilities and their capacity to handle complex, multi-turn dialogues.", + "Meta AI is best known for its LLaMA (Large Language Model Meta AI) series, which has been made open-source for researchers and developers. LLaMA models are praised for their ability to support innovation and experimentation due to their accessibility and strong performance.", + "Meta AI with it's LLaMA models aims to democratize AI development by making high-quality models available for free, fostering collaboration across industries. Their open-source approach has been a game-changer for researchers without access to expensive resources.", + "Microsoft’s Azure AI platform is famous for integrating OpenAI’s GPT models, enabling businesses to use these advanced models in a scalable and secure cloud environment. Azure AI powers applications like Copilot in Office 365, helping users draft emails, generate summaries, and more.", + "Amazon’s Bedrock platform is recognized for providing access to various language models, including its own models and third-party ones like Anthropic’s Claude and AI21’s Jurassic. Bedrock is especially valued for its flexibility, allowing users to choose models based on their specific needs.", + "Cohere is well-known for its language models tailored for business use, excelling in tasks like search, summarization, and customer support. Their models are recognized for being efficient, cost-effective, and easy to integrate into workflows.", + "AI21 Labs is famous for its Jurassic series of language models, which are highly versatile and capable of handling tasks like content creation and code generation. The Jurassic models stand out for their natural language understanding and ability to generate detailed and coherent responses.", + "In the rapidly advancing field of artificial intelligence, several companies have made significant contributions with their large language models. Notable players include OpenAI, known for its GPT Series (including GPT-4); Anthropic, which offers the Claude Series; Google DeepMind with its Gemini Models; Meta AI, recognized for its LLaMA Series; Microsoft Azure AI, which integrates OpenAI’s GPT Models; Amazon AWS (Bedrock), providing access to various models including Claude (Anthropic) and Jurassic (AI21 Labs); Cohere, which offers its own models tailored for business use; and AI21 Labs, known for its Jurassic Series. These companies are shaping the landscape of AI by providing powerful models with diverse capabilities.", +] + +# Initialize components for RAG pipeline +document_store = InMemoryDocumentStore() +docs = [Document(content=doc) for doc in dataset] + +document_embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small") +text_embedder = OpenAITextEmbedder(model="text-embedding-3-small") + +docs_with_embeddings = document_embedder.run(docs) +document_store.write_documents(docs_with_embeddings["documents"]) + +retriever = InMemoryEmbeddingRetriever(document_store, top_k=2) + +template = [ + ChatMessage.from_user( + """ +Given the following information, answer the question. + +Context: +{% for document in documents %} + {{ document.content }} +{% endfor %} + +Question: {{question}} +Answer: +""" + ) +] + +prompt_builder = ChatPromptBuilder(template=template) +chat_generator = OpenAIChatGenerator(model="gpt-4o-mini") + +# Setting the RagasEvaluator Component +llm = ChatOpenAI(model="gpt-4o-mini") +evaluator_llm = LangchainLLMWrapper(llm) + +ragas_evaluator = RagasEvaluator( + ragas_metrics=[AnswerRelevancy(), ContextPrecision(), Faithfulness()], evaluator_llm=evaluator_llm +) + +# Creating the Pipeline +rag_pipeline = Pipeline() + +# Adding the components +rag_pipeline.add_component("text_embedder", text_embedder) +rag_pipeline.add_component("retriever", retriever) +rag_pipeline.add_component("prompt_builder", prompt_builder) +rag_pipeline.add_component("llm", chat_generator) +rag_pipeline.add_component("answer_builder", AnswerBuilder()) +rag_pipeline.add_component("ragas_evaluator", ragas_evaluator) + +# Connecting the components +rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") +rag_pipeline.connect("retriever", "prompt_builder") +rag_pipeline.connect("prompt_builder.prompt", "llm.messages") +rag_pipeline.connect("llm.replies", "answer_builder.replies") +rag_pipeline.connect("retriever", "answer_builder.documents") +rag_pipeline.connect("llm.replies", "answer_builder.replies") +rag_pipeline.connect("retriever", "answer_builder.documents") +rag_pipeline.connect("retriever", "ragas_evaluator.documents") +rag_pipeline.connect("llm.replies", "ragas_evaluator.response") + +# Run the pipeline +question = "What makes Meta AI’s LLaMA models stand out?" + +reference = "Meta AI’s LLaMA models stand out for being open-source, supporting innovation and experimentation due to their accessibility and strong performance." + + +result = rag_pipeline.run( + { + "text_embedder": {"text": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + "ragas_evaluator": {"query": question, "reference": reference}, + # Each metric expects a specific set of parameters as input. Refer to the + # Ragas class' documentation for more details. + } +) + +print(result['answer_builder']['answers'][0].data, '\n') +print(result['ragas_evaluator']['result']) diff --git a/integrations/ragas/example/example.py b/integrations/ragas/example/example.py deleted file mode 100644 index ba75bdc7e..000000000 --- a/integrations/ragas/example/example.py +++ /dev/null @@ -1,52 +0,0 @@ -# A valid OpenAI API key must be provided as an environment variable "OPENAI_API_KEY" to run this example. - -from haystack import Pipeline - -from haystack_integrations.components.evaluators.ragas import RagasEvaluator, RagasMetric - -QUESTIONS = [ - "Which is the most popular global sport?", - "Who created the Python language?", -] -CONTEXTS = [ - [ - "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people." - ], - [ - "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects." - ], -] -RESPONSES = [ - "Football is the most popular sport with around 4 billion followers worldwide", - "Python language was created by Guido van Rossum.", -] - -GROUND_TRUTHS = [ - "Football is the most popular sport", - "Python language was created by Guido van Rossum.", -] - -pipeline = Pipeline() -evaluator_context = RagasEvaluator( - metric=RagasMetric.CONTEXT_PRECISION, -) -evaluator_aspect = RagasEvaluator( - metric=RagasMetric.ASPECT_CRITIQUE, - metric_params={"name": "custom", "definition": "Is this answer problematic for children?", "strictness": 3}, -) -pipeline.add_component("evaluator_context", evaluator_context) -pipeline.add_component("evaluator_aspect", evaluator_aspect) - -# Each metric expects a specific set of parameters as input. Refer to the -# Ragas class' documentation for more details. -results = pipeline.run( - { - "evaluator_context": {"questions": QUESTIONS, "contexts": CONTEXTS, "ground_truths": GROUND_TRUTHS}, - "evaluator_aspect": {"questions": QUESTIONS, "contexts": CONTEXTS, "responses": RESPONSES}, - } -) - - -for component in ["evaluator_context", "evaluator_aspect"]: - for output in results[component]["results"]: - print(output) diff --git a/integrations/ragas/pydoc/config.yml b/integrations/ragas/pydoc/config.yml index 97d8d808e..9a94e2293 100644 --- a/integrations/ragas/pydoc/config.yml +++ b/integrations/ragas/pydoc/config.yml @@ -3,8 +3,7 @@ loaders: search_path: [../src] modules: [ - "haystack_integrations.components.evaluators.ragas.evaluator", - "haystack_integrations.components.evaluators.ragas.metrics", + "haystack_integrations.components.evaluators.ragas.evaluator" ] ignore_when_discovered: ["__init__"] processors: @@ -14,7 +13,6 @@ processors: do_not_filter_modules: false skip_empty_modules: true - type: filter - expression: "name not in ['InputConverters', 'MetricDescriptor', 'MetricParamsValidators', 'OutputConverters', 'METRIC_DESCRIPTORS']" - type: smart - type: crossref renderer: diff --git a/integrations/ragas/pyproject.toml b/integrations/ragas/pyproject.toml index 179bcce16..548545e34 100644 --- a/integrations/ragas/pyproject.toml +++ b/integrations/ragas/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai", "ragas>=0.1.11,<=0.1.16"] +dependencies = ["haystack-ai", "ragas>=0.2.0,<0.3.0", "langchain_openai"] [project.urls] Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/ragas" @@ -48,6 +48,7 @@ dependencies = [ "pytest-rerunfailures", "haystack-pydoc-tools", "pytest-asyncio", + "pydantic" ] [tool.hatch.envs.default.scripts] test = "pytest {args:tests}" diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/__init__.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/__init__.py index a6f420701..f572e367f 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/__init__.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/__init__.py @@ -1,4 +1,3 @@ from .evaluator import RagasEvaluator -from .metrics import RagasMetric -__all__ = ("RagasEvaluator", "RagasMetric") +__all__ = ["RagasEvaluator"] diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index c44c446e6..934bb8fa7 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -1,161 +1,291 @@ -import json -from typing import Any, Callable, Dict, List, Optional, Union +import re +from typing import Any, Dict, List, Optional, Union, get_args, get_origin -from datasets import Dataset -from haystack import DeserializationError, component, default_from_dict, default_to_dict +from haystack import Document, component, default_from_dict, default_to_dict +from haystack.dataclasses import ChatMessage +from langchain_core.embeddings import Embeddings as LangchainEmbeddings # type: ignore +from langchain_core.language_models import BaseLanguageModel as LangchainLLM # type: ignore +from pydantic import ValidationError # type: ignore from ragas import evaluate # type: ignore -from ragas.evaluation import Result -from ragas.metrics.base import Metric - -from .metrics import ( - METRIC_DESCRIPTORS, - InputConverters, - OutputConverters, - RagasMetric, +from ragas.dataset_schema import ( + EvaluationDataset, + EvaluationResult, + SingleTurnSample, ) +from ragas.embeddings import BaseRagasEmbeddings +from ragas.llms import BaseRagasLLM +from ragas.metrics import Metric @component class RagasEvaluator: """ A component that uses the [Ragas framework](https://docs.ragas.io/) to evaluate - inputs against a specific metric. Supported metrics are defined by `RagasMetric`. + inputs against specified Ragas metrics. Usage example: ```python - from haystack_integrations.components.evaluators.ragas import RagasEvaluator, RagasMetric + from haystack_integrations.components.evaluators.ragas import RagasEvaluator + from ragas.metrics import ContextPrecision + from ragas.llms import LangchainLLMWrapper + from langchain_openai import ChatOpenAI + + llm = ChatOpenAI(model="gpt-4o-mini") + evaluator_llm = LangchainLLMWrapper(llm) evaluator = RagasEvaluator( - metric=RagasMetric.CONTEXT_PRECISION, + ragas_metrics=[ContextPrecision()], + evaluator_llm=evaluator_llm ) output = evaluator.run( - questions=["Which is the most popular global sport?"], - contexts=[ - [ - "Football is undoubtedly the world's most popular sport with" - "major events like the FIFA World Cup and sports personalities" - "like Ronaldo and Messi, drawing a followership of more than 4" - "billion people." - ] + query="Which is the most popular global sport?", + documents=[ + "Football is undoubtedly the world's most popular sport with" + " major events like the FIFA World Cup and sports personalities" + " like Ronaldo and Messi, drawing a followership of more than 4" + " billion people." ], - ground_truths=["Football is the most popular sport with around 4 billion" "followers worldwide"], + reference="Football is the most popular sport with around 4 billion" + " followers worldwide", ) - print(output["results"]) + + output['result'] ``` """ - # Wrapped for easy mocking. - _backend_callable: Callable - _backend_metric: Metric - def __init__( self, - metric: Union[str, RagasMetric], - metric_params: Optional[Dict[str, Any]] = None, + ragas_metrics: List[Metric], + evaluator_llm: Optional[Union[BaseRagasLLM, LangchainLLM]] = None, + evaluator_embedding: Optional[Union[BaseRagasEmbeddings, LangchainEmbeddings]] = None, ): """ - Construct a new Ragas evaluator. - - :param metric: - The metric to use for evaluation. - :param metric_params: - Parameters to pass to the metric's constructor. - Refer to the `RagasMetric` class for more details - on required parameters. + Constructs a new Ragas evaluator. + + :param ragas_metrics: A list of evaluation metrics from the [Ragas](https://docs.ragas.io/) library. + :param evaluator_llm: A language model used by metrics that require LLMs for evaluation. + :param evaluator_embedding: An embedding model used by metrics that require embeddings for evaluation. """ - self.metric = metric if isinstance(metric, RagasMetric) else RagasMetric.from_str(metric) - self.metric_params = metric_params - self.descriptor = METRIC_DESCRIPTORS[self.metric] - - self._init_backend() - self._init_metric() - - expected_inputs = self.descriptor.input_parameters - component.set_input_types(self, **expected_inputs) - - def _init_backend(self): - self._backend_callable = RagasEvaluator._invoke_evaluate - - def _init_metric(self): - if self.descriptor.init_parameters is not None: - if self.metric_params is None: - msg = f"Ragas metric '{self.metric}' expected init parameters but got none" - raise ValueError(msg) - elif not all(k in self.descriptor.init_parameters for k in self.metric_params.keys()): - msg = ( - f"Invalid init parameters for Ragas metric '{self.metric}'. " - f"Expected: {self.descriptor.init_parameters}" - ) - raise ValueError(msg) - elif self.metric_params is not None: - msg = ( - f"Invalid init parameters for Ragas metric '{self.metric}'. " - f"None expected but {self.metric_params} given" + self._validate_inputs(ragas_metrics, evaluator_llm, evaluator_embedding) + self.metrics = ragas_metrics + self.llm = evaluator_llm + self.embedding = evaluator_embedding + + def _validate_inputs( + self, + metrics: List[Metric], + llm: Optional[Union[BaseRagasLLM, LangchainLLM]], + embedding: Optional[Union[BaseRagasEmbeddings, LangchainEmbeddings]], + ) -> None: + """Validate input parameters. + + :param metrics: List of Ragas metrics to validate + :param llm: Language model to validate + :param embedding: Embedding model to validate + + :return: None. + """ + if not all(isinstance(metric, Metric) for metric in metrics): + error_message = "All items in ragas_metrics must be instances of Metric class." + raise TypeError(error_message) + + if llm is not None and not isinstance(llm, (BaseRagasLLM, LangchainLLM)): + error_message = f"Expected evaluator_llm to be BaseRagasLLM or LangchainLLM, got {type(llm).__name__}" + raise TypeError(error_message) + + if embedding is not None and not isinstance(embedding, (BaseRagasEmbeddings, LangchainEmbeddings)): + error_message = ( + f"Expected evaluator_embedding to be BaseRagasEmbeddings or " + f"LangchainEmbeddings, got {type(embedding).__name__}" ) - raise ValueError(msg) - metric_params = self.metric_params or {} - self._backend_metric = self.descriptor.backend(**metric_params) + raise TypeError(error_message) - @staticmethod - def _invoke_evaluate(dataset: Dataset, metric: Metric) -> Result: - return evaluate(dataset, [metric]) + @component.output_types(result=EvaluationResult) + def run( + self, + query: Optional[str] = None, + response: Optional[Union[List[ChatMessage], str]] = None, + documents: Optional[List[Union[Document, str]]] = None, + reference_contexts: Optional[List[str]] = None, + multi_responses: Optional[List[str]] = None, + reference: Optional[str] = None, + rubrics: Optional[Dict[str, str]] = None, + ) -> Dict[str, Any]: + """ + Evaluates the provided query against the documents and returns the evaluation result. - @component.output_types(results=List[List[Dict[str, Any]]]) - def run(self, **inputs) -> Dict[str, Any]: + :param query: The input query from the user. + :param response: A list of ChatMessage responses (typically from a language model or agent). + :param documents: A list of Haystack Document or strings that were retrieved for the query. + :param reference_contexts: A list of reference contexts that should have been retrieved for the query. + :param multi_responses: List of multiple responses generated for the query. + :param reference: A string reference answer for the query. + :param rubrics: A dictionary of evaluation rubric, where keys represent the score + and the values represent the corresponding evaluation criteria. + :return: A dictionary containing the evaluation result. """ - Run the Ragas evaluator on the provided inputs. - - :param inputs: - The inputs to evaluate. These are determined by the - metric being calculated. See `RagasMetric` for more - information. - :returns: - A dictionary with a single `results` entry that contains - a nested list of metric results. Each input can have one or more - results, depending on the metric. Each result is a dictionary - containing the following keys and values: - - `name` - The name of the metric. - - `score` - The score of the metric. + processed_docs = self._process_documents(documents) + processed_response = self._process_response(response) + + try: + sample = SingleTurnSample( + user_input=query, + retrieved_contexts=processed_docs, + reference_contexts=reference_contexts, + response=processed_response, + multi_responses=multi_responses, + reference=reference, + rubrics=rubrics, + ) + + except (ValueError, ValidationError) as e: + raise self._handle_conversion_error(e) from None + + dataset = EvaluationDataset([sample]) + + try: + result = evaluate( + dataset=dataset, + metrics=self.metrics, + llm=self.llm, + embeddings=self.embedding, + ) + except (ValueError, ValidationError) as e: + raise self._handle_evaluation_error(e) from None + + return {"result": result} + + def _process_documents(self, documents: Union[List[Union[Document, str]], None]) -> Union[List[str], None]: + """Process and validate input documents. + + :param documents: List of Documents or strings to process + :return: List of document contents as strings or None """ - InputConverters.validate_input_parameters(self.metric, self.descriptor.input_parameters, inputs) - converted_inputs: List[Dict[str, str]] = list(self.descriptor.input_converter(**inputs)) # type: ignore + if documents: + first_type = type(documents[0]) + if first_type is Document: + if not all(isinstance(doc, Document) for doc in documents): + error_message = "All elements in documents list must be of type Document." + raise ValueError(error_message) + return [doc.content for doc in documents] # type: ignore[union-attr] + + if first_type is str: + if not all(isinstance(doc, str) for doc in documents): + error_message = "All elements in documents list must be strings." + raise ValueError(error_message) + return documents + error_message = "Unsupported type in documents list." + raise ValueError(error_message) + return documents - dataset = Dataset.from_list(converted_inputs) - results = self._backend_callable(dataset=dataset, metric=self._backend_metric) + def _process_response(self, response: Optional[Union[List[ChatMessage], str]]) -> Union[str, None]: + """Process response into expected format. - OutputConverters.validate_outputs(results) - converted_results = [ - [result.to_dict()] for result in self.descriptor.output_converter(results, self.metric, self.metric_params) - ] + :param response: Response to process + :return: None or Processed response string + """ + if isinstance(response, list): # Check if response is a list + if all(isinstance(item, ChatMessage) for item in response): + return response[0]._content[0].text + return None + elif isinstance(response, str): + return response + return response - return {"results": converted_results} + def _handle_conversion_error(self, error: Exception): + """Handle evaluation errors with improved messages. - def to_dict(self) -> Dict[str, Any]: + :params error: Original error """ - Serializes the component to a dictionary. + if isinstance(error, ValidationError): + field_mapping = { + "user_input": "query", + "retrieved_contexts": "documents", + } + for err in error.errors(): + field = err["loc"][0] + haystack_field = field_mapping.get(field, field) + expected_type = self.run.__annotations__.get(haystack_field) + type_desc = self._get_expected_type_description(expected_type) + actual_type = type(err["input"]).__name__ + example = self._get_example_input(haystack_field) + error_message = ( + f"Validation error occured while running RagasEvaluator Component:\n" + f"The '{haystack_field}' field expected '{type_desc}', " + f"but got '{actual_type}'.\n" + f"Hint: Provide {example}" + ) + raise ValueError(error_message) + + def _handle_evaluation_error(self, error: Exception): + error_message = str(error) + columns_match = re.search(r"additional columns \[(.*?)\]", error_message) + field_mapping = { + "user_input": "query", + "retrieved_contexts": "documents", + } + if columns_match: + columns_str = columns_match.group(1) + columns = [col.strip().strip("'") for col in columns_str.split(",")] - :returns: - Dictionary with serialized data. - :raises DeserializationError: - If the component cannot be serialized. + mapped_columns = [field_mapping.get(col, col) for col in columns] + updated_columns_str = "[" + ", ".join(f"'{col}'" for col in mapped_columns) + "]" + + # Update the list of columns in the error message + updated_error_message = error_message.replace( + columns_match.group(0), f"additional columns {updated_columns_str}" + ) + raise ValueError(updated_error_message) + + def _get_expected_type_description(self, expected_type) -> str: + """Helper method to get a description of the expected type.""" + if get_origin(expected_type) is Union: + expected_types = [getattr(t, "__name__", str(t)) for t in get_args(expected_type)] + return f"one of {', '.join(expected_types)}" + elif get_origin(expected_type) is list: + expected_item_type = get_args(expected_type)[0] + item_type_name = getattr(expected_item_type, "__name__", str(expected_item_type)) + return f"a list of {item_type_name}" + elif get_origin(expected_type) is dict: + key_type, value_type = get_args(expected_type) + key_type_name = getattr(key_type, "__name__", str(key_type)) + value_type_name = getattr(value_type, "__name__", str(value_type)) + return f"a dictionary with keys of type {key_type_name} and values of type {value_type_name}" + else: + # Handle non-generic types or unknown types gracefully + return getattr(expected_type, "__name__", str(expected_type)) + + def _get_example_input(self, field: str) -> str: """ + Helper method to get an example input based on the field. - def check_serializable(obj: Any): - try: - json.dumps(obj) - return True - except (TypeError, OverflowError): - return False + :param field: Arguement used to make SingleTurnSample. + :returns: Example usage for the field. + """ + examples = { + "query": "A string query like 'Question?'", + "documents": "[Document(content='Example content')]", + "reference_contexts": "['Example string 1', 'Example string 2']", + "response": "ChatMessage(_content='Hi', _role='assistant')", + "multi_responses": "['Response 1', 'Response 2']", + "reference": "'A reference string'", + "rubrics": "{'score1': 'high_similarity'}", + } + return examples.get(field, "An appropriate value based on the field's type") - if not check_serializable(self.metric_params): - msg = "Ragas evaluator cannot serialize the metric parameters" - raise DeserializationError(msg) + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + :returns: Dictionary with serialized data. + :raises DeserializationError: If the component cannot be serialized. + """ return default_to_dict( self, - metric=self.metric, - metric_params=self.metric_params, + ragas_metrics=self.metrics, + evaluator_llm=self.llm, + evaluator_embedding=self.embedding, ) @classmethod @@ -163,9 +293,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "RagasEvaluator": """ Deserializes the component from a dictionary. - :param data: - Dictionary to deserialize from. - :returns: - Deserialized component. + :param data: Dictionary to deserialize from. + :returns: Deserialized component. """ return default_from_dict(cls, data) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py deleted file mode 100644 index 5d6ed16bc..000000000 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py +++ /dev/null @@ -1,333 +0,0 @@ -import dataclasses -import inspect -from dataclasses import dataclass -from enum import Enum -from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union - -from ragas.evaluation import Result -from ragas.metrics import ( # type: ignore - AnswerCorrectness, # type: ignore - AnswerRelevancy, # type: ignore - AnswerSimilarity, # type: ignore - AspectCritique, # type: ignore - ContextPrecision, # type: ignore - ContextRecall, # type: ignore - ContextUtilization, # type: ignore - Faithfulness, # type: ignore -) -from ragas.metrics.base import Metric - - -class RagasBaseEnum(Enum): - """ - Base functionality for a Ragas enum. - """ - - def __str__(self): - return self.value - - @classmethod - def from_str(cls, string: str) -> "RagasMetric": - """ - Create a metric type from a string. - - :param string: - The string to convert. - :returns: - The metric. - """ - enum_map = {e.value: e for e in RagasMetric} - metric = enum_map.get(string) - if metric is None: - msg = f"Unknown Ragas metric '{string}'. Supported metrics: {list(enum_map.keys())}" - raise ValueError(msg) - return metric - - -class RagasMetric(RagasBaseEnum): - """ - Metrics supported by Ragas. - """ - - #: Answer correctness.\ - #: Inputs - `questions: List[str], responses: List[str], ground_truths: List[str]`\ - #: Parameters - `weights: Tuple[float, float]` - ANSWER_CORRECTNESS = "answer_correctness" - - #: Faithfulness.\ - #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` - FAITHFULNESS = "faithfulness" - - #: Answer similarity.\ - #: Inputs - `responses: List[str], ground_truths: List[str]`\ - #: Parameters - `threshold: float` - ANSWER_SIMILARITY = "answer_similarity" - - #: Context precision.\ - #: Inputs - `questions: List[str], contexts: List[List[str]], ground_truths: List[str]` - CONTEXT_PRECISION = "context_precision" - - #: Context utilization. - #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`\ - CONTEXT_UTILIZATION = "context_utilization" - - #: Context recall. - #: Inputs - `questions: List[str], contexts: List[List[str]], ground_truths: List[str]`\ - CONTEXT_RECALL = "context_recall" - - #: Aspect critique. - #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`\ - #: Parameters - `name: str, definition: str, strictness: int` - ASPECT_CRITIQUE = "aspect_critique" - - #: Answer relevancy.\ - #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`\ - #: Parameters - `strictness: int` - ANSWER_RELEVANCY = "answer_relevancy" - - -@dataclass(frozen=True) -class MetricResult: - """ - Result of a metric evaluation. - - :param name: - The name of the metric. - :param score: - The score of the metric. - """ - - name: str - score: float - - def to_dict(self): - return dataclasses.asdict(self) - - -@dataclass(frozen=True) -class MetricDescriptor: - """ - Descriptor for a metric. - - :param metric: - The metric. - :param backend: - The associated Ragas metric class. - :param input_parameters: - Parameters accepted by the metric. This is used - to set the input types of the evaluator component. - :param input_converter: - Callable that converts input parameters to the Ragas input format. - :param output_converter: - Callable that converts the Ragas output format to our output format. - Accepts a single output parameter and returns a list of results derived from it. - :param init_parameters: - Additional parameters that are allowed to be passed to the metric class during initialization. - """ - - metric: RagasMetric - backend: Type[Metric] - input_parameters: Dict[str, Type] - input_converter: Callable[[Any], Iterable[Dict[str, str]]] - output_converter: Callable[[Result, RagasMetric, Optional[Dict[str, Any]]], List[MetricResult]] - init_parameters: Optional[List[str]] = None - - @classmethod - def new( - cls, - metric: RagasMetric, - backend: Type[Metric], - input_converter: Callable[[Any], Iterable[Dict[str, str]]], - output_converter: Optional[ - Callable[[Result, RagasMetric, Optional[Dict[str, Any]]], List[MetricResult]] - ] = None, - *, - init_parameters: Optional[List[str]] = None, - ) -> "MetricDescriptor": - input_converter_signature = inspect.signature(input_converter) - input_parameters = {} - for name, param in input_converter_signature.parameters.items(): - if name in ("cls", "self"): - continue - elif param.kind not in (inspect.Parameter.KEYWORD_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD): - continue - input_parameters[name] = param.annotation - - return cls( - metric=metric, - backend=backend, - input_parameters=input_parameters, - input_converter=input_converter, - output_converter=output_converter if output_converter is not None else OutputConverters.default, - init_parameters=init_parameters, - ) - - -class InputConverters: - """ - Converters for input parameters. - - The signature of the converter functions serves as the ground-truth of the - expected input parameters of a given metric. They are also responsible for validating - the input parameters and converting them to the format expected by Ragas. - """ - - @staticmethod - def _validate_input_elements(**kwargs) -> None: - for k, collection in kwargs.items(): - if not isinstance(collection, list): - msg = ( - f"Ragas evaluator expected input '{k}' to be a collection of type 'list', " - f"got '{type(collection).__name__}' instead" - ) - raise ValueError(msg) - elif not all(isinstance(x, str) for x in collection) and not all(isinstance(x, list) for x in collection): - msg = f"Ragas evaluator expects inputs to be of type 'str' or 'list' in '{k}'" - raise ValueError(msg) - - same_length = len({len(x) for x in kwargs.values()}) == 1 - if not same_length: - msg = f"Mismatching counts in the following inputs: {({k: len(v) for k, v in kwargs.items()})}" - raise ValueError(msg) - - @staticmethod - def validate_input_parameters( - metric: RagasMetric, - expected: Dict[str, Any], - received: Dict[str, Any], - ) -> None: - for param, _ in expected.items(): - if param not in received: - msg = f"Ragas evaluator expected input parameter '{param}' for metric '{metric}'" - raise ValueError(msg) - - @staticmethod - def question_context_response( - questions: List[str], contexts: List[List[str]], responses: List[str] - ) -> Iterable[Dict[str, Union[str, List[str]]]]: - InputConverters._validate_input_elements(questions=questions, contexts=contexts, responses=responses) - for q, c, r in zip(questions, contexts, responses): # type: ignore - yield {"question": q, "contexts": c, "answer": r} - - @staticmethod - def question_context_ground_truth( - questions: List[str], - contexts: List[List[str]], - ground_truths: List[str], - ) -> Iterable[Dict[str, Union[str, List[str]]]]: - InputConverters._validate_input_elements(questions=questions, contexts=contexts, ground_truths=ground_truths) - for q, c, gt in zip(questions, contexts, ground_truths): # type: ignore - yield {"question": q, "contexts": c, "ground_truth": gt} - - @staticmethod - def question_context( - questions: List[str], - contexts: List[List[str]], - ) -> Iterable[Dict[str, Union[str, List[str]]]]: - InputConverters._validate_input_elements(questions=questions, contexts=contexts) - for q, c in zip(questions, contexts): # type: ignore - yield {"question": q, "contexts": c} - - @staticmethod - def response_ground_truth( - responses: List[str], - ground_truths: List[str], - ) -> Iterable[Dict[str, str]]: - InputConverters._validate_input_elements(responses=responses, ground_truths=ground_truths) - for r, gt in zip(responses, ground_truths): # type: ignore - yield {"answer": r, "ground_truth": gt} - - @staticmethod - def question_response_ground_truth( - questions: List[str], - responses: List[str], - ground_truths: List[str], - ) -> Iterable[Dict[str, str]]: - InputConverters._validate_input_elements(questions=questions, ground_truths=ground_truths, responses=responses) - for q, r, gt in zip(questions, responses, ground_truths): # type: ignore - yield {"question": q, "answer": r, "ground_truth": gt} - - -class OutputConverters: - """ - Converters for results returned by Ragas. - - They are responsible for converting the results to our output format. - """ - - @staticmethod - def validate_outputs(outputs: Result) -> None: - if not isinstance(outputs, Result): - msg = f"Expected response from Ragas evaluator to be a 'Result', got '{type(outputs).__name__}'" - raise ValueError(msg) - - @staticmethod - def _extract_default_results(output: Result, metric_name: str) -> List[MetricResult]: - try: - output_scores: List[Dict[str, float]] = output.scores.to_list() - return [MetricResult(name=metric_name, score=metric_dict[metric_name]) for metric_dict in output_scores] - except KeyError as e: - msg = f"Ragas evaluator did not return an expected output for metric '{e.args[0]}'" - raise ValueError(msg) from e - - @staticmethod - def default(output: Result, metric: RagasMetric, _: Optional[Dict]) -> List[MetricResult]: - metric_name = metric.value - return OutputConverters._extract_default_results(output, metric_name) - - @staticmethod - def aspect_critique(output: Result, _: RagasMetric, metric_params: Optional[Dict[str, Any]]) -> List[MetricResult]: - if metric_params is None: - msg = "Aspect critique metric requires metric parameters" - raise ValueError(msg) - metric_name = metric_params["name"] - return OutputConverters._extract_default_results(output, metric_name) - - -METRIC_DESCRIPTORS = { - RagasMetric.ANSWER_CORRECTNESS: MetricDescriptor.new( - RagasMetric.ANSWER_CORRECTNESS, - AnswerCorrectness, - InputConverters.question_response_ground_truth, # type: ignore - init_parameters=["weights"], - ), - RagasMetric.FAITHFULNESS: MetricDescriptor.new( - RagasMetric.FAITHFULNESS, - Faithfulness, - InputConverters.question_context_response, # type: ignore - ), - RagasMetric.ANSWER_SIMILARITY: MetricDescriptor.new( - RagasMetric.ANSWER_SIMILARITY, - AnswerSimilarity, - InputConverters.response_ground_truth, # type: ignore - init_parameters=["threshold"], - ), - RagasMetric.CONTEXT_PRECISION: MetricDescriptor.new( - RagasMetric.CONTEXT_PRECISION, - ContextPrecision, - InputConverters.question_context_ground_truth, # type: ignore - ), - RagasMetric.CONTEXT_UTILIZATION: MetricDescriptor.new( - RagasMetric.CONTEXT_UTILIZATION, - ContextUtilization, - InputConverters.question_context_response, # type: ignore - ), - RagasMetric.CONTEXT_RECALL: MetricDescriptor.new( - RagasMetric.CONTEXT_RECALL, - ContextRecall, - InputConverters.question_context_ground_truth, # type: ignore - ), - RagasMetric.ASPECT_CRITIQUE: MetricDescriptor.new( - RagasMetric.ASPECT_CRITIQUE, - AspectCritique, - InputConverters.question_context_response, # type: ignore - OutputConverters.aspect_critique, - init_parameters=["name", "definition", "strictness"], - ), - RagasMetric.ANSWER_RELEVANCY: MetricDescriptor.new( - RagasMetric.ANSWER_RELEVANCY, - AnswerRelevancy, - InputConverters.question_context_response, # type: ignore - init_parameters=["strictness"], - ), -} diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 0f847ed0b..2b1ada27d 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -1,350 +1,155 @@ -import copy -import os -from dataclasses import dataclass - import pytest -from datasets import Dataset -from haystack import DeserializationError -from ragas.evaluation import Result -from ragas.metrics.base import Metric - -from haystack_integrations.components.evaluators.ragas import RagasEvaluator, RagasMetric - -DEFAULT_QUESTIONS = [ - "Which is the most popular global sport?", - "Who created the Python language?", -] -DEFAULT_CONTEXTS = [ - [ - "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact.", - "Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people.", - ], - [ - "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects." - ], -] -DEFAULT_RESPONSES = [ - "Football is the most popular sport with around 4 billion followers worldwide", - "Python language was created by Guido van Rossum.", -] -DEFAULT_GROUND_TRUTHS = [ - "Football (Soccer) is the most popular sport in the world with almost 4 billion fans around the world.", - "Guido van Rossum is the creator of the Python programming language.", -] - - -@dataclass(frozen=True) -class Unserializable: - something: str - - -# Only returns results for the passed metrics. -class MockBackend: - def __init__(self, metric: RagasMetric) -> None: - self.metric = metric - - def evaluate(self, _, metric: Metric, **kwargs): - output_map = { - RagasMetric.ANSWER_CORRECTNESS: Result(scores=Dataset.from_list([{"answer_correctness": 0.5}])), - RagasMetric.FAITHFULNESS: Result(scores=Dataset.from_list([{"faithfulness": 1.0}])), - RagasMetric.ANSWER_SIMILARITY: Result(scores=Dataset.from_list([{"answer_similarity": 1.0}])), - RagasMetric.CONTEXT_PRECISION: Result(scores=Dataset.from_list([{"context_precision": 0.5}])), - RagasMetric.CONTEXT_UTILIZATION: Result(scores=Dataset.from_list([{"context_utilization": 1.0}])), - RagasMetric.CONTEXT_RECALL: Result(scores=Dataset.from_list([{"context_recall": 0.9}])), - RagasMetric.ASPECT_CRITIQUE: Result(scores=Dataset.from_list([{"harmfulness": 1.0}])), - RagasMetric.ANSWER_RELEVANCY: Result(scores=Dataset.from_list([{"answer_relevancy": 0.4}])), - } - assert isinstance(metric, Metric) - return output_map[self.metric] - - -@pytest.mark.parametrize( - "metric, init_params", - [ - (RagasMetric.ANSWER_CORRECTNESS, {"weights": [0.5, 0.5]}), - (RagasMetric.FAITHFULNESS, None), - (RagasMetric.ANSWER_SIMILARITY, {"threshold": 0.5}), - (RagasMetric.CONTEXT_PRECISION, None), - (RagasMetric.CONTEXT_UTILIZATION, None), - (RagasMetric.CONTEXT_RECALL, None), - ( - RagasMetric.ASPECT_CRITIQUE, - { - "name": "harmfulness", - "definition": "Does the submission causes or has the potential to " - "cause harm to individuals, groups, or society at " - "large?", - }, - ), - (RagasMetric.ANSWER_RELEVANCY, {"strictness": 2}), - ], -) -def test_evaluator_valid_metric_init_params(metric, init_params): - eval = RagasEvaluator( - metric, - metric_params=init_params, +from unittest import mock +from unittest.mock import MagicMock, patch +from ragas.metrics import Metric, Faithfulness +from ragas.llms import BaseRagasLLM +from ragas.embeddings import BaseRagasEmbeddings +from ragas.dataset_schema import EvaluationResult +from haystack import Document +from haystack.dataclasses import ChatMessage +from haystack_integrations.components.evaluators.ragas import RagasEvaluator + + +# Fixture to mock the 'run' method of RagasEvaluator +@pytest.fixture +def mock_run(): + with mock.patch.object(RagasEvaluator, 'run') as mock_method: + yield mock_method + + +def test_successful_initialization(): + """Test RagasEvaluator initializes correctly with valid inputs.""" + valid_metrics = [MagicMock(spec=Metric) for _ in range(3)] + valid_llm = MagicMock(spec=BaseRagasLLM) + valid_embedding = MagicMock(spec=BaseRagasEmbeddings) + + evaluator = RagasEvaluator( + ragas_metrics=valid_metrics, + evaluator_llm=valid_llm, + evaluator_embedding=valid_embedding, + ) + assert evaluator.metrics == valid_metrics + assert evaluator.llm == valid_llm + assert evaluator.embedding == valid_embedding + + +def test_invalid_metrics(): + """Test RagasEvaluator raises TypeError for invalid metrics.""" + invalid_metric = "not_a_metric" + + with pytest.raises(TypeError, match="All items in ragas_metrics must be instances of Metric class."): + RagasEvaluator(ragas_metrics=[invalid_metric]) + + +def test_invalid_llm(): + """Test RagasEvaluator raises TypeError for invalid evaluator_llm.""" + valid_metric = MagicMock(spec=Metric) + invalid_llm = "not_a_llm" + + with pytest.raises(TypeError, match="Expected evaluator_llm to be BaseRagasLLM or LangchainLLM"): + RagasEvaluator(ragas_metrics=[valid_metric], evaluator_llm=invalid_llm) + + +def test_invalid_embedding(): + """Test RagasEvaluator raises TypeError for invalid evaluator_embedding.""" + valid_metric = MagicMock(spec=Metric) + invalid_embedding = "not_an_embedding" + + with pytest.raises( + TypeError, match="Expected evaluator_embedding to be BaseRagasEmbeddings or LangchainEmbeddings" + ): + RagasEvaluator(ragas_metrics=[valid_metric], evaluator_embedding=invalid_embedding) + + +def test_optional_llm_and_embeddings(): + """Test RagasEvaluator initializes correctly with None for optional parameters.""" + valid_metric = MagicMock(spec=Metric) + + evaluator = RagasEvaluator( + ragas_metrics=[valid_metric], + evaluator_llm=None, + evaluator_embedding=None, ) - assert eval.metric_params == init_params - - msg = f"Invalid init parameters for Ragas metric '{metric}'. " - with pytest.raises(ValueError, match=msg): - RagasEvaluator( - metric, - metric_params={"invalid_param": "invalid_value"}, - ) - - -@pytest.mark.parametrize( - "metric", - [ - RagasMetric.ANSWER_CORRECTNESS, - RagasMetric.ANSWER_SIMILARITY, - RagasMetric.ASPECT_CRITIQUE, - RagasMetric.ANSWER_RELEVANCY, - ], -) -def test_evaluator_fails_with_no_metric_init_params(metric): - msg = f"Ragas metric '{metric}' expected init parameters but got none" - with pytest.raises(ValueError, match=msg): - RagasEvaluator( - metric, - metric_params=None, - ) - - -def test_evaluator_serde(): - init_params = { - "metric": RagasMetric.ASPECT_CRITIQUE, - "metric_params": { - "name": "harmfulness", - "definition": "Does the submission causes or has the potential to " - "cause harm to individuals, groups, or society at " - "large?", - }, - } - eval = RagasEvaluator(**init_params) - serde_data = eval.to_dict() - new_eval = RagasEvaluator.from_dict(serde_data) - - assert eval.metric == new_eval.metric - assert eval.metric_params == new_eval.metric_params - - with pytest.raises(DeserializationError, match=r"cannot serialize the metric parameters"): - init_params3 = copy.deepcopy(init_params) - init_params3["metric_params"]["name"] = Unserializable("") - eval = RagasEvaluator(**init_params3) - eval.to_dict() - - -@pytest.mark.parametrize( - "current_metric, inputs, params", - [ - ( - RagasMetric.ANSWER_CORRECTNESS, - {"questions": [], "responses": [], "ground_truths": []}, - {"weights": [0.5, 0.5]}, - ), - (RagasMetric.FAITHFULNESS, {"questions": [], "contexts": [], "responses": []}, None), - (RagasMetric.ANSWER_SIMILARITY, {"responses": [], "ground_truths": []}, {"threshold": 0.5}), - (RagasMetric.CONTEXT_PRECISION, {"questions": [], "contexts": [], "ground_truths": []}, None), - (RagasMetric.CONTEXT_UTILIZATION, {"questions": [], "contexts": [], "responses": []}, None), - (RagasMetric.CONTEXT_RECALL, {"questions": [], "contexts": [], "ground_truths": []}, None), - ( - RagasMetric.ASPECT_CRITIQUE, - {"questions": [], "contexts": [], "responses": []}, - { - "name": "harmfulness", - "definition": "Does the submission causes or has the potential to " - "cause harm to individuals, groups, or society at " - "large?", - }, - ), - (RagasMetric.ANSWER_RELEVANCY, {"questions": [], "contexts": [], "responses": []}, {"strictness": 2}), - ], -) -def test_evaluator_valid_inputs(current_metric, inputs, params): - init_params = { - "metric": current_metric, - "metric_params": params, - } - eval = RagasEvaluator(**init_params) - eval._backend_callable = lambda dataset, metric: MockBackend(current_metric).evaluate(dataset, metric) - output = eval.run(**inputs) - - -@pytest.mark.parametrize( - "current_metric, inputs, error_string, params", - [ - ( - RagasMetric.FAITHFULNESS, - {"questions": [1], "contexts": [2], "responses": [3]}, - "expects inputs to be of type 'str'", - None, - ), - ( - RagasMetric.ANSWER_RELEVANCY, - {"questions": [""], "responses": [], "contexts": []}, - "Mismatching counts ", - {"strictness": 2}, - ), - (RagasMetric.ANSWER_RELEVANCY, {"responses": []}, "expected input parameter ", {"strictness": 2}), - ], -) -def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params): - with pytest.raises(ValueError, match=error_string): - init_params = { - "metric": current_metric, - "metric_params": params, - } - eval = RagasEvaluator(**init_params) - eval._backend_callable = lambda dataset, metric: MockBackend(current_metric).evaluate(dataset, metric) - output = eval.run(**inputs) - - -# This test validates the expected outputs of the evaluator. -# Each output is parameterized as a list of tuples, where each tuple is (name, score). -@pytest.mark.parametrize( - "current_metric, inputs, expected_outputs, metric_params", - [ - ( - RagasMetric.ANSWER_CORRECTNESS, - {"questions": ["q1"], "responses": ["r1"], "ground_truths": ["gt1"]}, - [[(None, 0.5)]], - {"weights": [0.5, 0.5]}, - ), - ( - RagasMetric.FAITHFULNESS, - {"questions": ["q2"], "contexts": [["c2"]], "responses": ["r2"]}, - [[(None, 1.0)]], - None, - ), - ( - RagasMetric.ANSWER_SIMILARITY, - {"responses": ["r3"], "ground_truths": ["gt3"]}, - [[(None, 1.0)]], - {"threshold": 0.5}, - ), - ( - RagasMetric.CONTEXT_PRECISION, - {"questions": ["q4"], "contexts": [["c4"]], "ground_truths": ["gt44"]}, - [[(None, 0.5)]], - None, - ), - ( - RagasMetric.CONTEXT_UTILIZATION, - {"questions": ["q5"], "contexts": [["c5"]], "responses": ["r5"]}, - [[(None, 1.0)]], - None, - ), - ( - RagasMetric.CONTEXT_RECALL, - {"questions": ["q6"], "contexts": [["c6"]], "ground_truths": ["gt6"]}, - [[(None, 0.9)]], - None, - ), - ( - RagasMetric.ASPECT_CRITIQUE, - {"questions": ["q7"], "contexts": [["c7"]], "responses": ["r7"]}, - [[("harmfulness", 1.0)]], - { - "name": "harmfulness", - "definition": "Does the submission causes or has the potential to " - "cause harm to individuals, groups, or society at " - "large?", - }, - ), - ( - RagasMetric.ANSWER_RELEVANCY, - {"questions": ["q9"], "contexts": [["c9"]], "responses": ["r9"]}, - [[(None, 0.4)]], - {"strictness": 2}, - ), - ], -) -def test_evaluator_outputs(current_metric, inputs, expected_outputs, metric_params): - init_params = { - "metric": current_metric, - "metric_params": metric_params, - } - eval = RagasEvaluator(**init_params) - eval._backend_callable = lambda dataset, metric: MockBackend(current_metric).evaluate(dataset, metric) - results = eval.run(**inputs)["results"] - - assert type(results) == type(expected_outputs) - assert len(results) == len(expected_outputs) - - for r, o in zip(results, expected_outputs): - assert len(r) == len(o) - - expected = {(name if name is not None else str(current_metric), score) for name, score in o} - got = {(x["name"], x["score"]) for x in r} - assert got == expected - - -# This integration test validates the evaluator by running it against the -# OpenAI API. It is parameterized by the metric, the inputs to the evaluator -# and the metric parameters. -@pytest.mark.asyncio -@pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY not set") -@pytest.mark.parametrize( - "metric, inputs, metric_params", - [ - ( - RagasMetric.ANSWER_CORRECTNESS, - {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES, "ground_truths": DEFAULT_GROUND_TRUTHS}, - {"weights": [0.5, 0.5]}, - ), - ( - RagasMetric.FAITHFULNESS, - {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, - None, - ), - ( - RagasMetric.ANSWER_SIMILARITY, - {"responses": DEFAULT_QUESTIONS, "ground_truths": DEFAULT_GROUND_TRUTHS}, - {"threshold": 0.5}, - ), - ( - RagasMetric.CONTEXT_PRECISION, - {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "ground_truths": DEFAULT_GROUND_TRUTHS}, - None, - ), - ( - RagasMetric.CONTEXT_UTILIZATION, - {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, - None, - ), - ( - RagasMetric.CONTEXT_RECALL, - {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "ground_truths": DEFAULT_GROUND_TRUTHS}, - None, - ), - ( - RagasMetric.ASPECT_CRITIQUE, - {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, - { - "name": "harmfulness", - "definition": "Does the submission causes or has the potential to " - "cause harm to individuals, groups, or society at " - "large?", - }, - ), - ( - RagasMetric.ANSWER_RELEVANCY, - {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, - {"strictness": 2}, - ), - ], -) -def test_integration_run(metric, inputs, metric_params): - init_params = { - "metric": metric, - "metric_params": metric_params, - } - eval = RagasEvaluator(**init_params) - output = eval.run(**inputs) - - assert isinstance(output, dict) - assert len(output) == 1 - assert "results" in output - assert len(output["results"]) == len(next(iter(inputs.values()))) + assert evaluator.metrics == [valid_metric] + assert evaluator.llm is None + assert evaluator.embedding is None + + +def test_missing_columns_in_dataset(): + """Test if RagasEvaluator raises a ValueError when required columns are missing for a specific metric.""" + evaluator = RagasEvaluator(ragas_metrics=[Faithfulness()]) + query = "Which is the most popular global sport?" + reference = "Football is the most popular sport with around 4 billion followers worldwide" + response = "Football is the most popular sport in the world" + + with pytest.raises(ValueError) as exc_info: + evaluator.run(query=query, reference=reference, response=response) + + expected_error_message = "The metric [faithfulness] that is used requires the following additional columns ['documents'] to be present in the dataset." + assert expected_error_message == str(exc_info.value) + + +def test_run_invalid_query_type(): + """Test RagasEvaluator raises ValueError for invalid query type.""" + evaluator = RagasEvaluator(ragas_metrics=[Faithfulness()]) + query = ["Invalid query type"] # Should be str + documents = ["Football is the most popular sport."] + reference = ChatMessage(_content="Football is the most popular sport.", _role="human") + response = "Football is the most popular sport in the world" + + with pytest.raises(ValueError, match="The 'query' field expected .* but got 'list'"): + evaluator.run(query=query, documents=documents, reference=reference, response=response) + + +def test_run_invalid_rubrics_type(): + """Test RagasEvaluator raises ValueError for invalid rubrics type.""" + evaluator = RagasEvaluator(ragas_metrics=[Faithfulness()]) + query = "Which is the most popular global sport?" + response = "Football is the most popular sport in the world" + documents = ["Football is the most popular sport."] + rubrics = ["score_1"] # Should be dict + + with pytest.raises(ValueError, match="The 'rubrics' field expected 'one of Dict, NoneType', but got 'list'."): + evaluator.run(query=query, rubrics=rubrics, response=response, documents=documents) + + +def test_run_invalid_documents_type(): + """Test RagasEvaluator raises ValueError for invalid document types.""" + evaluator = RagasEvaluator(ragas_metrics=[Faithfulness()]) + query = "Which is the most popular global sport?" + documents = [123, ["Invalid document"]] # Invalid types + + with pytest.raises(ValueError, match="Unsupported type in documents list."): + evaluator.run(query=query, documents=documents) + + +@patch.object(RagasEvaluator, 'run') +def test_run_valid_input(mock_run): + """Test RagasEvaluator runs successfully with valid input.""" + mock_run.return_value = {"result": {"score": MagicMock(), "details": MagicMock(spec=EvaluationResult)}} + evaluator = RagasEvaluator(ragas_metrics=[MagicMock(Metric)]) + + query = "Which is the most popular global sport?" + response = "Football is the most popular sport in the world" + documents = [ + Document(content="Football is the world's most popular sport."), + Document(content="Football has over 4 billion followers."), + ] + reference_contexts = ["Football is a globally popular sport."] + multi_responses = ["Football is considered the most popular sport."] + reference = "Football is the most popular sport with around 4 billion followers worldwide" + rubrics = {"accuracy": "high", "relevance": "high"} + + output = evaluator.run( + query=query, + response=response, + documents=documents, + reference_contexts=reference_contexts, + multi_responses=multi_responses, + reference=reference, + rubrics=rubrics, + ) + + assert "result" in output + assert isinstance(output["result"], dict) + assert "score" in output["result"] diff --git a/integrations/ragas/tests/test_metrics.py b/integrations/ragas/tests/test_metrics.py deleted file mode 100644 index 7447689fb..000000000 --- a/integrations/ragas/tests/test_metrics.py +++ /dev/null @@ -1,11 +0,0 @@ -import pytest - -from haystack_integrations.components.evaluators.ragas import RagasMetric - - -def test_ragas_metric(): - for e in RagasMetric: - assert e == RagasMetric.from_str(e.value) - - with pytest.raises(ValueError, match="Unknown Ragas metric"): - RagasMetric.from_str("smugness")