udpated role adherence

confident-ai · Feb 3, 2025 · e8d1e2d · e8d1e2d
1 parent 4ec13d3
commit e8d1e2d
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 50 deletions.
diff --git a/deepeval/metrics/role_adherence/role_adherence.py b/deepeval/metrics/role_adherence/role_adherence.py
@@ -2,7 +2,7 @@
 
 from deepeval.metrics import BaseConversationalMetric
 from deepeval.metrics.role_adherence.schema import (
-    OutOfCharacterResponseIndicies,
+    OutOfCharacterResponseVerdicts,
 )
 from deepeval.metrics.role_adherence.template import RoleAdherenceTemplate
 from deepeval.metrics.utils import (
@@ -20,7 +20,7 @@
     ConversationalTestCase,
 )
 from deepeval.utils import get_or_create_event_loop, prettify_list
-from deepeval.metrics.conversation_relevancy.schema import *
+from deepeval.metrics.role_adherence.schema import *
 
 required_params: List[LLMTestCaseParams] = [
     LLMTestCaseParams.INPUT,
@@ -64,10 +64,10 @@ def measure(
                 self.turns: List[Dict[str, str]] = format_turns(
                     test_case.turns, required_params
                 )
-                self.out_of_character_responses = (
-                    self._extract_out_of_character_responses(
-                        test_case.turns, test_case.chatbot_role
-                    )
+                self.out_of_character_verdicts: (
+                    OutOfCharacterResponseVerdicts
+                ) = self._extract_out_of_character_verdicts(
+                    test_case.turns, test_case.chatbot_role
                 )
                 self.score = self._calculate_score()
                 self.reason = self._generate_reason(role=test_case.chatbot_role)
@@ -76,7 +76,7 @@ def measure(
                     self,
                     steps=[
                         f"Chatbot Role:\n{test_case.chatbot_role}",
-                        f"Out-of-Character Turn Response(s):\n{prettify_list(self.out_of_character_responses)}",
+                        f"Out-of-Character Turn Response(s):\n{prettify_list(self.out_of_character_verdicts.verdicts)}",
                         f"Score: {self.score}\nReason: {self.reason}",
                     ],
                 )
@@ -91,16 +91,17 @@ async def a_measure(
             test_case, required_params, self, require_chatbot_role=True
         )
 
+        print(_show_indicator, "@")
         self.evaluation_cost = 0 if self.using_native_model else None
         with metric_progress_indicator(
             self, async_mode=True, _show_indicator=_show_indicator
         ):
             self.turns: List[Dict[str, str]] = format_turns(
                 test_case.turns, required_params
             )
-            self.out_of_character_responses = (
+            self.out_of_character_verdicts = (
                 await (
-                    self._a_extract_out_of_character_responses(
+                    self._a_extract_out_of_character_verdicts(
                         test_case.turns, test_case.chatbot_role
                     )
                 )
@@ -114,7 +115,7 @@ async def a_measure(
                 self,
                 steps=[
                     f"Chatbot Role:\n{test_case.chatbot_role}",
-                    f"Out-of-Character Turn(s) Response(s):\n{prettify_list(self.out_of_character_responses)}",
+                    f"Out-of-Character Turn(s) Response(s):\n{prettify_list(self.out_of_character_verdicts.verdicts)}",
                     f"Score: {self.score}\nReason: {self.reason}",
                 ],
             )
@@ -124,7 +125,7 @@ async def _a_generate_reason(self, role: str) -> str:
         prompt = RoleAdherenceTemplate.generate_reason(
             score=self.score,
             role=role,
-            out_of_character_responses=self.out_of_character_responses,
+            out_of_character_responses=self.out_of_character_verdicts,
         )
         if self.using_native_model:
             res, cost = await self.model.a_generate(prompt, schema=Reason)
@@ -143,7 +144,7 @@ def _generate_reason(self, role: str) -> str:
         prompt = RoleAdherenceTemplate.generate_reason(
             score=self.score,
             role=role,
-            out_of_character_responses=self.out_of_character_responses,
+            out_of_character_responses=self.out_of_character_verdicts.verdicts,
         )
         if self.using_native_model:
             res, cost = self.model.generate(prompt, schema=Reason)
@@ -158,79 +159,75 @@ def _generate_reason(self, role: str) -> str:
                 data = trimAndLoadJson(res, self)
                 return data["reason"]
 
-    async def _a_extract_out_of_character_responses(
+    async def _a_extract_out_of_character_verdicts(
         self, llm_test_cases: List[LLMTestCase], role: str
-    ) -> List[str]:
+    ) -> OutOfCharacterResponseVerdicts:
         prompt = (
-            RoleAdherenceTemplate.extract_out_of_character_response_indicies(
+            RoleAdherenceTemplate.extract_out_of_character_response_verdicts(
                 turns=self.turns,
                 role=role,
             )
         )
         if self.using_native_model:
             res, cost = await self.model.a_generate(
-                prompt, schema=OutOfCharacterResponseIndicies
+                prompt, schema=OutOfCharacterResponseVerdicts
             )
             self.evaluation_cost += cost
-            indicies = res.indicies
         else:
             try:
-                res: OutOfCharacterResponseIndicies = (
+                res: OutOfCharacterResponseVerdicts = (
                     await self.model.a_generate(
-                        prompt, schema=OutOfCharacterResponseIndicies
+                        prompt, schema=OutOfCharacterResponseVerdicts
                     )
                 )
-                indicies = res.indicies
             except TypeError:
                 res = await self.model.a_generate(prompt)
                 data = trimAndLoadJson(res, self)
-                indicies = OutOfCharacterResponseIndicies(**data).indicies
+                res = OutOfCharacterResponseVerdicts(**data)
 
-        out_of_character_responses: List[str] = []
-        for index in indicies:
+        for verdict in res.verdicts:
             try:
-                out_of_character_responses.append(
+                index = verdict.index
+                verdict.actual_output = (
                     f"{llm_test_cases[index].actual_output} (turn #{index+1})"
                 )
             except:
                 pass
-        return out_of_character_responses
+        return res
 
-    def _extract_out_of_character_responses(
+    def _extract_out_of_character_verdicts(
         self, llm_test_cases: List[LLMTestCase], role: str
-    ) -> List[str]:
+    ) -> OutOfCharacterResponseVerdicts:
         prompt = (
-            RoleAdherenceTemplate.extract_out_of_character_response_indicies(
+            RoleAdherenceTemplate.extract_out_of_character_response_verdicts(
                 turns=self.turns,
                 role=role,
             )
         )
         if self.using_native_model:
             res, cost = self.model.generate(
-                prompt, schema=OutOfCharacterResponseIndicies
+                prompt, schema=OutOfCharacterResponseVerdicts
             )
             self.evaluation_cost += cost
-            indicies = res.indicies
         else:
             try:
-                res: OutOfCharacterResponseIndicies = self.model.generate(
-                    prompt, schema=OutOfCharacterResponseIndicies
+                res: OutOfCharacterResponseVerdicts = self.model.generate(
+                    prompt, schema=OutOfCharacterResponseVerdicts
                 )
-                indicies = res.indicies
             except TypeError:
                 res = self.model.generate(prompt)
                 data = trimAndLoadJson(res, self)
-                indicies = OutOfCharacterResponseIndicies(**data).indicies
+                res = OutOfCharacterResponseVerdicts(**data)
 
-        out_of_character_responses: List[str] = []
-        for index in indicies:
+        for verdict in res.verdicts:
             try:
-                out_of_character_responses.append(
+                index = verdict.index
+                verdict.actual_output = (
                     f"{llm_test_cases[index].actual_output} (turn #{index+1})"
                 )
             except:
                 pass
-        return out_of_character_responses
+        return res
 
     def _calculate_score(self) -> float:
         number_of_turns = len(self.turns)
@@ -239,7 +236,7 @@ def _calculate_score(self) -> float:
 
         score = (
             number_of_turns
-            - min(len(self.out_of_character_responses), number_of_turns)
+            - min(len(self.out_of_character_verdicts.verdicts), number_of_turns)
         ) / number_of_turns
         return 0 if self.strict_mode and score < self.threshold else score
 

diff --git a/deepeval/metrics/role_adherence/schema.py b/deepeval/metrics/role_adherence/schema.py
@@ -1,9 +1,15 @@
-from typing import List
+from typing import List, Optional
 from pydantic import BaseModel
 
 
-class OutOfCharacterResponseIndicies(BaseModel):
-    indicies: List[int]
+class OutOfCharacterResponseVerdict(BaseModel):
+    index: int
+    reason: str
+    actual_output: Optional[str]
+
+
+class OutOfCharacterResponseVerdicts(BaseModel):
+    verdicts: List[OutOfCharacterResponseVerdict]
 
 
 class Reason(BaseModel):

diff --git a/deepeval/metrics/role_adherence/template.py b/deepeval/metrics/role_adherence/template.py
@@ -1,8 +1,8 @@
 class RoleAdherenceTemplate:
     @staticmethod
-    def extract_out_of_character_response_indicies(turns, role):
+    def extract_out_of_character_response_verdicts(turns, role):
         return f"""Based on the given list of message exchanges between a user and an LLM chatbot, generate a JSON object to specify which `actual_outputs` did not adhere to the specified chatbot role. 
-The JSON will have 1 field: "indicies", which is a list of integers specifying the indices of the LLM actual_output/responses that did NOT adhere to the chatbot role.
+The JSON will have 1 field: "verdicts", which is a list of verdicts specifying the indices and reasons of the LLM actual_output/responses that did NOT adhere to the chatbot role.
 You MUST USE look at all messages provided in the list of messages to make an informed judgement on role adherence.
 
 **
@@ -36,12 +36,15 @@ def extract_out_of_character_response_indicies(turns, role):
 
 Example JSON:
 {{
-    "indicies": [4]
+    "verdicts": {{
+        "index": 4,
+        "reason": "The LLM chatbot claims that 'I'm the greatest wizard ever' even though it was explicitly asked to adhere to the role of a humble and doubtful wizard."
+    }}
 }}
 ===== END OF EXAMPLE ======
 
 In this example, the 4th indexed was selected as it drastically deviates from the character's humble nature and shows extreme arrogance and overconfidence instead.
-You DON'T have to provide anything else other than the JSON of "indicies".
+You DON'T have to provide anything else other than the JSON of "verdicts".
 **
 
 Chatbot Role:
@@ -55,7 +58,7 @@ def extract_out_of_character_response_indicies(turns, role):
 
     @staticmethod
     def generate_reason(score, role, out_of_character_responses):
-        return f"""Below is a list of LLM chatboat responses that is out of character with respect to the specified chatbot role. It is drawn from a list of messages in a conversation, which you have minimal knowledge of.
+        return f"""Below is a list of LLM chatbot responses (actual_outputs) that is out of character with respect to the specified chatbot role. It is drawn from a list of messages in a conversation, which you have minimal knowledge of.
 Given the role adherence score, which is a 0-1 score indicating how well the chatbot responses has adhered to the given role through a conversation, with 1 being the best and 0 being worst, provide a reason by quoting the out of character responses to justify the score. 
 
 ** 

diff --git a/e.py b/e.py
@@ -108,11 +108,12 @@
 evaluate(
     test_cases=[convo_test_case, convo_test_case_2, convo_test_case_3],
     metrics=[
-        RoleAdherenceMetric(),
-        ConversationRelevancyMetric(),
-        funny_metric,
+        RoleAdherenceMetric(verbose_mode=True),
+        # ConversationRelevancyMetric(),
+        # funny_metric,
     ],
     hyperparameters={"model": "claude", "prompt template": role},
+    run_async=False,
 )