Skip to content

Commit

Permalink
udpated role adherence
Browse files Browse the repository at this point in the history
  • Loading branch information
penguine-ip committed Feb 3, 2025
1 parent 4ec13d3 commit e8d1e2d
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 50 deletions.
75 changes: 36 additions & 39 deletions deepeval/metrics/role_adherence/role_adherence.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from deepeval.metrics import BaseConversationalMetric
from deepeval.metrics.role_adherence.schema import (
OutOfCharacterResponseIndicies,
OutOfCharacterResponseVerdicts,
)
from deepeval.metrics.role_adherence.template import RoleAdherenceTemplate
from deepeval.metrics.utils import (
Expand All @@ -20,7 +20,7 @@
ConversationalTestCase,
)
from deepeval.utils import get_or_create_event_loop, prettify_list
from deepeval.metrics.conversation_relevancy.schema import *
from deepeval.metrics.role_adherence.schema import *

required_params: List[LLMTestCaseParams] = [
LLMTestCaseParams.INPUT,
Expand Down Expand Up @@ -64,10 +64,10 @@ def measure(
self.turns: List[Dict[str, str]] = format_turns(
test_case.turns, required_params
)
self.out_of_character_responses = (
self._extract_out_of_character_responses(
test_case.turns, test_case.chatbot_role
)
self.out_of_character_verdicts: (
OutOfCharacterResponseVerdicts
) = self._extract_out_of_character_verdicts(
test_case.turns, test_case.chatbot_role
)
self.score = self._calculate_score()
self.reason = self._generate_reason(role=test_case.chatbot_role)
Expand All @@ -76,7 +76,7 @@ def measure(
self,
steps=[
f"Chatbot Role:\n{test_case.chatbot_role}",
f"Out-of-Character Turn Response(s):\n{prettify_list(self.out_of_character_responses)}",
f"Out-of-Character Turn Response(s):\n{prettify_list(self.out_of_character_verdicts.verdicts)}",
f"Score: {self.score}\nReason: {self.reason}",
],
)
Expand All @@ -91,16 +91,17 @@ async def a_measure(
test_case, required_params, self, require_chatbot_role=True
)

print(_show_indicator, "@")
self.evaluation_cost = 0 if self.using_native_model else None
with metric_progress_indicator(
self, async_mode=True, _show_indicator=_show_indicator
):
self.turns: List[Dict[str, str]] = format_turns(
test_case.turns, required_params
)
self.out_of_character_responses = (
self.out_of_character_verdicts = (
await (
self._a_extract_out_of_character_responses(
self._a_extract_out_of_character_verdicts(
test_case.turns, test_case.chatbot_role
)
)
Expand All @@ -114,7 +115,7 @@ async def a_measure(
self,
steps=[
f"Chatbot Role:\n{test_case.chatbot_role}",
f"Out-of-Character Turn(s) Response(s):\n{prettify_list(self.out_of_character_responses)}",
f"Out-of-Character Turn(s) Response(s):\n{prettify_list(self.out_of_character_verdicts.verdicts)}",
f"Score: {self.score}\nReason: {self.reason}",
],
)
Expand All @@ -124,7 +125,7 @@ async def _a_generate_reason(self, role: str) -> str:
prompt = RoleAdherenceTemplate.generate_reason(
score=self.score,
role=role,
out_of_character_responses=self.out_of_character_responses,
out_of_character_responses=self.out_of_character_verdicts,
)
if self.using_native_model:
res, cost = await self.model.a_generate(prompt, schema=Reason)
Expand All @@ -143,7 +144,7 @@ def _generate_reason(self, role: str) -> str:
prompt = RoleAdherenceTemplate.generate_reason(
score=self.score,
role=role,
out_of_character_responses=self.out_of_character_responses,
out_of_character_responses=self.out_of_character_verdicts.verdicts,
)
if self.using_native_model:
res, cost = self.model.generate(prompt, schema=Reason)
Expand All @@ -158,79 +159,75 @@ def _generate_reason(self, role: str) -> str:
data = trimAndLoadJson(res, self)
return data["reason"]

async def _a_extract_out_of_character_responses(
async def _a_extract_out_of_character_verdicts(
self, llm_test_cases: List[LLMTestCase], role: str
) -> List[str]:
) -> OutOfCharacterResponseVerdicts:
prompt = (
RoleAdherenceTemplate.extract_out_of_character_response_indicies(
RoleAdherenceTemplate.extract_out_of_character_response_verdicts(
turns=self.turns,
role=role,
)
)
if self.using_native_model:
res, cost = await self.model.a_generate(
prompt, schema=OutOfCharacterResponseIndicies
prompt, schema=OutOfCharacterResponseVerdicts
)
self.evaluation_cost += cost
indicies = res.indicies
else:
try:
res: OutOfCharacterResponseIndicies = (
res: OutOfCharacterResponseVerdicts = (
await self.model.a_generate(
prompt, schema=OutOfCharacterResponseIndicies
prompt, schema=OutOfCharacterResponseVerdicts
)
)
indicies = res.indicies
except TypeError:
res = await self.model.a_generate(prompt)
data = trimAndLoadJson(res, self)
indicies = OutOfCharacterResponseIndicies(**data).indicies
res = OutOfCharacterResponseVerdicts(**data)

out_of_character_responses: List[str] = []
for index in indicies:
for verdict in res.verdicts:
try:
out_of_character_responses.append(
index = verdict.index
verdict.actual_output = (
f"{llm_test_cases[index].actual_output} (turn #{index+1})"
)
except:
pass
return out_of_character_responses
return res

def _extract_out_of_character_responses(
def _extract_out_of_character_verdicts(
self, llm_test_cases: List[LLMTestCase], role: str
) -> List[str]:
) -> OutOfCharacterResponseVerdicts:
prompt = (
RoleAdherenceTemplate.extract_out_of_character_response_indicies(
RoleAdherenceTemplate.extract_out_of_character_response_verdicts(
turns=self.turns,
role=role,
)
)
if self.using_native_model:
res, cost = self.model.generate(
prompt, schema=OutOfCharacterResponseIndicies
prompt, schema=OutOfCharacterResponseVerdicts
)
self.evaluation_cost += cost
indicies = res.indicies
else:
try:
res: OutOfCharacterResponseIndicies = self.model.generate(
prompt, schema=OutOfCharacterResponseIndicies
res: OutOfCharacterResponseVerdicts = self.model.generate(
prompt, schema=OutOfCharacterResponseVerdicts
)
indicies = res.indicies
except TypeError:
res = self.model.generate(prompt)
data = trimAndLoadJson(res, self)
indicies = OutOfCharacterResponseIndicies(**data).indicies
res = OutOfCharacterResponseVerdicts(**data)

out_of_character_responses: List[str] = []
for index in indicies:
for verdict in res.verdicts:
try:
out_of_character_responses.append(
index = verdict.index
verdict.actual_output = (
f"{llm_test_cases[index].actual_output} (turn #{index+1})"
)
except:
pass
return out_of_character_responses
return res

def _calculate_score(self) -> float:
number_of_turns = len(self.turns)
Expand All @@ -239,7 +236,7 @@ def _calculate_score(self) -> float:

score = (
number_of_turns
- min(len(self.out_of_character_responses), number_of_turns)
- min(len(self.out_of_character_verdicts.verdicts), number_of_turns)
) / number_of_turns
return 0 if self.strict_mode and score < self.threshold else score

Expand Down
12 changes: 9 additions & 3 deletions deepeval/metrics/role_adherence/schema.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
from typing import List
from typing import List, Optional
from pydantic import BaseModel


class OutOfCharacterResponseIndicies(BaseModel):
indicies: List[int]
class OutOfCharacterResponseVerdict(BaseModel):
index: int
reason: str
actual_output: Optional[str]


class OutOfCharacterResponseVerdicts(BaseModel):
verdicts: List[OutOfCharacterResponseVerdict]


class Reason(BaseModel):
Expand Down
13 changes: 8 additions & 5 deletions deepeval/metrics/role_adherence/template.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
class RoleAdherenceTemplate:
@staticmethod
def extract_out_of_character_response_indicies(turns, role):
def extract_out_of_character_response_verdicts(turns, role):
return f"""Based on the given list of message exchanges between a user and an LLM chatbot, generate a JSON object to specify which `actual_outputs` did not adhere to the specified chatbot role.
The JSON will have 1 field: "indicies", which is a list of integers specifying the indices of the LLM actual_output/responses that did NOT adhere to the chatbot role.
The JSON will have 1 field: "verdicts", which is a list of verdicts specifying the indices and reasons of the LLM actual_output/responses that did NOT adhere to the chatbot role.
You MUST USE look at all messages provided in the list of messages to make an informed judgement on role adherence.
**
Expand Down Expand Up @@ -36,12 +36,15 @@ def extract_out_of_character_response_indicies(turns, role):
Example JSON:
{{
"indicies": [4]
"verdicts": {{
"index": 4,
"reason": "The LLM chatbot claims that 'I'm the greatest wizard ever' even though it was explicitly asked to adhere to the role of a humble and doubtful wizard."
}}
}}
===== END OF EXAMPLE ======
In this example, the 4th indexed was selected as it drastically deviates from the character's humble nature and shows extreme arrogance and overconfidence instead.
You DON'T have to provide anything else other than the JSON of "indicies".
You DON'T have to provide anything else other than the JSON of "verdicts".
**
Chatbot Role:
Expand All @@ -55,7 +58,7 @@ def extract_out_of_character_response_indicies(turns, role):

@staticmethod
def generate_reason(score, role, out_of_character_responses):
return f"""Below is a list of LLM chatboat responses that is out of character with respect to the specified chatbot role. It is drawn from a list of messages in a conversation, which you have minimal knowledge of.
return f"""Below is a list of LLM chatbot responses (actual_outputs) that is out of character with respect to the specified chatbot role. It is drawn from a list of messages in a conversation, which you have minimal knowledge of.
Given the role adherence score, which is a 0-1 score indicating how well the chatbot responses has adhered to the given role through a conversation, with 1 being the best and 0 being worst, provide a reason by quoting the out of character responses to justify the score.
**
Expand Down
7 changes: 4 additions & 3 deletions e.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,12 @@
evaluate(
test_cases=[convo_test_case, convo_test_case_2, convo_test_case_3],
metrics=[
RoleAdherenceMetric(),
ConversationRelevancyMetric(),
funny_metric,
RoleAdherenceMetric(verbose_mode=True),
# ConversationRelevancyMetric(),
# funny_metric,
],
hyperparameters={"model": "claude", "prompt template": role},
run_async=False,
)


Expand Down

0 comments on commit e8d1e2d

Please sign in to comment.