Skip to content

Commit

Permalink
add conditional displaying
Browse files Browse the repository at this point in the history
  • Loading branch information
penguine-ip committed Jan 22, 2025
1 parent 4f3af89 commit f7f7f38
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 8 deletions.
10 changes: 9 additions & 1 deletion deepeval/cli/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from deepeval.test_run import global_test_run_manager, TEMP_FILE_NAME
from deepeval.test_run.cache import TEMP_CACHE_FILE_NAME
from deepeval.test_run.test_run import TestRunResultDisplay
from deepeval.utils import (
delete_file_if_exists,
set_should_ignore_errors,
Expand Down Expand Up @@ -94,6 +95,13 @@ def run(
"-v",
help="Whether to turn on verbose mode for evaluation or not",
),
display: Optional[TestRunResultDisplay] = typer.Option(
TestRunResultDisplay.ALL.value,
"--display",
"-d",
help="Whether to display all test cases or just some in the end",
case_sensitive=False,
),
mark: Optional[str] = typer.Option(
None,
"--mark",
Expand Down Expand Up @@ -154,7 +162,7 @@ def run(
pytest_retcode = pytest.main(pytest_args)
end_time = time.perf_counter()
run_duration = end_time - start_time
global_test_run_manager.wrap_up_test_run(run_duration)
global_test_run_manager.wrap_up_test_run(run_duration, True, display)

invoke_test_run_end_hook()

Expand Down
14 changes: 12 additions & 2 deletions deepeval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from deepeval.metrics.utils import copy_metrics
from deepeval.test_case.utils import check_valid_test_cases_type
from deepeval.test_run.hyperparameters import process_hyperparameters
from deepeval.test_run.test_run import TestRunResultDisplay
from deepeval.utils import (
get_or_create_event_loop,
should_ignore_errors,
Expand Down Expand Up @@ -1019,6 +1020,7 @@ def evaluate(
identifier: Optional[str] = None,
throttle_value: int = 0,
max_concurrent: int = 100,
display: Optional[TestRunResultDisplay] = TestRunResultDisplay.ALL,
) -> EvaluationResult:
check_valid_test_cases_type(test_cases)

Expand Down Expand Up @@ -1077,7 +1079,7 @@ def evaluate(
run_duration = end_time - start_time
if print_results:
for test_result in test_results:
print_test_result(test_result)
print_test_result(test_result, display)

aggregate_metric_pass_rates(test_results)

Expand All @@ -1092,10 +1094,18 @@ def evaluate(
)


def print_test_result(test_result: TestResult):
def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
if test_result.metrics_data is None:
return

if (
display == TestRunResultDisplay.PASSING.value
and test_result.success is False
):
return
elif display == TestRunResultDisplay.FAILING.value and test_result.success:
return

print("")
print("=" * 70 + "\n")
print("Metrics Summary\n")
Expand Down
48 changes: 43 additions & 5 deletions deepeval/test_run/test_run.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from enum import Enum
import os
import json
from pydantic import BaseModel, Field
Expand Down Expand Up @@ -30,6 +31,12 @@
TEMP_FILE_NAME = "temp_test_run_data.json"


class TestRunResultDisplay(Enum):
ALL = "all"
FAILING = "failing"
PASSING = "passing"


class MetricScoreType(BaseModel):
metric: str
score: float
Expand Down Expand Up @@ -392,7 +399,9 @@ def update_test_run(
def clear_test_run(self):
self.test_run = None

def display_results_table(self, test_run: TestRun):
def display_results_table(
self, test_run: TestRun, display: TestRunResultDisplay
):
table = Table(title="Test Results")
table.add_column("Test case", justify="left")
table.add_column("Metric", justify="left")
Expand All @@ -404,6 +413,14 @@ def display_results_table(self, test_run: TestRun):
if test_case.metrics_data is None:
continue

if (
display == TestRunResultDisplay.PASSING
and test_case.success == False
):
continue
elif display == TestRunResultDisplay.FAILING and test_case.success:
continue

pass_count = 0
fail_count = 0
test_case_name = test_case.name
Expand Down Expand Up @@ -459,6 +476,17 @@ def display_results_table(self, test_run: TestRun):
for index, conversational_test_case in enumerate(
test_run.conversational_test_cases
):
if (
display == TestRunResultDisplay.PASSING
and conversational_test_case.success == False
):
continue
elif (
display == TestRunResultDisplay.FAILING
and conversational_test_case.success
):
continue

pass_count = 0
fail_count = 0
conversational_test_case_name = conversational_test_case.name
Expand Down Expand Up @@ -569,6 +597,13 @@ def display_results_table(self, test_run: TestRun):
"",
)

table.add_row(
"[bold red]Note: Use Confident AI with DeepEval to analyze failed test cases for more details[/bold red]",
"",
"",
"",
"",
)
print(table)
print(
f"Total estimated evaluation tokens cost: {test_run.evaluation_cost} USD"
Expand Down Expand Up @@ -680,7 +715,7 @@ def post_test_run(self, test_run: TestRun) -> Optional[str]:
console.print(
"[rgb(5,245,141)]✓[/rgb(5,245,141)] Tests finished 🎉! View results on "
f"[link={link}]{link}[/link]."
"\n‼️ Friendly reminder 😇: You can also run evaluations with ALL of deepeval's metrics directly on Confident AI instead."
"\n‼️ Looking for a place for your test data to live 🏡? Use Confident AI to analyze, benchmark, compare models/prompts, and catch regressions for your LLM system."
)

if is_in_ci_env() == False:
Expand All @@ -690,7 +725,7 @@ def post_test_run(self, test_run: TestRun) -> Optional[str]:
else:
console.print(
"[rgb(5,245,141)]✓[/rgb(5,245,141)] Tests finished 🎉! Run 'deepeval login' to save and analyze evaluation results on Confident AI. "
"\n‼️ Friendly reminder 😇: You can also run evaluations with ALL of deepeval's metrics directly on Confident AI instead."
"\n‼️ Looking for a place for your test data to live 🏡? Use Confident AI to analyze, benchmark, compare models/prompts, and catch regressions for your LLM system."
)

def save_test_run_locally(self):
Expand All @@ -714,7 +749,10 @@ def save_test_run_locally(self):
os.remove(new_test_filename)

def wrap_up_test_run(
self, runDuration: float, display_table: bool = True
self,
runDuration: float,
display_table: bool = True,
display: Optional[TestRunResultDisplay] = TestRunResultDisplay.ALL,
) -> Optional[str]:
test_run = self.get_test_run()
if test_run is None:
Expand Down Expand Up @@ -750,7 +788,7 @@ def wrap_up_test_run(
global_test_run_cache_manager.wrap_up_cached_test_run()

if display_table:
self.display_results_table(test_run)
self.display_results_table(test_run, display)

self.save_test_run_locally()
delete_file_if_exists(self.temp_file_name)
Expand Down

0 comments on commit f7f7f38

Please sign in to comment.