diff --git a/deepeval/red_teaming/attack_synthesizer.py b/deepeval/red_teaming/attack_synthesizer.py index cc64101c..e44c8963 100644 --- a/deepeval/red_teaming/attack_synthesizer.py +++ b/deepeval/red_teaming/attack_synthesizer.py @@ -9,7 +9,6 @@ from deepeval.red_teaming.types import ( AttackEnhancement, - NonRemoteVulnerability, VulnerabilityType, CallbackType, ) @@ -72,6 +71,7 @@ def generate_attacks( attacks_per_vulnerability_type: int, vulnerabilities: List[BaseVulnerability], attack_enhancements: Dict[AttackEnhancement, float], + ignore_errors: bool, ) -> List[Attack]: # Generate unenhanced attacks for each vulnerability base_attacks: List[Attack] = [] @@ -85,8 +85,9 @@ def generate_attacks( for vulnerability in pbar: base_attacks.extend( self.generate_base_attacks( - attacks_per_vulnerability_type, - vulnerability, + attacks_per_vulnerability_type=attacks_per_vulnerability_type, + vulnerability=vulnerability, + ignore_errors=ignore_errors, ) ) @@ -111,6 +112,7 @@ def generate_attacks( target_model_callback=target_model_callback, base_attack=base_attack, attack_enhancement=sampled_enhancement, + ignore_errors=ignore_errors, ) enhanced_attacks.append(enhanced_attack) @@ -123,10 +125,11 @@ async def a_generate_attacks( attacks_per_vulnerability_type: int, vulnerabilities: List[BaseVulnerability], attack_enhancements: Dict[AttackEnhancement, float], - max_concurrent_tasks: int = 10, + ignore_errors: bool, + max_concurrent: int = 10, ) -> List[Attack]: # Create a semaphore to control the number of concurrent tasks - semaphore = asyncio.Semaphore(max_concurrent_tasks) + semaphore = asyncio.Semaphore(max_concurrent) # Generate unenhanced attacks for each vulnerability base_attacks: List[Attack] = [] @@ -141,7 +144,9 @@ async def a_generate_attacks( async def throttled_generate_base_attack(vulnerability): async with semaphore: # Throttling applied here result = await self.a_generate_base_attacks( - attacks_per_vulnerability_type, vulnerability + attacks_per_vulnerability_type=attacks_per_vulnerability_type, + vulnerability=vulnerability, + ignore_errors=ignore_errors, ) pbar.update(1) return result @@ -175,6 +180,7 @@ async def throttled_attack_enhancement(base_attack): target_model_callback=target_model_callback, base_attack=base_attack, attack_enhancement=sampled_enhancement, + ignore_errors=ignore_errors, ) pbar.update(1) return result @@ -204,34 +210,29 @@ def generate_base_attacks( self, attacks_per_vulnerability_type: int, vulnerability: BaseVulnerability, - max_retries: int = 5, + ignore_errors: bool, ) -> List[Attack]: base_attacks: List[Attack] = [] - # Remote vulnerabilities - if not isinstance(BaseVulnerability, NonRemoteVulnerability): - if not is_confident(): - raise Exception( - f"To generate attacks for '{vulnerability.get_name()}', login to Confident AI by running `deepeval login`" - ) - for vulnerability_type in vulnerability.get_types(): - try: - remote_attacks = self.generate_remote_attack( - self.purpose, - vulnerability_type, - attacks_per_vulnerability_type, - ) - base_attacks.extend( - [ - Attack( - vulnerability=vulnerability.get_name(), - vulnerability_type=vulnerability_type, - input=remote_attack, - ) - for remote_attack in remote_attacks - ] - ) - except: + for vulnerability_type in vulnerability.get_types(): + try: + remote_attacks = self.generate_remote_attack( + self.purpose, + vulnerability_type, + attacks_per_vulnerability_type, + ) + base_attacks.extend( + [ + Attack( + vulnerability=vulnerability.get_name(), + vulnerability_type=vulnerability_type, + input=remote_attack, + ) + for remote_attack in remote_attacks + ] + ) + except: + if ignore_errors: for _ in range(attacks_per_vulnerability_type): base_attacks.append( Attack( @@ -240,96 +241,36 @@ def generate_base_attacks( error="Error generating aligned attacks.", ) ) - - # Aligned vulnerabilities: LLMs can generate - else: - for vulnerability_type in vulnerability.get_types(): - prompt = RedTeamSynthesizerTemplate.generate_attacks( - attacks_per_vulnerability_type, - vulnerability_type, - self.purpose, - ) - - # Generate attacks with retries - for i in range(max_retries): - try: - res: SyntheticDataList = self._generate_schema( - prompt, SyntheticDataList - ) - compliance_prompt = ( - RedTeamSynthesizerTemplate.non_compliant( - res.model_dump() - ) - ) - compliance_res: ComplianceData = self._generate_schema( - compliance_prompt, ComplianceData - ) - - if not compliance_res.non_compliant: - base_attacks.extend( - Attack( - input=attack.input, - vulnerability=vulnerability.get_name(), - vulnerability_type=vulnerability_type, - ) - for attack in res.data - ) - break - - if i == max_retries - 1: - base_attacks = [ - Attack( - vulnerability=vulnerability.get_name(), - vulnerability_type=vulnerability_type, - error="Error generating compliant attacks.", - ) - for _ in range(attacks_per_vulnerability_type) - ] - except: - if i == max_retries - 1: - base_attacks = [ - Attack( - vulnerability=vulnerability.get_name(), - vulnerability_type=vulnerability_type, - error="Error generating aligned attacks.", - ) - for _ in range(attacks_per_vulnerability_type) - ] + else: + raise return base_attacks async def a_generate_base_attacks( self, attacks_per_vulnerability_type: int, vulnerability: BaseVulnerability, - max_retries: int = 5, + ignore_errors: bool, ) -> List[Attack]: base_attacks: List[Attack] = [] - - # Remote vulnerabilities - if not isinstance(vulnerability, NonRemoteVulnerability): - if not is_confident(): - raise Exception( - f"To generate attacks for '{vulnerability.get_name()}', login to Confident AI by running `deepeval login`" + for vulnerability_type in vulnerability.get_types(): + try: + remote_attacks = self.generate_remote_attack( + self.purpose, + vulnerability_type, + attacks_per_vulnerability_type, ) - - for vulnerability_type in vulnerability.get_types(): - try: - remote_attacks = self.generate_remote_attack( - self.purpose, - vulnerability_type, - attacks_per_vulnerability_type, - ) - base_attacks.extend( - [ - Attack( - vulnerability=vulnerability.get_name(), - vulnerability_type=vulnerability_type, - input=remote_attack, - ) - for remote_attack in remote_attacks - ] - ) - except: + base_attacks.extend( + [ + Attack( + vulnerability=vulnerability.get_name(), + vulnerability_type=vulnerability_type, + input=remote_attack, + ) + for remote_attack in remote_attacks + ] + ) + except: + if ignore_errors: for _ in range(attacks_per_vulnerability_type): base_attacks.append( Attack( @@ -338,63 +279,8 @@ async def a_generate_base_attacks( error="Error generating aligned attacks.", ) ) - - # Aligned vulnerabilities: LLMs can generate - else: - for vulnerability_type in vulnerability.get_types(): - prompt = RedTeamSynthesizerTemplate.generate_attacks( - attacks_per_vulnerability_type, - vulnerability_type, - self.purpose, - ) - - # Generate attacks with retries - for i in range(max_retries): - try: - res: SyntheticDataList = await self._a_generate_schema( - prompt, SyntheticDataList - ) - compliance_prompt = ( - RedTeamSynthesizerTemplate.non_compliant( - res.model_dump() - ) - ) - compliance_res: ComplianceData = ( - await self._a_generate_schema( - compliance_prompt, ComplianceData - ) - ) - - if not compliance_res.non_compliant: - base_attacks.extend( - Attack( - input=attack.input, - vulnerability=vulnerability.get_name(), - vulnerability_type=vulnerability_type, - ) - for attack in res.data - ) - break - - if i == max_retries - 1: - base_attacks = [ - Attack( - vulnerability=vulnerability.get_name(), - vulnerability_type=vulnerability_type, - error="Error generating compliant attacks.", - ) - for _ in range(attacks_per_vulnerability_type) - ] - except: - if i == max_retries - 1: - base_attacks = [ - Attack( - vulnerability=vulnerability.get_name(), - vulnerability_type=vulnerability_type, - error="Error generating aligned attacks.", - ) - for _ in range(attacks_per_vulnerability_type) - ] + else: + raise return base_attacks ################################################## @@ -406,6 +292,7 @@ def enhance_attack( target_model_callback: CallbackType, base_attack: Attack, attack_enhancement: AttackEnhancement, + ignore_errors: bool, jailbreaking_iterations: int = 5, ): attack_input = base_attack.input @@ -478,8 +365,11 @@ def enhance_attack( ).enhance(attack_input) base_attack.input = enhanced_attack except: - base_attack.error = "Error enhancing attack" - return base_attack + if ignore_errors: + base_attack.error = "Error enhancing attack" + return base_attack + else: + raise return base_attack @@ -488,6 +378,7 @@ async def a_enhance_attack( target_model_callback: CallbackType, base_attack: Attack, attack_enhancement: AttackEnhancement, + ignore_errors: bool, jailbreaking_iterations: int = 5, ): attack_input = base_attack.input @@ -562,8 +453,11 @@ async def a_enhance_attack( ).a_enhance(attack_input) base_attack.input = enhanced_attack except: - base_attack.error = "Error enhancing attack" - return base_attack + if ignore_errors: + base_attack.error = "Error enhancing attack" + return base_attack + else: + raise return base_attack diff --git a/deepeval/red_teaming/red_teamer.py b/deepeval/red_teaming/red_teamer.py index c9dd78d8..fb69bd0e 100644 --- a/deepeval/red_teaming/red_teamer.py +++ b/deepeval/red_teaming/red_teamer.py @@ -102,7 +102,8 @@ def scan( AttackEnhancement.MATH_PROBLEM: 1 / 11, AttackEnhancement.MULTILINGUAL: 1 / 11, }, - max_concurrent_tasks: int = 10, + max_concurrent: int = 10, + ignore_errors: bool = False, ): try: import pandas as pd @@ -121,7 +122,8 @@ def scan( attacks_per_vulnerability_type, vulnerabilities, attack_enhancements, - max_concurrent_tasks, + max_concurrent, + ignore_errors=ignore_errors, ) ) else: @@ -143,6 +145,7 @@ def scan( attacks_per_vulnerability_type=attacks_per_vulnerability_type, vulnerabilities=vulnerabilities, attack_enhancements=attack_enhancements, + ignore_errors=ignore_errors, ) ) @@ -195,6 +198,7 @@ def scan( "Error": None, } + # this will only go through if ignore_errors == True if attack.error: result["Error"] = attack.error red_teaming_results_breakdown.append(result) @@ -204,11 +208,14 @@ def scan( target_output = target_model_callback(attack.input) result["Target Output"] = target_output except Exception: - result["Error"] = ( - "Error generating output from target LLM" - ) - red_teaming_results_breakdown.append(result) - continue + if ignore_errors: + result["Error"] = ( + "Error generating output from target LLM" + ) + red_teaming_results_breakdown.append(result) + continue + else: + raise test_case = LLMTestCase( input=attack.input, @@ -221,11 +228,14 @@ def scan( result["Reason"] = metric.reason scores.append(metric.score) except Exception: - result["Error"] = ( - f"Error evaluating target LLM output for the '{vulnerability_type.value}' vulnerability" - ) - red_teaming_results_breakdown.append(result) - continue + if ignore_errors: + result["Error"] = ( + f"Error evaluating target LLM output for the '{vulnerability_type.value}' vulnerability" + ) + red_teaming_results_breakdown.append(result) + continue + else: + raise red_teaming_results_breakdown.append(result) @@ -267,7 +277,8 @@ async def a_scan( AttackEnhancement.MATH_PROBLEM: 1 / 11, AttackEnhancement.MULTILINGUAL: 1 / 11, }, - max_concurrent_tasks: int = 10, # Throttling limit, control concurrency + max_concurrent: int = 10, # Throttling limit, control concurrency + ignore_errors: bool = False, ): try: import pandas as pd @@ -290,7 +301,8 @@ async def a_scan( attacks_per_vulnerability_type=attacks_per_vulnerability_type, vulnerabilities=vulnerabilities, attack_enhancements=attack_enhancements, - max_concurrent_tasks=max_concurrent_tasks, + ignore_errors=ignore_errors, + max_concurrent=max_concurrent, ) ) @@ -315,7 +327,7 @@ async def a_scan( red_teaming_results_breakdown = [] # Create a semaphore for throttling concurrent tasks - semaphore = asyncio.Semaphore(max_concurrent_tasks) + semaphore = asyncio.Semaphore(max_concurrent) # Total number of attacks across all vulnerabilities total_attacks = sum( @@ -336,13 +348,14 @@ async def throttled_evaluate_vulnerability_type( ): async with ( semaphore - ): # Ensures only `max_concurrent_tasks` run concurrently + ): # Ensures only `max_concurrent` run concurrently vulnerability_results = ( await self._a_evaluate_vulnerability_type( target_model_callback, vulnerability_type, attacks, metrics_map, + ignore_errors=ignore_errors, ) ) pbar.update( @@ -425,6 +438,7 @@ async def _a_attack( vulnerability: str, vulnerability_type: VulnerabilityType, metrics_map, + ignore_errors: bool, ) -> VulnerabilityResult: result = VulnerabilityResult( input=attack.input, @@ -442,9 +456,12 @@ async def _a_attack( # Generate actual output using the 'input' actual_output = await target_model_callback(attack.input) result.actual_output = actual_output - except Exception as e: - result.error = "Error generating output from target LLM" - return result + except Exception: + if ignore_errors: + result.error = "Error generating output from target LLM" + return result + else: + raise test_case = LLMTestCase( input=attack.input, @@ -456,8 +473,11 @@ async def _a_attack( result.score = metric.score result.reason = metric.reason except: - result.error = f"Error evaluating target LLM output for the '{vulnerability_type.value}' vulnerability type" - return result + if ignore_errors: + result.error = f"Error evaluating target LLM output for the '{vulnerability_type.value}' vulnerability type" + return result + else: + raise return result @@ -467,6 +487,7 @@ async def _a_evaluate_vulnerability_type( vulnerability_type: VulnerabilityType, attacks: List[Attack], metrics_map, + ignore_errors: bool, ) -> List[VulnerabilityResult]: results = await asyncio.gather( *[ @@ -476,6 +497,7 @@ async def _a_evaluate_vulnerability_type( vulnerability=attack.vulnerability, vulnerability_type=vulnerability_type, metrics_map=metrics_map, + ignore_errors=ignore_errors, ) for attack in attacks ] diff --git a/deepeval/red_teaming/types.py b/deepeval/red_teaming/types.py index 78340427..b354b37c 100644 --- a/deepeval/red_teaming/types.py +++ b/deepeval/red_teaming/types.py @@ -17,16 +17,12 @@ from deepeval.vulnerability.robustness import RobustnessType from deepeval.vulnerability.toxicity import ToxicityType from deepeval.vulnerability.bias import BiasType -from deepeval.vulnerability import ( - Misinformation, - Bias, -) ########################################## #### Attack Enhancements ################ ########################################## -NonRemoteVulnerability = Union[Bias, Misinformation] + VulnerabilityType = Union[ UnauthorizedAccessType, IllegalActivityType, diff --git a/deepeval/red_teaming/utils.py b/deepeval/red_teaming/utils.py index ae9e8a96..ee3d1e2f 100644 --- a/deepeval/red_teaming/utils.py +++ b/deepeval/red_teaming/utils.py @@ -16,7 +16,7 @@ def generate_schema( return res else: try: - res: schema = model.generate(prompt, schema=schema) + res = model.generate(prompt, schema=schema) return res except TypeError: res = model.generate(prompt) @@ -39,7 +39,7 @@ async def a_generate_schema( return res else: try: - res: schema = await model.a_generate(prompt, schema=schema) + res = await model.a_generate(prompt, schema=schema) return res except TypeError: res = await model.a_generate(prompt) diff --git a/tests/test_red_team_synthesizer.py b/tests/test_red_team_synthesizer.py index cbeb98db..71beece9 100644 --- a/tests/test_red_team_synthesizer.py +++ b/tests/test_red_team_synthesizer.py @@ -145,11 +145,10 @@ def test_red_teamer(): results = red_teamer.scan( target_model_callback=a_generate, attacks_per_vulnerability_type=1, - attack_enhancements={ - enhancement: 1 for enhancement in AttackEnhancement - }, - vulnerabilities=vulnerabilties, - max_concurrent_tasks=2, + attack_enhancements={AttackEnhancement.JAILBREAK_CRESCENDO: 1}, + vulnerabilities=vulnerabilties[0:2], + max_concurrent=2, + ignore_errors=True, ) print(results)