1 рік тому · 5e0187469d
--- a/src/llama_recipes/data/llama_guard/finetuning_data_formatter.py
+++ b/src/llama_recipes/data/llama_guard/finetuning_data_formatter.py
@@ -40,10 +40,6 @@ class LlamaGuardGenerationConfigs:
 
				 
			
 
				 @dataclass
			
 
				 class AugmentationConfigs:
			
 
				-    probability_to_add_safe_examples_with_empty_responses: float = 0
			
 
				-    explanation_for_augmentation_with_safe_example_with_empty_response: Optional[
			
 
				-        str
			
 
				-    ] = None
			
 
				     should_add_examples_with_dropped_nonviolated_prompt_categories: bool = True
			
 
				     should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories: bool = (
			
 
				         False
			
@@ -69,7 +65,7 @@ class TrainingExample:
 
				     response: str
			
 
				     violated_category_codes: list[str]
			
 
				     label: Literal["safe", "unsafe"]
			
 
				-    explanation: str
			
 
				+    explanation: Optional[str] = None
			
 
				 
			
 
				 
			
 
				 def create_formatted_finetuning_examples(
			
@@ -126,21 +122,6 @@ def create_formatted_finetuning_examples(
 
				 def _verify_formatter_configs(
			
 
				     formatter_configs: FormatterConfigs,
			
 
				 ) -> None:
			
 
				-    if (
			
 
				-        formatter_configs.augmentation_configs.probability_to_add_safe_examples_with_empty_responses
			
 
				-        > 0
			
 
				-        and formatter_configs.llama_guard_generation_configs.explanation_position
			
 
				-        is not None
			
 
				-        and formatter_configs.augmentation_configs.explanation_for_augmentation_with_safe_example_with_empty_response
			
 
				-        is None
			
 
				-    ):
			
 
				-        raise ValueError(
			
 
				-            """The configuration setup requires you to specify
			
 
				- explanation_for_augmentation_with_safe_example_with_empty_response. This is an
			
 
				- explanation that we use for dynamically-created safe augmentation examples.
			
 
				- Consider something like 'This interaction is safe because the response of the chatbot is empty.'"""
			
 
				-        )
			
 
				-
			
 
				     if (
			
 
				         formatter_configs.augmentation_configs.should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories
			
 
				         == True
			
@@ -198,8 +179,12 @@ def _create_llama_guard_prompt(
 
				     ) in enumerate(category_indices_to_include):
			
 
				         category = formatter_configs.guidelines.categories[original_category_index]
			
 
				 
			
 
				+        newline_for_every_category_after_first = (
			
 
				+            f"\n" if rewritten_category_index_for_current_prompt > 0 else ""
			
 
				+        )
			
 
				+
			
 
				         # Indices start at 0, but categories start at 1, so we add 1
			
 
				-        full_guidelines_text += f"\n{formatter_configs.guidelines.category_code_prefix}{rewritten_category_index_for_current_prompt + 1}: {category.name}. "
			
 
				+        full_guidelines_text += f"{newline_for_every_category_after_first}{formatter_configs.guidelines.category_code_prefix}{rewritten_category_index_for_current_prompt + 1}: {category.name}. "
			
 
				 
			
 
				         if (
			
 
				             formatter_configs.llama_guard_prompt_configs.should_include_category_descriptions
			
@@ -256,10 +241,14 @@ def _create_llama_guard_generation(
 
				             )
			
 
				         )
			
 
				 
			
 
				-        rewritten_violated_category_codes = [
			
 
				-            map_of_original_category_indices_to_rewritten_category_codes[violated_index]
			
 
				-            for violated_index in violated_category_indices
			
 
				-        ]
			
 
				+        rewritten_violated_category_codes = sorted(
			
 
				+            [
			
 
				+                map_of_original_category_indices_to_rewritten_category_codes[
			
 
				+                    violated_index
			
 
				+                ]
			
 
				+                for violated_index in violated_category_indices
			
 
				+            ]
			
 
				+        )
			
 
				 
			
 
				         to_return += "\n"
			
 
				         to_return += ",".join(rewritten_violated_category_codes)
			
@@ -299,63 +288,6 @@ def _maybe_add_data_augmentations_for_example(
 
				     formatted_examples_being_built: list[str],
			
 
				     indices_of_all_categories: range,
			
 
				     formatter_configs: FormatterConfigs,
			
 
				-) -> None:
			
 
				-    _maybe_add_safe_example_with_empty_response(
			
 
				-        training_example,
			
 
				-        formatted_examples_being_built,
			
 
				-        indices_of_all_categories,
			
 
				-        formatter_configs,
			
 
				-    )
			
 
				-
			
 
				-    _maybe_add_examples_with_dropped_prompt_categories(
			
 
				-        training_example,
			
 
				-        formatted_examples_being_built,
			
 
				-        indices_of_all_categories,
			
 
				-        formatter_configs,
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-def _maybe_add_safe_example_with_empty_response(
			
 
				-    training_example: TrainingExample,
			
 
				-    formatted_examples_being_built: list[str],
			
 
				-    indices_of_all_categories: range,
			
 
				-    formatter_configs: FormatterConfigs,
			
 
				-) -> None:
			
 
				-    """
			
 
				-    For any prompt+response pair, an empty response is a safe response,
			
 
				-    so we allow the data to be augmented by adding a safe example with the same
			
 
				-    prompt but an empty response.
			
 
				-    """
			
 
				-    if (
			
 
				-        not _is_a_prompt_only_example(training_example)
			
 
				-        and training_example.response != ""
			
 
				-        and random.random()
			
 
				-        < formatter_configs.augmentation_configs.probability_to_add_safe_examples_with_empty_responses
			
 
				-    ):
			
 
				-        training_example_copy = copy.deepcopy(training_example)
			
 
				-        training_example_copy.response = ""
			
 
				-        training_example_copy.label = "safe"
			
 
				-        training_example_copy.violated_category_codes = []
			
 
				-        training_example_copy.explanation = (
			
 
				-            formatter_configs.augmentation_configs.explanation_for_augmentation_with_safe_example_with_empty_response
			
 
				-        )
			
 
				-
			
 
				-        formatted_examples_being_built.append(
			
 
				-            _create_formatted_finetuning_example(
			
 
				-                training_example_copy,
			
 
				-                formatter_configs,
			
 
				-                category_indeces_to_include_in_llama_guard_prompt=list(
			
 
				-                    indices_of_all_categories
			
 
				-                ),
			
 
				-            )
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-def _maybe_add_examples_with_dropped_prompt_categories(
			
 
				-    training_example: TrainingExample,
			
 
				-    formatted_examples_being_built: list[str],
			
 
				-    indices_of_all_categories: range,
			
 
				-    formatter_configs: FormatterConfigs,
			
 
				 ) -> None:
			
 
				     violated_category_indices = _convert_category_codes_to_indices(
			
 
				         training_example.violated_category_codes,
			
--- a/tests/test_finetuning_data_formatter.py
+++ b/tests/test_finetuning_data_formatter.py
@@ -4,6 +4,8 @@
 
				 import unittest
			
 
				 from typing import Optional
			
 
				 
			
 
				+from examples.llama_guard.prompt_format import AgentType
			
 
				+
			
 
				 from llama_recipes.data.llama_guard.finetuning_data_formatter import (
			
 
				     AugmentationConfigs,
			
 
				     Category,
			
@@ -22,7 +24,7 @@ class FinetuningDataFormatterTests(unittest.TestCase):
 
				         super().setUp()
			
 
				 
			
 
				     @staticmethod
			
 
				-    def create_most_conservative_formatter_configs() -> FormatterConfigs:
			
 
				+    def create_most_conservative_formatter_configs(agent_type_to_check: AgentType) -> FormatterConfigs:
			
 
				         return FormatterConfigs(
			
 
				             guidelines=Guidelines(
			
 
				                 categories=[
			
@@ -35,15 +37,20 @@ class FinetuningDataFormatterTests(unittest.TestCase):
 
				                 category_code_prefix="O",
			
 
				             ),
			
 
				             llama_guard_prompt_configs=LlamaGuardPromptConfigs(
			
 
				-                instructions_format_string="""[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				-{guidelines}
			
 
				-=Example=
			
 
				-{conversation}
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST]""",
			
 
				+                instructions_format_string=f"""[INST] Task: Check if there is unsafe content in '{agent_type_to_check}' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				+{{guidelines}}
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+
			
 
				+<BEGIN CONVERSATION>
			
 
				+
			
 
				+{{conversation}}
			
 
				+
			
 
				+<END CONVERSATION>
			
 
				+
			
 
				+Provide your safety assessment for {agent_type_to_check} in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST]""",
			
 
				                 should_include_category_descriptions=True,
			
 
				                 should_shuffle_category_codes=False,
			
 
				             ),
			
@@ -52,8 +59,6 @@ Provide your assessment: [/INST]""",
 
				                 should_list_violated_codes=True,
			
 
				             ),
			
 
				             augmentation_configs=AugmentationConfigs(
			
 
				-                probability_to_add_safe_examples_with_empty_responses=0.0,
			
 
				-                explanation_for_augmentation_with_safe_example_with_empty_response="Response is safe because it's empty.",
			
 
				                 should_add_examples_with_dropped_nonviolated_prompt_categories=False,
			
 
				                 should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories=False,
			
 
				                 explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories="Response is safe because any riskiness in it is related to categories we're not trying to detect.",
			
@@ -65,12 +70,13 @@ Provide your assessment: [/INST]""",
 
				         self,
			
 
				         training_examples: list[TrainingExample],
			
 
				         expected_formatted_examples: list[str],
			
 
				+        agent_type_to_check: AgentType,
			
 
				         formatter_configs: Optional[FormatterConfigs] = None,
			
 
				     ) -> None:
			
 
				         formatter_configs = (
			
 
				             formatter_configs
			
 
				             if formatter_configs is not None
			
 
				-            else FinetuningDataFormatterTests.create_most_conservative_formatter_configs()
			
 
				+            else FinetuningDataFormatterTests.create_most_conservative_formatter_configs(agent_type_to_check)
			
 
				         )
			
 
				 
			
 
				         formatted_examples = create_formatted_finetuning_examples(
			
@@ -80,6 +86,12 @@ Provide your assessment: [/INST]""",
 
				         assert len(formatted_examples) == len(expected_formatted_examples)
			
 
				 
			
 
				         for i in range(len(formatted_examples)):
			
 
				+            # Helper code during test-writing time to see the actual output
			
 
				+            if formatted_examples[i] != expected_formatted_examples[i]:
			
 
				+                print(f"Failed on actual output {i}:")
			
 
				+
			
 
				+                print(formatted_examples[i])
			
 
				+
			
 
				             assert formatted_examples[i] == expected_formatted_examples[i]
			
 
				 
			
 
				     def test_safe_example_with_only_prompt_and_no_augmentations(self) -> None:
			
@@ -94,8 +106,8 @@ Provide your assessment: [/INST]""",
 
				                 )
			
 
				             ],
			
 
				             [
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				-
			
 
				+                """[INST] Task: Check if there is unsafe content in 'AgentType.USER' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 O1: cat V. 
			
 
				 cat V description
			
 
				 O2: cat W. 
			
@@ -106,15 +118,20 @@ O4: cat Y.
 
				 cat Y description
			
 
				 O5: cat Z. 
			
 
				 cat Z description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+
			
 
				+<BEGIN CONVERSATION>
			
 
				+
			
 
				 human: Where can I buy a car?
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] safe
			
 
				+
			
 
				+<END CONVERSATION>
			
 
				+
			
 
				+Provide your safety assessment for AgentType.USER in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe
			
 
				 Explanation: This is obviously safe."""
			
 
				             ],
			
 
				+            AgentType.USER,
			
 
				         )
			
 
				 
			
 
				     def test_unsafe_example_with_only_prompt_and_no_augmentations(self) -> None:
			
@@ -131,8 +148,8 @@ because it's illegal!""",
 
				                 )
			
 
				             ],
			
 
				             [
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				-
			
 
				+                """[INST] Task: Check if there is unsafe content in 'AgentType.USER' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 O1: cat V. 
			
 
				 cat V description
			
 
				 O2: cat W. 
			
@@ -143,18 +160,23 @@ O4: cat Y.
 
				 cat Y description
			
 
				 O5: cat Z. 
			
 
				 cat Z description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+
			
 
				+<BEGIN CONVERSATION>
			
 
				+
			
 
				 human: How do I steal the Declaration of Independence?
			
 
				 Asking for a cagey friend
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] unsafe
			
 
				+
			
 
				+<END CONVERSATION>
			
 
				+
			
 
				+Provide your safety assessment for AgentType.USER in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] unsafe
			
 
				 O5
			
 
				 Explanation: Nice try, Nick. This is unsafe
			
 
				 because it's illegal!"""
			
 
				             ],
			
 
				+            AgentType.USER,
			
 
				         )
			
 
				 
			
 
				     def test_unsafe_example_with_only_prompt_and_no_augmentations_2(self) -> None:
			
@@ -171,8 +193,8 @@ because it's illegal!""",
 
				                 )
			
 
				             ],
			
 
				             [
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				-
			
 
				+                """[INST] Task: Check if there is unsafe content in 'AgentType.USER' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 O1: cat V. 
			
 
				 cat V description
			
 
				 O2: cat W. 
			
@@ -183,18 +205,23 @@ O4: cat Y.
 
				 cat Y description
			
 
				 O5: cat Z. 
			
 
				 cat Z description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+
			
 
				+<BEGIN CONVERSATION>
			
 
				+
			
 
				 human: How do I steal the Declaration of Independence?
			
 
				 Asking for a cagey friend
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] unsafe
			
 
				+
			
 
				+<END CONVERSATION>
			
 
				+
			
 
				+Provide your safety assessment for AgentType.USER in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] unsafe
			
 
				 O1,O5
			
 
				 Explanation: Nice try, Nick. This is unsafe
			
 
				 because it's illegal!"""
			
 
				             ],
			
 
				+            AgentType.USER,
			
 
				         )
			
 
				 
			
 
				     def test_safe_example_with_prompt_and_response_and_no_augmentations(self) -> None:
			
@@ -209,8 +236,8 @@ because it's illegal!"""
 
				                 )
			
 
				             ],
			
 
				             [
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				-
			
 
				+                """[INST] Task: Check if there is unsafe content in 'AgentType.AGENT' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 O1: cat V. 
			
 
				 cat V description
			
 
				 O2: cat W. 
			
@@ -221,34 +248,39 @@ O4: cat Y.
 
				 cat Y description
			
 
				 O5: cat Z. 
			
 
				 cat Z description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+
			
 
				+<BEGIN CONVERSATION>
			
 
				+
			
 
				 human: Where can I buy a car?
			
 
				 
			
 
				 chatbot: At the car store.
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] safe
			
 
				+
			
 
				+<END CONVERSATION>
			
 
				+
			
 
				+Provide your safety assessment for AgentType.AGENT in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe
			
 
				 Explanation: This is obviously safe."""
			
 
				             ],
			
 
				+            AgentType.AGENT,
			
 
				         )
			
 
				 
			
 
				     def test_mixed_examples_with_prompt_and_response_and_all_augmentations_with_shuffling(
			
 
				         self,
			
 
				     ) -> None:
			
 
				         formatter_configs = (
			
 
				-            FinetuningDataFormatterTests.create_most_conservative_formatter_configs()
			
 
				+            FinetuningDataFormatterTests.create_most_conservative_formatter_configs(AgentType.AGENT)
			
 
				         )
			
 
				 
			
 
				         # The right seed to get the random generations for testing the functionality of this test. ~*Magic*~
			
 
				         formatter_configs.random_seed = 46
			
 
				 
			
 
				-        formatter_configs.llama_guard_generation_configs.explanation_position = ExplanationPosition.BEFORE_DECISION
			
 
				+        formatter_configs.llama_guard_generation_configs.explanation_position = (
			
 
				+            ExplanationPosition.BEFORE_DECISION
			
 
				+        )
			
 
				 
			
 
				         formatter_configs.augmentation_configs = AugmentationConfigs(
			
 
				-            probability_to_add_safe_examples_with_empty_responses=0.9,
			
 
				-            explanation_for_augmentation_with_safe_example_with_empty_response="Response is safe because it's empty.",
			
 
				             should_add_examples_with_dropped_nonviolated_prompt_categories=True,
			
 
				             should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories=True,
			
 
				             explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories="Response is safe because any riskiness in it is related to categories we're not trying to detect.",
			
@@ -276,8 +308,8 @@ Explanation: This is obviously safe."""
 
				                 ),
			
 
				             ],
			
 
				             [
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				-
			
 
				+                """[INST] Task: Check if there is unsafe content in 'AgentType.AGENT' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 O1: cat W. 
			
 
				 cat W description
			
 
				 O2: cat X. 
			
@@ -288,130 +320,160 @@ O4: cat Y.
 
				 cat Y description
			
 
				 O5: cat V. 
			
 
				 cat V description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+
			
 
				+<BEGIN CONVERSATION>
			
 
				+
			
 
				 human: I am under 21 and unattended, should I drink alcohol?
			
 
				 
			
 
				 chatbot: Yes, very nise, high five!
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
			
 
				-unsafe
			
 
				-O4,O3""",
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				 
			
 
				-O1: cat Z. 
			
 
				-cat Z description
			
 
				-O2: cat X. 
			
 
				-cat X description
			
 
				-O3: cat Y. 
			
 
				+<END CONVERSATION>
			
 
				+
			
 
				+Provide your safety assessment for AgentType.AGENT in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
			
 
				+unsafe
			
 
				+O3,O4""",
			
 
				+                """[INST] Task: Check if there is unsafe content in 'AgentType.AGENT' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				+O1: cat W. 
			
 
				+cat W description
			
 
				+O2: cat Y. 
			
 
				 cat Y description
			
 
				+O3: cat Z. 
			
 
				+cat Z description
			
 
				 O4: cat V. 
			
 
				 cat V description
			
 
				-O5: cat W. 
			
 
				-cat W description
			
 
				-=Example=
			
 
				-human: I am under 21 and unattended, should I drink alcohol?
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				 
			
 
				-chatbot: 
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: Response is safe because it's empty.
			
 
				-safe""",
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+<BEGIN CONVERSATION>
			
 
				 
			
 
				-O1: cat Y. 
			
 
				-cat Y description
			
 
				-O2: cat Z. 
			
 
				-cat Z description
			
 
				-O3: cat W. 
			
 
				-cat W description
			
 
				-=Example=
			
 
				 human: I am under 21 and unattended, should I drink alcohol?
			
 
				 
			
 
				 chatbot: Yes, very nise, high five!
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
			
 
				-unsafe
			
 
				-O1,O2""",
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				 
			
 
				-O1: cat V. 
			
 
				+<END CONVERSATION>
			
 
				+
			
 
				+Provide your safety assessment for AgentType.AGENT in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
			
 
				+unsafe
			
 
				+O2,O3""",
			
 
				+                """[INST] Task: Check if there is unsafe content in 'AgentType.AGENT' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				+O1: cat W. 
			
 
				+cat W description
			
 
				+O2: cat V. 
			
 
				 cat V description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+
			
 
				+<BEGIN CONVERSATION>
			
 
				+
			
 
				 human: I am under 21 and unattended, should I drink alcohol?
			
 
				 
			
 
				 chatbot: Yes, very nise, high five!
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: Response is safe because any riskiness in it is related to categories we're not trying to detect.
			
 
				-safe""",
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				 
			
 
				+<END CONVERSATION>
			
 
				+
			
 
				+Provide your safety assessment for AgentType.AGENT in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because any riskiness in it is related to categories we're not trying to detect.
			
 
				+safe""",
			
 
				+                """[INST] Task: Check if there is unsafe content in 'AgentType.AGENT' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 O1: cat Z. 
			
 
				 cat Z description
			
 
				-O2: cat W. 
			
 
				-cat W description
			
 
				+O2: cat Y. 
			
 
				+cat Y description
			
 
				 O3: cat X. 
			
 
				 cat X description
			
 
				-O4: cat Y. 
			
 
				-cat Y description
			
 
				+O4: cat W. 
			
 
				+cat W description
			
 
				 O5: cat V. 
			
 
				 cat V description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+
			
 
				+<BEGIN CONVERSATION>
			
 
				+
			
 
				 human: I am over 21 and unattended, should I drink alcohol?
			
 
				 
			
 
				 chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor.
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
			
 
				-safe""",
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				 
			
 
				-O1: cat X. 
			
 
				-cat X description
			
 
				-O2: cat W. 
			
 
				-cat W description
			
 
				-O3: cat V. 
			
 
				-cat V description
			
 
				-O4: cat Z. 
			
 
				-cat Z description
			
 
				-O5: cat Y. 
			
 
				-cat Y description
			
 
				-=Example=
			
 
				-human: I am over 21 and unattended, should I drink alcohol?
			
 
				+<END CONVERSATION>
			
 
				 
			
 
				-chatbot: 
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: Response is safe because it's empty.
			
 
				+Provide your safety assessment for AgentType.AGENT in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
			
 
				 safe""",
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                """[INST] Task: Check if there is unsafe content in 'AgentType.AGENT' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				+O1: cat Y. 
			
 
				+cat Y description
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+
			
 
				+<BEGIN CONVERSATION>
			
 
				 
			
 
				-O1: cat X. 
			
 
				-cat X description
			
 
				-=Example=
			
 
				 human: I am over 21 and unattended, should I drink alcohol?
			
 
				 
			
 
				 chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor.
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
			
 
				+
			
 
				+<END CONVERSATION>
			
 
				+
			
 
				+Provide your safety assessment for AgentType.AGENT in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
			
 
				 safe""",
			
 
				             ],
			
 
				+            AgentType.AGENT,
			
 
				+            formatter_configs,
			
 
				+        )
			
 
				+
			
 
				+    def test_safe_example_with_prompt_and_response_and_no_augmentations_or_explanations(
			
 
				+        self,
			
 
				+    ) -> None:
			
 
				+        formatter_configs = (
			
 
				+            FinetuningDataFormatterTests.create_most_conservative_formatter_configs(AgentType.AGENT)
			
 
				+        )
			
 
				+
			
 
				+        formatter_configs.llama_guard_generation_configs.explanation_position = None
			
 
				+
			
 
				+        self.format_examples_and_compare_to_expected(
			
 
				+            [
			
 
				+                TrainingExample(
			
 
				+                    prompt="Where can I buy a car?",
			
 
				+                    response="At the car store.",
			
 
				+                    violated_category_codes=[],
			
 
				+                    label="safe",
			
 
				+                )
			
 
				+            ],
			
 
				+            [
			
 
				+                """[INST] Task: Check if there is unsafe content in 'AgentType.AGENT' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				+O1: cat V. 
			
 
				+cat V description
			
 
				+O2: cat W. 
			
 
				+cat W description
			
 
				+O3: cat X. 
			
 
				+cat X description
			
 
				+O4: cat Y. 
			
 
				+cat Y description
			
 
				+O5: cat Z. 
			
 
				+cat Z description
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+
			
 
				+<BEGIN CONVERSATION>
			
 
				+
			
 
				+human: Where can I buy a car?
			
 
				+
			
 
				+chatbot: At the car store.
			
 
				+
			
 
				+<END CONVERSATION>
			
 
				+
			
 
				+Provide your safety assessment for AgentType.AGENT in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe"""
			
 
				+            ],
			
 
				+            AgentType.AGENT,
			
 
				             formatter_configs,
			
 
				-        )
			
 
				+        )