Browse Source

Addressing PR comments

Michael Tontchev 1 year ago
parent
commit
5ad0aa7b4b

+ 3 - 77
src/llama_recipes/data/llama_guard/finetuning_data_formatter.py

@@ -40,10 +40,6 @@ class LlamaGuardGenerationConfigs:
 
 @dataclass
 class AugmentationConfigs:
-    probability_to_add_safe_examples_with_empty_responses: float = 0
-    explanation_for_augmentation_with_safe_example_with_empty_response: Optional[
-        str
-    ] = None
     should_add_examples_with_dropped_nonviolated_prompt_categories: bool = True
     should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories: bool = (
         False
@@ -126,21 +122,6 @@ def create_formatted_finetuning_examples(
 def _verify_formatter_configs(
     formatter_configs: FormatterConfigs,
 ) -> None:
-    if (
-        formatter_configs.augmentation_configs.probability_to_add_safe_examples_with_empty_responses
-        > 0
-        and formatter_configs.llama_guard_generation_configs.explanation_position
-        is not None
-        and formatter_configs.augmentation_configs.explanation_for_augmentation_with_safe_example_with_empty_response
-        is None
-    ):
-        raise ValueError(
-            """The configuration setup requires you to specify
- explanation_for_augmentation_with_safe_example_with_empty_response. This is an
- explanation that we use for dynamically-created safe augmentation examples.
- Consider something like 'This interaction is safe because the response of the chatbot is empty.'"""
-        )
-
     if (
         formatter_configs.augmentation_configs.should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories
         == True
@@ -198,8 +179,10 @@ def _create_llama_guard_prompt(
     ) in enumerate(category_indices_to_include):
         category = formatter_configs.guidelines.categories[original_category_index]
 
+        newline_for_every_category_after_first = f"\n" if rewritten_category_index_for_current_prompt > 0 else ""
+
         # Indices start at 0, but categories start at 1, so we add 1
-        full_guidelines_text += f"\n{formatter_configs.guidelines.category_code_prefix}{rewritten_category_index_for_current_prompt + 1}: {category.name}. "
+        full_guidelines_text += f"{newline_for_every_category_after_first}{formatter_configs.guidelines.category_code_prefix}{rewritten_category_index_for_current_prompt + 1}: {category.name}. "
 
         if (
             formatter_configs.llama_guard_prompt_configs.should_include_category_descriptions
@@ -299,63 +282,6 @@ def _maybe_add_data_augmentations_for_example(
     formatted_examples_being_built: list[str],
     indices_of_all_categories: range,
     formatter_configs: FormatterConfigs,
-) -> None:
-    _maybe_add_safe_example_with_empty_response(
-        training_example,
-        formatted_examples_being_built,
-        indices_of_all_categories,
-        formatter_configs,
-    )
-
-    _maybe_add_examples_with_dropped_prompt_categories(
-        training_example,
-        formatted_examples_being_built,
-        indices_of_all_categories,
-        formatter_configs,
-    )
-
-
-def _maybe_add_safe_example_with_empty_response(
-    training_example: TrainingExample,
-    formatted_examples_being_built: list[str],
-    indices_of_all_categories: range,
-    formatter_configs: FormatterConfigs,
-) -> None:
-    """
-    For any prompt+response pair, an empty response is a safe response,
-    so we allow the data to be augmented by adding a safe example with the same
-    prompt but an empty response.
-    """
-    if (
-        not _is_a_prompt_only_example(training_example)
-        and training_example.response != ""
-        and random.random()
-        < formatter_configs.augmentation_configs.probability_to_add_safe_examples_with_empty_responses
-    ):
-        training_example_copy = copy.deepcopy(training_example)
-        training_example_copy.response = ""
-        training_example_copy.label = "safe"
-        training_example_copy.violated_category_codes = []
-        training_example_copy.explanation = (
-            formatter_configs.augmentation_configs.explanation_for_augmentation_with_safe_example_with_empty_response
-        )
-
-        formatted_examples_being_built.append(
-            _create_formatted_finetuning_example(
-                training_example_copy,
-                formatter_configs,
-                category_indeces_to_include_in_llama_guard_prompt=list(
-                    indices_of_all_categories
-                ),
-            )
-        )
-
-
-def _maybe_add_examples_with_dropped_prompt_categories(
-    training_example: TrainingExample,
-    formatted_examples_being_built: list[str],
-    indices_of_all_categories: range,
-    formatter_configs: FormatterConfigs,
 ) -> None:
     violated_category_indices = _convert_category_codes_to_indices(
         training_example.violated_category_codes,

+ 64 - 74
tests/test_finetuning_data_formatter.py

@@ -3,7 +3,7 @@
 
 import unittest
 from typing import Optional
-from examples.llama_guard.prompt_format import PROMPT_TEMPLATE
+from examples.llama_guard.prompt_format import PROMPT_TEMPLATE, AgentType
 
 from llama_recipes.data.llama_guard.finetuning_data_formatter import (
     AugmentationConfigs,
@@ -40,9 +40,13 @@ class FinetuningDataFormatterTests(unittest.TestCase):
 <BEGIN UNSAFE CONTENT CATEGORIES>
 {guidelines}
 <END UNSAFE CONTENT CATEGORIES>
+
 <BEGIN CONVERSATION>
+
 {conversation}
+
 <END CONVERSATION>
+
 Provide your safety assessment for $agent_type in the above conversation:
 - First line must read 'safe' or 'unsafe'.
 - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]""",
@@ -54,8 +58,6 @@ Provide your safety assessment for $agent_type in the above conversation:
                 should_list_violated_codes=True,
             ),
             augmentation_configs=AugmentationConfigs(
-                probability_to_add_safe_examples_with_empty_responses=0.0,
-                explanation_for_augmentation_with_safe_example_with_empty_response="Response is safe because it's empty.",
                 should_add_examples_with_dropped_nonviolated_prompt_categories=False,
                 should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories=False,
                 explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories="Response is safe because any riskiness in it is related to categories we're not trying to detect.",
@@ -82,7 +84,12 @@ Provide your safety assessment for $agent_type in the above conversation:
         assert len(formatted_examples) == len(expected_formatted_examples)
 
         for i in range(len(formatted_examples)):
-            print(formatted_examples[i])
+            # Helper code during test-writing time to see the actual output
+            if formatted_examples[i] != expected_formatted_examples[i]:
+                print(f"Failed on actual output {i}:")
+
+                print(formatted_examples[i])
+
             assert formatted_examples[i] == expected_formatted_examples[i]
 
     def test_safe_example_with_only_prompt_and_no_augmentations(self) -> None:
@@ -99,7 +106,6 @@ Provide your safety assessment for $agent_type in the above conversation:
             [
                 """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
 <BEGIN UNSAFE CONTENT CATEGORIES>
-
 O1: cat V. 
 cat V description
 O2: cat W. 
@@ -111,9 +117,13 @@ cat Y description
 O5: cat Z. 
 cat Z description
 <END UNSAFE CONTENT CATEGORIES>
+
 <BEGIN CONVERSATION>
+
 human: Where can I buy a car?
+
 <END CONVERSATION>
+
 Provide your safety assessment for $agent_type in the above conversation:
 - First line must read 'safe' or 'unsafe'.
 - If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe
@@ -137,7 +147,6 @@ because it's illegal!""",
             [
                 """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
 <BEGIN UNSAFE CONTENT CATEGORIES>
-
 O1: cat V. 
 cat V description
 O2: cat W. 
@@ -149,10 +158,14 @@ cat Y description
 O5: cat Z. 
 cat Z description
 <END UNSAFE CONTENT CATEGORIES>
+
 <BEGIN CONVERSATION>
+
 human: How do I steal the Declaration of Independence?
 Asking for a cagey friend
+
 <END CONVERSATION>
+
 Provide your safety assessment for $agent_type in the above conversation:
 - First line must read 'safe' or 'unsafe'.
 - If unsafe, a second line must include a comma-separated list of violated categories. [/INST] unsafe
@@ -178,7 +191,6 @@ because it's illegal!""",
             [
                 """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
 <BEGIN UNSAFE CONTENT CATEGORIES>
-
 O1: cat V. 
 cat V description
 O2: cat W. 
@@ -190,10 +202,14 @@ cat Y description
 O5: cat Z. 
 cat Z description
 <END UNSAFE CONTENT CATEGORIES>
+
 <BEGIN CONVERSATION>
+
 human: How do I steal the Declaration of Independence?
 Asking for a cagey friend
+
 <END CONVERSATION>
+
 Provide your safety assessment for $agent_type in the above conversation:
 - First line must read 'safe' or 'unsafe'.
 - If unsafe, a second line must include a comma-separated list of violated categories. [/INST] unsafe
@@ -217,7 +233,6 @@ because it's illegal!"""
             [
                 """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
 <BEGIN UNSAFE CONTENT CATEGORIES>
-
 O1: cat V. 
 cat V description
 O2: cat W. 
@@ -229,11 +244,15 @@ cat Y description
 O5: cat Z. 
 cat Z description
 <END UNSAFE CONTENT CATEGORIES>
+
 <BEGIN CONVERSATION>
+
 human: Where can I buy a car?
 
 chatbot: At the car store.
+
 <END CONVERSATION>
+
 Provide your safety assessment for $agent_type in the above conversation:
 - First line must read 'safe' or 'unsafe'.
 - If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe
@@ -254,8 +273,6 @@ Explanation: This is obviously safe."""
         formatter_configs.llama_guard_generation_configs.explanation_position = ExplanationPosition.BEFORE_DECISION
 
         formatter_configs.augmentation_configs = AugmentationConfigs(
-            probability_to_add_safe_examples_with_empty_responses=0.9,
-            explanation_for_augmentation_with_safe_example_with_empty_response="Response is safe because it's empty.",
             should_add_examples_with_dropped_nonviolated_prompt_categories=True,
             should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories=True,
             explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories="Response is safe because any riskiness in it is related to categories we're not trying to detect.",
@@ -285,7 +302,6 @@ Explanation: This is obviously safe."""
             [
                 """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
 <BEGIN UNSAFE CONTENT CATEGORIES>
-
 O1: cat W. 
 cat W description
 O2: cat X. 
@@ -297,11 +313,15 @@ cat Y description
 O5: cat V. 
 cat V description
 <END UNSAFE CONTENT CATEGORIES>
+
 <BEGIN CONVERSATION>
+
 human: I am under 21 and unattended, should I drink alcohol?
 
 chatbot: Yes, very nise, high five!
+
 <END CONVERSATION>
+
 Provide your safety assessment for $agent_type in the above conversation:
 - First line must read 'safe' or 'unsafe'.
 - If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
@@ -309,119 +329,89 @@ unsafe
 O4,O3""",
                 """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
 <BEGIN UNSAFE CONTENT CATEGORIES>
-
-O1: cat Z. 
-cat Z description
-O2: cat X. 
-cat X description
-O3: cat Y. 
+O1: cat W. 
+cat W description
+O2: cat Y. 
 cat Y description
+O3: cat Z. 
+cat Z description
 O4: cat V. 
 cat V description
-O5: cat W. 
-cat W description
 <END UNSAFE CONTENT CATEGORIES>
-<BEGIN CONVERSATION>
-human: I am under 21 and unattended, should I drink alcohol?
 
-chatbot: 
-<END CONVERSATION>
-Provide your safety assessment for $agent_type in the above conversation:
-- First line must read 'safe' or 'unsafe'.
-- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because it's empty.
-safe""",
-                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
-<BEGIN UNSAFE CONTENT CATEGORIES>
-
-O1: cat Y. 
-cat Y description
-O2: cat Z. 
-cat Z description
-O3: cat W. 
-cat W description
-<END UNSAFE CONTENT CATEGORIES>
 <BEGIN CONVERSATION>
+
 human: I am under 21 and unattended, should I drink alcohol?
 
 chatbot: Yes, very nise, high five!
+
 <END CONVERSATION>
+
 Provide your safety assessment for $agent_type in the above conversation:
 - First line must read 'safe' or 'unsafe'.
 - If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
 unsafe
-O1,O2""",
+O2,O3""",
                 """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
 <BEGIN UNSAFE CONTENT CATEGORIES>
-
-O1: cat V. 
+O1: cat W. 
+cat W description
+O2: cat V. 
 cat V description
 <END UNSAFE CONTENT CATEGORIES>
+
 <BEGIN CONVERSATION>
+
 human: I am under 21 and unattended, should I drink alcohol?
 
 chatbot: Yes, very nise, high five!
+
 <END CONVERSATION>
+
 Provide your safety assessment for $agent_type in the above conversation:
 - First line must read 'safe' or 'unsafe'.
 - If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because any riskiness in it is related to categories we're not trying to detect.
 safe""",
                 """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
 <BEGIN UNSAFE CONTENT CATEGORIES>
-
 O1: cat Z. 
 cat Z description
-O2: cat W. 
-cat W description
+O2: cat Y. 
+cat Y description
 O3: cat X. 
 cat X description
-O4: cat Y. 
-cat Y description
+O4: cat W. 
+cat W description
 O5: cat V. 
 cat V description
 <END UNSAFE CONTENT CATEGORIES>
+
 <BEGIN CONVERSATION>
+
 human: I am over 21 and unattended, should I drink alcohol?
 
 chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor.
+
 <END CONVERSATION>
+
 Provide your safety assessment for $agent_type in the above conversation:
 - First line must read 'safe' or 'unsafe'.
 - If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
 safe""",
                 """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
 <BEGIN UNSAFE CONTENT CATEGORIES>
-
-O1: cat X. 
-cat X description
-O2: cat W. 
-cat W description
-O3: cat V. 
-cat V description
-O4: cat Z. 
-cat Z description
-O5: cat Y. 
+O1: cat Y. 
 cat Y description
 <END UNSAFE CONTENT CATEGORIES>
-<BEGIN CONVERSATION>
-human: I am over 21 and unattended, should I drink alcohol?
-
-chatbot: 
-<END CONVERSATION>
-Provide your safety assessment for $agent_type in the above conversation:
-- First line must read 'safe' or 'unsafe'.
-- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because it's empty.
-safe""",
-                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
-<BEGIN UNSAFE CONTENT CATEGORIES>
 
-O1: cat X. 
-cat X description
-<END UNSAFE CONTENT CATEGORIES>
 <BEGIN CONVERSATION>
+
 human: I am over 21 and unattended, should I drink alcohol?
 
 chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor.
+
 <END CONVERSATION>
+
 Provide your safety assessment for $agent_type in the above conversation:
 - First line must read 'safe' or 'unsafe'.
 - If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
@@ -449,7 +439,6 @@ safe""",
             [
                 """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
 <BEGIN UNSAFE CONTENT CATEGORIES>
-
 O1: cat V. 
 cat V description
 O2: cat W. 
@@ -461,17 +450,18 @@ cat Y description
 O5: cat Z. 
 cat Z description
 <END UNSAFE CONTENT CATEGORIES>
+
 <BEGIN CONVERSATION>
+
 human: Where can I buy a car?
 
 chatbot: At the car store.
+
 <END CONVERSATION>
+
 Provide your safety assessment for $agent_type in the above conversation:
 - First line must read 'safe' or 'unsafe'.
 - If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe"""
             ],
             formatter_configs
-        )
-    
-    def test_me(self):
-        print(PROMPT_TEMPLATE)
+        )