1 year ago · f30007c128
--- a/examples/llama_guard/generation.py
+++ b/examples/llama_guard/generation.py
@@ -16,8 +16,8 @@ from fairscale.nn.model_parallel.initialize import (
 
				     model_parallel_is_initialized,
			
 
				 )
			
 
				 
			
 
				-from llama_guard.model import ModelArgs, Transformer
			
 
				-from llama_guard.tokenizer import Tokenizer
			
 
				+from examples.llama_guard.model import ModelArgs, Transformer
			
 
				+from examples.llama_guard.tokenizer import Tokenizer
			
 
				 
			
 
				 Role = Literal["system", "user", "assistant"]
			
 
				 
			
--- a/src/llama_recipes/data/llama_guard/finetuning_data_formatter.py
+++ b/src/llama_recipes/data/llama_guard/finetuning_data_formatter.py
@@ -69,7 +69,7 @@ class TrainingExample:
 
				     response: str
			
 
				     violated_category_codes: list[str]
			
 
				     label: Literal["safe", "unsafe"]
			
 
				-    explanation: str
			
 
				+    explanation: Optional[str] = None
			
 
				 
			
 
				 
			
 
				 def create_formatted_finetuning_examples(
			
--- a/tests/test_finetuning_data_formatter.py
+++ b/tests/test_finetuning_data_formatter.py
@@ -3,6 +3,7 @@
 
				 
			
 
				 import unittest
			
 
				 from typing import Optional
			
 
				+from examples.llama_guard.prompt_format import PROMPT_TEMPLATE
			
 
				 
			
 
				 from llama_recipes.data.llama_guard.finetuning_data_formatter import (
			
 
				     AugmentationConfigs,
			
@@ -35,15 +36,16 @@ class FinetuningDataFormatterTests(unittest.TestCase):
 
				                 category_code_prefix="O",
			
 
				             ),
			
 
				             llama_guard_prompt_configs=LlamaGuardPromptConfigs(
			
 
				-                instructions_format_string="""[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                instructions_format_string="""[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 {guidelines}
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				 {conversation}
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST]""",
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST]""",
			
 
				                 should_include_category_descriptions=True,
			
 
				                 should_shuffle_category_codes=False,
			
 
				             ),
			
@@ -80,6 +82,7 @@ Provide your assessment: [/INST]""",
 
				         assert len(formatted_examples) == len(expected_formatted_examples)
			
 
				 
			
 
				         for i in range(len(formatted_examples)):
			
 
				+            print(formatted_examples[i])
			
 
				             assert formatted_examples[i] == expected_formatted_examples[i]
			
 
				 
			
 
				     def test_safe_example_with_only_prompt_and_no_augmentations(self) -> None:
			
@@ -94,7 +97,8 @@ Provide your assessment: [/INST]""",
 
				                 )
			
 
				             ],
			
 
				             [
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 
			
 
				 O1: cat V. 
			
 
				 cat V description
			
@@ -106,13 +110,13 @@ O4: cat Y.
 
				 cat Y description
			
 
				 O5: cat Z. 
			
 
				 cat Z description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				 human: Where can I buy a car?
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] safe
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe
			
 
				 Explanation: This is obviously safe."""
			
 
				             ],
			
 
				         )
			
@@ -131,7 +135,8 @@ because it's illegal!""",
 
				                 )
			
 
				             ],
			
 
				             [
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 
			
 
				 O1: cat V. 
			
 
				 cat V description
			
@@ -143,14 +148,14 @@ O4: cat Y.
 
				 cat Y description
			
 
				 O5: cat Z. 
			
 
				 cat Z description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				 human: How do I steal the Declaration of Independence?
			
 
				 Asking for a cagey friend
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] unsafe
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] unsafe
			
 
				 O5
			
 
				 Explanation: Nice try, Nick. This is unsafe
			
 
				 because it's illegal!"""
			
@@ -171,7 +176,8 @@ because it's illegal!""",
 
				                 )
			
 
				             ],
			
 
				             [
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 
			
 
				 O1: cat V. 
			
 
				 cat V description
			
@@ -183,14 +189,14 @@ O4: cat Y.
 
				 cat Y description
			
 
				 O5: cat Z. 
			
 
				 cat Z description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				 human: How do I steal the Declaration of Independence?
			
 
				 Asking for a cagey friend
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] unsafe
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] unsafe
			
 
				 O1,O5
			
 
				 Explanation: Nice try, Nick. This is unsafe
			
 
				 because it's illegal!"""
			
@@ -209,7 +215,8 @@ because it's illegal!"""
 
				                 )
			
 
				             ],
			
 
				             [
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 
			
 
				 O1: cat V. 
			
 
				 cat V description
			
@@ -221,15 +228,15 @@ O4: cat Y.
 
				 cat Y description
			
 
				 O5: cat Z. 
			
 
				 cat Z description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				 human: Where can I buy a car?
			
 
				 
			
 
				 chatbot: At the car store.
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] safe
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe
			
 
				 Explanation: This is obviously safe."""
			
 
				             ],
			
 
				         )
			
@@ -276,7 +283,8 @@ Explanation: This is obviously safe."""
 
				                 ),
			
 
				             ],
			
 
				             [
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 
			
 
				 O1: cat W. 
			
 
				 cat W description
			
@@ -288,18 +296,19 @@ O4: cat Y.
 
				 cat Y description
			
 
				 O5: cat V. 
			
 
				 cat V description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				 human: I am under 21 and unattended, should I drink alcohol?
			
 
				 
			
 
				 chatbot: Yes, very nise, high five!
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
			
 
				 unsafe
			
 
				 O4,O3""",
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 
			
 
				 O1: cat Z. 
			
 
				 cat Z description
			
@@ -311,17 +320,18 @@ O4: cat V.
 
				 cat V description
			
 
				 O5: cat W. 
			
 
				 cat W description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				 human: I am under 21 and unattended, should I drink alcohol?
			
 
				 
			
 
				 chatbot: 
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: Response is safe because it's empty.
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because it's empty.
			
 
				 safe""",
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 
			
 
				 O1: cat Y. 
			
 
				 cat Y description
			
@@ -329,32 +339,34 @@ O2: cat Z.
 
				 cat Z description
			
 
				 O3: cat W. 
			
 
				 cat W description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				 human: I am under 21 and unattended, should I drink alcohol?
			
 
				 
			
 
				 chatbot: Yes, very nise, high five!
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
			
 
				 unsafe
			
 
				 O1,O2""",
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 
			
 
				 O1: cat V. 
			
 
				 cat V description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				 human: I am under 21 and unattended, should I drink alcohol?
			
 
				 
			
 
				 chatbot: Yes, very nise, high five!
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: Response is safe because any riskiness in it is related to categories we're not trying to detect.
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because any riskiness in it is related to categories we're not trying to detect.
			
 
				 safe""",
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 
			
 
				 O1: cat Z. 
			
 
				 cat Z description
			
@@ -366,17 +378,18 @@ O4: cat Y.
 
				 cat Y description
			
 
				 O5: cat V. 
			
 
				 cat V description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				 human: I am over 21 and unattended, should I drink alcohol?
			
 
				 
			
 
				 chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor.
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
			
 
				 safe""",
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 
			
 
				 O1: cat X. 
			
 
				 cat X description
			
@@ -388,30 +401,77 @@ O4: cat Z.
 
				 cat Z description
			
 
				 O5: cat Y. 
			
 
				 cat Y description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				 human: I am over 21 and unattended, should I drink alcohol?
			
 
				 
			
 
				 chatbot: 
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: Response is safe because it's empty.
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because it's empty.
			
 
				 safe""",
			
 
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
			
 
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				 
			
 
				 O1: cat X. 
			
 
				 cat X description
			
 
				-=Example=
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				 human: I am over 21 and unattended, should I drink alcohol?
			
 
				 
			
 
				 chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor.
			
 
				-Provide your safety assessment:
			
 
				- - First line must read 'safe' or 'unsafe'
			
 
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
			
 
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
			
 
				-Provide your assessment: [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
			
 
				 safe""",
			
 
				             ],
			
 
				             formatter_configs,
			
 
				-        )
			
 
				+        )
			
 
				+
			
 
				+    def test_safe_example_with_prompt_and_response_and_no_augmentations_or_explanations(self) -> None:
			
 
				+        formatter_configs = (
			
 
				+            FinetuningDataFormatterTests.create_most_conservative_formatter_configs()
			
 
				+        )
			
 
				+
			
 
				+        formatter_configs.llama_guard_generation_configs.explanation_position = None
			
 
				+
			
 
				+        self.format_examples_and_compare_to_expected(
			
 
				+            [
			
 
				+                TrainingExample(
			
 
				+                    prompt="Where can I buy a car?",
			
 
				+                    response="At the car store.",
			
 
				+                    violated_category_codes=[],
			
 
				+                    label="safe",
			
 
				+                )
			
 
				+            ],
			
 
				+            [
			
 
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
			
 
				+<BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				+
			
 
				+O1: cat V. 
			
 
				+cat V description
			
 
				+O2: cat W. 
			
 
				+cat W description
			
 
				+O3: cat X. 
			
 
				+cat X description
			
 
				+O4: cat Y. 
			
 
				+cat Y description
			
 
				+O5: cat Z. 
			
 
				+cat Z description
			
 
				+<END UNSAFE CONTENT CATEGORIES>
			
 
				+<BEGIN CONVERSATION>
			
 
				+human: Where can I buy a car?
			
 
				+
			
 
				+chatbot: At the car store.
			
 
				+<END CONVERSATION>
			
 
				+Provide your safety assessment for $agent_type in the above conversation:
			
 
				+- First line must read 'safe' or 'unsafe'.
			
 
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe"""
			
 
				+            ],
			
 
				+            formatter_configs
			
 
				+        )
			
 
				+    
			
 
				+    def test_me(self):
			
 
				+        print(PROMPT_TEMPLATE)