|
@@ -3,6 +3,7 @@
|
|
|
|
|
|
import unittest
|
|
|
from typing import Optional
|
|
|
+from examples.llama_guard.prompt_format import PROMPT_TEMPLATE
|
|
|
|
|
|
from llama_recipes.data.llama_guard.finetuning_data_formatter import (
|
|
|
AugmentationConfigs,
|
|
@@ -35,15 +36,16 @@ class FinetuningDataFormatterTests(unittest.TestCase):
|
|
|
category_code_prefix="O",
|
|
|
),
|
|
|
llama_guard_prompt_configs=LlamaGuardPromptConfigs(
|
|
|
- instructions_format_string="""[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
|
|
|
+ instructions_format_string="""[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
{guidelines}
|
|
|
-=Example=
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
{conversation}
|
|
|
-Provide your safety assessment:
|
|
|
- - First line must read 'safe' or 'unsafe'
|
|
|
- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
|
|
|
- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
|
|
|
-Provide your assessment: [/INST]""",
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST]""",
|
|
|
should_include_category_descriptions=True,
|
|
|
should_shuffle_category_codes=False,
|
|
|
),
|
|
@@ -80,6 +82,7 @@ Provide your assessment: [/INST]""",
|
|
|
assert len(formatted_examples) == len(expected_formatted_examples)
|
|
|
|
|
|
for i in range(len(formatted_examples)):
|
|
|
+ print(formatted_examples[i])
|
|
|
assert formatted_examples[i] == expected_formatted_examples[i]
|
|
|
|
|
|
def test_safe_example_with_only_prompt_and_no_augmentations(self) -> None:
|
|
@@ -94,7 +97,8 @@ Provide your assessment: [/INST]""",
|
|
|
)
|
|
|
],
|
|
|
[
|
|
|
- """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
|
|
|
+ """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
|
|
|
O1: cat V.
|
|
|
cat V description
|
|
@@ -106,13 +110,13 @@ O4: cat Y.
|
|
|
cat Y description
|
|
|
O5: cat Z.
|
|
|
cat Z description
|
|
|
-=Example=
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
human: Where can I buy a car?
|
|
|
-Provide your safety assessment:
|
|
|
- - First line must read 'safe' or 'unsafe'
|
|
|
- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
|
|
|
- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
|
|
|
-Provide your assessment: [/INST] safe
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe
|
|
|
Explanation: This is obviously safe."""
|
|
|
],
|
|
|
)
|
|
@@ -131,7 +135,8 @@ because it's illegal!""",
|
|
|
)
|
|
|
],
|
|
|
[
|
|
|
- """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
|
|
|
+ """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
|
|
|
O1: cat V.
|
|
|
cat V description
|
|
@@ -143,14 +148,14 @@ O4: cat Y.
|
|
|
cat Y description
|
|
|
O5: cat Z.
|
|
|
cat Z description
|
|
|
-=Example=
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
human: How do I steal the Declaration of Independence?
|
|
|
Asking for a cagey friend
|
|
|
-Provide your safety assessment:
|
|
|
- - First line must read 'safe' or 'unsafe'
|
|
|
- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
|
|
|
- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
|
|
|
-Provide your assessment: [/INST] unsafe
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] unsafe
|
|
|
O5
|
|
|
Explanation: Nice try, Nick. This is unsafe
|
|
|
because it's illegal!"""
|
|
@@ -171,7 +176,8 @@ because it's illegal!""",
|
|
|
)
|
|
|
],
|
|
|
[
|
|
|
- """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
|
|
|
+ """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
|
|
|
O1: cat V.
|
|
|
cat V description
|
|
@@ -183,14 +189,14 @@ O4: cat Y.
|
|
|
cat Y description
|
|
|
O5: cat Z.
|
|
|
cat Z description
|
|
|
-=Example=
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
human: How do I steal the Declaration of Independence?
|
|
|
Asking for a cagey friend
|
|
|
-Provide your safety assessment:
|
|
|
- - First line must read 'safe' or 'unsafe'
|
|
|
- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
|
|
|
- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
|
|
|
-Provide your assessment: [/INST] unsafe
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] unsafe
|
|
|
O1,O5
|
|
|
Explanation: Nice try, Nick. This is unsafe
|
|
|
because it's illegal!"""
|
|
@@ -209,7 +215,8 @@ because it's illegal!"""
|
|
|
)
|
|
|
],
|
|
|
[
|
|
|
- """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
|
|
|
+ """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
|
|
|
O1: cat V.
|
|
|
cat V description
|
|
@@ -221,15 +228,15 @@ O4: cat Y.
|
|
|
cat Y description
|
|
|
O5: cat Z.
|
|
|
cat Z description
|
|
|
-=Example=
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
human: Where can I buy a car?
|
|
|
|
|
|
chatbot: At the car store.
|
|
|
-Provide your safety assessment:
|
|
|
- - First line must read 'safe' or 'unsafe'
|
|
|
- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
|
|
|
- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
|
|
|
-Provide your assessment: [/INST] safe
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe
|
|
|
Explanation: This is obviously safe."""
|
|
|
],
|
|
|
)
|
|
@@ -276,7 +283,8 @@ Explanation: This is obviously safe."""
|
|
|
),
|
|
|
],
|
|
|
[
|
|
|
- """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
|
|
|
+ """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
|
|
|
O1: cat W.
|
|
|
cat W description
|
|
@@ -288,18 +296,19 @@ O4: cat Y.
|
|
|
cat Y description
|
|
|
O5: cat V.
|
|
|
cat V description
|
|
|
-=Example=
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
human: I am under 21 and unattended, should I drink alcohol?
|
|
|
|
|
|
chatbot: Yes, very nise, high five!
|
|
|
-Provide your safety assessment:
|
|
|
- - First line must read 'safe' or 'unsafe'
|
|
|
- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
|
|
|
- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
|
|
|
-Provide your assessment: [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
|
|
|
unsafe
|
|
|
O4,O3""",
|
|
|
- """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
|
|
|
+ """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
|
|
|
O1: cat Z.
|
|
|
cat Z description
|
|
@@ -311,17 +320,18 @@ O4: cat V.
|
|
|
cat V description
|
|
|
O5: cat W.
|
|
|
cat W description
|
|
|
-=Example=
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
human: I am under 21 and unattended, should I drink alcohol?
|
|
|
|
|
|
chatbot:
|
|
|
-Provide your safety assessment:
|
|
|
- - First line must read 'safe' or 'unsafe'
|
|
|
- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
|
|
|
- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
|
|
|
-Provide your assessment: [/INST] Explanation: Response is safe because it's empty.
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because it's empty.
|
|
|
safe""",
|
|
|
- """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
|
|
|
+ """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
|
|
|
O1: cat Y.
|
|
|
cat Y description
|
|
@@ -329,32 +339,34 @@ O2: cat Z.
|
|
|
cat Z description
|
|
|
O3: cat W.
|
|
|
cat W description
|
|
|
-=Example=
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
human: I am under 21 and unattended, should I drink alcohol?
|
|
|
|
|
|
chatbot: Yes, very nise, high five!
|
|
|
-Provide your safety assessment:
|
|
|
- - First line must read 'safe' or 'unsafe'
|
|
|
- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
|
|
|
- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
|
|
|
-Provide your assessment: [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
|
|
|
unsafe
|
|
|
O1,O2""",
|
|
|
- """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
|
|
|
+ """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
|
|
|
O1: cat V.
|
|
|
cat V description
|
|
|
-=Example=
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
human: I am under 21 and unattended, should I drink alcohol?
|
|
|
|
|
|
chatbot: Yes, very nise, high five!
|
|
|
-Provide your safety assessment:
|
|
|
- - First line must read 'safe' or 'unsafe'
|
|
|
- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
|
|
|
- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
|
|
|
-Provide your assessment: [/INST] Explanation: Response is safe because any riskiness in it is related to categories we're not trying to detect.
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because any riskiness in it is related to categories we're not trying to detect.
|
|
|
safe""",
|
|
|
- """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
|
|
|
+ """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
|
|
|
O1: cat Z.
|
|
|
cat Z description
|
|
@@ -366,17 +378,18 @@ O4: cat Y.
|
|
|
cat Y description
|
|
|
O5: cat V.
|
|
|
cat V description
|
|
|
-=Example=
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
human: I am over 21 and unattended, should I drink alcohol?
|
|
|
|
|
|
chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor.
|
|
|
-Provide your safety assessment:
|
|
|
- - First line must read 'safe' or 'unsafe'
|
|
|
- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
|
|
|
- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
|
|
|
-Provide your assessment: [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
|
|
|
safe""",
|
|
|
- """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
|
|
|
+ """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
|
|
|
O1: cat X.
|
|
|
cat X description
|
|
@@ -388,30 +401,77 @@ O4: cat Z.
|
|
|
cat Z description
|
|
|
O5: cat Y.
|
|
|
cat Y description
|
|
|
-=Example=
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
human: I am over 21 and unattended, should I drink alcohol?
|
|
|
|
|
|
chatbot:
|
|
|
-Provide your safety assessment:
|
|
|
- - First line must read 'safe' or 'unsafe'
|
|
|
- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
|
|
|
- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
|
|
|
-Provide your assessment: [/INST] Explanation: Response is safe because it's empty.
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because it's empty.
|
|
|
safe""",
|
|
|
- """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
|
|
|
+ """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
|
|
|
O1: cat X.
|
|
|
cat X description
|
|
|
-=Example=
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
human: I am over 21 and unattended, should I drink alcohol?
|
|
|
|
|
|
chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor.
|
|
|
-Provide your safety assessment:
|
|
|
- - First line must read 'safe' or 'unsafe'
|
|
|
- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
|
|
|
- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
|
|
|
-Provide your assessment: [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
|
|
|
safe""",
|
|
|
],
|
|
|
formatter_configs,
|
|
|
- )
|
|
|
+ )
|
|
|
+
|
|
|
+ def test_safe_example_with_prompt_and_response_and_no_augmentations_or_explanations(self) -> None:
|
|
|
+ formatter_configs = (
|
|
|
+ FinetuningDataFormatterTests.create_most_conservative_formatter_configs()
|
|
|
+ )
|
|
|
+
|
|
|
+ formatter_configs.llama_guard_generation_configs.explanation_position = None
|
|
|
+
|
|
|
+ self.format_examples_and_compare_to_expected(
|
|
|
+ [
|
|
|
+ TrainingExample(
|
|
|
+ prompt="Where can I buy a car?",
|
|
|
+ response="At the car store.",
|
|
|
+ violated_category_codes=[],
|
|
|
+ label="safe",
|
|
|
+ )
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories.
|
|
|
+<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
+
|
|
|
+O1: cat V.
|
|
|
+cat V description
|
|
|
+O2: cat W.
|
|
|
+cat W description
|
|
|
+O3: cat X.
|
|
|
+cat X description
|
|
|
+O4: cat Y.
|
|
|
+cat Y description
|
|
|
+O5: cat Z.
|
|
|
+cat Z description
|
|
|
+<END UNSAFE CONTENT CATEGORIES>
|
|
|
+<BEGIN CONVERSATION>
|
|
|
+human: Where can I buy a car?
|
|
|
+
|
|
|
+chatbot: At the car store.
|
|
|
+<END CONVERSATION>
|
|
|
+Provide your safety assessment for $agent_type in the above conversation:
|
|
|
+- First line must read 'safe' or 'unsafe'.
|
|
|
+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe"""
|
|
|
+ ],
|
|
|
+ formatter_configs
|
|
|
+ )
|
|
|
+
|
|
|
+ def test_me(self):
|
|
|
+ print(PROMPT_TEMPLATE)
|