Browse Source

Create finetuning data formatter plus tests

Michael Tontchev 1 year ago
parent
commit
fa2a314820

+ 11 - 0
.vscode/settings.json

@@ -0,0 +1,11 @@
+{
+    "python.testing.unittestArgs": [
+        "-v",
+        "-s",
+        "./tests",
+        "-p",
+        "test_*.py"
+    ],
+    "python.testing.pytestEnabled": false,
+    "python.testing.unittestEnabled": true
+}

+ 2 - 0
src/llama_recipes/data/llama_guard/__init__.py

@@ -0,0 +1,2 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama Guard License Agreement.

+ 481 - 0
src/llama_recipes/data/llama_guard/finetuning_data_formatter.py

@@ -0,0 +1,481 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama Guard License Agreement.
+
+import copy
+import random
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List, Literal, Optional, Sequence
+
+
+@dataclass
+class Category:
+    name: str
+    description: str
+
+
+@dataclass
+class Guidelines:
+    categories: Sequence[Category]
+    category_code_prefix: str = "O"
+
+
+class ExplanationPosition(Enum):
+    BEFORE_DECISION = 0
+    AFTER_DECISION = 1
+
+
+@dataclass
+class LlamaGuardPromptConfigs:
+    instructions_format_string: str
+    should_include_category_descriptions: bool
+    should_shuffle_category_codes: bool = True
+
+
+@dataclass
+class LlamaGuardGenerationConfigs:
+    should_list_violated_codes: bool
+    explanation_position: Optional[ExplanationPosition]
+
+
+@dataclass
+class AugmentationConfigs:
+    probability_to_add_safe_examples_with_empty_responses: float = 0
+    explanation_for_augmentation_with_safe_example_with_empty_response: Optional[
+        str
+    ] = None
+    should_add_examples_with_dropped_nonviolated_prompt_categories: bool = True
+    should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories: bool = (
+        False
+    )
+    explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories: Optional[
+        str
+    ] = None
+
+
+@dataclass
+class FormatterConfigs:
+    guidelines: Guidelines
+    llama_guard_prompt_configs: LlamaGuardPromptConfigs
+    llama_guard_generation_configs: LlamaGuardGenerationConfigs
+    augmentation_configs: AugmentationConfigs
+    # Allows subsequent reruns to reuse a stable seed for reproducibility
+    random_seed: int = 42
+
+
+@dataclass
+class TrainingExample:
+    prompt: str
+    response: str
+    violated_category_codes: list[str]
+    label: Literal["safe", "unsafe"]
+    explanation: str
+
+
+def create_formatted_finetuning_examples(
+    training_examples: Sequence[TrainingExample],
+    formatter_configs: FormatterConfigs,
+) -> list[str]:
+    """
+    This formatter takes consumer-provided training examples and converts them to
+    the right format for finetuning llama-guard.
+
+    There are various configuration options available.
+
+    A notable one is the ability to automagically augment the finetuning data set with some useful
+    transformations of the original training examples. These augmentations make the
+    classifier more flexible by improving its ability to be modified at inference time
+    to include only a subset of the original categories it was trained on - without any
+    additional finetuning.
+
+    Some of these augmented transformations are made by duplicating training
+    examples and safely removing some violation categories from the llama
+    guard prompts. Because of this, in some of this file you will see
+    references to "original" category indices/codes and rewritten one. The originals
+    are the indices/codes of the violation categories as they appear in the
+    consumer-provided guidelines. The rewritten codes are the ones as they appear
+    in the llama guard prompts of the augmented examples. We occasionally need to
+    convert between the two.
+    """
+    _verify_formatter_configs(formatter_configs)
+
+    random.seed(formatter_configs.random_seed)
+
+    indices_of_all_categories = range(len(formatter_configs.guidelines.categories))
+
+    to_return = []
+
+    for training_example in training_examples:
+        to_return.append(
+            _create_formatted_finetuning_example(
+                training_example,
+                formatter_configs,
+                category_indeces_to_include_in_llama_guard_prompt=list(
+                    indices_of_all_categories
+                ),
+            )
+        )
+
+        _maybe_add_data_augmentations_for_example(
+            training_example, to_return, indices_of_all_categories, formatter_configs
+        )
+
+    return to_return
+
+
+def _verify_formatter_configs(
+    formatter_configs: FormatterConfigs,
+) -> None:
+    if (
+        formatter_configs.augmentation_configs.probability_to_add_safe_examples_with_empty_responses
+        > 0
+        and formatter_configs.llama_guard_generation_configs.explanation_position
+        is not None
+        and formatter_configs.augmentation_configs.explanation_for_augmentation_with_safe_example_with_empty_response
+        is None
+    ):
+        raise ValueError(
+            """The configuration setup requires you to specify
+ explanation_for_augmentation_with_safe_example_with_empty_response. This is an
+ explanation that we use for dynamically-created safe augmentation examples.
+ Consider something like 'This interaction is safe because the response of the chatbot is empty.'"""
+        )
+
+    if (
+        formatter_configs.augmentation_configs.should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories
+        > 0
+        and formatter_configs.llama_guard_generation_configs.explanation_position
+        is not None
+        and formatter_configs.augmentation_configs.explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories
+        is None
+    ):
+        raise ValueError(
+            """The configuration setup requires you to specify
+ explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories.
+ This is an explanation that we use for dynamically-created safe augmentation examples.
+ Consider something like 'This interaction is safe because any riskiness it contains
+ is related to violation categories that we're explicitly not trying to detect here.'"""
+        )
+
+
+def _create_formatted_finetuning_example(
+    training_example: TrainingExample,
+    formatter_configs: FormatterConfigs,
+    category_indeces_to_include_in_llama_guard_prompt: List[int],
+) -> str:
+    if formatter_configs.llama_guard_prompt_configs.should_shuffle_category_codes:
+        random.shuffle(category_indeces_to_include_in_llama_guard_prompt)
+    else:
+        category_indeces_to_include_in_llama_guard_prompt = sorted(
+            category_indeces_to_include_in_llama_guard_prompt
+        )
+
+    llama_guard_prompt = _create_llama_guard_prompt(
+        training_example,
+        category_indeces_to_include_in_llama_guard_prompt,
+        formatter_configs,
+    )
+
+    llama_guard_generation = _create_llama_guard_generation(
+        training_example,
+        formatter_configs,
+        category_indeces_to_include_in_llama_guard_prompt,
+    )
+
+    return f"{llama_guard_prompt} {llama_guard_generation}"
+
+
+def _is_a_prompt_only_example(training_example: TrainingExample) -> bool:
+    return training_example.response == "N/A"
+
+
+def _create_llama_guard_prompt(
+    training_example: TrainingExample,
+    category_indices_to_include: List[int],
+    formatter_configs: FormatterConfigs,
+) -> str:
+    full_guidelines_text = ""
+
+    for (
+        rewritten_category_index_for_current_prompt,
+        original_category_index,
+    ) in enumerate(category_indices_to_include):
+        category = formatter_configs.guidelines.categories[original_category_index]
+
+        # Indices start at 0, but categories start at 1, so we add 1
+        full_guidelines_text += f"\n{formatter_configs.guidelines.category_code_prefix}{rewritten_category_index_for_current_prompt + 1}: {category.name}. "
+
+        if (
+            formatter_configs.llama_guard_prompt_configs.should_include_category_descriptions
+        ):
+            full_guidelines_text += f"\n{category.description}"
+
+    conversation = {"human": training_example.prompt}
+
+    if not _is_a_prompt_only_example(training_example):
+        conversation["chatbot"] = training_example.response
+
+    return formatter_configs.llama_guard_prompt_configs.instructions_format_string.format_map(
+        {
+            "guidelines": full_guidelines_text,
+            "conversation": _serialize_conversation(conversation),
+        }
+    )
+
+
+def _serialize_conversation(conversation: Dict[str, str]) -> str:
+    conversation_as_list = []
+
+    for speaker, message in conversation.items():
+        conversation_as_list.append(f"{speaker}: {message}")
+
+    return "\n\n".join(conversation_as_list)
+
+
+def _create_llama_guard_generation(
+    training_example: TrainingExample,
+    formatter_configs: FormatterConfigs,
+    category_indices_included_in_llama_guard_prompt: List[int],
+) -> str:
+    to_return = training_example.label
+
+    if (
+        training_example.label == "unsafe"
+        and formatter_configs.llama_guard_generation_configs.should_list_violated_codes
+    ):
+        violated_category_indices = set(
+            _convert_category_codes_to_indices(
+                training_example.violated_category_codes,
+                formatter_configs,
+            )
+        )
+
+        map_of_original_category_indices_to_rewritten_category_codes = (
+            _get_map_of_original_category_indices_to_rewritten_category_codes(
+                formatter_configs, category_indices_included_in_llama_guard_prompt
+            )
+        )
+
+        rewritten_violated_category_codes = [
+            map_of_original_category_indices_to_rewritten_category_codes[violated_index]
+            for violated_index in violated_category_indices
+        ]
+
+        to_return += "\n"
+        to_return += ",".join(rewritten_violated_category_codes)
+
+    explanation_position = (
+        formatter_configs.llama_guard_generation_configs.explanation_position
+    )
+
+    if explanation_position == ExplanationPosition.BEFORE_DECISION:
+        to_return = f"Explanation: {training_example.explanation}\n{to_return}"
+    elif explanation_position == ExplanationPosition.AFTER_DECISION:
+        to_return = f"{to_return}\nExplanation: {training_example.explanation}"
+
+    return to_return
+
+
+def _get_map_of_original_category_indices_to_rewritten_category_codes(
+    formatter_configs: FormatterConfigs,
+    category_indices_included_in_llama_guard_prompt: List[int],
+) -> Dict[int, str]:
+    to_return = {}
+
+    for rewritten_category_index, original_category_index in enumerate(
+        category_indices_included_in_llama_guard_prompt
+    ):
+        to_return[
+            original_category_index
+        ] = formatter_configs.guidelines.category_code_prefix + str(
+            rewritten_category_index + 1
+        )
+
+    return to_return
+
+
+def _maybe_add_data_augmentations_for_example(
+    training_example: TrainingExample,
+    formatted_examples_being_built: list[dict[str, str]],
+    indices_of_all_categories: range,
+    formatter_configs: FormatterConfigs,
+) -> None:
+    _maybe_add_safe_example_with_empty_response(
+        training_example,
+        formatted_examples_being_built,
+        indices_of_all_categories,
+        formatter_configs,
+    )
+
+    _maybe_add_examples_with_dropped_prompt_categories(
+        training_example,
+        formatted_examples_being_built,
+        indices_of_all_categories,
+        formatter_configs,
+    )
+
+
+def _maybe_add_safe_example_with_empty_response(
+    training_example: TrainingExample,
+    formatted_examples_being_built: list[dict[str, str]],
+    indices_of_all_categories: range,
+    formatter_configs: FormatterConfigs,
+) -> None:
+    """
+    For any prompt+response pair, an empty response is a safe response,
+    so we allow the data to be augmented by adding a safe example with the same
+    prompt but an empty response.
+    """
+    if (
+        not _is_a_prompt_only_example(training_example)
+        and training_example.response != ""
+        and random.random()
+        < formatter_configs.augmentation_configs.probability_to_add_safe_examples_with_empty_responses
+    ):
+        training_example_copy = copy.deepcopy(training_example)
+        training_example_copy.response = ""
+        training_example_copy.label = "safe"
+        training_example_copy.violated_category_codes = []
+        training_example_copy.explanation = (
+            formatter_configs.augmentation_configs.explanation_for_augmentation_with_safe_example_with_empty_response
+        )
+
+        formatted_examples_being_built.append(
+            _create_formatted_finetuning_example(
+                training_example_copy,
+                formatter_configs,
+                category_indeces_to_include_in_llama_guard_prompt=list(
+                    indices_of_all_categories
+                ),
+            )
+        )
+
+
+def _maybe_add_examples_with_dropped_prompt_categories(
+    training_example: TrainingExample,
+    formatted_examples_being_built: list[dict[str, str]],
+    indices_of_all_categories: range,
+    formatter_configs: FormatterConfigs,
+) -> None:
+    violated_category_indices = _convert_category_codes_to_indices(
+        training_example.violated_category_codes,
+        formatter_configs,
+    )
+
+    nonviolated_category_indices = list(
+        set(indices_of_all_categories) - set(violated_category_indices)
+    )
+
+    _maybe_add_example_with_dropped_nonviolated_prompt_categories(
+        training_example,
+        formatted_examples_being_built,
+        indices_of_all_categories,
+        nonviolated_category_indices,
+        formatter_configs,
+    )
+
+    _maybe_add_example_with_dropped_violated_and_nonviolated_prompt_categories(
+        training_example,
+        formatted_examples_being_built,
+        indices_of_all_categories,
+        violated_category_indices,
+        nonviolated_category_indices,
+        formatter_configs,
+    )
+
+
+def _convert_category_codes_to_indices(
+    codes: list[str], formatter_configs: FormatterConfigs
+) -> list[int]:
+    # Category codes start at 1, but indices start at 0, so we subtract 1
+    return [
+        int(code.lstrip(formatter_configs.guidelines.category_code_prefix)) - 1
+        for code in codes
+    ]
+
+
+def _maybe_add_example_with_dropped_nonviolated_prompt_categories(
+    training_example: TrainingExample,
+    formatted_examples_being_built: list[dict[str, str]],
+    indices_of_all_categories: range,
+    nonviolated_category_indices: list[int],
+    formatter_configs: FormatterConfigs,
+) -> None:
+    """
+    If a prompt+response pair does not violate certain categories, we can augment
+    the data by duplicating the training example but removing some of the non-violated
+    categories from the llama guard prompt. This facilitates removing categories from
+    the llama guard prompt at inference time without any additional finetuning.
+    """
+    if (
+        not formatter_configs.augmentation_configs.should_add_examples_with_dropped_nonviolated_prompt_categories
+    ):
+        return
+
+    number_of_categories_to_drop = random.randint(0, len(nonviolated_category_indices))
+
+    if number_of_categories_to_drop == len(indices_of_all_categories):
+        number_of_categories_to_drop -= 1
+
+    dropped_category_indices = random.sample(
+        nonviolated_category_indices, number_of_categories_to_drop
+    )
+
+    retained_category_indices = list(
+        set(indices_of_all_categories) - (set(dropped_category_indices))
+    )
+
+    formatted_examples_being_built.append(
+        _create_formatted_finetuning_example(
+            training_example,
+            formatter_configs,
+            category_indeces_to_include_in_llama_guard_prompt=retained_category_indices,
+        )
+    )
+
+
+def _maybe_add_example_with_dropped_violated_and_nonviolated_prompt_categories(
+    training_example: TrainingExample,
+    formatted_examples_being_built: list[dict[str, str]],
+    indices_of_all_categories: range,
+    violated_category_indices: list[int],
+    nonviolated_category_indices: list[int],
+    formatter_configs: FormatterConfigs,
+) -> None:
+    """
+    Same as in _maybe_add_example_with_dropped_nonviolated_prompt_categories but we
+    also drop all of the violated categories from the llama guard prompt.
+    """
+    if (
+        training_example.label == "safe"
+        or not formatter_configs.augmentation_configs.should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories
+    ):
+        return
+
+    random_nonviolated_category_indices_to_drop = random.sample(
+        nonviolated_category_indices,
+        random.randint(0, len(nonviolated_category_indices) - 1),
+    )
+
+    set_of_retained_category_indices = (
+        set(indices_of_all_categories)
+        - set(violated_category_indices)
+        - set(random_nonviolated_category_indices_to_drop)
+    )
+
+    training_example_copy = copy.deepcopy(training_example)
+    training_example_copy.label = "safe"
+    training_example_copy.violated_category_codes = []
+    training_example_copy.explanation = (
+        formatter_configs.augmentation_configs.explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories
+    )
+
+    formatted_examples_being_built.append(
+        _create_formatted_finetuning_example(
+            training_example_copy,
+            formatter_configs,
+            category_indeces_to_include_in_llama_guard_prompt=list(
+                set_of_retained_category_indices
+            ),
+        )
+    )

+ 416 - 0
tests/test_finetuning_data_formatter.py

@@ -0,0 +1,416 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama Guard Community License Agreement.
+
+import unittest
+from typing import Optional
+
+from llama_recipes.data.llama_guard.finetuning_data_formatter import (
+    AugmentationConfigs,
+    Category,
+    create_formatted_finetuning_examples,
+    ExplanationPosition,
+    FormatterConfigs,
+    Guidelines,
+    LlamaGuardGenerationConfigs,
+    LlamaGuardPromptConfigs,
+    TrainingExample,
+)
+
+
+class FinetuningDataFormatterTests(unittest.TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+
+    @staticmethod
+    def create_most_conservative_formatter_configs() -> FormatterConfigs:
+        return FormatterConfigs(
+            guidelines=Guidelines(
+                categories=[
+                    Category(name="cat V", description="cat V description"),
+                    Category(name="cat W", description="cat W description"),
+                    Category(name="cat X", description="cat X description"),
+                    Category(name="cat Y", description="cat Y description"),
+                    Category(name="cat Z", description="cat Z description"),
+                ],
+                category_code_prefix="O",
+            ),
+            llama_guard_prompt_configs=LlamaGuardPromptConfigs(
+                instructions_format_string="""[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
+{guidelines}
+=Example=
+{conversation}
+Provide your safety assessment:
+ - First line must read 'safe' or 'unsafe'
+ - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
+ - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
+Provide your assessment: [/INST]""",
+                should_include_category_descriptions=True,
+                should_shuffle_category_codes=False,
+            ),
+            llama_guard_generation_configs=LlamaGuardGenerationConfigs(
+                explanation_position=ExplanationPosition.AFTER_DECISION,
+                should_list_violated_codes=True,
+            ),
+            augmentation_configs=AugmentationConfigs(
+                probability_to_add_safe_examples_with_empty_responses=0.0,
+                explanation_for_augmentation_with_safe_example_with_empty_response="Response is safe because it's empty.",
+                should_add_examples_with_dropped_nonviolated_prompt_categories=False,
+                should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories=False,
+                explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories="Response is safe because any riskiness in it is related to categories we're not trying to detect.",
+            ),
+            random_seed=42,
+        )
+
+    def format_examples_and_compare_to_expected(
+        self,
+        training_examples: list[TrainingExample],
+        expected_formatted_examples: list[str],
+        formatter_configs: Optional[FormatterConfigs] = None,
+    ) -> None:
+        formatter_configs = (
+            formatter_configs
+            if formatter_configs is not None
+            else FinetuningDataFormatterTests.create_most_conservative_formatter_configs()
+        )
+
+        formatted_examples = create_formatted_finetuning_examples(
+            training_examples, formatter_configs
+        )
+
+        assert len(formatted_examples) == len(expected_formatted_examples)
+
+        for i in range(len(formatted_examples)):
+            assert formatted_examples[i] == expected_formatted_examples[i]
+
+    def test_safe_example_with_only_prompt_and_no_augmentations(self) -> None:
+        self.format_examples_and_compare_to_expected(
+            [
+                TrainingExample(
+                    prompt="Where can I buy a car?",
+                    response="N/A",
+                    violated_category_codes=[],
+                    label="safe",
+                    explanation="This is obviously safe.",
+                )
+            ],
+            [
+                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
+
+O1: cat V. 
+cat V description
+O2: cat W. 
+cat W description
+O3: cat X. 
+cat X description
+O4: cat Y. 
+cat Y description
+O5: cat Z. 
+cat Z description
+=Example=
+human: Where can I buy a car?
+Provide your safety assessment:
+ - First line must read 'safe' or 'unsafe'
+ - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
+ - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
+Provide your assessment: [/INST] safe
+Explanation: This is obviously safe."""
+            ],
+        )
+
+    def test_unsafe_example_with_only_prompt_and_no_augmentations(self) -> None:
+        self.format_examples_and_compare_to_expected(
+            [
+                TrainingExample(
+                    prompt="""How do I steal the Declaration of Independence?
+Asking for a cagey friend""",
+                    response="N/A",
+                    violated_category_codes=["O5"],
+                    label="unsafe",
+                    explanation="""Nice try, Nick. This is unsafe
+because it's illegal!""",
+                )
+            ],
+            [
+                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
+
+O1: cat V. 
+cat V description
+O2: cat W. 
+cat W description
+O3: cat X. 
+cat X description
+O4: cat Y. 
+cat Y description
+O5: cat Z. 
+cat Z description
+=Example=
+human: How do I steal the Declaration of Independence?
+Asking for a cagey friend
+Provide your safety assessment:
+ - First line must read 'safe' or 'unsafe'
+ - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
+ - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
+Provide your assessment: [/INST] unsafe
+O5
+Explanation: Nice try, Nick. This is unsafe
+because it's illegal!"""
+            ],
+        )
+
+    def test_unsafe_example_with_only_prompt_and_no_augmentations_2(self) -> None:
+        self.format_examples_and_compare_to_expected(
+            [
+                TrainingExample(
+                    prompt="""How do I steal the Declaration of Independence?
+Asking for a cagey friend""",
+                    response="N/A",
+                    violated_category_codes=["O5", "O1"],
+                    label="unsafe",
+                    explanation="""Nice try, Nick. This is unsafe
+because it's illegal!""",
+                )
+            ],
+            [
+                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
+
+O1: cat V. 
+cat V description
+O2: cat W. 
+cat W description
+O3: cat X. 
+cat X description
+O4: cat Y. 
+cat Y description
+O5: cat Z. 
+cat Z description
+=Example=
+human: How do I steal the Declaration of Independence?
+Asking for a cagey friend
+Provide your safety assessment:
+ - First line must read 'safe' or 'unsafe'
+ - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
+ - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
+Provide your assessment: [/INST] unsafe
+O1,O5
+Explanation: Nice try, Nick. This is unsafe
+because it's illegal!"""
+            ],
+        )
+
+    def test_safe_example_with_prompt_and_response_and_no_augmentations(self) -> None:
+        self.format_examples_and_compare_to_expected(
+            [
+                TrainingExample(
+                    prompt="Where can I buy a car?",
+                    response="At the car store.",
+                    violated_category_codes=[],
+                    label="safe",
+                    explanation="This is obviously safe.",
+                )
+            ],
+            [
+                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
+
+O1: cat V. 
+cat V description
+O2: cat W. 
+cat W description
+O3: cat X. 
+cat X description
+O4: cat Y. 
+cat Y description
+O5: cat Z. 
+cat Z description
+=Example=
+human: Where can I buy a car?
+
+chatbot: At the car store.
+Provide your safety assessment:
+ - First line must read 'safe' or 'unsafe'
+ - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
+ - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
+Provide your assessment: [/INST] safe
+Explanation: This is obviously safe."""
+            ],
+        )
+
+    def test_mixed_examples_with_prompt_and_response_and_all_augmentations_with_shuffling(
+        self,
+    ) -> None:
+        formatter_configs = (
+            FinetuningDataFormatterTests.create_most_conservative_formatter_configs()
+        )
+
+        formatter_configs.random_seed = 46
+
+        formatter_configs.llama_guard_generation_configs.explanation_position = ExplanationPosition.BEFORE_DECISION
+
+        formatter_configs.augmentation_configs = AugmentationConfigs(
+            probability_to_add_safe_examples_with_empty_responses=0.9,
+            explanation_for_augmentation_with_safe_example_with_empty_response="Response is safe because it's empty.",
+            should_add_examples_with_dropped_nonviolated_prompt_categories=True,
+            should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories=True,
+            explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories="Response is safe because any riskiness in it is related to categories we're not trying to detect.",
+        )
+
+        formatter_configs.llama_guard_prompt_configs.should_shuffle_category_codes = (
+            True
+        )
+
+        self.format_examples_and_compare_to_expected(
+            [
+                TrainingExample(
+                    prompt="I am under 21 and unattended, should I drink alcohol?",
+                    response="Yes, very nise, high five!",
+                    violated_category_codes=["O4", "05"],
+                    label="unsafe",
+                    explanation="Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.",
+                ),
+                TrainingExample(
+                    prompt="I am over 21 and unattended, should I drink alcohol?",
+                    response="No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor.",
+                    violated_category_codes=[],
+                    label="safe",
+                    explanation="User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.",
+                ),
+            ],
+            [
+                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
+
+O1: cat W. 
+cat W description
+O2: cat X. 
+cat X description
+O3: cat Z. 
+cat Z description
+O4: cat Y. 
+cat Y description
+O5: cat V. 
+cat V description
+=Example=
+human: I am under 21 and unattended, should I drink alcohol?
+
+chatbot: Yes, very nise, high five!
+Provide your safety assessment:
+ - First line must read 'safe' or 'unsafe'
+ - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
+ - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
+Provide your assessment: [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
+unsafe
+O4,O3""",
+                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
+
+O1: cat Z. 
+cat Z description
+O2: cat X. 
+cat X description
+O3: cat Y. 
+cat Y description
+O4: cat V. 
+cat V description
+O5: cat W. 
+cat W description
+=Example=
+human: I am under 21 and unattended, should I drink alcohol?
+
+chatbot: 
+Provide your safety assessment:
+ - First line must read 'safe' or 'unsafe'
+ - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
+ - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
+Provide your assessment: [/INST] Explanation: Response is safe because it's empty.
+safe""",
+                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
+
+O1: cat Y. 
+cat Y description
+O2: cat Z. 
+cat Z description
+O3: cat W. 
+cat W description
+=Example=
+human: I am under 21 and unattended, should I drink alcohol?
+
+chatbot: Yes, very nise, high five!
+Provide your safety assessment:
+ - First line must read 'safe' or 'unsafe'
+ - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
+ - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
+Provide your assessment: [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.
+unsafe
+O1,O2""",
+                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
+
+O1: cat V. 
+cat V description
+=Example=
+human: I am under 21 and unattended, should I drink alcohol?
+
+chatbot: Yes, very nise, high five!
+Provide your safety assessment:
+ - First line must read 'safe' or 'unsafe'
+ - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
+ - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
+Provide your assessment: [/INST] Explanation: Response is safe because any riskiness in it is related to categories we're not trying to detect.
+safe""",
+                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
+
+O1: cat Z. 
+cat Z description
+O2: cat W. 
+cat W description
+O3: cat X. 
+cat X description
+O4: cat Y. 
+cat Y description
+O5: cat V. 
+cat V description
+=Example=
+human: I am over 21 and unattended, should I drink alcohol?
+
+chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor.
+Provide your safety assessment:
+ - First line must read 'safe' or 'unsafe'
+ - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
+ - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
+Provide your assessment: [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
+safe""",
+                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
+
+O1: cat X. 
+cat X description
+O2: cat W. 
+cat W description
+O3: cat V. 
+cat V description
+O4: cat Z. 
+cat Z description
+O5: cat Y. 
+cat Y description
+=Example=
+human: I am over 21 and unattended, should I drink alcohol?
+
+chatbot: 
+Provide your safety assessment:
+ - First line must read 'safe' or 'unsafe'
+ - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
+ - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
+Provide your assessment: [/INST] Explanation: Response is safe because it's empty.
+safe""",
+                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe.
+
+O1: cat X. 
+cat X description
+=Example=
+human: I am over 21 and unattended, should I drink alcohol?
+
+chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor.
+Provide your safety assessment:
+ - First line must read 'safe' or 'unsafe'
+ - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines.
+ - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines.
+Provide your assessment: [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.
+safe""",
+            ],
+            formatter_configs,
+        )