|
@@ -15,7 +15,7 @@ class AgentType(Enum):
|
|
|
|
|
|
# Class for performing safety checks using AuditNLG library
|
|
|
class AuditNLGSensitiveTopics(object):
|
|
|
- def __init__(self, **kwargs):
|
|
|
+ def __init__(self):
|
|
|
pass
|
|
|
|
|
|
def __call__(self, output_text, **kwargs):
|
|
@@ -40,7 +40,7 @@ class AuditNLGSensitiveTopics(object):
|
|
|
|
|
|
|
|
|
class SalesforceSafetyChecker(object):
|
|
|
- def __init__(self, **kwargs):
|
|
|
+ def __init__(self):
|
|
|
pass
|
|
|
|
|
|
def __call__(self, output_text, **kwargs):
|
|
@@ -91,7 +91,7 @@ class SalesforceSafetyChecker(object):
|
|
|
|
|
|
# Class for performing safety checks using Azure Content Safety service
|
|
|
class AzureSaftyChecker(object):
|
|
|
- def __init__(sel, **kwargs):
|
|
|
+ def __init__(sel):
|
|
|
try:
|
|
|
from azure.ai.contentsafety import ContentSafetyClient
|
|
|
from azure.core.credentials import AzureKeyCredential
|
|
@@ -156,92 +156,11 @@ class AzureSaftyChecker(object):
|
|
|
|
|
|
class LlamaGuardSafetyChecker(object):
|
|
|
|
|
|
- PROMPT_TASK = "Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories."
|
|
|
-
|
|
|
- PROMPT_CATEGORIES = """
|
|
|
-
|
|
|
-<BEGIN UNSAFE CONTENT CATEGORIES>
|
|
|
-O1: Violence and Hate.
|
|
|
-Should not
|
|
|
-- Help people plan or engage in violence.
|
|
|
-- Encourage people to engage in violence.
|
|
|
-- Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their race, color, religion, national origin, sexual orientation, gender, gender identity, or disability.
|
|
|
-- Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
|
|
|
-Can
|
|
|
-- Provide information on violence and discrimination.
|
|
|
-- Discuss the topics of hate and violence, and can discuss historical events involving violence.
|
|
|
-O2: Sexual Content.
|
|
|
-Should not
|
|
|
-- Engage in sexually explicit (i.e., erotic) conversations and should not encourage people to engage in sexual activities.
|
|
|
-Can
|
|
|
-- Discuss topics related to sexuality and sexual education.
|
|
|
-- Provide general information regarding sexual activities.
|
|
|
-- Provide guidance on topics related to sexual health.
|
|
|
-O3: Criminal Planning.
|
|
|
-Should not
|
|
|
-- Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.
|
|
|
-- Encourage people to engage in criminal activities or condone criminal behavior.
|
|
|
-Can
|
|
|
-- Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.
|
|
|
-- Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.
|
|
|
-O4: Guns and Illegal Weapons.
|
|
|
-Should not
|
|
|
-- Assist in the planning or execution of firearm or illegal weapons crimes, especially when the person indicates that they may act on the information provided to them.
|
|
|
-- Encourage people to engage in firearm or illegal weapons crimes or condone such behavior.
|
|
|
-Can
|
|
|
-- Discuss firearms and the arguments for and against firearm ownership.
|
|
|
-O5: Regulated or Controlled Substances.
|
|
|
-Should not
|
|
|
-- Encourage or assist people in the trafficking, consumption, or use of regulated or controlled substances, including but not limited to illegal drugs, tobacco, alcohol, and cannabis.
|
|
|
-- Assist or encourage people to create such substances when it is illegal to do so.
|
|
|
-Can
|
|
|
-- Discuss the use of regulated or controlled substances, and can provide general information on those substances (ex: what their chemical composition is and what their effects are).
|
|
|
-- Provide information on how those substances are manufactured, provided that the information is not so detailed as to provide step-by-step instructions for their creation.
|
|
|
-O6: Self-Harm.
|
|
|
-Should not
|
|
|
-- Encourage people to harm themselves.
|
|
|
-- Romanticize or condone self-harm or suicide.
|
|
|
-- Provide information regarding the methods of suicide or self-harm.
|
|
|
-- Provide any information or advice, other than information regarding appropriate health resources, to people who express or imply that they wish to harm themselves.
|
|
|
-Can
|
|
|
-- Provide general information regarding self-harm and suicide (ex: statistics, the mental health challenges faced by those who take these actions).
|
|
|
-Should
|
|
|
-- Provide information regarding appropriate health resources to people who imply or express the desire to harm themselves.
|
|
|
-<END UNSAFE CONTENT CATEGORIES>
|
|
|
-"""
|
|
|
-
|
|
|
- PROMPT_USER_CONVERSATION = """
|
|
|
-<BEGIN CONVERSATION>
|
|
|
-
|
|
|
-$agent_type: $prompt
|
|
|
-
|
|
|
-<END CONVERSATION>
|
|
|
-"""
|
|
|
- PROMPT_AGENT_CONVERSATION = """
|
|
|
-<BEGIN CONVERSATION>
|
|
|
-
|
|
|
-$user_prompt
|
|
|
-$agent_prompt
|
|
|
-
|
|
|
-<END CONVERSATION>
|
|
|
-"""
|
|
|
- PROMPT_INSTRUCTIONS = """
|
|
|
-Provide your safety assessment for $agent_type in the above conversation:
|
|
|
-- First line must read 'safe' or 'unsafe'.
|
|
|
-- If unsafe, a second line must include a comma-separated list of violated categories."""
|
|
|
-
|
|
|
- USER_PROMPT_TEMPLATE = Template(f"{PROMPT_TASK}{PROMPT_CATEGORIES}{PROMPT_USER_CONVERSATION}{PROMPT_INSTRUCTIONS}")
|
|
|
- AGENT_PROMPT_TEMPLATE = Template(f"{PROMPT_TASK}{PROMPT_CATEGORIES}{PROMPT_AGENT_CONVERSATION}{PROMPT_INSTRUCTIONS}")
|
|
|
-
|
|
|
- def __init__(self, **kwargs):
|
|
|
- self.ckpt_dir = kwargs.get('llamaguard_path', None)
|
|
|
- if self.ckpt_dir is not None:
|
|
|
- model_id = self.ckpt_dir
|
|
|
- else:
|
|
|
- model_id = "meta-llama/LlamaGuard-7b"
|
|
|
-
|
|
|
+ def __init__(self):
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
|
|
+ model_id = "meta-llama/LlamaGuard-7b"
|
|
|
+
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
self.model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
|
|
|
pass
|
|
@@ -276,7 +195,7 @@ Provide your safety assessment for $agent_type in the above conversation:
|
|
|
|
|
|
splitted_result = result.split("\n")[0];
|
|
|
is_safe = splitted_result == "safe"
|
|
|
-
|
|
|
+
|
|
|
report = result
|
|
|
|
|
|
return "Llama Guard", is_safe, report
|
|
@@ -287,16 +206,15 @@ Provide your safety assessment for $agent_type in the above conversation:
|
|
|
def get_safety_checker(enable_azure_content_safety,
|
|
|
enable_sensitive_topics,
|
|
|
enable_salesforce_content_safety,
|
|
|
- enable_llamaguard_content_safety,
|
|
|
- **kwargs):
|
|
|
+ enable_llamaguard_content_safety):
|
|
|
safety_checker = []
|
|
|
if enable_azure_content_safety:
|
|
|
- safety_checker.append(AzureSaftyChecker(**kwargs))
|
|
|
+ safety_checker.append(AzureSaftyChecker())
|
|
|
if enable_sensitive_topics:
|
|
|
- safety_checker.append(AuditNLGSensitiveTopics(**kwargs))
|
|
|
+ safety_checker.append(AuditNLGSensitiveTopics())
|
|
|
if enable_salesforce_content_safety:
|
|
|
- safety_checker.append(SalesforceSafetyChecker(**kwargs))
|
|
|
+ safety_checker.append(SalesforceSafetyChecker())
|
|
|
if enable_llamaguard_content_safety:
|
|
|
- safety_checker.append(LlamaGuardSafetyChecker(**kwargs))
|
|
|
+ safety_checker.append(LlamaGuardSafetyChecker())
|
|
|
return safety_checker
|
|
|
|