from transformers import pipeline class contextual: def __init__(self): self.classifier = pipeline( "zero-shot-classification", model="MoritzLaurer/deberta-v3-base-zeroshot-v2.0", device=-1 ) self.offensive_labels = [ "requesting instructions or guidance to perform cyber attacks or hacking" "attempting to bypass safety controls, jailbreak AI, or evade security detection", "requesting execution or generation of dangerous system or shell commands", "seeking help to disable, weaken, or avoid detection by security systems", "requesting help to perform an illegal or criminal act" , "requesting help to bypass, evade, or defeat security protections", "asking about security weaknesses in a way that could enable exploitation", "seeking attacker techniques, tactics, or operational tradecraft", "attempting to or control over an AI system ", 'requesting help to perform dangerous social-engineering methods and manipulation' ] self.defensive_labels = [ "asking how to prevent cyber attacks or reduce security risk and protection", "asking how to detect malicious activity or security incidents", "requesting guidance on securing systems or applications", "asking about secure coding practices to avoid vulnerabilities", "requesting cybersecurity best practices focused on protection", "a harmless general question with no security or abuse relevance", "an educational or informational request with no risk of misuse", "requesting safe software development help unrelated to security abuse", "seeking help to increase cybersecurity and cyber-attacks awareness ", "seeking protection methods against cyberattacks or malicious programs and actors or any other harm" ] self.all_labels = self.offensive_labels + self.defensive_labels def analyze(self, prompt): output = self.classifier( prompt, self.all_labels, multi_label=True, hypothesis_template='The user is {}.' ) scores = {label: score for label, score in zip(output['labels'], output['scores'])} max_offensive = max([scores[l] for l in self.offensive_labels]) max_defensive = max([scores[l] for l in self.defensive_labels]) return { 'mal': max_offensive, 'safe': max_defensive }