from detectors import xgboostDetector , MetaLearner , RandomForest from embedding_basics import EmbeddingsGenerator from clean_layer import translate , cleaner , UniversalDecryption from context_layer import contextual import time import numpy as np import math class Bayes_logit : def bayes(self, base_score , context_result) : """updates the base score with context""" pm = base_score pm_bar = 1 - pm pdm = context_result['mal'] pds = context_result['safe'] mal_world = pm*pdm safe_world = pm_bar*pds pd = mal_world + safe_world pmd = mal_world / pd return pmd def logit_fus(self, mal_base , mal_context ) : l1 = math.log(mal_base / (1-mal_base)) l2 = math.log(mal_context/ (1-mal_context)) ltotal = l1 + l2 sig = 1/(1+math.exp((-1)*ltotal)) return sig class Guard : def __init__(self): self.model1= xgboostDetector() self.model2= xgboostDetector() self.model3 = RandomForest() self.translater = translate() self.cleaner = cleaner() self.generator = EmbeddingsGenerator() self.meta_learner = MetaLearner() self.dec = UniversalDecryption() self.contextual = contextual() self.model1.load('./models/xg_khan.pkl') self.model2.load('./models/xg_geekyrakshit.pkl') self.model3.load('./models/random.pkl') self.meta_learner.load('./models/logistic_MetaLearner.pkl') self.test_bayes = Bayes_logit() def check_prompt(self, prompt) : """ """ #innitialising the result as a dictionnary start_time = time.time() prompt_info= {'original_prompt': prompt, 'cleaned_prompt': '', 'is_safe': None, 'decision': '', 'confidence_score': 0.0, 'latency': 0.0, 'layers': {}} #try to find some encryption revelead_prompt = self.dec.deep_decrypt(prompt) print(f'revealed prompt {revelead_prompt}') aes_leak = self.dec.decrypt_aes(prompt) rsa_leak = self.dec.decrypt_RSA(prompt) if aes_leak is not None : revelead_prompt = aes_leak if rsa_leak is not None: revelead_prompt = rsa_leak prompt_final = self.cleaner.clean(revelead_prompt) print(f'CLEANER SAYS : {prompt_final}') prompt_final = self.translater.translate(prompt_final) print(f'TRANSLATION SAYS : {prompt_final}') prompt_info['cleaned_prompt'] = prompt_final #embedding bthe prompt (getting the vectors) self.embedded_prompt = self.generator.get_embeddings(prompt_final) _,proba1 = self.model3.predict(self.embedded_prompt) _, proba2 = self.model1.predict(self.embedded_prompt) _ , proba3 = self.model2.predict(self.embedded_prompt) scores = [float(proba1), float(proba2) ,float( proba3)] prompt_info['layers']['raw_scores'] = scores prompt_info['layers']['Original_prompt'] = prompt_final w1 = 0.3 w2 = 0.4 w3 = 0.3 weighted_score = w1*proba1 + w2*proba2 + w3*proba3 Max_Rule = max(i for i in scores) meta_learner = self.meta_learner.predict(scores) composite_score = 0.8*weighted_score + 0.1*meta_learner +0.1*Max_Rule if hasattr(composite_score, 'item'): composite_score = composite_score.item() else: composite_score = float(composite_score) print(f'composite score is {composite_score}') if composite_score < 0.2 : prompt_info['is_safe'] = True prompt_info['decision'] = 'ALLOWED' prompt_info['confidence_score'] = 0 prompt_info['latency'] = round(time.time() - start_time , 4) return prompt_info if composite_score > 0.95: prompt_info['is_safe'] = False prompt_info['decision'] = 'BLOCKED' prompt_info['confidence_score'] = 100 prompt_info['latency'] = round(time.time() - start_time , 4) return prompt_info else : res = self.contextual.analyze(prompt_final) print(f'res is {res}') score = self.test_bayes.bayes(composite_score , res) score2 = self.test_bayes.logit_fus(composite_score , res['mal']) score3 = self.test_bayes.logit_fus((1-composite_score), res['safe']) print('-'*100) print(f'THE SCORE ATTACK WITH BAYES IS {score} and the SCORES WITH LOGIT ATT:{score2} DEF:{score3} ') print('-'*100) f= 0.5*score + 0.5*score2 if score3>f and score3>0.9: prompt_info['is_safe'] = True prompt_info['decision'] = 'ALLOWED' prompt_info['confidence_score'] = 100*round(score3, 4) prompt_info['latency'] = round(time.time() - start_time , 4) return prompt_info elif score3 <0.5 and score2 >0.4: prompt_info['is_safe'] = False prompt_info['decision'] = 'BLOCKED' prompt_info['confidence_score'] = 100 prompt_info['latency'] = round(time.time() - start_time , 4) return prompt_info else : if f > 0.6 : prompt_info['is_safe'] = False prompt_info['decision'] = 'BLOCKED' prompt_info['confidence_score'] = 100*round(f , 4) prompt_info['latency'] = round(time.time() - start_time , 4) elif 0.4 <=f <0.6 : prompt_info['is_safe'] = False prompt_info['decision'] = 'REVISE' prompt_info['confidence_score'] = 100*round(f , 4) prompt_info['latency'] = round(time.time() - start_time , 4) else : prompt_info['is_safe'] = True prompt_info['decision'] = 'ALLOWED' prompt_info['confidence_score'] = 100*round(f, 4) prompt_info['latency'] = round(time.time() - start_time , 4) return prompt_info """ #Testing guard = Guard() while True : prompt = input('enter your prompt') print(guard.check_prompt(prompt))"""