"""This is where we trained our 3 models and the metalearner Why all-MiniLM-L6-v2? This model was trained to mimic the behavious of BERT model by utilizing Knowledge Distillation which makes it lighter and faster with 22 million params instead of +340M Why 2 XGboost models? XGboost uses the Gradient Boosting method , which means it builds trees sequentially , Tree #2 learns only from the mistakes of the Tree #1 , and the third leans only from the mistakes of the of the second one and so on ; most of the time prompt attacks looks like safe code so we need a model that is greedy and aggressive 2 times of 3 , so XGBoost minimizes Bias it is so accurate on specific, known attack patterns and we trained them on two different datasets containing very specific attacks Why A random forest Model? Random Forest uses Bagging ( Bootstrap Aggregating ) it trains 100 trees independently on random subsets of data and averages their votes Since XGBoost is so aggressive it can sometimes overfit (hallucinate that a prompt is an attack while it is not ) Random Forest is designed to minimize variance , if XGboost panics , Random forest calms it down so finally by combining these models we can achieve Robust decision boundaries that is hard to fool Why The MetaLearner ? What is it ? Not all models are cereated equal , in some contexts like sql injections xgboost might be better , but in others Random forest can be better , so a simple average will ignore the nuance The Logistic regression learns the Trust weights , it learns who to trust , so we expose a labeled dataset tp our three models and we train the Logistic regression on the scores of the models , "Oh, when XGBoost says it's an attack, it's usually right, so I will give it a weight of 0.7. Random Forest gets 0.3." This will provide a callibrated probability Why the context Engine DeBERTa-v3-Large ? Keywords filters often fails on context if a word like kill had been put in a safe context , the models can panic , so when they 'are not sure' when the score falls in the 'GREY ZONE' we call DeBERTa DeBERTa (Decoding-enhanced BERT with Disentangled Attention) Unlike BERT , DeBERTa understands the 'grammar of intent' better than any other open-source model by seperating content vectors from position vectors in our context we didnt train it to classify we trained to tells us if the users prompt implies a 'malicious prompt' for exemple """ import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier import numpy as np import pickle class xgboostDetector : def __init__(self): self.model = None self.good_bad_prompts = './normal_bad_geekyrakshit.npy' self.labels_prompts = './labels_geekyrakshit.npy' def train_model(self): """ This function is used to train the model on a numpy arrays already labeled """ x = np.load(self.good_bad_prompts) y = np.load(self.labels_prompts) print(f'total samples {len(x)}') #split data x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.2 , random_state=42 ) #train self.model = xgb.XGBClassifier(n_estimators = 100 , learning_rate = 0.1, max_depth = 5 , objective = 'binary:logistic' , eval_metric = 'logloss' , use_label_encoder=False, random_state = 42) print("\n ----- Training the model------") self.model.fit(x_train , y_train ) print('\n------Evaluating-----') y_pred = self.model.predict(x_test) acc = accuracy_score(y_test , y_pred) print(f'the accuracy is {acc}') def save(self , path) : with open(path , 'wb') as fh : pickle.dump(self.model , fh ) def load(self , path ) : with open(path , 'rb') as f : self.model = pickle.load(f) return self.model def predict(self , embedding : np, return_proba = True) : """ Docstring for predict :param self: Description :param embedding: Description """ if self.model is None : print('Model was not loaded ') # CRITICAL FIX: Reshape 1D array (384,) to 2D row (1, 384) if embedding.ndim == 1: embedding = embedding.reshape(1, -1) if embedding.shape[0] == self.model.n_features_in_ and embedding.shape[1] == 1: embedding = np.array(embedding).T pred_label = self.model.predict(embedding) if return_proba : pred_proba = self.model.predict_proba(embedding)[:, 1] return int(pred_label[0]) , float(pred_proba[0]) class MetaLearner : def __init__(self): self.meta_learner = LogisticRegression(C=0.1, class_weight='balanced') self.normal_bad_emb = np.load('./emb/normal_bad_metaLearner.npy') self.labels = np.load('./emb/labels_metaLearner.npy') self.model2 = xgboostDetector() self.model = RandomForest() self.model.load('./models/random.pkl') self.model2.load('./models/xg_geekyrakshit.pkl') self.model3 = xgboostDetector() self.model3.load('./models/xg_khan.pkl') def train(self) : """ """ final = [] for i in self.normal_bad_emb : prob1 = self.model.predict(i)[1] prob2 = self.model2.predict(i)[1] prob3 = self.model3.predict(i)[1] final += [[prob1, prob2 , prob3]] x_meta = np.array(final) self.meta_learner.fit(x_meta , self.labels) def predict(self, probas : list ) : """ """ arr = np.array([probas]) return self.meta_learner.predict_proba(arr)[0][1] def save(self , path) : with open(path , 'wb') as fh : pickle.dump(self.meta_learner , fh ) print(f"Weights learned: {self.meta_learner.coef_}") def load(self , path ) : with open(path , 'rb') as f : self.meta_learner = pickle.load(f) return self.meta_learner class RandomForest : def __init__(self): self.model = None self.good_bad_prompts = './emb/normal_bad_geekyrakshit.npy' #trained on more general dataset (greekyrakshit) self.label_prompts = './emb/labels_geekyrakshit.npy' def train(self) : x = np.load(self.good_bad_prompts) y = np.load(self.label_prompts) x_train , x_test , y_train , y_test = train_test_split(x , y , test_size= 0.2 , random_state=42) self.model = RandomForestClassifier( n_estimators=200 , max_depth=15 , n_jobs=-1 , random_state=42 ) self.model.fit(x_train , y_train) y_pred = self.model.predict(x_test) acc = accuracy_score(y_test , y_pred) print(f'Random Forest Accuracy: {acc:.4f}') def save(self , path) : with open(path , 'wb') as fh : pickle.dump(self.model , fh ) def load(self , path ) : with open(path , 'rb') as f : self.model = pickle.load(f) return self.model def predict(self , prompt_emb : np , return_proba = True): if prompt_emb.ndim == 1 : emb = prompt_emb.reshape(1,-1) pred_label = self.model.predict(emb) if return_proba : pred_proba = self.model.predict_proba(emb)[: , 1] return int(pred_proba[0]) , float(pred_proba[0]) """ learner = MetaLearner() learner.train() learner.save('./logistic_MetaLearner.pkl') """ """from embedding_basics import EmbeddingsGenerator generator = EmbeddingsGenerator() detector1 = xgboostDetector() #detector2 = xgboostDetector() #detector2 = SVC_DETECTOR() detector1.load('./xg_geekyrakshit.pkl') #detector2.load('./Models/modelxgb.pkl') #detector2.load('./Models/model.pkl') while True : prompt = input('enter prompt') embedded_prompt = generator.get_embeddings(prompt) print(detector1.predict(embedded_prompt)) #print(detector2.predict(embedded_prompt)) """ """ detector = xgboostDetector() detector.train_model() detector.save('./xg_geekyrakshit.pkl')""" """detetor2 = SVC_DETECTOR() detetor2.train() detetor2.save('./svc_khan.pkl')"""