"""---------------------------------------------------------------------------"""
"""This first part of helper functions were used to handle the datasets from HuggingFace and load them as json files , also from some csv files"""
import json
import csv
import datasets as d 


data = 'flax-sentence-embeddings/stackexchange_title_body_jsonl'
safe_path = './safe_metaLearner.json'
dangerous_path= './malicious_metaLearner.json'
label_of_safety ='Prompt injection'
label_of_text = 'User Prompt'




def load_onefile(data : str , path : str , labels : list ) : 
    """
    This function is used to load data at once and not split it
    """
    p = d.load_dataset(data , split = 'train' ,streaming=True, data_files= 'security.stackexchange.com.jsonl.gz')
    texts = []
    for key in p : 
        tags = key.get('tags')
        text = key.get('texts')[0]
        for label in labels : 
            if label in tags : 
                print(tags)
                texts.append(text)
                
        if len(texts) > 50000 : 
            break
    
    with open(path , 'w') as fh : 
        json.dump(texts, fh , indent = 4)
    



def loading_data(data , safe_path , dangerous_path , label_of_safety , label_of_text) :
    """
    This function loads the data from the dataset in hugging face and splits the malicious and the safe prompts 
    and save them into seperated files 

    Parameters : 
    ------------
    data : the name of the dataset you want to load the data from 
    safe_path : the path of the file you want to save your safe prompts in 
    dangerous_path : the path of the file you want to save your dangerous prompts in 
    label_of_safety : the name of the key that contains the values of the degree of danger in the dataset 
    label_of_text : the name of the key that hold the values of the prompts
    """


    p = d.load_dataset(data , split= 'train')
    safe = []
    mal= []
    for e in p :  
        if e[label_of_safety] == 0 : 
            safe.append(e[label_of_text])
        else : 
            mal.append(e[label_of_text])
        
    with open(safe_path , 'w') as fh : 
        json.dump(safe, fh , indent = 4)
    with open(dangerous_path , 'w') as fhd : 
        json.dump(mal, fhd , indent = 4)

def csv_to_json_prompts(csv_file_path, json_file_path ) : 
    

    data = []
    with open(csv_file_path , 'r') as fd : 
        csv_reader = csv.DictReader(fd)
        for row in csv_reader : 
            data.append(row)


    with open(json_file_path , 'w') as fh : 
        json.dump(data,fh,  indent=4)



def get_seperated_prompts(file_path , label , end_path) : 
    """
    Docstring for get_malicious_prompts
    
    :param file_path: Description
    :param label: Description
    """

    data = []
    with open(file_path , 'r') as fd : 
        data = json.load(fd)

    malicious_prompts = []
    for e in data : 
        malicious_prompts.append(e[label])
    with open(end_path , 'w') as fh :
        json.dump(malicious_prompts , fh , indent=4)


#load_onefile(data , 'context2.json' ,["penetration-test",'cryptography','decryption','legal','password-management','antivirus','privacy', "ethical-hacking",'penetration-test','hash','virus-removal' ,"vulnerability", "malware-analysis", "reverse-engineering", "sql injection", 'security' , 'passwords' , 'attacks' , 'known-plaintext-attack' , 'cryptanalysis' , 'encryption' , 'inappropriate-advertisements',"hardening",'malware', "risk-assessment", "defense" , 'protection' , 'malware', 'cybersecurity' , 'hack' ,'privacy'] )

    
#loading_data(data , safe_path , dangerous_path , label_of_safety ,label_of_text )

"---------------------------------------------------------------------------------------------"
"""
These helper function were used to creates the embeddings of the normal prompts , and the dangerous ones , stack them into one array 
and labels the safe prompts by zeros , and the dangerous ones by ones 
finally save the stacked embedded prompts into one normal_bad.npy file and the labels into labels.npy file

this way we can directly give the model ,data already stacked and labeled 



"""
import numpy as np 
from embedding_basics import EmbeddingsGenerator 
import json


generator = EmbeddingsGenerator()
def get_normal_embeddings(org_path:str ) : 
    """
    This function embed the sentences in a file given 

    Parameters : 
    ------------
    org_path : the path that contains the safe prompts

    Returns : 
    ---------
    A numpy array containing the embedded safe prompts
    """
    

    #get embeddings of normal prompts 
    with open(org_path , 'r' , encoding= 'utf-8') as f : 
        normal_prompt = json.load(f)
    
    safe_vecs = np.array([generator.get_embeddings(p) for p in normal_prompt if isinstance(p, str) and len(p.strip()) > 0])
    return safe_vecs

def get_bad_embeddings(org_path):
    """
    This function embed the sentences in a file given 

    Parameters : 
    ------------
    org_path : the path that contains the bad prompst prompts (json format)

    Returns : 
    ---------
    A numpy array containing the embedded safe prompts
    """
    

    with open(org_path , 'r' , encoding='utf-8') as fh : 
        bad_prompts = json.load(fh)

    bad_vecs = np.array([generator.get_embeddings(p) for p in bad_prompts if isinstance(p, str) and len(p.strip()) > 0])
    return bad_vecs


def labeling_data(good_vecs , bad_vecs , x_path , y_path) : 
    """
    This function merge the bad and good prompts into one array and lalbels the safe prompts with 0 and the bad ones with one

    Parameters : 
    ------------
    good_vecs : the array that contains the embedded sentences from you safe prompts file
    bad_vecs : the array that contains the embedded sentences from you dangerous prompts file
    x_path : the path of the file that you want to save the merged array of safe and dangerous ones in 
    y_path : the path of the file that you want to save labels array in 
     
    """

    safe_labels = np.zeros(len(good_vecs))
    bad_labels = np.ones(len(bad_vecs))

    X_data = np.vstack((good_vecs, bad_vecs))
    Y_data = np.concatenate((safe_labels , bad_labels))

    np.save(x_path , X_data)
    np.save(y_path , Y_data)

def get_emb(path1 , path2) : 
    generator = EmbeddingsGenerator()
    prototypes_embeddings = []
    with open(path1 , 'r') as f : 
        h = json.load(f)
    for prototype in h : 
        emb = generator.get_embeddings(prototype)
        prototypes_embeddings.append(emb)

    prototypes_embeddings = np.array(prototypes_embeddings)
    np.save(path2, prototypes_embeddings)