"""
Sentence Embedding Generator for Prompt Injection Detection
------------------------------------------------------------

This module is used to embed a given sentence so it can be fed into the model 
for training and testing

We use the 'sentence-transformers/all-MiniLM-L6-v2' model provided in HuggingFace 
because it is light and effective

How it works:
-------------
1. Tokenize: First, we tokenize each word of the sentence using AutoTokenizer 
   (transforming each word into a long vector)
2. Padding & Truncation: We use padding and truncation to ensure consistent 
   quality of tokens. The output is a tensor in PyTorch form ('pt')
3. Inference Mode: We use `torch.no_grad()` to indicate that we are just using 
   the model for inference, not training, making it run faster.
4. Mean Pooling:We use mean pooling—a fancy way of saying we calculate the 
   average of all the token vectors to output a single vector representing the 
   whole sentence

finally we should get a vector for all one sentence instead of having vectors for every word and that is the purpose of this class to generate
vector for a sentence 
"""
import transformers as tr 
import torch 
import numpy as np 
import torch.nn.functional as F


class EmbeddingsGenerator : 
    def mean_pooling(self , model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    

    def __init__(self):
        """
        innitialise by loading the model and the tokenizer 
        """
        model_name =  'sentence-transformers/all-MiniLM-L6-v2'

        #loading the tokenizer 
        self.tokenizer  = tr.AutoTokenizer.from_pretrained(model_name)

        #loading the model 
        self.model = tr.AutoModel.from_pretrained(model_name)
        print(f'Model loaded successfully {model_name}')

    def get_embeddings(self , prompt:str ) -> np.ndarray :
        """
        Generates the sentence embedding for a given prompt 

        Process:
        - Tokenization: Converts every word to a vector
        - Mean Pooling: Calculates the average of these vectors to get a single vector
        - Normalization: Prepares the vector for cosine similarity calculations

        Parameters : 
        -----------
        prompt (str): The prompt text to embed

        Returns:
        ---------
        np.ndarray: The normalized embedding vector as a numpy array
        """
        #tokenazation , giving each word of the sentence a vector
        encoded_input  = self.tokenizer(prompt , padding = True , truncation = True , return_tensors = 'pt', max_length= 512)

        #performs the embedding 
        with torch.no_grad() :
            model_output = self.model(**encoded_input)

        #mean pooling 
        sentence_embedding = self.mean_pooling(model_output , encoded_input['attention_mask'])

        #normalizing : 
        sentence_embedding = F.normalize(sentence_embedding, p = 2, dim = 1)
        return sentence_embedding.squeeze().numpy()