from transformers import AutoTokenizer, AutoModel, pipeline import torch import torch.nn.functional as F import pandas as pd # Function for mean pooling of embeddings def mean_pooling(model_output, attention_mask): token_embeddings = model_output[0] # First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) return sum_embeddings / sum_mask # Load model and tokenizer from HuggingFace Hub for sentence embeddings tokenizer = AutoTokenizer.from_pretrained('AidenYan/MiniLM_L6_v2_finetuned_ISOM5240_Group27') model = AutoModel.from_pretrained('AidenYan/MiniLM_L6_v2_finetuned_ISOM5240_Group27') # Load CSV data df = pd.read_csv('toys_and_games_reviews.csv', encoding='ISO-8859-1') # Extract the last column's first three rows of sentences sentences = df.iloc[:3, -1].tolist() # Tokenize sentences encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') # Compute token embeddings with torch.no_grad(): model_output = model(**encoded_input) # Perform pooling sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) # Normalize embeddings sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) # Load a text generation pipeline with the correct GPT-2 model identifier text_generator = pipeline('text-generation', model='gpt2') # Corrected model identifier # Function to find most similar sentence embeddings def find_most_similar(new_embedding, sentence_embeddings): similarities = F.cosine_similarity(new_embedding, sentence_embeddings) most_similar_idx = similarities.topk(1).indices.item() return most_similar_idx # Assuming you want to generate text based on a new sentence new_sentence = "A teddy bear and a lego toy." # Tokenize the new sentence and compute its embedding encoded_new_sentence = tokenizer([new_sentence], padding=True, truncation=True, return_tensors='pt') with torch.no_grad(): model_output_new_sentence = model(**encoded_new_sentence) new_sentence_embedding = mean_pooling(model_output_new_sentence, encoded_new_sentence['attention_mask']) # Find the most similar sentence in your corpus most_similar_idx = find_most_similar(new_sentence_embedding, sentence_embeddings) most_similar_sentence = sentences[most_similar_idx] # Generate text based on the most similar sentence generated = text_generator(most_similar_sentence, max_length=50, num_return_sequences=1) generated_text = generated[0]['generated_text'] # Output print(f"Most similar sentence in the corpus: {most_similar_sentence}") print(f"Generated text: {generated_text}")