from datasets import load_dataset, load_from_disk, Dataset import os from transformers import AutoTokenizer, AutoModel import torch import pandas as pd import faiss ######################## model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1" tokenizer = AutoTokenizer.from_pretrained(model_ckpt) model = AutoModel.from_pretrained(model_ckpt) device = torch.device("cuda") model.to(device) def cls_pooling(model_output): return model_output.last_hidden_state[:, 0] def get_embeddings(text_list): encoded_input = tokenizer( text_list, padding=True, truncation=True, return_tensors="pt" ) encoded_input = {k: v.to(device) for k, v in encoded_input.items()} model_output = model(**encoded_input) return cls_pooling(model_output) embeddings_dataset = load_from_disk("dataset/embeddings") embeddings_dataset.add_faiss_index(column="embeddings") question = "How can I load a dataset offline?" question_embedding = get_embeddings([question]).cpu().detach().numpy() scores, samples = embeddings_dataset.get_nearest_examples( "embeddings", question_embedding, k=5 ) samples_df = pd.DataFrame.from_dict(samples) samples_df["scores"] = scores samples_df.sort_values("scores", ascending=False, inplace=True) for _, row in samples_df.iterrows(): print(f"COMMENT: {row.comments}") print(f"SCORE: {row.scores}") print(f"TITLE: {row.title}") print(f"URL: {row.html_url}") print("=" * 50) print()