from transformers import AutoTokenizer, AutoModel import torch import gradio as gr # Load the pre-trained paraphrase-mpnet-base-v2 model and tokenizer tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-mpnet-base-v2') model = AutoModel.from_pretrained('sentence-transformers/paraphrase-mpnet-base-v2') def get_mpnet_embeddings(sentences): # Tokenize input sentences inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=512) # Get embeddings with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling over the sequence return embeddings.numpy().tolist() # Define the Gradio interface interface = gr.Interface( fn=get_mpnet_embeddings, # Function to call inputs=gr.Textbox(lines=2, placeholder="Enter sentences here, one per line"), # Input component outputs=gr.JSON(), # Output component title="Sentence Embeddings with MPNet", # Interface title description="Enter sentences to get their embeddings with paraphrase-mpnet-base-v2 (up to 512 tokens)." # Description ) # Launch the interface interface.launch()