Spaces:
Sleeping
Sleeping
File size: 4,013 Bytes
5ecde30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import pandas as pd
from sentence_transformers import SentenceTransformer, util
def filter_similar_sentences(model: SentenceTransformer, df: pd.DataFrame) -> pd.DataFrame:
# Calculate embeddings for each utterance
embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True)
# Calculate cosine similarity matrix
cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
# Keep track of sentences to keep
to_keep = set()
for i in range(len(df)):
if i not in to_keep:
to_keep.add(i)
for j in range(i + 1, len(df)):
if cosine_scores[i][j] >= 0.8:
print(f"Similarity between '{df.iloc[i]['utterance']}' and '{df.iloc[j]['utterance']}' is {cosine_scores[i][j]:.2f}")
to_keep.add(j)
# Filter the dataframe to keep only the selected sentences
filtered_df = df.iloc[list(to_keep)].reset_index(drop=True)
return filtered_df
def get_similar_sentences(model: SentenceTransformer, user_text: str, df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
# Get embeddings for user input
user_embedding = model.encode(user_text, convert_to_tensor=True)
# Get embeddings for all utterances
embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True)
# Calculate cosine similarity between user input and all utterances
cosine_scores = util.pytorch_cos_sim(user_embedding, embeddings)[0]
# Get top_n most similar utterances
top_matches = cosine_scores.argsort(descending=True)[:top_n]
return df.iloc[top_matches][['utterance', 'intent', 'combined', 'similarity']]
file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\InvoiceDetailsExplanation.csv'
# Load the data
utterances = pd.read_csv(file_path)
# Load the model multilingual-e5-small from sentence-transformers
# 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Filter similar sentences
#filtered_utterances = filter_similar_sentences(model, utterances)
# Display the filtered dataframe
#filtered_utterances.head()
examples = ['vreau detalii despre ultima factura','cat pot sa platesc','informatii factura','vreau informatii despre costuri','as dori sa aflu ultima factura']
def filter_similar_sentences(model: SentenceTransformer, df: pd.DataFrame) -> pd.DataFrame:
embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
to_keep = set()
for i in range(len(df)):
if i not in to_keep:
to_keep.add(i)
for j in range(i + 1, len(df)):
if cosine_scores[i][j] >= 0.8:
print(f"Similarity between '{df.iloc[i]['utterance']}' and '{df.iloc[j]['utterance']}' is {cosine_scores[i][j]:.2f}")
to_keep.add(j)
filtered_df = df.iloc[list(to_keep)].reset_index(drop=True)
return filtered_df
def get_similar_sentences(model: SentenceTransformer, user_text: str, df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
user_embedding = model.encode(user_text, convert_to_tensor=True)
embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(user_embedding, embeddings)[0]
top_matches = cosine_scores.argsort(descending=True)[:top_n]
return df.iloc[top_matches][['utterance', 'intent', 'combined', 'similarity']]
file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\InvoiceDetailsExplanation.csv'
utterances = pd.read_csv(file_path)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
examples = ['vreau detalii despre ultima factura','cat pot sa platesc','informatii factura','vreau informatii despre costuri','as dori sa aflu ultima factura']
for example in examples:
print(f"Input: {example}")
print(similar_sentences)
print("\n") |