File size: 4,013 Bytes
5ecde30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import pandas as pd
from sentence_transformers import SentenceTransformer, util

def filter_similar_sentences(model: SentenceTransformer, df: pd.DataFrame) -> pd.DataFrame:
    # Calculate embeddings for each utterance
    embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True)
    
    # Calculate cosine similarity matrix
    cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
    
    # Keep track of sentences to keep
    to_keep = set()
    
    for i in range(len(df)):
        if i not in to_keep:
            to_keep.add(i)
            for j in range(i + 1, len(df)):
                if cosine_scores[i][j] >= 0.8:
                    print(f"Similarity between '{df.iloc[i]['utterance']}' and '{df.iloc[j]['utterance']}' is {cosine_scores[i][j]:.2f}")
                    to_keep.add(j)
    
    # Filter the dataframe to keep only the selected sentences
    filtered_df = df.iloc[list(to_keep)].reset_index(drop=True)
    
    return filtered_df



def get_similar_sentences(model: SentenceTransformer, user_text: str, df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
    # Get embeddings for user input
    user_embedding = model.encode(user_text, convert_to_tensor=True)
    
    # Get embeddings for all utterances
    embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True)
    
    # Calculate cosine similarity between user input and all utterances
    cosine_scores = util.pytorch_cos_sim(user_embedding, embeddings)[0]
    
    # Get top_n most similar utterances
    top_matches = cosine_scores.argsort(descending=True)[:top_n]
    
    return df.iloc[top_matches][['utterance', 'intent', 'combined', 'similarity']]


file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\InvoiceDetailsExplanation.csv'

# Load the data
utterances = pd.read_csv(file_path)

# Load the model multilingual-e5-small from sentence-transformers
# 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Filter similar sentences
#filtered_utterances = filter_similar_sentences(model, utterances)

# Display the filtered dataframe
#filtered_utterances.head()

examples = ['vreau detalii despre ultima factura','cat pot sa platesc','informatii factura','vreau informatii despre costuri','as dori sa aflu ultima factura']

def filter_similar_sentences(model: SentenceTransformer, df: pd.DataFrame) -> pd.DataFrame:
    embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
    to_keep = set()
    for i in range(len(df)):
        if i not in to_keep:
            to_keep.add(i)
            for j in range(i + 1, len(df)):
                if cosine_scores[i][j] >= 0.8:
                    print(f"Similarity between '{df.iloc[i]['utterance']}' and '{df.iloc[j]['utterance']}' is {cosine_scores[i][j]:.2f}")
                    to_keep.add(j)
    filtered_df = df.iloc[list(to_keep)].reset_index(drop=True)
    return filtered_df

def get_similar_sentences(model: SentenceTransformer, user_text: str, df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
    user_embedding = model.encode(user_text, convert_to_tensor=True)
    embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(user_embedding, embeddings)[0]
    top_matches = cosine_scores.argsort(descending=True)[:top_n]
    return df.iloc[top_matches][['utterance', 'intent', 'combined', 'similarity']]

file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\InvoiceDetailsExplanation.csv'
utterances = pd.read_csv(file_path)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

examples = ['vreau detalii despre ultima factura','cat pot sa platesc','informatii factura','vreau informatii despre costuri','as dori sa aflu ultima factura']

for example in examples:
    print(f"Input: {example}")
    print(similar_sentences)
    print("\n")