buelfhood commited on
Commit
2aacaa3
·
verified ·
1 Parent(s): 8f6434a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -0
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import gradio as gr
3
+ import numpy as np
4
+ import pandas as pd
5
+ from rapidfuzz.distance import Levenshtein, JaroWinkler
6
+ from sentence_transformers import SentenceTransformer, util
7
+ from typing import List
8
+ import zipfile
9
+ import os
10
+ import io
11
+
12
+ def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
13
+ model = SentenceTransformer(model_name)
14
+ embedding1 = model.encode(code1)
15
+ embedding2 = model.encode(code2)
16
+ sim_similarity = util.cos_sim(embedding1, embedding2).item()
17
+ lev_ratio = Levenshtein.normalized_similarity(code1, code2)
18
+ jaro_winkler_ratio = JaroWinkler.normalized_similarity(code1, code2)
19
+ overall_similarity = Ws * sim_similarity + Wl * lev_ratio + Wj * jaro_winkler_ratio
20
+
21
+ return "The similarity score between the two codes is: %.2f" % overall_similarity
22
+
23
+ # Define the function to process the uploaded file and return a DataFrame
24
+ def extract_and_read_compressed_file(file_path):
25
+ file_names = []
26
+ codes = []
27
+
28
+ # Handle .zip files
29
+ if file_path.endswith('.zip'):
30
+ with zipfile.ZipFile(file_path, 'r') as z:
31
+ file_names = z.namelist()
32
+ codes = [z.read(file).decode('utf-8', errors='ignore') for file in file_names]
33
+
34
+ else:
35
+ raise ValueError("Unsupported file type. Only .zip is supported.")
36
+
37
+ return file_names, codes
38
+
39
+ def filter_and_return_top(df, similarity_threshold,returned_results):
40
+ filtered_df = df[df['similarity_score'] > similarity_threshold]
41
+ return filtered_df.head(returned_results)
42
+
43
+ # Perform paraphrase mining with the specified weights
44
+ def perform_paraphrase_mining(model, codes_list, weight_semantic, weight_levenshtein, weight_jaro_winkler):
45
+ return paraphrase_mining_with_combined_score(
46
+ model,
47
+ codes_list,
48
+ weight_semantic=weight_semantic,
49
+ weight_levenshtein=weight_levenshtein,
50
+ weight_jaro_winkler=weight_jaro_winkler
51
+ )
52
+
53
+ def paraphrase_mining_with_combined_score(
54
+ model,
55
+ sentences: List[str],
56
+ show_progress_bar: bool = False,
57
+ weight_semantic: float = 1.0,
58
+ weight_levenshtein: float = 0.0,
59
+ weight_jaro_winkler: float = 0.0
60
+ ):
61
+ embeddings = model.encode(
62
+ sentences, show_progress_bar=show_progress_bar, convert_to_tensor=True)
63
+ paraphrases = util.paraphrase_mining_embeddings(embeddings, score_function=util.cos_sim)
64
+
65
+ results = []
66
+ for score, i, j in paraphrases:
67
+ lev_ratio = Levenshtein.normalized_similarity(sentences[i], sentences[j])
68
+ jaro_winkler_ratio = JaroWinkler.normalized_similarity(sentences[i], sentences[j])
69
+
70
+ combined_score = (weight_semantic * score) + \
71
+ (weight_levenshtein * lev_ratio) + \
72
+ (weight_jaro_winkler * jaro_winkler_ratio)
73
+
74
+ results.append([combined_score, i, j])
75
+
76
+ results = sorted(results, key=lambda x: x[0], reverse=True)
77
+ return results
78
+
79
+ def get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
80
+ file_names, codes = extract_and_read_compressed_file(zipped_file)
81
+ model = SentenceTransformer(model_name)
82
+ code_pairs = perform_paraphrase_mining(model, codes,Ws, Wl, Wj)
83
+ pairs_results = []
84
+
85
+ for score, i, j in code_pairs:
86
+ pairs_results.append({
87
+ 'file_name_1': file_names[i],
88
+ 'file_name_2': file_names[j],
89
+ 'similarity_score': score
90
+ })
91
+
92
+ similarity_df = pd.concat([pd.DataFrame(pairs_results)], ignore_index=True)
93
+ similarity_df = similarity_df.sort_values(by='similarity_score', ascending=False)
94
+ result = filter_and_return_top(similarity_df,threshold,number_results).round(2)
95
+
96
+ return result
97
+
98
+ # Define the Gradio app
99
+ with gr.Blocks(theme=gr.themes.Glass()) as demo:
100
+ # Tab for similarity calculation
101
+ with gr.Tab("Code Pair Similarity"):
102
+ # Input components
103
+ code1 = gr.Textbox(label="Code 1")
104
+ code2 = gr.Textbox(label="Code 2")
105
+
106
+ # Accordion for weights and models
107
+ with gr.Accordion("Weights and Models", open=False):
108
+ Ws = gr.Slider(0, 1, value=0.7, label="Semantic Search Weight", step=0.1)
109
+ Wl = gr.Slider(0, 1, value=0.3, label="Levenshiern Distance Weight", step=0.1)
110
+ Wj = gr.Slider(0, 1, value=0.0, label="Jaro Winkler Weight", step=0.1)
111
+ model_dropdown = gr.Dropdown(
112
+ [("codebert", "microsoft/codebert-base"),
113
+ ("graphcodebert", "microsoft/graphcodebert-base"),
114
+ ("UnixCoder", "microsoft/unixcoder-base-unimodal"),
115
+ ("CodeBERTa", "huggingface/CodeBERTa-small-v1"),
116
+ ("CodeT5 small", "Salesforce/codet5-small"),
117
+ ("PLBART", "uclanlp/plbart-java-cs"),],
118
+ label="Select Model",
119
+ value= "uclanlp/plbart-java-cs"
120
+ )
121
+
122
+ # Output component
123
+ output = gr.Textbox(label="Similarity Score")
124
+
125
+ def update_weights(Ws, Wl, Wj):
126
+ total = Ws + Wl + Wj
127
+ if total != 1:
128
+ Wj = 1 - (Ws + Wl)
129
+ return Ws, Wl, Wj
130
+
131
+ # Update weights when any slider changes
132
+ Ws.change(update_weights, [Ws, Wl, Wj], [Ws, Wl, Wj])
133
+ Wl.change(update_weights, [Ws, Wl, Wj], [Ws, Wl, Wj])
134
+ Wj.change(update_weights, [Ws, Wl, Wj], [Ws, Wl, Wj])
135
+
136
+ # Button to trigger the similarity calculation
137
+ calculate_btn = gr.Button("Calculate Similarity")
138
+ calculate_btn.click(calculate_similarity, inputs=[code1, code2, Ws, Wl, Wj, model_dropdown], outputs=output)
139
+
140
+ # Tab for file upload and DataFrame output
141
+ with gr.Tab("Code Collection Pair Similarity"):
142
+ # File uploader component
143
+ file_uploader = gr.File(label="Upload a Zip file",file_types=[".zip"])
144
+
145
+ with gr.Accordion("Weights and Models", open=False):
146
+ Ws = gr.Slider(0, 1, value=0.7, label="Semantic Search Weight", step=0.1)
147
+ Wl = gr.Slider(0, 1, value=0.3, label="Levenshiern Distance Weight", step=0.1)
148
+ Wj = gr.Slider(0, 1, value=0.0, label="Jaro Winkler Weight", step=0.1)
149
+ model_dropdown = gr.Dropdown(
150
+ [("codebert", "microsoft/codebert-base"),
151
+ ("graphcodebert", "microsoft/graphcodebert-base"),
152
+ ("UnixCoder", "microsoft/unixcoder-base-unimodal"),
153
+ ("CodeBERTa", "huggingface/CodeBERTa-small-v1"),
154
+ ("CodeT5 small", "Salesforce/codet5-small"),
155
+ ("PLBART", "uclanlp/plbart-java-cs"),],
156
+ label="Select Model",
157
+ value= "uclanlp/plbart-java-cs"
158
+ )
159
+ threshold = gr.Slider(0, 1, value=0, label="Threshold", step=0.01)
160
+ number_results = gr.Slider(1, 1000, value=10, label="Number of Returned pairs", step=1)
161
+
162
+ # Output component for the DataFrame
163
+ df_output = gr.Dataframe(label="Uploaded Data")
164
+
165
+ # Button to trigger the file processing
166
+ process_btn = gr.Button("Process File")
167
+ process_btn.click(get_sim_list, inputs=[file_uploader, Ws, Wl, Wj, model_dropdown,threshold,number_results], outputs=df_output)
168
+
169
+ # Launch the Gradio app with live=True
170
+ demo.launch(show_error=True,debug=True)