Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import necessary libraries
|
2 |
+
import gradio as gr
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
from rapidfuzz.distance import Levenshtein, JaroWinkler
|
6 |
+
from sentence_transformers import SentenceTransformer, util
|
7 |
+
from typing import List
|
8 |
+
import zipfile
|
9 |
+
import os
|
10 |
+
import io
|
11 |
+
|
12 |
+
def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
|
13 |
+
model = SentenceTransformer(model_name)
|
14 |
+
embedding1 = model.encode(code1)
|
15 |
+
embedding2 = model.encode(code2)
|
16 |
+
sim_similarity = util.cos_sim(embedding1, embedding2).item()
|
17 |
+
lev_ratio = Levenshtein.normalized_similarity(code1, code2)
|
18 |
+
jaro_winkler_ratio = JaroWinkler.normalized_similarity(code1, code2)
|
19 |
+
overall_similarity = Ws * sim_similarity + Wl * lev_ratio + Wj * jaro_winkler_ratio
|
20 |
+
|
21 |
+
return "The similarity score between the two codes is: %.2f" % overall_similarity
|
22 |
+
|
23 |
+
# Define the function to process the uploaded file and return a DataFrame
|
24 |
+
def extract_and_read_compressed_file(file_path):
|
25 |
+
file_names = []
|
26 |
+
codes = []
|
27 |
+
|
28 |
+
# Handle .zip files
|
29 |
+
if file_path.endswith('.zip'):
|
30 |
+
with zipfile.ZipFile(file_path, 'r') as z:
|
31 |
+
file_names = z.namelist()
|
32 |
+
codes = [z.read(file).decode('utf-8', errors='ignore') for file in file_names]
|
33 |
+
|
34 |
+
else:
|
35 |
+
raise ValueError("Unsupported file type. Only .zip is supported.")
|
36 |
+
|
37 |
+
return file_names, codes
|
38 |
+
|
39 |
+
def filter_and_return_top(df, similarity_threshold,returned_results):
|
40 |
+
filtered_df = df[df['similarity_score'] > similarity_threshold]
|
41 |
+
return filtered_df.head(returned_results)
|
42 |
+
|
43 |
+
# Perform paraphrase mining with the specified weights
|
44 |
+
def perform_paraphrase_mining(model, codes_list, weight_semantic, weight_levenshtein, weight_jaro_winkler):
|
45 |
+
return paraphrase_mining_with_combined_score(
|
46 |
+
model,
|
47 |
+
codes_list,
|
48 |
+
weight_semantic=weight_semantic,
|
49 |
+
weight_levenshtein=weight_levenshtein,
|
50 |
+
weight_jaro_winkler=weight_jaro_winkler
|
51 |
+
)
|
52 |
+
|
53 |
+
def paraphrase_mining_with_combined_score(
|
54 |
+
model,
|
55 |
+
sentences: List[str],
|
56 |
+
show_progress_bar: bool = False,
|
57 |
+
weight_semantic: float = 1.0,
|
58 |
+
weight_levenshtein: float = 0.0,
|
59 |
+
weight_jaro_winkler: float = 0.0
|
60 |
+
):
|
61 |
+
embeddings = model.encode(
|
62 |
+
sentences, show_progress_bar=show_progress_bar, convert_to_tensor=True)
|
63 |
+
paraphrases = util.paraphrase_mining_embeddings(embeddings, score_function=util.cos_sim)
|
64 |
+
|
65 |
+
results = []
|
66 |
+
for score, i, j in paraphrases:
|
67 |
+
lev_ratio = Levenshtein.normalized_similarity(sentences[i], sentences[j])
|
68 |
+
jaro_winkler_ratio = JaroWinkler.normalized_similarity(sentences[i], sentences[j])
|
69 |
+
|
70 |
+
combined_score = (weight_semantic * score) + \
|
71 |
+
(weight_levenshtein * lev_ratio) + \
|
72 |
+
(weight_jaro_winkler * jaro_winkler_ratio)
|
73 |
+
|
74 |
+
results.append([combined_score, i, j])
|
75 |
+
|
76 |
+
results = sorted(results, key=lambda x: x[0], reverse=True)
|
77 |
+
return results
|
78 |
+
|
79 |
+
def get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
|
80 |
+
file_names, codes = extract_and_read_compressed_file(zipped_file)
|
81 |
+
model = SentenceTransformer(model_name)
|
82 |
+
code_pairs = perform_paraphrase_mining(model, codes,Ws, Wl, Wj)
|
83 |
+
pairs_results = []
|
84 |
+
|
85 |
+
for score, i, j in code_pairs:
|
86 |
+
pairs_results.append({
|
87 |
+
'file_name_1': file_names[i],
|
88 |
+
'file_name_2': file_names[j],
|
89 |
+
'similarity_score': score
|
90 |
+
})
|
91 |
+
|
92 |
+
similarity_df = pd.concat([pd.DataFrame(pairs_results)], ignore_index=True)
|
93 |
+
similarity_df = similarity_df.sort_values(by='similarity_score', ascending=False)
|
94 |
+
result = filter_and_return_top(similarity_df,threshold,number_results).round(2)
|
95 |
+
|
96 |
+
return result
|
97 |
+
|
98 |
+
# Define the Gradio app
|
99 |
+
with gr.Blocks(theme=gr.themes.Glass()) as demo:
|
100 |
+
# Tab for similarity calculation
|
101 |
+
with gr.Tab("Code Pair Similarity"):
|
102 |
+
# Input components
|
103 |
+
code1 = gr.Textbox(label="Code 1")
|
104 |
+
code2 = gr.Textbox(label="Code 2")
|
105 |
+
|
106 |
+
# Accordion for weights and models
|
107 |
+
with gr.Accordion("Weights and Models", open=False):
|
108 |
+
Ws = gr.Slider(0, 1, value=0.7, label="Semantic Search Weight", step=0.1)
|
109 |
+
Wl = gr.Slider(0, 1, value=0.3, label="Levenshiern Distance Weight", step=0.1)
|
110 |
+
Wj = gr.Slider(0, 1, value=0.0, label="Jaro Winkler Weight", step=0.1)
|
111 |
+
model_dropdown = gr.Dropdown(
|
112 |
+
[("codebert", "microsoft/codebert-base"),
|
113 |
+
("graphcodebert", "microsoft/graphcodebert-base"),
|
114 |
+
("UnixCoder", "microsoft/unixcoder-base-unimodal"),
|
115 |
+
("CodeBERTa", "huggingface/CodeBERTa-small-v1"),
|
116 |
+
("CodeT5 small", "Salesforce/codet5-small"),
|
117 |
+
("PLBART", "uclanlp/plbart-java-cs"),],
|
118 |
+
label="Select Model",
|
119 |
+
value= "uclanlp/plbart-java-cs"
|
120 |
+
)
|
121 |
+
|
122 |
+
# Output component
|
123 |
+
output = gr.Textbox(label="Similarity Score")
|
124 |
+
|
125 |
+
def update_weights(Ws, Wl, Wj):
|
126 |
+
total = Ws + Wl + Wj
|
127 |
+
if total != 1:
|
128 |
+
Wj = 1 - (Ws + Wl)
|
129 |
+
return Ws, Wl, Wj
|
130 |
+
|
131 |
+
# Update weights when any slider changes
|
132 |
+
Ws.change(update_weights, [Ws, Wl, Wj], [Ws, Wl, Wj])
|
133 |
+
Wl.change(update_weights, [Ws, Wl, Wj], [Ws, Wl, Wj])
|
134 |
+
Wj.change(update_weights, [Ws, Wl, Wj], [Ws, Wl, Wj])
|
135 |
+
|
136 |
+
# Button to trigger the similarity calculation
|
137 |
+
calculate_btn = gr.Button("Calculate Similarity")
|
138 |
+
calculate_btn.click(calculate_similarity, inputs=[code1, code2, Ws, Wl, Wj, model_dropdown], outputs=output)
|
139 |
+
|
140 |
+
# Tab for file upload and DataFrame output
|
141 |
+
with gr.Tab("Code Collection Pair Similarity"):
|
142 |
+
# File uploader component
|
143 |
+
file_uploader = gr.File(label="Upload a Zip file",file_types=[".zip"])
|
144 |
+
|
145 |
+
with gr.Accordion("Weights and Models", open=False):
|
146 |
+
Ws = gr.Slider(0, 1, value=0.7, label="Semantic Search Weight", step=0.1)
|
147 |
+
Wl = gr.Slider(0, 1, value=0.3, label="Levenshiern Distance Weight", step=0.1)
|
148 |
+
Wj = gr.Slider(0, 1, value=0.0, label="Jaro Winkler Weight", step=0.1)
|
149 |
+
model_dropdown = gr.Dropdown(
|
150 |
+
[("codebert", "microsoft/codebert-base"),
|
151 |
+
("graphcodebert", "microsoft/graphcodebert-base"),
|
152 |
+
("UnixCoder", "microsoft/unixcoder-base-unimodal"),
|
153 |
+
("CodeBERTa", "huggingface/CodeBERTa-small-v1"),
|
154 |
+
("CodeT5 small", "Salesforce/codet5-small"),
|
155 |
+
("PLBART", "uclanlp/plbart-java-cs"),],
|
156 |
+
label="Select Model",
|
157 |
+
value= "uclanlp/plbart-java-cs"
|
158 |
+
)
|
159 |
+
threshold = gr.Slider(0, 1, value=0, label="Threshold", step=0.01)
|
160 |
+
number_results = gr.Slider(1, 1000, value=10, label="Number of Returned pairs", step=1)
|
161 |
+
|
162 |
+
# Output component for the DataFrame
|
163 |
+
df_output = gr.Dataframe(label="Uploaded Data")
|
164 |
+
|
165 |
+
# Button to trigger the file processing
|
166 |
+
process_btn = gr.Button("Process File")
|
167 |
+
process_btn.click(get_sim_list, inputs=[file_uploader, Ws, Wl, Wj, model_dropdown,threshold,number_results], outputs=df_output)
|
168 |
+
|
169 |
+
# Launch the Gradio app with live=True
|
170 |
+
demo.launch(show_error=True,debug=True)
|