abdulmatinomotoso commited on
Commit
c63dd3d
1 Parent(s): 7542a39

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_url
2
+ from datasets import load_dataset
3
+ from datasets import Dataset
4
+ from transformers import AutoTokenizer, AutoModel
5
+ import torch
6
+ import pandas as pd
7
+
8
+ model_checkpoint = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
9
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
10
+ model = AutoModel.from_pretrained(model_checkpoint)
11
+
12
+ data_files = hf_hub_url(
13
+ repo_id="lewtun/github-issues",
14
+ filename="datasets-issues-with-comments.jsonl",
15
+ repo_type="dataset",
16
+ )
17
+
18
+ issues_dataset = load_dataset("json", data_files=data_files, split="train")
19
+
20
+ issues_dataset = issues_dataset.filter(
21
+ lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
22
+ )
23
+
24
+ columns = issues_dataset.column_names
25
+ columns_to_keep = ["title", "body", "html_url", "comments"]
26
+ columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
27
+
28
+ issues_dataset = issues_dataset.remove_columns(columns_to_remove)
29
+
30
+ issues_dataset.set_format("pandas")
31
+ df = issues_dataset[:]
32
+
33
+ comments_df = df.explode("comments", ignore_index=True)
34
+
35
+ comments_dataset = Dataset.from_pandas(comments_df)
36
+
37
+ comments_dataset = comments_dataset.map(
38
+ lambda x: {"length_comment": len(x["comments"].split())}
39
+ )
40
+
41
+ comments_dataset = comments_dataset.filter(
42
+ lambda x: x["length_comment"] > 15
43
+ )
44
+
45
+ def concatenate_text(examples):
46
+ return {
47
+ "text": examples["title"]
48
+ + " \n "
49
+ + examples["body"]
50
+ + " \n "
51
+ + examples["comments"]
52
+ }
53
+
54
+ comments_dataset = comments_dataset.map(concatenate_text)
55
+
56
+ device = torch.device("cuda")
57
+ model = model.to(device)
58
+
59
+ def cls_pooling(model_output):
60
+ return model_output.last_hidden_state[:, 0]
61
+
62
+ def get_embeddings(text_list):
63
+ encoded_input = tokenizer(
64
+ text_list, padding=True, truncation=True, return_tensors="pt"
65
+ )
66
+ encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
67
+ model_output = model(**encoded_input)
68
+ return cls_pooling(model_output)
69
+
70
+ embeddings_dataset = comments_dataset.map(
71
+ lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
72
+ )
73
+
74
+ embeddings_dataset.add_faiss_index(column="embeddings")
75
+
76
+ def search(question):
77
+ question_embedding = get_embeddings([question]).cpu().detach().numpy()
78
+
79
+ scores, samples = embeddings_dataset.get_nearest_examples(
80
+ "embeddings", question_embedding, k=5
81
+ )
82
+
83
+ samples_df = pd.DataFrame.from_dict(samples)
84
+ samples_df["scores"] = scores
85
+ samples_df.sort_values("scores", ascending=False, inplace=True)
86
+
87
+ string = ""
88
+ for _, row in samples_df.iterrows():
89
+ string += f"COMMENT: {row.comments}"
90
+ string += f"SCORE: {row.scores}"
91
+ string += f"TITLE: {row.title}"
92
+ string += f"URL: {row.html_url}"
93
+ string += "=" * 50
94
+ string += "\n"
95
+
96
+ return string
97
+
98
+ demo = gr.Interface(search, inputs=gr.inputs.Textbox(),
99
+ outputs = gr.outputs.Textbox(),
100
+ title='Datasets issues search engine')
101
+
102
+ if __name__ == '__main__':
103
+ demo.launch(debug=True)