File size: 3,003 Bytes
c63dd3d
 
 
 
 
c125028
c63dd3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b06616
c63dd3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from huggingface_hub import hf_hub_url
from datasets import load_dataset
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel
import torch
import gradio as gr
import pandas as pd

model_checkpoint = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModel.from_pretrained(model_checkpoint)

data_files = hf_hub_url(
    repo_id="lewtun/github-issues",
    filename="datasets-issues-with-comments.jsonl",
    repo_type="dataset",
)

issues_dataset = load_dataset("json", data_files=data_files, split="train")

issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
)

columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)

issues_dataset = issues_dataset.remove_columns(columns_to_remove)

issues_dataset.set_format("pandas")
df = issues_dataset[:]

comments_df = df.explode("comments", ignore_index=True)

comments_dataset = Dataset.from_pandas(comments_df)

comments_dataset = comments_dataset.map(
    lambda x: {"length_comment": len(x["comments"].split())}
)

comments_dataset = comments_dataset.filter(
    lambda x: x["length_comment"] > 15
)

def concatenate_text(examples):
    return {
        "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"]
    }

comments_dataset = comments_dataset.map(concatenate_text)

device = torch.device("cpu")
model = model.to(device)

def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]
    
def get_embeddings(text_list):
  encoded_input = tokenizer(
      text_list, padding=True, truncation=True, return_tensors="pt"
  )
  encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
  model_output = model(**encoded_input)
  return cls_pooling(model_output)
    
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

embeddings_dataset.add_faiss_index(column="embeddings")

def search(question):
  question_embedding = get_embeddings([question]).cpu().detach().numpy()

  scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
  )

  samples_df = pd.DataFrame.from_dict(samples)
  samples_df["scores"] = scores
  samples_df.sort_values("scores", ascending=False, inplace=True)

  string = ""
  for _, row in samples_df.iterrows():
    string += f"COMMENT: {row.comments}"
    string += f"SCORE: {row.scores}"
    string += f"TITLE: {row.title}"
    string += f"URL: {row.html_url}"
    string += "=" * 50
    string += "\n"

  return string

demo = gr.Interface(search, inputs=gr.inputs.Textbox(),
                    outputs = gr.outputs.Textbox(),
                    title='Datasets issues search engine')

if __name__ == '__main__':
  demo.launch(debug=True)