Hong commited on
Commit
d49e18f
1 Parent(s): b720f19

Upload utils.py

Browse files
Files changed (1) hide show
  1. utils.py +93 -0
utils.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from transformers import AutoModelForSequenceClassification
3
+ import torch
4
+ from sentence_transformers import SentenceTransformer, util
5
+ import gensim
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn import preprocessing
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
+
13
+ tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
14
+ nli_model = (
15
+ AutoModelForSequenceClassification.from_pretrained(
16
+ "facebook/bart-large-mnli"
17
+ ).cuda()
18
+ if torch.cuda.is_available()
19
+ else AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
20
+ )
21
+
22
+
23
+ def get_prob(sequence, label):
24
+ premise = sequence
25
+ hypothesis = f"This example is {label}."
26
+
27
+ # run through model pre-trained on MNLI
28
+ x = tokenizer.encode(
29
+ premise, hypothesis, return_tensors="pt", truncation_strategy="only_first"
30
+ )
31
+ logits = nli_model(x.to(device))[0]
32
+
33
+ # we throw away "neutral" (dim 1) and take the probability of
34
+ # "entailment" (2) as the probability of the label being true
35
+ entail_contradiction_logits = logits[:, [0, 2]]
36
+ probs = entail_contradiction_logits.softmax(dim=1)
37
+ prob_label_is_true = probs[:, 1]
38
+ return prob_label_is_true[0].item()
39
+
40
+
41
+ def get_prob_lists(sequence, labels):
42
+ out = []
43
+ for l in labels:
44
+ out.append(get_prob(sequence, l))
45
+ return out
46
+
47
+
48
+ compare_model = SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
49
+
50
+
51
+ def compare_sentence(query, docs):
52
+ query_emb = compare_model.encode(query)
53
+ doc_emb = compare_model.encode(docs)
54
+ scores = util.dot_score(query_emb, doc_emb)[0].to(device).tolist()
55
+ return np.mean(scores)
56
+
57
+
58
+ def query_jds(DB, keyword):
59
+ keywords = " ".join(gensim.utils.simple_preprocess(keyword, deacc=True))
60
+ temp_tf_matrix = tfidf_matrix(DB, tokenized="tokenized", name="Title")
61
+ target = query(DB, keywords, temp_tf_matrix)
62
+ return target
63
+
64
+
65
+ def query(df, keywords, tf_matrix):
66
+
67
+ keywords = " ".join(gensim.utils.simple_preprocess(keywords, deacc=True))
68
+ df["Query_score"] = tfidf_score(tf_matrix, keywords)
69
+ q = df.loc[df["Query_score"] > 0.3].sort_values(by="Query_score", ascending=False)
70
+
71
+ result = q[:5].reset_index(drop=True)
72
+ # print(result[["Title", "Query_score"]])
73
+ return result.drop("Query_score", axis=1)
74
+
75
+
76
+ def tfidf_score(tf_matrix, keyword):
77
+ vector = np.array([0] * tf_matrix.shape[1])
78
+ for i in keyword.split():
79
+ if i in tf_matrix.index:
80
+ vector = vector + tf_matrix.loc[i].values
81
+ return vector
82
+
83
+
84
+ def tfidf_matrix(data, tokenized="tokenized", name="Course_Name"):
85
+ corpus = [" ".join(i) for i in data[tokenized]]
86
+ tfidf_voctorize = TfidfVectorizer().fit(corpus)
87
+
88
+ avg_score = tfidf_voctorize.transform(corpus).toarray().T
89
+ vocab = tfidf_voctorize.get_feature_names()
90
+ courses = data[name].values
91
+ avg_score = preprocessing.minmax_scale(avg_score.T).T
92
+ scores = pd.DataFrame(avg_score, index=vocab, columns=courses)
93
+ return scores