Spaces:
Build error
Build error
Hong
commited on
Commit
•
d49e18f
1
Parent(s):
b720f19
Upload utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
from transformers import AutoModelForSequenceClassification
|
3 |
+
import torch
|
4 |
+
from sentence_transformers import SentenceTransformer, util
|
5 |
+
import gensim
|
6 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
+
from sklearn import preprocessing
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
|
11 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
12 |
+
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
|
14 |
+
nli_model = (
|
15 |
+
AutoModelForSequenceClassification.from_pretrained(
|
16 |
+
"facebook/bart-large-mnli"
|
17 |
+
).cuda()
|
18 |
+
if torch.cuda.is_available()
|
19 |
+
else AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
def get_prob(sequence, label):
|
24 |
+
premise = sequence
|
25 |
+
hypothesis = f"This example is {label}."
|
26 |
+
|
27 |
+
# run through model pre-trained on MNLI
|
28 |
+
x = tokenizer.encode(
|
29 |
+
premise, hypothesis, return_tensors="pt", truncation_strategy="only_first"
|
30 |
+
)
|
31 |
+
logits = nli_model(x.to(device))[0]
|
32 |
+
|
33 |
+
# we throw away "neutral" (dim 1) and take the probability of
|
34 |
+
# "entailment" (2) as the probability of the label being true
|
35 |
+
entail_contradiction_logits = logits[:, [0, 2]]
|
36 |
+
probs = entail_contradiction_logits.softmax(dim=1)
|
37 |
+
prob_label_is_true = probs[:, 1]
|
38 |
+
return prob_label_is_true[0].item()
|
39 |
+
|
40 |
+
|
41 |
+
def get_prob_lists(sequence, labels):
|
42 |
+
out = []
|
43 |
+
for l in labels:
|
44 |
+
out.append(get_prob(sequence, l))
|
45 |
+
return out
|
46 |
+
|
47 |
+
|
48 |
+
compare_model = SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
|
49 |
+
|
50 |
+
|
51 |
+
def compare_sentence(query, docs):
|
52 |
+
query_emb = compare_model.encode(query)
|
53 |
+
doc_emb = compare_model.encode(docs)
|
54 |
+
scores = util.dot_score(query_emb, doc_emb)[0].to(device).tolist()
|
55 |
+
return np.mean(scores)
|
56 |
+
|
57 |
+
|
58 |
+
def query_jds(DB, keyword):
|
59 |
+
keywords = " ".join(gensim.utils.simple_preprocess(keyword, deacc=True))
|
60 |
+
temp_tf_matrix = tfidf_matrix(DB, tokenized="tokenized", name="Title")
|
61 |
+
target = query(DB, keywords, temp_tf_matrix)
|
62 |
+
return target
|
63 |
+
|
64 |
+
|
65 |
+
def query(df, keywords, tf_matrix):
|
66 |
+
|
67 |
+
keywords = " ".join(gensim.utils.simple_preprocess(keywords, deacc=True))
|
68 |
+
df["Query_score"] = tfidf_score(tf_matrix, keywords)
|
69 |
+
q = df.loc[df["Query_score"] > 0.3].sort_values(by="Query_score", ascending=False)
|
70 |
+
|
71 |
+
result = q[:5].reset_index(drop=True)
|
72 |
+
# print(result[["Title", "Query_score"]])
|
73 |
+
return result.drop("Query_score", axis=1)
|
74 |
+
|
75 |
+
|
76 |
+
def tfidf_score(tf_matrix, keyword):
|
77 |
+
vector = np.array([0] * tf_matrix.shape[1])
|
78 |
+
for i in keyword.split():
|
79 |
+
if i in tf_matrix.index:
|
80 |
+
vector = vector + tf_matrix.loc[i].values
|
81 |
+
return vector
|
82 |
+
|
83 |
+
|
84 |
+
def tfidf_matrix(data, tokenized="tokenized", name="Course_Name"):
|
85 |
+
corpus = [" ".join(i) for i in data[tokenized]]
|
86 |
+
tfidf_voctorize = TfidfVectorizer().fit(corpus)
|
87 |
+
|
88 |
+
avg_score = tfidf_voctorize.transform(corpus).toarray().T
|
89 |
+
vocab = tfidf_voctorize.get_feature_names()
|
90 |
+
courses = data[name].values
|
91 |
+
avg_score = preprocessing.minmax_scale(avg_score.T).T
|
92 |
+
scores = pd.DataFrame(avg_score, index=vocab, columns=courses)
|
93 |
+
return scores
|