shakhovak commited on
Commit
8e44dd8
1 Parent(s): 69f2a1b

bienc+intent_added

Browse files
Files changed (5) hide show
  1. data/scripts.pkl +2 -2
  2. data/scripts_vectors.pkl +2 -2
  3. requirements.txt +2 -1
  4. retrieve_bot.py +77 -25
  5. utils.py +134 -11
data/scripts.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd8ded525a9faf9031e899ba75c5b7f91fdc4052619a43ca1ff608a7cce73b42
3
- size 2127113
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2df9355dd53669d082cecbdcfabee2cedba4527b0dfafcc086d7da479f78be48
3
+ size 3031195
data/scripts_vectors.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba242c25adc032bcf265fa1c805bf1f506150f181a6fc13f6753088af79cd9c7
3
- size 71223174
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3452fc927c68cb4cfd2a3eacd1b86158cb64696ef41b419ef45d50f9946196b
3
+ size 100818899
requirements.txt CHANGED
@@ -4,4 +4,5 @@ pandas==1.3.5
4
  gunicorn==20.1.0
5
  requests==2.27.
6
  datasets==2.13.2
7
- transformers==4.37.2
 
 
4
  gunicorn==20.1.0
5
  requests==2.27.
6
  datasets==2.13.2
7
+ transformers==4.37.2
8
+ DialogTag==1.1.3
retrieve_bot.py CHANGED
@@ -1,20 +1,46 @@
1
  import pandas as pd
2
  import pickle
 
3
  from sentence_transformers import SentenceTransformer
4
- from utils import encode, cosine_sim, top_candidates, candidates_reranking
 
 
 
 
 
 
5
  from collections import deque
6
  from transformers import pipeline
7
  import torch
8
  from transformers import AutoTokenizer
 
9
 
10
  # this class representes main functions of retrieve bot
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  class ChatBot:
14
  def __init__(self):
15
  self.vect_data = []
16
  self.scripts = []
17
  self.conversation_history = deque([], maxlen=5)
 
18
  self.ranking_model = None
19
  self.reranking_model = None
20
  self.device = None
@@ -27,46 +53,72 @@ class ChatBot:
27
 
28
  with open("data/scripts_vectors.pkl", "rb") as fp:
29
  self.vect_data = pickle.load(fp)
30
- self.scripts = pd.read_pickle("data/scripts.pkl")
31
- self.ranking_model = SentenceTransformer("sentence-transformers/LaBSE")
32
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
33
- self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 
 
 
 
34
  self.reranking_model = pipeline(
35
  model="Shakhovak/RerankerModel_chat_bot",
36
  device=self.device,
37
- tokenizer=self.tokenizer,
38
  )
39
 
40
  def generate_response(self, utterance: str) -> str:
41
  """this functions identifies potential
42
  candidates for answer and ranks them"""
 
 
43
  query_encoding = encode(
44
- utterance, self.ranking_model, contexts=self.conversation_history
 
 
 
45
  )
46
- bot_cosine_scores = cosine_sim(self.vect_data, query_encoding)
47
- top_scores, top_indexes = top_candidates(bot_cosine_scores, top=5)
48
-
49
- # test candidates and collects them with label 0 to dictionary
50
-
51
- reranked_dict = candidates_reranking(
52
- top_indexes,
53
- self.conversation_history,
54
- utterance,
55
- self.scripts,
56
- self.reranking_model,
57
  )
58
- # if any candidates were selected, range them and pick up the top
59
- # else keep up the initial top 1
 
 
 
 
60
 
61
- if len(reranked_dict) >= 1:
62
- updated_top_candidates = dict(
63
- sorted(reranked_dict.items(), key=lambda item: item[1])
 
 
 
64
  )
65
- answer = self.scripts.iloc[list(updated_top_candidates.keys())[0]]["answer"]
66
- else:
67
- answer = self.scripts.iloc[top_indexes[0]]["answer"]
 
 
 
 
 
 
 
 
 
 
68
 
69
  self.conversation_history.append(utterance)
70
  self.conversation_history.append(answer)
71
 
72
  return answer
 
 
 
 
 
 
1
  import pandas as pd
2
  import pickle
3
+ import random
4
  from sentence_transformers import SentenceTransformer
5
+ from utils import (
6
+ encode,
7
+ cosine_sim,
8
+ top_candidates,
9
+ candidates_reranking,
10
+ intent_classification,
11
+ )
12
  from collections import deque
13
  from transformers import pipeline
14
  import torch
15
  from transformers import AutoTokenizer
16
+ from dialog_tag import DialogTag
17
 
18
  # this class representes main functions of retrieve bot
19
 
20
+ low_scoring_list = [
21
+ "What does it mean?",
22
+ "You have two strikes. Three strikes and you’ re out. It’ s a sports metaphor. Explain again!",
23
+ "Again, urban slang. In which, I believe I’ m gaining remarkable fluency. So, could you repeat?",
24
+ "I’m confused.",
25
+ "I can’t comment without violating our agreement that I don’ t criticize you.",
26
+ "Oh!",
27
+ "I need to use the restroom.",
28
+ "Move. Move. Move!",
29
+ "I was going to mention it at the time, but then I thought, some day maybe...",
30
+ "Well...",
31
+ "Apparently... I have no idea!?",
32
+ "I’m not sure...",
33
+ "Nothing. I say nothing.",
34
+ "Well, my friend. Focus and repeat!",
35
+ ]
36
+
37
 
38
  class ChatBot:
39
  def __init__(self):
40
  self.vect_data = []
41
  self.scripts = []
42
  self.conversation_history = deque([], maxlen=5)
43
+ self.tag_model = None
44
  self.ranking_model = None
45
  self.reranking_model = None
46
  self.device = None
 
53
 
54
  with open("data/scripts_vectors.pkl", "rb") as fp:
55
  self.vect_data = pickle.load(fp)
56
+ self.scripts = pd.read_pickle("data/scripts.pkl")
57
+ self.tag_model = DialogTag("distilbert-base-uncased")
58
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
+ self.ranking_model = SentenceTransformer(
60
+ "Shakhovak/chatbot_sentence-transformer"
61
+ ) # # sentence-transformers/LaBSE or sentence-transformers/all-mpnet-base-v2 or Shakhovak/chatbot_sentence-transformer
62
+
63
+ self.tokenizer_reranking = AutoTokenizer.from_pretrained("bert-base-uncased")
64
  self.reranking_model = pipeline(
65
  model="Shakhovak/RerankerModel_chat_bot",
66
  device=self.device,
67
+ tokenizer=self.tokenizer_reranking,
68
  )
69
 
70
  def generate_response(self, utterance: str) -> str:
71
  """this functions identifies potential
72
  candidates for answer and ranks them"""
73
+
74
+ intent = intent_classification(utterance, utterance, self.tag_model)
75
  query_encoding = encode(
76
+ texts=utterance,
77
+ intent=intent,
78
+ model=self.ranking_model,
79
+ contexts=self.conversation_history,
80
  )
81
+ bot_cosine_scores = cosine_sim(
82
+ self.vect_data,
83
+ query_encoding,
84
+ )
85
+ top_scores, top_indexes = top_candidates(
86
+ bot_cosine_scores, intent=intent, initial_data=self.scripts, top=10
 
 
 
 
 
87
  )
88
+ print(top_scores)
89
+ if top_scores[0] < 0.9:
90
+ answer = random.choice(low_scoring_list)
91
+ self.conversation_history.clear()
92
+ else:
93
+ # test candidates and collects them with label 0 to dictionary
94
 
95
+ reranked_dict = candidates_reranking(
96
+ top_indexes,
97
+ self.conversation_history,
98
+ utterance,
99
+ self.scripts,
100
+ self.reranking_model,
101
  )
102
+ # if any candidates were selected, range them and pick up the top
103
+ # else keep up the initial top 1
104
+
105
+ if len(reranked_dict) >= 1:
106
+ updated_top_candidates = dict(
107
+ sorted(reranked_dict.items(), key=lambda item: item[1])
108
+ )
109
+ answer = self.scripts.iloc[list(updated_top_candidates.keys())[0]][
110
+ "answer"
111
+ ]
112
+ print(self.scripts.iloc[top_indexes[0]]["answer"])
113
+ else:
114
+ answer = self.scripts.iloc[top_indexes[0]]["answer"]
115
 
116
  self.conversation_history.append(utterance)
117
  self.conversation_history.append(answer)
118
 
119
  return answer
120
+
121
+
122
+ # katya = ChatBot()
123
+ # katya.load()
124
+ # katya.generate_response("hi man!")
utils.py CHANGED
@@ -4,31 +4,47 @@ from scipy import sparse
4
  import pandas as pd
5
  import pickle
6
  import random
 
 
7
 
8
 
9
- def encode(texts, model, contexts=None, do_norm=True):
10
  """function to encode texts for cosine similarity search"""
11
 
12
  question_vectors = model.encode(texts)
13
  context_vectors = model.encode("".join(contexts))
 
14
 
15
  return np.concatenate(
16
- [np.asarray(question_vectors), np.asarray(context_vectors)], axis=-1
 
 
 
 
 
17
  )
18
 
19
 
 
 
 
20
  def cosine_sim(data_vectors, query_vectors) -> list:
21
  """returns list of tuples with similarity score and
22
  script index in initial dataframe"""
 
23
  data_emb = sparse.csr_matrix(data_vectors)
24
  query_emb = sparse.csr_matrix(query_vectors)
25
  similarity = cosine_similarity(query_emb, data_emb).flatten()
26
  ind = np.argwhere(similarity)
27
  match = sorted(zip(similarity, ind.tolist()), reverse=True)
 
28
  return match
29
 
30
 
31
- def scripts_rework(path, character):
 
 
 
32
  """this functions split scripts for queation, answer, context,
33
  picks up the cahracter and saves data in pickle format"""
34
 
@@ -66,18 +82,30 @@ def scripts_rework(path, character):
66
  "context": context,
67
  }
68
 
69
- scripts = scripts.append(new_row, ignore_index=True)
70
 
71
  elif (row["person_scene"] == character) & (
72
  df.iloc[index - 1]["person_scene"] == "Scene"
73
  ):
74
  context = []
75
  new_row = {"answer": row["dialogue"], "question": "", "context": context}
76
- scripts = scripts.append(new_row, ignore_index=True)
77
  # load reworked data to pkl
 
 
 
 
 
 
 
 
 
78
  scripts.to_pickle("data/scripts.pkl")
79
 
80
 
 
 
 
81
  def encode_df_save(model):
82
  """this functions vectorizes reworked scripts and loads them to
83
  pickle file to be used as retrieval base for ranking script"""
@@ -85,21 +113,33 @@ def encode_df_save(model):
85
  scripts_reopened = pd.read_pickle("data/scripts.pkl")
86
  vect_data = []
87
  for index, row in scripts_reopened.iterrows():
88
- vect = encode(row["question"], model, row["context"])
 
 
 
 
 
89
  vect_data.append(vect)
90
  with open("data/scripts_vectors.pkl", "wb") as f:
91
  pickle.dump(vect_data, f)
92
 
93
 
94
- def top_candidates(score_lst_sorted, top=1):
 
 
 
95
  """this functions receives results of the cousine similarity ranking and
96
  returns top items' scores and their indices"""
97
-
98
- scores = [item[0] for item in score_lst_sorted]
99
- candidates_indexes = [item[1][0] for item in score_lst_sorted]
 
100
  return scores[0:top], candidates_indexes[0:top]
101
 
102
 
 
 
 
103
  def candidates_reranking(
104
  top_candidates_idx_lst, conversational_history, utterance, initial_df, pipeline
105
  ):
@@ -123,6 +163,9 @@ def candidates_reranking(
123
  return reranked_idx
124
 
125
 
 
 
 
126
  def read_files_negative(path1, path2):
127
  """this functions creates training dataset for classifier incl negative
128
  examples and saves it to the pickle file"""
@@ -155,12 +198,92 @@ def read_files_negative(path1, path2):
155
  fin_scripts["context"] = fin_scripts["context"].apply(lambda x: "".join(x))
156
  fin_scripts = fin_scripts[fin_scripts["question"] != ""]
157
  fin_scripts = fin_scripts[fin_scripts["answer"] != ""]
158
- fin_scripts["combined"] = (
159
  fin_scripts["context"]
160
  + "[SEP]"
161
  + fin_scripts["question"]
162
  + "[SEP]"
163
  + fin_scripts["answer"]
164
  )
 
 
 
 
165
  # fin_scripts = fin_scripts.dropna(how='any')
166
  fin_scripts.to_pickle("data/scripts_for_reranker.pkl")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import pandas as pd
5
  import pickle
6
  import random
7
+ from nltk.tokenize import word_tokenize
8
+ import string
9
 
10
 
11
+ def encode(texts, model, intent, contexts=None, do_norm=True):
12
  """function to encode texts for cosine similarity search"""
13
 
14
  question_vectors = model.encode(texts)
15
  context_vectors = model.encode("".join(contexts))
16
+ intent_vectors = model.encode(intent)
17
 
18
  return np.concatenate(
19
+ [
20
+ np.asarray(context_vectors),
21
+ np.asarray(question_vectors),
22
+ np.asarray(intent_vectors),
23
+ ],
24
+ axis=-1,
25
  )
26
 
27
 
28
+ # ===================================================
29
+
30
+
31
  def cosine_sim(data_vectors, query_vectors) -> list:
32
  """returns list of tuples with similarity score and
33
  script index in initial dataframe"""
34
+
35
  data_emb = sparse.csr_matrix(data_vectors)
36
  query_emb = sparse.csr_matrix(query_vectors)
37
  similarity = cosine_similarity(query_emb, data_emb).flatten()
38
  ind = np.argwhere(similarity)
39
  match = sorted(zip(similarity, ind.tolist()), reverse=True)
40
+
41
  return match
42
 
43
 
44
+ # ===================================================
45
+
46
+
47
+ def scripts_rework(path, character, tag_model):
48
  """this functions split scripts for queation, answer, context,
49
  picks up the cahracter and saves data in pickle format"""
50
 
 
82
  "context": context,
83
  }
84
 
85
+ scripts = pd.concat([scripts, pd.DataFrame([new_row])])
86
 
87
  elif (row["person_scene"] == character) & (
88
  df.iloc[index - 1]["person_scene"] == "Scene"
89
  ):
90
  context = []
91
  new_row = {"answer": row["dialogue"], "question": "", "context": context}
92
+ scripts = pd.concat([scripts, pd.DataFrame([new_row])])
93
  # load reworked data to pkl
94
+ scripts = scripts[scripts["question"] != ""]
95
+ scripts["answer"] = scripts["answer"].apply(lambda x: change_names(x))
96
+ scripts["tag"] = scripts[["answer", "question"]].apply(
97
+ lambda test_scripts: intent_classification(
98
+ test_scripts["question"], test_scripts["answer"], tag_model
99
+ ),
100
+ axis=1,
101
+ )
102
+ scripts = scripts.reset_index(drop=True)
103
  scripts.to_pickle("data/scripts.pkl")
104
 
105
 
106
+ # ===================================================
107
+
108
+
109
  def encode_df_save(model):
110
  """this functions vectorizes reworked scripts and loads them to
111
  pickle file to be used as retrieval base for ranking script"""
 
113
  scripts_reopened = pd.read_pickle("data/scripts.pkl")
114
  vect_data = []
115
  for index, row in scripts_reopened.iterrows():
116
+ vect = encode(
117
+ texts=row["question"],
118
+ model=model,
119
+ intent=row["tag"],
120
+ contexts=row["context"],
121
+ )
122
  vect_data.append(vect)
123
  with open("data/scripts_vectors.pkl", "wb") as f:
124
  pickle.dump(vect_data, f)
125
 
126
 
127
+ # ===================================================
128
+
129
+
130
+ def top_candidates(score_lst_sorted, intent, initial_data, top=1):
131
  """this functions receives results of the cousine similarity ranking and
132
  returns top items' scores and their indices"""
133
+ intent_idx = initial_data.index[initial_data["tag"] == intent]
134
+ filtered_candiates = [item for item in score_lst_sorted if item[1][0] in intent_idx]
135
+ scores = [item[0] for item in filtered_candiates]
136
+ candidates_indexes = [item[1][0] for item in filtered_candiates]
137
  return scores[0:top], candidates_indexes[0:top]
138
 
139
 
140
+ # ===================================================
141
+
142
+
143
  def candidates_reranking(
144
  top_candidates_idx_lst, conversational_history, utterance, initial_df, pipeline
145
  ):
 
163
  return reranked_idx
164
 
165
 
166
+ # ===================================================
167
+
168
+
169
  def read_files_negative(path1, path2):
170
  """this functions creates training dataset for classifier incl negative
171
  examples and saves it to the pickle file"""
 
198
  fin_scripts["context"] = fin_scripts["context"].apply(lambda x: "".join(x))
199
  fin_scripts = fin_scripts[fin_scripts["question"] != ""]
200
  fin_scripts = fin_scripts[fin_scripts["answer"] != ""]
201
+ fin_scripts["combined_all"] = (
202
  fin_scripts["context"]
203
  + "[SEP]"
204
  + fin_scripts["question"]
205
  + "[SEP]"
206
  + fin_scripts["answer"]
207
  )
208
+
209
+ fin_scripts["combined_cq"] = (
210
+ fin_scripts["context"] + "[SEP]" + fin_scripts["question"]
211
+ )
212
  # fin_scripts = fin_scripts.dropna(how='any')
213
  fin_scripts.to_pickle("data/scripts_for_reranker.pkl")
214
+
215
+
216
+ # ===================================================
217
+
218
+
219
+ def intent_classification(question, answer, tag_model):
220
+ greetings = ["hi", "hello", "greeting", "greetings", "hii", "helo", "hellow"]
221
+ tokens = word_tokenize(answer.lower())
222
+ for token in tokens:
223
+ if token in greetings:
224
+ return "greetings"
225
+ else:
226
+ intent = tag_model.predict_tag(question)
227
+ return intent
228
+
229
+
230
+ # ===================================================
231
+
232
+
233
+ def change_names(sentences):
234
+ lst_punct = string.punctuation
235
+ lst_punct += "’"
236
+ sheldon_friends = [
237
+ "Penny",
238
+ "Amy",
239
+ "Leonard",
240
+ "Stephanie",
241
+ "Dr. Stephanie",
242
+ "Raj",
243
+ "Rebecca",
244
+ ]
245
+ tokens = word_tokenize(sentences)
246
+ changes = "".join(
247
+ "my friend" if i in sheldon_friends else i if i in lst_punct else f" {i}"
248
+ for i in tokens
249
+ ).strip()
250
+ return changes
251
+
252
+
253
+ # ===================================================
254
+
255
+
256
+ def data_prep_biencoder(path1, path2):
257
+ """this functions creates training dataset for classifier incl negative
258
+ examples and saves it to the pickle file"""
259
+
260
+ star_wars = []
261
+ for file in path1:
262
+ star_wars.append(pd.read_csv(file, sep='"', on_bad_lines="warn"))
263
+ total = pd.concat(star_wars, ignore_index=True)
264
+
265
+ rick_and_morty = pd.read_csv(path2)
266
+ negative_lines_to_add = list(rick_and_morty["line"])
267
+ negative_lines_to_add.extend(list(total["dialogue"]))
268
+
269
+ scripts_reopened = pd.read_pickle("data/scripts.pkl")
270
+ scripts_reopened["label"] = 0
271
+ source = random.sample(
272
+ list(scripts_reopened[scripts_reopened["question"] != ""]["question"]), 7062
273
+ )
274
+ negative_lines_to_add.extend(source)
275
+ random.shuffle(negative_lines_to_add)
276
+
277
+ scripts_negative = scripts_reopened[["question", "context", "answer"]]
278
+ scripts_negative["label"] = 1
279
+
280
+ scripts_negative["neg_answer"] = negative_lines_to_add[0 : len(scripts_negative)]
281
+
282
+ fin_scripts = scripts_negative.sample(frac=1).reset_index(drop=True)
283
+ fin_scripts["context"] = fin_scripts["context"].apply(lambda x: "".join(x))
284
+ fin_scripts = fin_scripts[fin_scripts["question"] != ""]
285
+ fin_scripts = fin_scripts[fin_scripts["answer"] != ""]
286
+
287
+ fin_scripts["combined"] = fin_scripts["context"] + "[SEP]" + fin_scripts["question"]
288
+ # fin_scripts = fin_scripts.dropna(how='any')
289
+ fin_scripts.to_pickle("data/scripts_for_biencoder.pkl")