Spaces:

Shakhovak
/

Sheldon_Retrieval_chat_bot

Sleeping

App Files Files Community

shakhovak commited on Feb 15

Commit

8e44dd8

•

1 Parent(s): 69f2a1b

bienc+intent_added

Browse files

Files changed (5) hide show

data/scripts.pkl +2 -2
data/scripts_vectors.pkl +2 -2
requirements.txt +2 -1
retrieve_bot.py +77 -25
utils.py +134 -11

data/scripts.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fd8ded525a9faf9031e899ba75c5b7f91fdc4052619a43ca1ff608a7cce73b42
-size 2127113

 version https://git-lfs.github.com/spec/v1
+oid sha256:2df9355dd53669d082cecbdcfabee2cedba4527b0dfafcc086d7da479f78be48
+size 3031195

data/scripts_vectors.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba242c25adc032bcf265fa1c805bf1f506150f181a6fc13f6753088af79cd9c7
-size 71223174

 version https://git-lfs.github.com/spec/v1
+oid sha256:f3452fc927c68cb4cfd2a3eacd1b86158cb64696ef41b419ef45d50f9946196b
+size 100818899

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ pandas==1.3.5
 gunicorn==20.1.0
 requests==2.27.
 datasets==2.13.2
-transformers==4.37.2

 gunicorn==20.1.0
 requests==2.27.
 datasets==2.13.2
+transformers==4.37.2
+DialogTag==1.1.3

retrieve_bot.py CHANGED Viewed

@@ -1,20 +1,46 @@
 import pandas as pd
 import pickle
 from sentence_transformers import SentenceTransformer
-from utils import encode, cosine_sim, top_candidates, candidates_reranking
 from collections import deque
 from transformers import pipeline
 import torch
 from transformers import AutoTokenizer
 # this class representes main functions of retrieve bot
 class ChatBot:
     def __init__(self):
         self.vect_data = []
         self.scripts = []
         self.conversation_history = deque([], maxlen=5)
         self.ranking_model = None
         self.reranking_model = None
         self.device = None
@@ -27,46 +53,72 @@ class ChatBot:
         with open("data/scripts_vectors.pkl", "rb") as fp:
             self.vect_data = pickle.load(fp)
-            self.scripts = pd.read_pickle("data/scripts.pkl")
-        self.ranking_model = SentenceTransformer("sentence-transformers/LaBSE")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         self.reranking_model = pipeline(
             model="Shakhovak/RerankerModel_chat_bot",
             device=self.device,
-            tokenizer=self.tokenizer,
         )
     def generate_response(self, utterance: str) -> str:
         """this functions identifies potential
         candidates for answer and ranks them"""
         query_encoding = encode(
-            utterance, self.ranking_model, contexts=self.conversation_history
         )
-        bot_cosine_scores = cosine_sim(self.vect_data, query_encoding)
-        top_scores, top_indexes = top_candidates(bot_cosine_scores, top=5)
-        # test candidates and collects them with label 0 to dictionary
-        reranked_dict = candidates_reranking(
-            top_indexes,
-            self.conversation_history,
-            utterance,
-            self.scripts,
-            self.reranking_model,
         )
-        # if any candidates were selected, range them and pick up the top
-        # else keep up the initial top 1
-        if len(reranked_dict) >= 1:
-            updated_top_candidates = dict(
-                sorted(reranked_dict.items(), key=lambda item: item[1])
             )
-            answer = self.scripts.iloc[list(updated_top_candidates.keys())[0]]["answer"]
-        else:
-            answer = self.scripts.iloc[top_indexes[0]]["answer"]
         self.conversation_history.append(utterance)
         self.conversation_history.append(answer)
         return answer

 import pandas as pd
 import pickle
+import random
 from sentence_transformers import SentenceTransformer
+from utils import (
+    encode,
+    cosine_sim,
+    top_candidates,
+    candidates_reranking,
+    intent_classification,
+)
 from collections import deque
 from transformers import pipeline
 import torch
 from transformers import AutoTokenizer
+from dialog_tag import DialogTag
 # this class representes main functions of retrieve bot
+low_scoring_list = [
+    "What does it mean?",
+    "You have two strikes. Three strikes and you’ re out. It’ s a sports metaphor. Explain again!",
+    "Again, urban slang. In which, I believe I’ m gaining remarkable fluency. So, could you repeat?",
+    "I’m confused.",
+    "I can’t comment without violating our agreement that I don’ t criticize you.",
+    "Oh!",
+    "I need to use the restroom.",
+    "Move. Move. Move!",
+    "I was going to mention it at the time, but then I thought, some day maybe...",
+    "Well...",
+    "Apparently... I have no idea!?",
+    "I’m not sure...",
+    "Nothing. I say nothing.",
+    "Well, my friend. Focus and repeat!",
+]
 class ChatBot:
     def __init__(self):
         self.vect_data = []
         self.scripts = []
         self.conversation_history = deque([], maxlen=5)
+        self.tag_model = None
         self.ranking_model = None
         self.reranking_model = None
         self.device = None
         with open("data/scripts_vectors.pkl", "rb") as fp:
             self.vect_data = pickle.load(fp)
+        self.scripts = pd.read_pickle("data/scripts.pkl")
+        self.tag_model = DialogTag("distilbert-base-uncased")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.ranking_model = SentenceTransformer(
+         "Shakhovak/chatbot_sentence-transformer"
+         )  # # sentence-transformers/LaBSE  or sentence-transformers/all-mpnet-base-v2 or Shakhovak/chatbot_sentence-transformer
+        self.tokenizer_reranking = AutoTokenizer.from_pretrained("bert-base-uncased")
         self.reranking_model = pipeline(
             model="Shakhovak/RerankerModel_chat_bot",
             device=self.device,
+            tokenizer=self.tokenizer_reranking,
         )
     def generate_response(self, utterance: str) -> str:
         """this functions identifies potential
         candidates for answer and ranks them"""
+        intent = intent_classification(utterance, utterance, self.tag_model)
         query_encoding = encode(
+            texts=utterance,
+            intent=intent,
+            model=self.ranking_model,
+            contexts=self.conversation_history,
         )
+        bot_cosine_scores = cosine_sim(
+            self.vect_data,
+            query_encoding,
+        )
+        top_scores, top_indexes = top_candidates(
+            bot_cosine_scores, intent=intent, initial_data=self.scripts, top=10
         )
+        print(top_scores)
+        if top_scores[0] < 0.9:
+            answer = random.choice(low_scoring_list)
+            self.conversation_history.clear()
+        else:
+            # test candidates and collects them with label 0 to dictionary
+            reranked_dict = candidates_reranking(
+                top_indexes,
+                self.conversation_history,
+                utterance,
+                self.scripts,
+                self.reranking_model,
             )
+            # if any candidates were selected, range them and pick up the top
+            # else keep up the initial top 1
+            if len(reranked_dict) >= 1:
+                updated_top_candidates = dict(
+                    sorted(reranked_dict.items(), key=lambda item: item[1])
+                )
+                answer = self.scripts.iloc[list(updated_top_candidates.keys())[0]][
+                    "answer"
+                ]
+                print(self.scripts.iloc[top_indexes[0]]["answer"])
+            else:
+                answer = self.scripts.iloc[top_indexes[0]]["answer"]
         self.conversation_history.append(utterance)
         self.conversation_history.append(answer)
         return answer
+# katya = ChatBot()
+# katya.load()
+# katya.generate_response("hi man!")

utils.py CHANGED Viewed

@@ -4,31 +4,47 @@ from scipy import sparse
 import pandas as pd
 import pickle
 import random
-def encode(texts, model, contexts=None, do_norm=True):
     """function to encode texts for cosine similarity search"""
     question_vectors = model.encode(texts)
     context_vectors = model.encode("".join(contexts))
     return np.concatenate(
-        [np.asarray(question_vectors), np.asarray(context_vectors)], axis=-1
     )
 def cosine_sim(data_vectors, query_vectors) -> list:
     """returns list of tuples with similarity score and
     script index in initial dataframe"""
     data_emb = sparse.csr_matrix(data_vectors)
     query_emb = sparse.csr_matrix(query_vectors)
     similarity = cosine_similarity(query_emb, data_emb).flatten()
     ind = np.argwhere(similarity)
     match = sorted(zip(similarity, ind.tolist()), reverse=True)
     return match
-def scripts_rework(path, character):
     """this functions split scripts for queation, answer, context,
     picks up the cahracter and saves data in pickle format"""
@@ -66,18 +82,30 @@ def scripts_rework(path, character):
                 "context": context,
             }
-            scripts = scripts.append(new_row, ignore_index=True)
         elif (row["person_scene"] == character) & (
             df.iloc[index - 1]["person_scene"] == "Scene"
         ):
             context = []
             new_row = {"answer": row["dialogue"], "question": "", "context": context}
-            scripts = scripts.append(new_row, ignore_index=True)
     # load reworked data to pkl
     scripts.to_pickle("data/scripts.pkl")
 def encode_df_save(model):
     """this functions vectorizes reworked scripts and loads them to
     pickle file to be used as retrieval base for ranking script"""
@@ -85,21 +113,33 @@ def encode_df_save(model):
     scripts_reopened = pd.read_pickle("data/scripts.pkl")
     vect_data = []
     for index, row in scripts_reopened.iterrows():
-        vect = encode(row["question"], model, row["context"])
         vect_data.append(vect)
     with open("data/scripts_vectors.pkl", "wb") as f:
         pickle.dump(vect_data, f)
-def top_candidates(score_lst_sorted, top=1):
     """this functions receives results of the cousine similarity ranking and
     returns top items' scores and their indices"""
-    scores = [item[0] for item in score_lst_sorted]
-    candidates_indexes = [item[1][0] for item in score_lst_sorted]
     return scores[0:top], candidates_indexes[0:top]
 def candidates_reranking(
     top_candidates_idx_lst, conversational_history, utterance, initial_df, pipeline
 ):
@@ -123,6 +163,9 @@ def candidates_reranking(
     return reranked_idx
 def read_files_negative(path1, path2):
     """this functions creates training dataset for classifier incl negative
     examples and saves it to the pickle file"""
@@ -155,12 +198,92 @@ def read_files_negative(path1, path2):
     fin_scripts["context"] = fin_scripts["context"].apply(lambda x: "".join(x))
     fin_scripts = fin_scripts[fin_scripts["question"] != ""]
     fin_scripts = fin_scripts[fin_scripts["answer"] != ""]
-    fin_scripts["combined"] = (
         fin_scripts["context"]
         + "[SEP]"
         + fin_scripts["question"]
         + "[SEP]"
         + fin_scripts["answer"]
     )
     # fin_scripts = fin_scripts.dropna(how='any')
     fin_scripts.to_pickle("data/scripts_for_reranker.pkl")

 import pandas as pd
 import pickle
 import random
+from nltk.tokenize import word_tokenize
+import string
+def encode(texts, model, intent, contexts=None, do_norm=True):
     """function to encode texts for cosine similarity search"""
     question_vectors = model.encode(texts)
     context_vectors = model.encode("".join(contexts))
+    intent_vectors = model.encode(intent)
     return np.concatenate(
+        [
+            np.asarray(context_vectors),
+            np.asarray(question_vectors),
+            np.asarray(intent_vectors),
+        ],
+        axis=-1,
     )
+# ===================================================
 def cosine_sim(data_vectors, query_vectors) -> list:
     """returns list of tuples with similarity score and
     script index in initial dataframe"""
     data_emb = sparse.csr_matrix(data_vectors)
     query_emb = sparse.csr_matrix(query_vectors)
     similarity = cosine_similarity(query_emb, data_emb).flatten()
     ind = np.argwhere(similarity)
     match = sorted(zip(similarity, ind.tolist()), reverse=True)
     return match
+# ===================================================
+def scripts_rework(path, character, tag_model):
     """this functions split scripts for queation, answer, context,
     picks up the cahracter and saves data in pickle format"""
                 "context": context,
             }
+            scripts = pd.concat([scripts, pd.DataFrame([new_row])])
         elif (row["person_scene"] == character) & (
             df.iloc[index - 1]["person_scene"] == "Scene"
         ):
             context = []
             new_row = {"answer": row["dialogue"], "question": "", "context": context}
+            scripts = pd.concat([scripts, pd.DataFrame([new_row])])
     # load reworked data to pkl
+    scripts = scripts[scripts["question"] != ""]
+    scripts["answer"] = scripts["answer"].apply(lambda x: change_names(x))
+    scripts["tag"] = scripts[["answer", "question"]].apply(
+        lambda test_scripts: intent_classification(
+            test_scripts["question"], test_scripts["answer"], tag_model
+        ),
+        axis=1,
+    )
+    scripts = scripts.reset_index(drop=True)
     scripts.to_pickle("data/scripts.pkl")
+# ===================================================
 def encode_df_save(model):
     """this functions vectorizes reworked scripts and loads them to
     pickle file to be used as retrieval base for ranking script"""
     scripts_reopened = pd.read_pickle("data/scripts.pkl")
     vect_data = []
     for index, row in scripts_reopened.iterrows():
+        vect = encode(
+            texts=row["question"],
+            model=model,
+            intent=row["tag"],
+            contexts=row["context"],
+        )
         vect_data.append(vect)
     with open("data/scripts_vectors.pkl", "wb") as f:
         pickle.dump(vect_data, f)
+# ===================================================
+def top_candidates(score_lst_sorted, intent, initial_data, top=1):
     """this functions receives results of the cousine similarity ranking and
     returns top items' scores and their indices"""
+    intent_idx = initial_data.index[initial_data["tag"] == intent]
+    filtered_candiates = [item for item in score_lst_sorted if item[1][0] in intent_idx]
+    scores = [item[0] for item in filtered_candiates]
+    candidates_indexes = [item[1][0] for item in filtered_candiates]
     return scores[0:top], candidates_indexes[0:top]
+# ===================================================
 def candidates_reranking(
     top_candidates_idx_lst, conversational_history, utterance, initial_df, pipeline
 ):
     return reranked_idx
+# ===================================================
 def read_files_negative(path1, path2):
     """this functions creates training dataset for classifier incl negative
     examples and saves it to the pickle file"""
     fin_scripts["context"] = fin_scripts["context"].apply(lambda x: "".join(x))
     fin_scripts = fin_scripts[fin_scripts["question"] != ""]
     fin_scripts = fin_scripts[fin_scripts["answer"] != ""]
+    fin_scripts["combined_all"] = (
         fin_scripts["context"]
         + "[SEP]"
         + fin_scripts["question"]
         + "[SEP]"
         + fin_scripts["answer"]
     )
+    fin_scripts["combined_cq"] = (
+        fin_scripts["context"] + "[SEP]" + fin_scripts["question"]
+    )
     # fin_scripts = fin_scripts.dropna(how='any')
     fin_scripts.to_pickle("data/scripts_for_reranker.pkl")
+# ===================================================
+def intent_classification(question, answer, tag_model):
+    greetings = ["hi", "hello", "greeting", "greetings", "hii", "helo", "hellow"]
+    tokens = word_tokenize(answer.lower())
+    for token in tokens:
+        if token in greetings:
+            return "greetings"
+        else:
+            intent = tag_model.predict_tag(question)
+            return intent
+# ===================================================
+def change_names(sentences):
+    lst_punct = string.punctuation
+    lst_punct += "’"
+    sheldon_friends = [
+        "Penny",
+        "Amy",
+        "Leonard",
+        "Stephanie",
+        "Dr. Stephanie",
+        "Raj",
+        "Rebecca",
+    ]
+    tokens = word_tokenize(sentences)
+    changes = "".join(
+        "my friend" if i in sheldon_friends else i if i in lst_punct else f" {i}"
+        for i in tokens
+    ).strip()
+    return changes
+# ===================================================
+def data_prep_biencoder(path1, path2):
+    """this functions creates training dataset for classifier incl negative
+    examples and saves it to the pickle file"""
+    star_wars = []
+    for file in path1:
+        star_wars.append(pd.read_csv(file, sep='"', on_bad_lines="warn"))
+    total = pd.concat(star_wars, ignore_index=True)
+    rick_and_morty = pd.read_csv(path2)
+    negative_lines_to_add = list(rick_and_morty["line"])
+    negative_lines_to_add.extend(list(total["dialogue"]))
+    scripts_reopened = pd.read_pickle("data/scripts.pkl")
+    scripts_reopened["label"] = 0
+    source = random.sample(
+        list(scripts_reopened[scripts_reopened["question"] != ""]["question"]), 7062
+    )
+    negative_lines_to_add.extend(source)
+    random.shuffle(negative_lines_to_add)
+    scripts_negative = scripts_reopened[["question", "context", "answer"]]
+    scripts_negative["label"] = 1
+    scripts_negative["neg_answer"] = negative_lines_to_add[0 : len(scripts_negative)]
+    fin_scripts = scripts_negative.sample(frac=1).reset_index(drop=True)
+    fin_scripts["context"] = fin_scripts["context"].apply(lambda x: "".join(x))
+    fin_scripts = fin_scripts[fin_scripts["question"] != ""]
+    fin_scripts = fin_scripts[fin_scripts["answer"] != ""]
+    fin_scripts["combined"] = fin_scripts["context"] + "[SEP]" + fin_scripts["question"]
+    # fin_scripts = fin_scripts.dropna(how='any')
+    fin_scripts.to_pickle("data/scripts_for_biencoder.pkl")