File size: 3,961 Bytes
3fb88a6
 
8e44dd8
3fb88a6
8e44dd8
 
 
 
 
 
 
3fb88a6
 
 
decf09e
3fb88a6
8e44dd8
3fb88a6
 
 
 
 
 
 
 
8e44dd8
3fb88a6
 
 
 
decf09e
3fb88a6
 
 
 
 
 
 
 
8e44dd8
e8c8539
decf09e
8e44dd8
3fb88a6
8e44dd8
 
 
 
 
3fb88a6
 
 
8e44dd8
3fb88a6
 
 
 
 
8e44dd8
 
3fb88a6
8e44dd8
 
 
 
3fb88a6
8e44dd8
 
 
 
 
decf09e
3fb88a6
decf09e
 
 
 
 
 
 
8e44dd8
 
3fb88a6
8e44dd8
 
 
 
 
 
3fb88a6
8e44dd8
 
 
 
 
 
 
 
 
 
 
 
3fb88a6
 
 
 
 
8e44dd8
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import pandas as pd
import pickle
import random
from sentence_transformers import SentenceTransformer
from utils import (
    encode,
    cosine_sim,
    top_candidates,
    candidates_reranking,
    intent_classification,
)
from collections import deque
from transformers import pipeline
import torch
import json
from transformers import AutoTokenizer
from dialog_tag import DialogTag

# this class representes main functions of retrieve bot

class ChatBot:
    def __init__(self):
        self.vect_data = []
        self.scripts = []
        self.conversation_history = deque([], maxlen=5)
        self.tag_model = None
        self.ranking_model = None
        self.reranking_model = None
        self.device = None
        self.tokenizer = None
        self.low_scoring_list = None

    def load(self):
        """ "This method is called first to load all datasets and
        model used by the chat bot; all the data to be saved in
        tha data folder, models to be loaded from hugging face"""

        with open("data/scripts_vectors.pkl", "rb") as fp:
            self.vect_data = pickle.load(fp)
        self.scripts = pd.read_pickle("data/scripts.pkl")
        with open('data/low_score_sripts.json', 'r') as f:
            self.low_scoring_list = json.load(f)
        self.tag_model = DialogTag("distilbert-base-uncased")
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.ranking_model = SentenceTransformer(
         "Shakhovak/chatbot_sentence-transformer"
         )  # # sentence-transformers/LaBSE  or sentence-transformers/all-mpnet-base-v2 or Shakhovak/chatbot_sentence-transformer

        self.tokenizer_reranking = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.reranking_model = pipeline(
            model="Shakhovak/RerankerModel_chat_bot",
            device=self.device,
            tokenizer=self.tokenizer_reranking,
        )

    def generate_response(self, utterance: str) -> str:
        """this functions identifies potential
        candidates for answer and ranks them"""

        intent = intent_classification(utterance, utterance, self.tag_model)
        query_encoding = encode(
            texts=utterance,
            intent=intent,
            model=self.ranking_model,
            contexts=self.conversation_history,
        )
        bot_cosine_scores = cosine_sim(
            self.vect_data,
            query_encoding,
        )
        top_scores, top_indexes = top_candidates(
            bot_cosine_scores, intent=intent, initial_data=self.scripts, top=5
        )
        if top_scores[0] < 0.9:
            if intent == "greetings":
                answer = random.choice(self.low_scoring_list['greetings'])
                self.conversation_history.clear()
            else:
                answer = random.choice(self.low_scoring_list['generic'])
                self.conversation_history.clear()
        else:
            # test candidates and collects them with label 0 to dictionary

            reranked_dict = candidates_reranking(
                top_indexes,
                self.conversation_history,
                utterance,
                self.scripts,
                self.reranking_model,
            )
            # if any candidates were selected, range them and pick up the top
            # else keep up the initial top 1

            if len(reranked_dict) >= 1:
                updated_top_candidates = dict(
                    sorted(reranked_dict.items(), key=lambda item: item[1])
                )
                answer = self.scripts.iloc[list(updated_top_candidates.keys())[0]][
                    "answer"
                ]
            else:
                answer = self.scripts.iloc[top_indexes[0]]["answer"]

        self.conversation_history.append(utterance)
        self.conversation_history.append(answer)

        return answer


# katya = ChatBot()
# katya.load()
# katya.generate_response("hi man!")