File size: 4,501 Bytes
b942bef
5499fd6
b942bef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5499fd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b942bef
 
 
 
 
 
 
 
 
 
5499fd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b942bef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5499fd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b942bef
 
 
5499fd6
b942bef
 
 
 
 
 
 
5499fd6
b942bef
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
"""gradio_bert.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly

# libraries
"""


import gradio as gr
import torch
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import re

"""# data - text"""

splitted_df = pd.read_csv('/content/splitted_df_jo.csv')

"""# getting context"""

def remove_symbols(text: str)-> str:
   """
    Removes specified symbols and non-ASCII characters from the input text.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text with specified symbols and non-ASCII characters removed.

    Example:
        >>> text = "This is a test string / with (some) symbols.\nAnd some non-ASCII characters like é and ñ."
        >>> clean_text = remove_symbols(text)
        >>> print(clean_text)
        This is a test string  with some symbols.And some non-ASCII characters like  and .
    """
    remove_list = ['/', '(', ')', '\n', '.']
    remove_chars = "".join(remove_list)
    cleaned_text = "".join([char for char in text if char not in remove_chars])

    # Remove non-ASCII characters
    pattern_ascii = r'[^\x00-\x7F]'  # Matches any character outside the ASCII range
    filtered_text = re.sub(pattern_ascii, '', cleaned_text)

    return filtered_text


def context_func(message: str)-> str:
  """
    Finds the most similar context from a collection of texts based on TF-IDF vectorization and cosine similarity.

    Args:
        message (str): The input message or question.

    Returns:
        str: The most similar context to the input message from the collection of texts.

    Example:
        >>> message = "What are the symptoms of breast cancer?"
        >>> similar_context = context_func(message)
        >>> print(similar_context)
        Breast cancer is the most common cancer among women worldwide...
    """
  # Create a TF-IDF vectorizer
  vectorizer = TfidfVectorizer()

  # Convert abstracts and question to TF-IDF vectors
  text_tfidf = vectorizer.fit_transform(splitted_df["section_text"])
  question_tfidf = vectorizer.transform([message])

  # Calculate cosine similarity between question and each abstract
  similarities = cosine_similarity(question_tfidf, text_tfidf)[0]

  # Find the index of the most similar abstract
  most_similar_index = similarities.argmax()

  # Get the most similar abstract
  most_similar_context = splitted_df["section_text"][most_similar_index]
  most_similar_context = remove_symbols(most_similar_context)

  return most_similar_context

"""# the model"""

!huggingface-cli login

tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")

def answer_question(question: str)-> str, str:
  """
    Generates an answer to the input question based on the provided context.

    Args:
        question (str): The input question.

    Returns:
        tuple: A tuple containing the generated answer and the context used for answering.

    Example:
        >>> question = "What is the capital of France?"
        >>> answer, context = answer_question(question)
        >>> print("Answer:", answer)
        >>> print("Context:", context)
    """
    context = context_func(question)
    # Tokenize the inputs
    inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
    
    # Get the answer from the model
    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1
    answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
    
    return answer, context

iface = gr.Interface(fn=answer_question,
                     inputs=["text"],
                     outputs=[gr.Textbox(label="Answer")],
                     title="Women Cancer ChatBot",
                     description="How can I help you?",
                     examples=[
                         ["What is breast cancer?"],
                         ["What are treatments for cervical cancer?"]
                     ])

iface.launch(debug = True)