Spaces:
Paused
Paused
#import | |
# from transformers import TFAutoModelForQuestionAnswering | |
# from transformers import AutoTokenizer | |
from transformers import pipeline | |
import glob | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import gradio as gr | |
# get corpus | |
corpus = [] | |
file_paths = glob.glob('data/*.txt') | |
for file_path in file_paths: | |
with open(file_path, 'r') as file: | |
content = file.read() | |
corpus.append(content) | |
# vectorise | |
vectorizer = TfidfVectorizer() | |
tfidf_matrix = vectorizer.fit_transform(corpus) | |
# doc retrieval | |
def doc_retrieve(query, tfidf_matrix): | |
query_tfidf = vectorizer.transform([query]) | |
similarity_scores = cosine_similarity(query_tfidf, tfidf_matrix) | |
top_indexes = similarity_scores.argsort()[0][::-1] | |
top_matches = [corpus[i] for i in top_indexes] | |
# Print the top matching documents | |
return top_matches[0] | |
# gradio interface | |
title = "D/HD Task NLP" | |
desc="""An (attempt at) NLP chatbot that answers questions on information about the Computer Science degree""" | |
models = ["base distilbert", "finetuned distilbert", "finetuned tinyroberta"] | |
trained_models = ["distilbert-base-cased-distilled-squad", "kiwimckiwiman/distilv2", "kiwimckiwiman/robertav3"] | |
model_input = gr.inputs.Dropdown(choices=models, label="Select Model") | |
question = gr.Textbox(label="Type your question here:", placeholder="Type here", lines=2) | |
# predict | |
def predict(model, question): | |
context = doc_retrieve(question, tfidf_matrix) | |
question_answerer = pipeline("question-answering", model=trained_models[models.index(model)]) | |
results = question_answerer(question=question, context=context) | |
return results["answer"] | |
demo = gr.Interface(fn=predict, inputs=[model_input, question], outputs="text", title=title, description=desc) | |
demo.launch() | |