NLP_D_HD_Task / app.py
kiwimckiwiman's picture
Update app.py
9cf93d5
#import
# from transformers import TFAutoModelForQuestionAnswering
# from transformers import AutoTokenizer
from transformers import pipeline
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
# get corpus
corpus = []
file_paths = glob.glob('data/*.txt')
for file_path in file_paths:
with open(file_path, 'r') as file:
content = file.read()
corpus.append(content)
# vectorise
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
# doc retrieval
def doc_retrieve(query, tfidf_matrix):
query_tfidf = vectorizer.transform([query])
similarity_scores = cosine_similarity(query_tfidf, tfidf_matrix)
top_indexes = similarity_scores.argsort()[0][::-1]
top_matches = [corpus[i] for i in top_indexes]
# Print the top matching documents
return top_matches[0]
# gradio interface
title = "D/HD Task NLP"
desc="""An (attempt at) NLP chatbot that answers questions on information about the Computer Science degree"""
models = ["base distilbert", "finetuned distilbert", "finetuned tinyroberta"]
trained_models = ["distilbert-base-cased-distilled-squad", "kiwimckiwiman/distilv2", "kiwimckiwiman/robertav3"]
model_input = gr.inputs.Dropdown(choices=models, label="Select Model")
question = gr.Textbox(label="Type your question here:", placeholder="Type here", lines=2)
# predict
def predict(model, question):
context = doc_retrieve(question, tfidf_matrix)
question_answerer = pipeline("question-answering", model=trained_models[models.index(model)])
results = question_answerer(question=question, context=context)
return results["answer"]
demo = gr.Interface(fn=predict, inputs=[model_input, question], outputs="text", title=title, description=desc)
demo.launch()