Spaces:

mdj1412
/

movie_review_score_discriminator

Sleeping

App Files Files Community

movie_review_score_discriminator / app.py

mdj1412

Upload app.py

63644af almost 2 years ago

raw

history blame

5.1 kB

	import gradio as gr
	import fasttext

	from transformers import AutoModelForSequenceClassification
	from transformers import AutoTokenizer

	import numpy as np
	import pandas as pd
	import torch


	id2label = {0: "NEGATIVE", 1: "POSITIVE"}
	label2id = {"NEGATIVE": 0, "POSITIVE": 1}


	title = "Movie Review Score Discriminator"
	description = "It is a program that classifies whether it is positive or negative by entering movie reviews. \
	You can choose between the Korean version and the English version. \
	It also provides a version called Any, which determines whether it is Korean or English and predicts it."


	class LanguageIdentification:
	def __init__(self):
	pretrained_lang_model = "./lid.176.ftz"
	self.model = fasttext.load_model(pretrained_lang_model)

	def predict_lang(self, text):
	predictions = self.model.predict(text, k=200) # returns top 200 matching languages
	return predictions

	LANGUAGE = LanguageIdentification()



	def tokenized_data(tokenizer, inputs):
	return tokenizer.batch_encode_plus(
	[inputs],
	return_tensors="pt",
	padding="max_length",
	max_length=64,
	truncation=True)



	examples = []
	df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
	np.random.seed(100)

	idx = np.random.choice(50, size=5, replace=False)
	eng_examples = [ ['Eng', df.iloc[i, 0]] for i in idx ]
	kor_examples = [ ['Kor', df.iloc[i, 1]] for i in idx ]
	examples = eng_examples + kor_examples



	eng_model_name = "roberta-base"
	eng_step = 1900
	eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name)
	eng_file_name = "{}-{}.pt".format(eng_model_name, eng_step)
	eng_state_dict = torch.load(eng_file_name)
	eng_model = AutoModelForSequenceClassification.from_pretrained(
	eng_model_name, num_labels=2, id2label=id2label, label2id=label2id,
	state_dict=eng_state_dict
	)


	kor_model_name = "klue/roberta-small"
	kor_step = 2400
	kor_tokenizer = AutoTokenizer.from_pretrained(kor_model_name)
	kor_file_name = "{}-{}.pt".format(kor_model_name.replace('/', '_'), kor_step)
	kor_state_dict = torch.load(kor_file_name)
	kor_model = AutoModelForSequenceClassification.from_pretrained(
	kor_model_name, num_labels=2, id2label=id2label, label2id=label2id,
	state_dict=kor_state_dict
	)


	def builder(lang, text):
	percent_kor, percent_eng = 0, 0
	text_list = text.split(' ')


	# [ output_1 ]
	if lang == 'Any':
	pred = LANGUAGE.predict_lang(text)
	if '__label__en' in pred[0]:
	lang = 'Eng'
	idx = pred[0].index('__label__en')
	percent_eng = pred[1][idx]
	if '__label__ko' in pred[0]:
	lang = 'Kor'
	idx = pred[0].index('__label__ko')
	percent_kor = pred[1][idx]

	if lang == 'Eng':
	model = eng_model
	tokenizer = eng_tokenizer
	if percent_eng==0: percent_eng=1

	if lang == 'Kor':
	model = kor_model
	tokenizer = kor_tokenizer
	if percent_kor==0: percent_kor=1


	# [ output_2 ]
	inputs = tokenized_data(tokenizer, text)
	model.eval()
	with torch.no_grad():
	logits = model(input_ids=inputs['input_ids'],
	attention_mask=inputs['attention_mask']).logits

	m = torch.nn.Softmax(dim=1)
	output = m(logits)
	# print(logits, output)


	# [ output_3 ]
	output_analysis = []
	for word in text_list:
	tokenized_word = tokenized_data(tokenizer, word)
	with torch.no_grad():
	logit = model(input_ids=tokenized_word['input_ids'],
	attention_mask=tokenized_word['attention_mask']).logits
	word_output = m(logit)
	if word_output[0][1] > 0.95:
	output_analysis.append( (word, '+') )
	elif word_output[0][1] < 0.05:
	output_analysis.append( (word, '-') )
	else:
	output_analysis.append( (word, None) )

	return [ {'Kor': percent_kor, 'Eng': percent_eng, 'Other': 1-(percent_kor+percent_eng)},
	{id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()},
	output_analysis ]

	# prediction = torch.argmax(logits, axis=1)
	return id2label[prediction.item()]


	# demo3 = gr.Interface.load("models/mdj1412/movie_review_score_discriminator_eng", inputs="text", outputs="text",
	# title=title, theme="peach",
	# allow_flagging="auto",
	# description=description, examples=examples)



	demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Any', 'Eng', 'Kor']), "text"],
	outputs=[ gr.Label(num_top_classes=3, label='Lang'),
	gr.Label(num_top_classes=2, label='Result'),
	gr.HighlightedText(label="Analysis", combine_adjacent=False).style(color_map={"+": "red", "-": "green"}) ],
	# outputs='label',
	title=title, description=description, examples=examples)


	if __name__ == "__main__":
	# print(examples)
	demo.launch()
	# demo3.launch()