Spaces:

keras-io
/

bert-semantic-similarity

Runtime error

App Files Files Community

bert-semantic-similarity / app.py

vumichien

Create new file

d9610ab over 2 years ago

raw

history blame

3.78 kB

	from huggingface_hub import from_pretrained_keras
	import numpy as np
	import gradio as gr
	import transformers

	class BertSemanticDataGenerator(tf.keras.utils.Sequence):
	"""Generates batches of data."""
	def __init__(
	self,
	sentence_pairs,
	labels,
	batch_size=batch_size,
	shuffle=True,
	include_targets=True,
	):
	self.sentence_pairs = sentence_pairs
	self.labels = labels
	self.shuffle = shuffle
	self.batch_size = batch_size
	self.include_targets = include_targets
	# Load our BERT Tokenizer to encode the text.
	# We will use base-base-uncased pretrained model.
	self.tokenizer = transformers.BertTokenizer.from_pretrained(
	"bert-base-uncased", do_lower_case=True
	)
	self.indexes = np.arange(len(self.sentence_pairs))
	self.on_epoch_end()

	def __len__(self):
	# Denotes the number of batches per epoch.
	return len(self.sentence_pairs) // self.batch_size

	def __getitem__(self, idx):
	# Retrieves the batch of index.
	indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
	sentence_pairs = self.sentence_pairs[indexes]

	# With BERT tokenizer's batch_encode_plus batch of both the sentences are
	# encoded together and separated by [SEP] token.
	encoded = self.tokenizer.batch_encode_plus(
	sentence_pairs.tolist(),
	add_special_tokens=True,
	max_length=max_length,
	return_attention_mask=True,
	return_token_type_ids=True,
	pad_to_max_length=True,
	return_tensors="tf",
	)

	# Convert batch of encoded features to numpy array.
	input_ids = np.array(encoded["input_ids"], dtype="int32")
	attention_masks = np.array(encoded["attention_mask"], dtype="int32")
	token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

	# Set to true if data generator is used for training/validation.
	if self.include_targets:
	labels = np.array(self.labels[indexes], dtype="int32")
	return [input_ids, attention_masks, token_type_ids], labels
	else:
	return [input_ids, attention_masks, token_type_ids]

	model = from_pretrained_keras("keras-io/bert-semantic-similarity")

	def predict(sentence1, sentence2):
	sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
	test_data = BertSemanticDataGenerator(
	sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
	)
	proba = model.predict(test_data[0])[0]
	idx = np.argmax(proba)
	proba = f"{proba[idx]*100:.2f}%"
	pred = labels[idx]
	return f'These two sentence is {pred} with {proba} of probability'

	inputs = [
	gr.Audio(source = "upload", label='Upload audio file', type="filepath"),
	]

	examples = [["Two women are observing something together.", "Two women are standing with their eyes closed."],
	["A smiling costumed woman is holding an umbrella", "A happy woman in a fairy costume holds an umbrella"],
	["A soccer game with multiple males playing", "Some men are playing a sport"],
	]

	gr.Interface(
	fn=predict,
	title="Semantic Similarity with BERT",
	description = "Natural Language Inference by fine-tuning BERT model on SNLI Corpus.)",
	inputs=["text", "text"],
	examples=examples,
	outputs=gr.Textbox(label='Prediction'),
	cache_examples=False,
	article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the keras example from <a href=\"https://keras.io/examples/nlp/semantic_similarity_with_bert/\">Mohamad Merchant</a>",
	).launch(debug=True, enable_queue=True)