persian-reverse-dict / pipeline.py
behnamsa's picture
Increase logits sensitivity, fix typo
1ec5925
raw
history blame
No virus
2.02 kB
import tensorflow as tf
from transformers import Pipeline
import tensorflow as tf
import numpy as np
import json
from hazm import *
from scipy.spatial import distance
class PreTrainedPipeline():
def __init__(self, path):
self.model_dir = path + "/saved_model"
self.t2id_path = path + "/t2id.json"
self.id2h_path = path + "/id2h.json"
self.stopwords_path = path + "/stopwords.txt"
self.comparison_matrix_path = path + "/comparison_matrix.npz"
self.t2id = json.load(open(self.t2id_path,encoding="utf8"))
self.id2h = json.load(open(self.id2h_path,encoding="utf8"))
self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8"))
self.comparisons = np.load(self.comparison_matrix_path)['arr_0']
self.model = tf.saved_model.load(self.model_dir)
def __call__(self, inputs: str):
# Preprocess the input sentence
sentence = Normalizer().normalize(inputs)
tokens = word_tokenize(sentence)
tokens = [t for t in tokens if t not in self.stopwords]
input_ids = np.zeros((1, 20))
for i, token in enumerate(tokens):
if i >= 20:
break
input_ids[0, i] = self.t2id.get(token, self.t2id['UNK'])
# Call the model on the input ids
embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy()
# Postprocess the embeddings to get the most similar words
similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0]
top_indices = similarities.argsort()[:10]
top_words = [self.id2h[str(top_indices[i])] for i in range(10)]
logits = np.exp(-1000*np.array(similarities[top_indices]))
softmax_probs = tf.nn.softmax(logits).numpy()
top_scores = [round(float(softmax_probs[i]), 3) for i in range(10)]
return [
[{'label': word, 'score': score} for word, score in zip(top_words, top_scores)]
]