Qissar / app.py
IinjyI's picture
Update app.py
0fcdc35 verified
raw
history blame
4.18 kB
from fastapi import FastAPI, File, UploadFile
import gradio as gr
import pickle
import zipfile
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from summarizer import Summarizer,TransformerSummarizer
nltk.download('punkt')
nltk.download('stopwords')
model_checkpoint = "marefa-nlp/marefa-mt-en-ar"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
with zipfile.ZipFile("model.zip", 'r') as zip_ref:
zip_ref.extractall("./marian_model/")
# Define the model architecture
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)
# Load the weights from the .h5 file
model.load_weights("./marian_model/model.weights.h5")
# Load cleaned_word_embeddings
with open("cleaned_word_embeddings.pkl", "rb") as f:
cleaned_word_embeddings = pickle.load(f)
summ_model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")
def translate_pretrained(text):
summarized = ''.join(summ_model(text))
tokenized = tokenizer([summarized], return_tensors="np")
out = model.generate(**tokenized)
arabic = tokenizer.decode(out[0], skip_special_tokens=True)
return arabic
def get_clean_sentences(text):
sentences = sent_tokenize(text)
# Remove punctuations, numbers and special characters
cleaned_sentences = []
for sentence in sentences:
cleaned_sentence = re.sub(r"\\.|[^\\'\w ]", " ", sentence)
cleaned_sentences.append(cleaned_sentence)
return cleaned_sentences
def filter_sentences(text):
cleaned_sentences = get_clean_sentences(text)
# Remove stopwords
stop_words = set(stopwords.words("english"))
filtered_sentences = []
for sentence in cleaned_sentences:
words = nltk.word_tokenize(sentence)
filtered_sentence = " ".join(
[word for word in words if word.lower() not in stop_words]
)
filtered_sentences.append(filtered_sentence)
return filtered_sentences
def get_vector_representation(text):
filtered_sentences = filter_sentences(text)
# Get vector representations for each sentence in the articles
sentence_vectors = []
for sentence in filtered_sentences:
words = sentence.split()
sentence_vector = np.zeros((25,))
if len(words) != 0:
for word in words:
if word in cleaned_word_embeddings:
sentence_vector += cleaned_word_embeddings[word]
sentence_vector /= len(words)
sentence_vectors.append(sentence_vector)
return sentence_vectors
def calculate_cosine_similarity(sentence_vectors):
flat_sentence_vectors = np.array(
[vec for sublist in sentence_vectors for vec in sublist]
).reshape(1, -1)
# Calculate cosine similarity
similarity_matrix = cosine_similarity(sentence_vectors)
return similarity_matrix
def get_scores(similarity_matrix):
# Create a graph from the similarity matrix
nx_graph = nx.from_numpy_array(similarity_matrix)
# Get scores
scores = nx.pagerank(nx_graph)
return scores
def rank_sentences(text):
sentence_vectors = get_vector_representation(text)
similarity_matrix = calculate_cosine_similarity(sentence_vectors)
scores = get_scores(similarity_matrix)
ranked_sentences = sorted(
((scores[j], sentence) for j, sentence in enumerate(sent_tokenize(text))),
reverse=True,
)
return ranked_sentences
def summarize(text):
ranked_sentences = rank_sentences(text)
summary = ""
for j in range(len(ranked_sentences)//10):
summary += ranked_sentences[j][1] + " "
return summary
def translate(text):
summarized = summarize(text)
tokenized = tokenizer([summarized], return_tensors='np')
out = model.generate(**tokenized)
arabic = tokenizer.decode(out[0], skip_special_tokens=True)
return arabic
demo = gr.Interface(fn=translate_pretrained, inputs="text", outputs="text")
demo.launch(share=True)