|
from fastapi import FastAPI, File, UploadFile |
|
import gradio as gr |
|
import pickle |
|
import zipfile |
|
|
|
import pandas as pd |
|
import numpy as np |
|
import re |
|
|
|
import nltk |
|
from nltk.tokenize import sent_tokenize |
|
from nltk.corpus import stopwords |
|
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
import networkx as nx |
|
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM |
|
from summarizer import Summarizer,TransformerSummarizer |
|
|
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
|
|
model_checkpoint = "marefa-nlp/marefa-mt-en-ar" |
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
|
|
|
with zipfile.ZipFile("model.zip", 'r') as zip_ref: |
|
zip_ref.extractall("./marian_model/") |
|
|
|
|
|
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True) |
|
|
|
|
|
model.load_weights("./marian_model/model.weights.h5") |
|
|
|
|
|
with open("cleaned_word_embeddings.pkl", "rb") as f: |
|
cleaned_word_embeddings = pickle.load(f) |
|
|
|
summ_model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased") |
|
|
|
def translate_pretrained(text): |
|
summarized = ''.join(summ_model(text)) |
|
tokenized = tokenizer([summarized], return_tensors="np") |
|
out = model.generate(**tokenized) |
|
arabic = tokenizer.decode(out[0], skip_special_tokens=True) |
|
return arabic |
|
|
|
def get_clean_sentences(text): |
|
sentences = sent_tokenize(text) |
|
|
|
cleaned_sentences = [] |
|
for sentence in sentences: |
|
cleaned_sentence = re.sub(r"\\.|[^\\'\w ]", " ", sentence) |
|
cleaned_sentences.append(cleaned_sentence) |
|
return cleaned_sentences |
|
|
|
|
|
def filter_sentences(text): |
|
cleaned_sentences = get_clean_sentences(text) |
|
|
|
stop_words = set(stopwords.words("english")) |
|
filtered_sentences = [] |
|
for sentence in cleaned_sentences: |
|
words = nltk.word_tokenize(sentence) |
|
filtered_sentence = " ".join( |
|
[word for word in words if word.lower() not in stop_words] |
|
) |
|
filtered_sentences.append(filtered_sentence) |
|
return filtered_sentences |
|
|
|
|
|
def get_vector_representation(text): |
|
filtered_sentences = filter_sentences(text) |
|
|
|
sentence_vectors = [] |
|
for sentence in filtered_sentences: |
|
words = sentence.split() |
|
sentence_vector = np.zeros((25,)) |
|
if len(words) != 0: |
|
for word in words: |
|
if word in cleaned_word_embeddings: |
|
sentence_vector += cleaned_word_embeddings[word] |
|
sentence_vector /= len(words) |
|
sentence_vectors.append(sentence_vector) |
|
return sentence_vectors |
|
|
|
|
|
def calculate_cosine_similarity(sentence_vectors): |
|
flat_sentence_vectors = np.array( |
|
[vec for sublist in sentence_vectors for vec in sublist] |
|
).reshape(1, -1) |
|
|
|
similarity_matrix = cosine_similarity(sentence_vectors) |
|
return similarity_matrix |
|
|
|
|
|
def get_scores(similarity_matrix): |
|
|
|
nx_graph = nx.from_numpy_array(similarity_matrix) |
|
|
|
scores = nx.pagerank(nx_graph) |
|
return scores |
|
|
|
|
|
def rank_sentences(text): |
|
sentence_vectors = get_vector_representation(text) |
|
similarity_matrix = calculate_cosine_similarity(sentence_vectors) |
|
scores = get_scores(similarity_matrix) |
|
ranked_sentences = sorted( |
|
((scores[j], sentence) for j, sentence in enumerate(sent_tokenize(text))), |
|
reverse=True, |
|
) |
|
return ranked_sentences |
|
|
|
|
|
def summarize(text): |
|
ranked_sentences = rank_sentences(text) |
|
summary = "" |
|
for j in range(len(ranked_sentences)//10): |
|
summary += ranked_sentences[j][1] + " " |
|
return summary |
|
|
|
def translate(text): |
|
summarized = summarize(text) |
|
tokenized = tokenizer([summarized], return_tensors='np') |
|
out = model.generate(**tokenized) |
|
arabic = tokenizer.decode(out[0], skip_special_tokens=True) |
|
return arabic |
|
|
|
demo = gr.Interface(fn=translate_pretrained, inputs="text", outputs="text") |
|
demo.launch(share=True) |
|
|
|
|