File size: 4,183 Bytes
c469ecf af38f47 3a2bd7e 9f2169b 3a2bd7e da7e970 a9cf0ea af38f47 96daf57 b893078 6826d47 068ce9c 9f2169b 068ce9c 9f2169b 068ce9c a2e8dd9 a9cf0ea a2e8dd9 b893078 777b18c bbede20 b893078 c469ecf 0fcdc35 e8871b2 a2e8dd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from fastapi import FastAPI, File, UploadFile
import gradio as gr
import pickle
import zipfile
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from summarizer import Summarizer,TransformerSummarizer
nltk.download('punkt')
nltk.download('stopwords')
model_checkpoint = "marefa-nlp/marefa-mt-en-ar"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
with zipfile.ZipFile("model.zip", 'r') as zip_ref:
zip_ref.extractall("./marian_model/")
# Define the model architecture
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)
# Load the weights from the .h5 file
model.load_weights("./marian_model/model.weights.h5")
# Load cleaned_word_embeddings
with open("cleaned_word_embeddings.pkl", "rb") as f:
cleaned_word_embeddings = pickle.load(f)
summ_model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")
def translate_pretrained(text):
summarized = ''.join(summ_model(text))
tokenized = tokenizer([summarized], return_tensors="np")
out = model.generate(**tokenized)
arabic = tokenizer.decode(out[0], skip_special_tokens=True)
return arabic
def get_clean_sentences(text):
sentences = sent_tokenize(text)
# Remove punctuations, numbers and special characters
cleaned_sentences = []
for sentence in sentences:
cleaned_sentence = re.sub(r"\\.|[^\\'\w ]", " ", sentence)
cleaned_sentences.append(cleaned_sentence)
return cleaned_sentences
def filter_sentences(text):
cleaned_sentences = get_clean_sentences(text)
# Remove stopwords
stop_words = set(stopwords.words("english"))
filtered_sentences = []
for sentence in cleaned_sentences:
words = nltk.word_tokenize(sentence)
filtered_sentence = " ".join(
[word for word in words if word.lower() not in stop_words]
)
filtered_sentences.append(filtered_sentence)
return filtered_sentences
def get_vector_representation(text):
filtered_sentences = filter_sentences(text)
# Get vector representations for each sentence in the articles
sentence_vectors = []
for sentence in filtered_sentences:
words = sentence.split()
sentence_vector = np.zeros((25,))
if len(words) != 0:
for word in words:
if word in cleaned_word_embeddings:
sentence_vector += cleaned_word_embeddings[word]
sentence_vector /= len(words)
sentence_vectors.append(sentence_vector)
return sentence_vectors
def calculate_cosine_similarity(sentence_vectors):
flat_sentence_vectors = np.array(
[vec for sublist in sentence_vectors for vec in sublist]
).reshape(1, -1)
# Calculate cosine similarity
similarity_matrix = cosine_similarity(sentence_vectors)
return similarity_matrix
def get_scores(similarity_matrix):
# Create a graph from the similarity matrix
nx_graph = nx.from_numpy_array(similarity_matrix)
# Get scores
scores = nx.pagerank(nx_graph)
return scores
def rank_sentences(text):
sentence_vectors = get_vector_representation(text)
similarity_matrix = calculate_cosine_similarity(sentence_vectors)
scores = get_scores(similarity_matrix)
ranked_sentences = sorted(
((scores[j], sentence) for j, sentence in enumerate(sent_tokenize(text))),
reverse=True,
)
return ranked_sentences
def summarize(text):
ranked_sentences = rank_sentences(text)
summary = ""
for j in range(len(ranked_sentences)//10):
summary += ranked_sentences[j][1] + " "
return summary
def translate(text):
summarized = summarize(text)
tokenized = tokenizer([summarized], return_tensors='np')
out = model.generate(**tokenized)
arabic = tokenizer.decode(out[0], skip_special_tokens=True)
return arabic
demo = gr.Interface(fn=translate_pretrained, inputs="text", outputs="text")
demo.launch(share=True)
|