File size: 4,183 Bytes
c469ecf
af38f47
3a2bd7e
9f2169b
3a2bd7e
 
 
 
 
 
 
 
 
 
 
 
da7e970
a9cf0ea
af38f47
96daf57
 
 
b893078
 
6826d47
068ce9c
 
9f2169b
068ce9c
 
9f2169b
068ce9c
 
a2e8dd9
 
 
 
 
a9cf0ea
 
 
 
 
 
 
 
 
a2e8dd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b893078
 
 
777b18c
bbede20
b893078
c469ecf
0fcdc35
e8871b2
a2e8dd9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from fastapi import FastAPI, File, UploadFile
import gradio as gr
import pickle
import zipfile

import pandas as pd
import numpy as np
import re

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity

import networkx as nx
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from summarizer import Summarizer,TransformerSummarizer

nltk.download('punkt')
nltk.download('stopwords')

model_checkpoint = "marefa-nlp/marefa-mt-en-ar"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

with zipfile.ZipFile("model.zip", 'r') as zip_ref:
    zip_ref.extractall("./marian_model/")

# Define the model architecture
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

# Load the weights from the .h5 file
model.load_weights("./marian_model/model.weights.h5")

# Load cleaned_word_embeddings
with open("cleaned_word_embeddings.pkl", "rb") as f:
    cleaned_word_embeddings = pickle.load(f)

summ_model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")

def translate_pretrained(text):
    summarized = ''.join(summ_model(text))
    tokenized = tokenizer([summarized], return_tensors="np")
    out = model.generate(**tokenized)
    arabic = tokenizer.decode(out[0], skip_special_tokens=True)
    return arabic

def get_clean_sentences(text):
    sentences = sent_tokenize(text)
    # Remove punctuations, numbers and special characters
    cleaned_sentences = []
    for sentence in sentences:
        cleaned_sentence = re.sub(r"\\.|[^\\'\w ]", " ", sentence)
        cleaned_sentences.append(cleaned_sentence)
    return cleaned_sentences


def filter_sentences(text):
    cleaned_sentences = get_clean_sentences(text)
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_sentences = []
    for sentence in cleaned_sentences:
        words = nltk.word_tokenize(sentence)
        filtered_sentence = " ".join(
            [word for word in words if word.lower() not in stop_words]
        )
        filtered_sentences.append(filtered_sentence)
    return filtered_sentences


def get_vector_representation(text):
    filtered_sentences = filter_sentences(text)
    # Get vector representations for each sentence in the articles
    sentence_vectors = []
    for sentence in filtered_sentences:
        words = sentence.split()
        sentence_vector = np.zeros((25,))
        if len(words) != 0:
            for word in words:
                if word in cleaned_word_embeddings:
                    sentence_vector += cleaned_word_embeddings[word]
            sentence_vector /= len(words)
        sentence_vectors.append(sentence_vector)
    return sentence_vectors


def calculate_cosine_similarity(sentence_vectors):
    flat_sentence_vectors = np.array(
        [vec for sublist in sentence_vectors for vec in sublist]
    ).reshape(1, -1)
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(sentence_vectors)
    return similarity_matrix


def get_scores(similarity_matrix):
    # Create a graph from the similarity matrix
    nx_graph = nx.from_numpy_array(similarity_matrix)
    # Get scores
    scores = nx.pagerank(nx_graph)
    return scores


def rank_sentences(text):
    sentence_vectors = get_vector_representation(text)
    similarity_matrix = calculate_cosine_similarity(sentence_vectors)
    scores = get_scores(similarity_matrix)
    ranked_sentences = sorted(
        ((scores[j], sentence) for j, sentence in enumerate(sent_tokenize(text))),
        reverse=True,
    )
    return ranked_sentences


def summarize(text):
    ranked_sentences = rank_sentences(text)
    summary = ""
    for j in range(len(ranked_sentences)//10): 
        summary += ranked_sentences[j][1] + " "
    return summary

def translate(text):
    summarized = summarize(text)
    tokenized = tokenizer([summarized], return_tensors='np')
    out = model.generate(**tokenized)
    arabic = tokenizer.decode(out[0], skip_special_tokens=True)
    return arabic

demo = gr.Interface(fn=translate_pretrained, inputs="text", outputs="text")
demo.launch(share=True)