Spaces:

IinjyI
/

Qissar

Runtime error

File size: 4,183 Bytes

from fastapi import FastAPI, File, UploadFile
import gradio as gr
import pickle
import zipfile

import pandas as pd
import numpy as np
import re

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity

import networkx as nx
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from summarizer import Summarizer,TransformerSummarizer

nltk.download('punkt')
nltk.download('stopwords')

model_checkpoint = "marefa-nlp/marefa-mt-en-ar"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

with zipfile.ZipFile("model.zip", 'r') as zip_ref:
    zip_ref.extractall("./marian_model/")

# Define the model architecture
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

# Load the weights from the .h5 file
model.load_weights("./marian_model/model.weights.h5")

# Load cleaned_word_embeddings
with open("cleaned_word_embeddings.pkl", "rb") as f:
    cleaned_word_embeddings = pickle.load(f)

summ_model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")

def translate_pretrained(text):
    summarized = ''.join(summ_model(text))
    tokenized = tokenizer([summarized], return_tensors="np")
    out = model.generate(**tokenized)
    arabic = tokenizer.decode(out[0], skip_special_tokens=True)
    return arabic

def get_clean_sentences(text):
    sentences = sent_tokenize(text)
    # Remove punctuations, numbers and special characters
    cleaned_sentences = []
    for sentence in sentences:
        cleaned_sentence = re.sub(r"\\.|[^\\'\w ]", " ", sentence)
        cleaned_sentences.append(cleaned_sentence)
    return cleaned_sentences


def filter_sentences(text):
    cleaned_sentences = get_clean_sentences(text)
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_sentences = []
    for sentence in cleaned_sentences:
        words = nltk.word_tokenize(sentence)
        filtered_sentence = " ".join(
            [word for word in words if word.lower() not in stop_words]
        )
        filtered_sentences.append(filtered_sentence)
    return filtered_sentences


def get_vector_representation(text):
    filtered_sentences = filter_sentences(text)
    # Get vector representations for each sentence in the articles
    sentence_vectors = []
    for sentence in filtered_sentences:
        words = sentence.split()
        sentence_vector = np.zeros((25,))
        if len(words) != 0:
            for word in words:
                if word in cleaned_word_embeddings:
                    sentence_vector += cleaned_word_embeddings[word]
            sentence_vector /= len(words)
        sentence_vectors.append(sentence_vector)
    return sentence_vectors


def calculate_cosine_similarity(sentence_vectors):
    flat_sentence_vectors = np.array(
        [vec for sublist in sentence_vectors for vec in sublist]
    ).reshape(1, -1)
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(sentence_vectors)
    return similarity_matrix


def get_scores(similarity_matrix):
    # Create a graph from the similarity matrix
    nx_graph = nx.from_numpy_array(similarity_matrix)
    # Get scores
    scores = nx.pagerank(nx_graph)
    return scores


def rank_sentences(text):
    sentence_vectors = get_vector_representation(text)
    similarity_matrix = calculate_cosine_similarity(sentence_vectors)
    scores = get_scores(similarity_matrix)
    ranked_sentences = sorted(
        ((scores[j], sentence) for j, sentence in enumerate(sent_tokenize(text))),
        reverse=True,
    )
    return ranked_sentences


def summarize(text):
    ranked_sentences = rank_sentences(text)
    summary = ""
    for j in range(len(ranked_sentences)//10): 
        summary += ranked_sentences[j][1] + " "
    return summary

def translate(text):
    summarized = summarize(text)
    tokenized = tokenizer([summarized], return_tensors='np')
    out = model.generate(**tokenized)
    arabic = tokenizer.decode(out[0], skip_special_tokens=True)
    return arabic

demo = gr.Interface(fn=translate_pretrained, inputs="text", outputs="text")
demo.launch(share=True)