Spaces:

IinjyI
/

Qissar

Runtime error

App Files Files Community

Qissar / app.py

IinjyI

Update app.py

0fcdc35 verified 7 months ago

raw

history blame contribute delete

4.18 kB

	from fastapi import FastAPI, File, UploadFile
	import gradio as gr
	import pickle
	import zipfile

	import pandas as pd
	import numpy as np
	import re

	import nltk
	from nltk.tokenize import sent_tokenize
	from nltk.corpus import stopwords

	from sklearn.metrics.pairwise import cosine_similarity

	import networkx as nx
	from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
	from summarizer import Summarizer,TransformerSummarizer

	nltk.download('punkt')
	nltk.download('stopwords')

	model_checkpoint = "marefa-nlp/marefa-mt-en-ar"
	tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

	with zipfile.ZipFile("model.zip", 'r') as zip_ref:
	zip_ref.extractall("./marian_model/")

	# Define the model architecture
	model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

	# Load the weights from the .h5 file
	model.load_weights("./marian_model/model.weights.h5")

	# Load cleaned_word_embeddings
	with open("cleaned_word_embeddings.pkl", "rb") as f:
	cleaned_word_embeddings = pickle.load(f)

	summ_model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")

	def translate_pretrained(text):
	summarized = ''.join(summ_model(text))
	tokenized = tokenizer([summarized], return_tensors="np")
	out = model.generate(**tokenized)
	arabic = tokenizer.decode(out[0], skip_special_tokens=True)
	return arabic

	def get_clean_sentences(text):
	sentences = sent_tokenize(text)
	# Remove punctuations, numbers and special characters
	cleaned_sentences = []
	for sentence in sentences:
	cleaned_sentence = re.sub(r"\\.\|[^\\'\w ]", " ", sentence)
	cleaned_sentences.append(cleaned_sentence)
	return cleaned_sentences


	def filter_sentences(text):
	cleaned_sentences = get_clean_sentences(text)
	# Remove stopwords
	stop_words = set(stopwords.words("english"))
	filtered_sentences = []
	for sentence in cleaned_sentences:
	words = nltk.word_tokenize(sentence)
	filtered_sentence = " ".join(
	[word for word in words if word.lower() not in stop_words]
	)
	filtered_sentences.append(filtered_sentence)
	return filtered_sentences


	def get_vector_representation(text):
	filtered_sentences = filter_sentences(text)
	# Get vector representations for each sentence in the articles
	sentence_vectors = []
	for sentence in filtered_sentences:
	words = sentence.split()
	sentence_vector = np.zeros((25,))
	if len(words) != 0:
	for word in words:
	if word in cleaned_word_embeddings:
	sentence_vector += cleaned_word_embeddings[word]
	sentence_vector /= len(words)
	sentence_vectors.append(sentence_vector)
	return sentence_vectors


	def calculate_cosine_similarity(sentence_vectors):
	flat_sentence_vectors = np.array(
	[vec for sublist in sentence_vectors for vec in sublist]
	).reshape(1, -1)
	# Calculate cosine similarity
	similarity_matrix = cosine_similarity(sentence_vectors)
	return similarity_matrix


	def get_scores(similarity_matrix):
	# Create a graph from the similarity matrix
	nx_graph = nx.from_numpy_array(similarity_matrix)
	# Get scores
	scores = nx.pagerank(nx_graph)
	return scores


	def rank_sentences(text):
	sentence_vectors = get_vector_representation(text)
	similarity_matrix = calculate_cosine_similarity(sentence_vectors)
	scores = get_scores(similarity_matrix)
	ranked_sentences = sorted(
	((scores[j], sentence) for j, sentence in enumerate(sent_tokenize(text))),
	reverse=True,
	)
	return ranked_sentences


	def summarize(text):
	ranked_sentences = rank_sentences(text)
	summary = ""
	for j in range(len(ranked_sentences)//10):
	summary += ranked_sentences[j][1] + " "
	return summary

	def translate(text):
	summarized = summarize(text)
	tokenized = tokenizer([summarized], return_tensors='np')
	out = model.generate(**tokenized)
	arabic = tokenizer.decode(out[0], skip_special_tokens=True)
	return arabic

	demo = gr.Interface(fn=translate_pretrained, inputs="text", outputs="text")
	demo.launch(share=True)