Spaces:
Sleeping
Sleeping
File size: 4,098 Bytes
dcbadce 51f87ca 2ee2bc5 51f87ca 268dbda dcbadce d1fbd17 dcbadce e69c39b dcbadce 4b74149 dcbadce 4b74149 5a73569 61376dc 436765e 78ab44a dcbadce 43629d4 dcbadce e69c39b dcbadce 41a0f6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# -*- coding: utf-8 -*-
"""ArabicPoetryGeneration.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1HDyT5F8qnrbR_PW_HYpiM3O-7i6htGG2
"""
'''
pip install transformers
pip install tashaphyne
pip install gradio
pip install translate
'''
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer
from transformers import AutoTokenizer
import random
from tashaphyne import normalize
import re
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GRU
import tensorflow as tf
from transformers import AutoTokenizer
nltk.download('punkt')
nltk.download('wordnet')
aurl = 'https://raw.githubusercontent.com/Obai33/NLP_PoemGenerationDatasets/main/arabicpoems.csv'
adf = pd.read_csv(aurl)
# Function to normalize text
def normalize_text(text):
normalize.strip_tashkeel(text)
normalize.strip_tatweel(text)
normalize.normalize_hamza(text)
normalize.normalize_lamalef(text)
return text
# Normalize the text
allah = normalize_text('ุงููู')
adf = adf['poem_text']
i = random.randint(0, len(adf))
adf = adf.sample(n=100, random_state=i)
adf = adf.apply(lambda x: normalize_text(x))
adf = adf[~adf.str.contains(allah)]
# Function to clean text
def remove_non_arabic_symbols(text):
arabic_pattern = r'[\u0600-\u06FF\s]+'
arabic_text = re.findall(arabic_pattern, text)
cleaned_text = ''.join(arabic_text)
return cleaned_text
# Clean the text
adf = adf.apply(lambda x: remove_non_arabic_symbols(x))
# Tokenize the text
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
tokens = tokenizer.tokenize(adf.tolist(), is_split_into_words=True)
input_sequences = []
for line in adf:
token_list = tokenizer.encode(line, add_special_tokens=True)
for i in range(1, len(token_list)):
n_gram_sequence = token_list[:i+1]
input_sequences.append(n_gram_sequence)
max_sequence_len = 100
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
total_words = tokenizer.vocab_size
xs, labels = input_sequences[:, :-1], input_sequences[:, -1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
print('error not here')
##############
import requests
'''
# URL of the model
url = 'https://github.com/Obai33/NLP_PoemGenerationDatasets/raw/main/modelarab1.h5'
# Local file path to save the model
local_filename = 'modelarab1.h5'
# Download the model file
response = requests.get(url)
with open(local_filename, 'wb') as f:
f.write(response.content)
'''
model = tf.keras.models.load_model('my_model')
print('ok model loaded')
##############
# Import the necessary library for translation
import translate
# Function to translate text to English
def translate_to_english(text):
translator = translate.Translator(from_lang="ar", to_lang="en")
translated_text = translator.translate(text)
return translated_text
def generate_arabic_text(seed_text, next_words=50):
generated_text = seed_text
for _ in range(next_words):
token_list = tokenizer.encode(generated_text, add_special_tokens=False)
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
predicted = np.argmax(model.predict(token_list), axis=-1)
output_word = tokenizer.decode(predicted[0])
generated_text += " " + output_word
reconnected_text = generated_text.replace(" ##", "")
t_text = translate_to_english(reconnected_text)
return reconnected_text, t_text
import gradio as gr
print('error not here')
# Update Gradio interface to include both Arabic and English outputs
iface = gr.Interface(
fn=generate_arabic_text,
inputs="text",
outputs=["text", "text"],
title="Arabic Poetry Generation",
description="Enter Arabic text to generate a small poem.",
theme="compact"
)
# Run the interface
iface.launch(share = True) |