File size: 4,098 Bytes
dcbadce
 
 
 
 
 
 
 
51f87ca
2ee2bc5
 
 
 
51f87ca
268dbda
dcbadce
 
 
 
 
 
 
 
 
 
 
 
 
 
d1fbd17
dcbadce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e69c39b
dcbadce
 
 
 
4b74149
dcbadce
 
 
 
 
 
 
 
 
4b74149
5a73569
61376dc
436765e
78ab44a
 
dcbadce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43629d4
dcbadce
 
 
 
 
 
 
e69c39b
dcbadce
 
 
 
 
 
 
 
 
 
 
41a0f6d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# -*- coding: utf-8 -*-
"""ArabicPoetryGeneration.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1HDyT5F8qnrbR_PW_HYpiM3O-7i6htGG2
"""
'''
pip install transformers
pip install tashaphyne
pip install gradio
pip install translate
'''

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer
from transformers import AutoTokenizer
import random
from tashaphyne import normalize
import re
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GRU
import tensorflow as tf
from transformers import AutoTokenizer

nltk.download('punkt')
nltk.download('wordnet')

aurl = 'https://raw.githubusercontent.com/Obai33/NLP_PoemGenerationDatasets/main/arabicpoems.csv'
adf = pd.read_csv(aurl)

# Function to normalize text
def normalize_text(text):
    normalize.strip_tashkeel(text)
    normalize.strip_tatweel(text)
    normalize.normalize_hamza(text)
    normalize.normalize_lamalef(text)
    return text

# Normalize the text
allah = normalize_text('ุงู„ู„ู‡')
adf = adf['poem_text']
i = random.randint(0, len(adf))
adf = adf.sample(n=100, random_state=i)
adf = adf.apply(lambda x: normalize_text(x))
adf = adf[~adf.str.contains(allah)]

# Function to clean text
def remove_non_arabic_symbols(text):
    arabic_pattern = r'[\u0600-\u06FF\s]+'
    arabic_text = re.findall(arabic_pattern, text)
    cleaned_text = ''.join(arabic_text)
    return cleaned_text

# Clean the text
adf = adf.apply(lambda x: remove_non_arabic_symbols(x))

# Tokenize the text
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
tokens = tokenizer.tokenize(adf.tolist(), is_split_into_words=True)

input_sequences = []
for line in adf:
    token_list = tokenizer.encode(line, add_special_tokens=True)
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = 100
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

total_words = tokenizer.vocab_size

xs, labels = input_sequences[:, :-1], input_sequences[:, -1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

print('error not here')

##############

import requests
'''
# URL of the model
url = 'https://github.com/Obai33/NLP_PoemGenerationDatasets/raw/main/modelarab1.h5'
# Local file path to save the model
local_filename = 'modelarab1.h5'

# Download the model file
response = requests.get(url)
with open(local_filename, 'wb') as f:
    f.write(response.content)
'''
model = tf.keras.models.load_model('my_model')



print('ok model loaded')
##############

# Import the necessary library for translation
import translate

# Function to translate text to English
def translate_to_english(text):
    translator = translate.Translator(from_lang="ar", to_lang="en")
    translated_text = translator.translate(text)
    return translated_text

def generate_arabic_text(seed_text, next_words=50):
    generated_text = seed_text
    for _ in range(next_words):
        token_list = tokenizer.encode(generated_text, add_special_tokens=False)
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = tokenizer.decode(predicted[0])
        generated_text += " " + output_word
    reconnected_text = generated_text.replace(" ##", "")
    t_text = translate_to_english(reconnected_text)
    return reconnected_text, t_text

import gradio as gr
print('error not here')

# Update Gradio interface to include both Arabic and English outputs
iface = gr.Interface(
    fn=generate_arabic_text,
    inputs="text",
    outputs=["text", "text"],
    title="Arabic Poetry Generation",
    description="Enter Arabic text to generate a small poem.",
    theme="compact"
)
# Run the interface
iface.launch(share = True)