import os import pandas as pd import tensorflow as tf import numpy as np data=pd.read_csv('train.csv') data.head(5) from tensorflow.keras.layers import TextVectorization x=data['comment_text'] y=data[data.columns[2:]].values max_features=200000 vectorizer=TextVectorization(max_tokens=max_features, output_sequence_length=1800, output_mode='int') vectorizer.get_vocabulary() vectorizer.adapt(x.values) vectorizer("have you watched breaking bad")[:5] vectorized_text=vectorizer(x.values) dataset=tf.data.Dataset.from_tensor_slices((vectorized_text, y)) dataset=dataset.cache() dataset=dataset.shuffle(160000) dataset=dataset.batch(16) dataset=dataset.prefetch(8) batch_x, batch_y = dataset.as_numpy_iterator().next() train=dataset.take(int(len(dataset)*.7)) val=dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)) test=dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)) train_generator=train.as_numpy_iterator() train_generator.next() from tensorflow.keras.models import Sequential from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding model=Sequential() model.add(Embedding(max_features+1, 32)) model.add(Bidirectional(LSTM(32, activation='tanh'))) model.add(Dense(128, activation='relu')) model.add(Dense(256, activation='relu')) model.add(Dense(128, activation='relu')) model.add(Dense(6, activation='sigmoid')) model.compile(loss='BinaryCrossentropy', optimizer='adam', metrics=['accuracy']) model.summary() history=model.fit(train, epochs=10, validation_data=val) model.evaluate(test) x_batch, y_batch = test.as_numpy_iterator().next() (model.predict(x_batch) > 0.5).astype(int) input_text=vectorizer('I am coming to kill you pal') input_text[:7] batch=test.as_numpy_iterator().next() res=model.predict(np.expand_dims(input_text,0)) res model.save('finalprojecttoxic.h5') from transformers import pipeline import gradio as gr model=tf.keras.models.load_model('finalprojecttoxic.h5') input_str=vectorizer('Hey i freaking hate you!. I\'m going to hurt you!') res=model.predict(np.expand_dims(input_str,0)) translator_hindi = pipeline("translation", model="Helsinki-NLP/opus-mt-hi-en", tokenizer="Helsinki-NLP/opus-mt-hi-en") hindi_text = "नमस्ते, आप कैसे हैं?" en_to_hin = translator_hindi(hindi_text) en_to_hin[0]['translation_text'] def translate_hindi(from_text): result2 = translator_hindi(from_text) return result2[0]['translation_text'] translate_hindi('नमस्ते, आप कैसे हैं?') def score_comment(comment): vectorized_comment = vectorizer([comment]) results=model.predict(vectorized_comment) text='' for idx, col in enumerate(data.columns[2:]): text+= '{}: {}\n'.format(col, results[0][idx]>0.5) return text def combined_models(input): output1=translate_hindi(input) output2=score_comment(input) return output1, output2 interface = gr.Interface(fn=combined_models, inputs="text", outputs=["text","text"],title="Toxic Comment Analyzer") interface.launch(share=True)