{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "a9a3a647", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n", "\n" ] } ], "source": [ "import os\n", "import pandas as pd\n", "import tensorflow as tf\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 167, "id": "52960768", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcomment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
00000997932d777bfExplanation\\nWhy the edits made under my usern...000000
1000103f0d9cfb60fD'aww! He matches this background colour I'm s...000000
2000113f07ec002fdHey man, I'm really not trying to edit war. It...000000
30001b41b1c6bb37e\"\\nMore\\nI can't make any real suggestions on ...000000
40001d958c54c6e35You, sir, are my hero. Any chance you remember...000000
\n", "
" ], "text/plain": [ " id comment_text toxic \\\n", "0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \n", "1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n", "2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n", "3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n", "4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n", "\n", " severe_toxic obscene threat insult identity_hate \n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "3 0 0 0 0 0 \n", "4 0 0 0 0 0 " ] }, "execution_count": 167, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "1/1 [==============================] - 0s 327ms/step\n" ] } ], "source": [ "data=pd.read_csv('train.csv')\n", "data.head(5)" ] }, { "cell_type": "code", "execution_count": 4, "id": "4bb87073", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"Sorry if the word 'nonsense' was offensive to you. Anyway, I'm not intending to write anything in the article(wow they would jump on me for vandalism), I'm merely requesting that it be more encyclopedic so one can use it for school as a reference. I have been to the selective breeding page but it's almost a stub. It points to 'animal breeding' which is a short messy article that gives you no info. There must be someone around with expertise in eugenics? 93.161.107.169\"" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['comment_text'][8]" ] }, { "cell_type": "code", "execution_count": 5, "id": "c6e7509b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',\n", " 'insult', 'identity_hate'],\n", " dtype='object')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ " data.columns" ] }, { "cell_type": "code", "execution_count": 6, "id": "2802af7a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(159571, 8)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.shape" ] }, { "cell_type": "code", "execution_count": 7, "id": "97449fcb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "toxic 0\n", "severe_toxic 0\n", "obscene 0\n", "threat 0\n", "insult 0\n", "identity_hate 0\n", "Name: 9, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[data.columns[2:]].iloc[9]" ] }, { "cell_type": "code", "execution_count": null, "id": "8844c1b7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "bbd67b78", "metadata": {}, "source": [ "## Preprocessing" ] }, { "cell_type": "code", "execution_count": 8, "id": "6d23f922", "metadata": {}, "outputs": [], "source": [ "from tensorflow.keras.layers import TextVectorization" ] }, { "cell_type": "code", "execution_count": 9, "id": "a3d9e014", "metadata": {}, "outputs": [], "source": [ "x=data['comment_text']\n", "y=data[data.columns[2:]].values" ] }, { "cell_type": "code", "execution_count": 10, "id": "eb1eefc0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Explanation\\nWhy the edits made under my usern...\n", "1 D'aww! He matches this background colour I'm s...\n", "2 Hey man, I'm really not trying to edit war. It...\n", "3 \"\\nMore\\nI can't make any real suggestions on ...\n", "4 You, sir, are my hero. Any chance you remember...\n", " ... \n", "159566 \":::::And for the second time of asking, when ...\n", "159567 You should be ashamed of yourself \\n\\nThat is ...\n", "159568 Spitzer \\n\\nUmm, theres no actual article for ...\n", "159569 And it looks like it was actually you who put ...\n", "159570 \"\\nAnd ... I really don't think you understand...\n", "Name: comment_text, Length: 159571, dtype: object" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x" ] }, { "cell_type": "code", "execution_count": 11, "id": "414f8a4c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " ...,\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0]], dtype=int64)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y" ] }, { "cell_type": "code", "execution_count": 12, "id": "70ec2244", "metadata": {}, "outputs": [], "source": [ "max_features=200000" ] }, { "cell_type": "code", "execution_count": 13, "id": "b6a83b69", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", "\n" ] } ], "source": [ "vectorizer=TextVectorization(max_tokens=max_features,\n", " output_sequence_length=1800,\n", " output_mode='int')" ] }, { "cell_type": "code", "execution_count": 14, "id": "ba246221", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['', '[UNK]']" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vectorizer.get_vocabulary()" ] }, { "cell_type": "code", "execution_count": 15, "id": "9648914d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\utils\\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead.\n", "\n" ] } ], "source": [ "vectorizer.adapt(x.values)" ] }, { "cell_type": "code", "execution_count": 16, "id": "75b035a9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vectorizer(\"have you watched breaking bad\")[:5]" ] }, { "cell_type": "code", "execution_count": 17, "id": "8854984d", "metadata": {}, "outputs": [], "source": [ "vectorized_text=vectorizer(x.values)" ] }, { "cell_type": "code", "execution_count": 18, "id": "9fb407a3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vectorized_text" ] }, { "cell_type": "code", "execution_count": 19, "id": "0aa74efc", "metadata": {}, "outputs": [], "source": [ "dataset=tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n", "dataset=dataset.cache()\n", "dataset=dataset.shuffle(160000)\n", "dataset=dataset.batch(16)\n", "dataset=dataset.prefetch(8)" ] }, { "cell_type": "code", "execution_count": 20, "id": "ff040bf8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9973.1875" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "159571/16" ] }, { "cell_type": "code", "execution_count": 21, "id": "fd8b18f5", "metadata": {}, "outputs": [], "source": [ "batch_x, batch_y = dataset.as_numpy_iterator().next()" ] }, { "cell_type": "code", "execution_count": 22, "id": "d81bb1af", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(16, 1800)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch_x.shape" ] }, { "cell_type": "code", "execution_count": 23, "id": "2cfeca51", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(16, 6)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch_y.shape" ] }, { "cell_type": "code", "execution_count": 24, "id": "9d8a90ce", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9974" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dataset)" ] }, { "cell_type": "code", "execution_count": 25, "id": "5a111205", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6981" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "int(len(dataset)*.7)" ] }, { "cell_type": "code", "execution_count": 26, "id": "34094209", "metadata": {}, "outputs": [], "source": [ "train=dataset.take(int(len(dataset)*.7))\n", "val=dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))\n", "test=dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))" ] }, { "cell_type": "code", "execution_count": 27, "id": "2e5369af", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6981, 1994, 997)" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(train),len(val),len(test)" ] }, { "cell_type": "code", "execution_count": 28, "id": "3bb32ca4", "metadata": {}, "outputs": [], "source": [ "train_generator=train.as_numpy_iterator()" ] }, { "cell_type": "code", "execution_count": 29, "id": "32f4500b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([[ 73, 9, 12, ..., 0, 0, 0],\n", " [182862, 88, 7, ..., 0, 0, 0],\n", " [ 4384, 274, 139, ..., 0, 0, 0],\n", " ...,\n", " [ 14, 9, 21, ..., 0, 0, 0],\n", " [ 1188, 399, 123, ..., 0, 0, 0],\n", " [ 46927, 175, 425, ..., 0, 0, 0]], dtype=int64),\n", " array([[0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [1, 0, 1, 0, 1, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0]], dtype=int64))" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_generator.next()" ] }, { "cell_type": "code", "execution_count": 30, "id": "cbc9a9b2", "metadata": {}, "outputs": [], "source": [ "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding" ] }, { "cell_type": "code", "execution_count": 31, "id": "6dd6bf3d", "metadata": {}, "outputs": [], "source": [ "model=Sequential()" ] }, { "cell_type": "code", "execution_count": 32, "id": "e33e5c86", "metadata": {}, "outputs": [], "source": [ "model.add(Embedding(max_features+1, 32))\n", "model.add(Bidirectional(LSTM(32, activation='tanh')))\n", "model.add(Dense(128, activation='relu'))\n", "model.add(Dense(256, activation='relu'))\n", "model.add(Dense(128, activation='relu'))\n", "model.add(Dense(6, activation='sigmoid'))" ] }, { "cell_type": "code", "execution_count": 33, "id": "6821b620", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\optimizers\\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", "\n" ] } ], "source": [ "model.compile(loss='BinaryCrossentropy', optimizer='adam', metrics=['accuracy'])" ] }, { "cell_type": "code", "execution_count": 34, "id": "f06f01e5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " embedding (Embedding) (None, None, 32) 6400032 \n", " \n", " bidirectional (Bidirection (None, 64) 16640 \n", " al) \n", " \n", " dense (Dense) (None, 128) 8320 \n", " \n", " dense_1 (Dense) (None, 256) 33024 \n", " \n", " dense_2 (Dense) (None, 128) 32896 \n", " \n", " dense_3 (Dense) (None, 6) 774 \n", " \n", "=================================================================\n", "Total params: 6491686 (24.76 MB)\n", "Trainable params: 6491686 (24.76 MB)\n", "Non-trainable params: 0 (0.00 Byte)\n", "_________________________________________________________________\n" ] } ], "source": [ "model.summary()" ] }, { "cell_type": "code", "execution_count": 36, "id": "376ceed5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\engine\\base_layer_utils.py:384: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.\n", "\n", "6981/6981 [==============================] - 5071s 726ms/step - loss: 0.0635 - accuracy: 0.9855 - val_loss: 0.0452 - val_accuracy: 0.9946\n", "Epoch 2/10\n", "6981/6981 [==============================] - 4516s 647ms/step - loss: 0.0454 - accuracy: 0.9942 - val_loss: 0.0399 - val_accuracy: 0.9938\n", "Epoch 3/10\n", "6981/6981 [==============================] - 4100s 587ms/step - loss: 0.0407 - accuracy: 0.9889 - val_loss: 0.0373 - val_accuracy: 0.9941\n", "Epoch 4/10\n", "6981/6981 [==============================] - 4111s 589ms/step - loss: 0.0371 - accuracy: 0.9920 - val_loss: 0.0327 - val_accuracy: 0.9948\n", "Epoch 5/10\n", "6981/6981 [==============================] - 4691s 672ms/step - loss: 0.0334 - accuracy: 0.9941 - val_loss: 0.0302 - val_accuracy: 0.9940\n", "Epoch 6/10\n", "6981/6981 [==============================] - 5055s 724ms/step - loss: 0.0311 - accuracy: 0.9841 - val_loss: 0.0275 - val_accuracy: 0.9944\n", "Epoch 7/10\n", "6981/6981 [==============================] - 4508s 646ms/step - loss: 0.0277 - accuracy: 0.9937 - val_loss: 0.0245 - val_accuracy: 0.9930\n", "Epoch 8/10\n", "6981/6981 [==============================] - 4479s 642ms/step - loss: 0.0254 - accuracy: 0.9907 - val_loss: 0.0228 - val_accuracy: 0.9940\n", "Epoch 9/10\n", "6981/6981 [==============================] - 4501s 645ms/step - loss: 0.0228 - accuracy: 0.9892 - val_loss: 0.0193 - val_accuracy: 0.9950\n", "Epoch 10/10\n", "6981/6981 [==============================] - 4523s 648ms/step - loss: 0.0209 - accuracy: 0.9200 - val_loss: 0.0192 - val_accuracy: 0.9943\n" ] } ], "source": [ "history=model.fit(train, epochs=10, validation_data=val)" ] }, { "cell_type": "code", "execution_count": 37, "id": "cb6501e6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "997/997 [==============================] - 158s 146ms/step - loss: 0.0188 - accuracy: 0.9940\n" ] }, { "data": { "text/plain": [ "[0.018809018656611443, 0.9939819574356079]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.evaluate(test)" ] }, { "cell_type": "code", "execution_count": 40, "id": "92408998", "metadata": {}, "outputs": [], "source": [ "x_batch, y_batch = test.as_numpy_iterator().next()" ] }, { "cell_type": "code", "execution_count": 41, "id": "1c555107", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1/1 [==============================] - 2s 2s/step\n" ] }, { "data": { "text/plain": [ "array([[0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [1, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [1, 0, 1, 0, 1, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0]])" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(model.predict(x_batch) > 0.5).astype(int)" ] }, { "cell_type": "code", "execution_count": 42, "id": "26a06914", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [1, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [1, 0, 1, 0, 1, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0]], dtype=int64)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_batch" ] }, { "cell_type": "code", "execution_count": 49, "id": "0ef7c06b", "metadata": {}, "outputs": [], "source": [ "input_text=vectorizer('I am coming to kill you pal')" ] }, { "cell_type": "code", "execution_count": 50, "id": "5bb057fa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "input_text[:7]" ] }, { "cell_type": "code", "execution_count": 51, "id": "7ab223e7", "metadata": {}, "outputs": [], "source": [ "batch=test.as_numpy_iterator().next()" ] }, { "cell_type": "code", "execution_count": 52, "id": "3986d97b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1/1 [==============================] - 0s 78ms/step\n" ] } ], "source": [ "res=model.predict(np.expand_dims(input_text,0))" ] }, { "cell_type": "code", "execution_count": 53, "id": "5df2d7da", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',\n", " 'identity_hate'],\n", " dtype='object')" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.columns[2:]" ] }, { "cell_type": "code", "execution_count": 54, "id": "ee22bb73", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.54140395, 0.00114176, 0.01782109, 0.10045966, 0.0319472 ,\n", " 0.02094165]], dtype=float32)" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res" ] }, { "cell_type": "markdown", "id": "fa7378c8", "metadata": {}, "source": [ "## Evaluate the Model" ] }, { "cell_type": "code", "execution_count": 59, "id": "c2b08a8c", "metadata": {}, "outputs": [], "source": [ "model.save('finalproject.keras')" ] }, { "cell_type": "code", "execution_count": 60, "id": "71e114bc", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\engine\\training.py:3103: UserWarning: You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.\n", " saving_api.save_model(\n" ] } ], "source": [ "model.save('finalprojecttoxic.h5')" ] }, { "cell_type": "markdown", "id": "6abdcdb8", "metadata": {}, "source": [ "## Making a Language Translation" ] }, { "cell_type": "code", "execution_count": 97, "id": "442cd16b", "metadata": {}, "outputs": [], "source": [ "from transformers import pipeline" ] }, { "cell_type": "code", "execution_count": 125, "id": "95b31788", "metadata": {}, "outputs": [], "source": [ "translator_german=pipeline(\"translation\", model=\"Helsinki-NLP/opus-mt-de-en\", tokenizer=\"Helsinki-NLP/opus-mt-de-en\")" ] }, { "cell_type": "code", "execution_count": 120, "id": "7e882490", "metadata": {}, "outputs": [], "source": [ "german=\"Hallo, wie heißt du?\"" ] }, { "cell_type": "code", "execution_count": 126, "id": "dcfefba8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"Hello, what's your name?\"" ] }, "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "en_to_german=translator_german(german)\n", "en_to_german[0]['translation_text']" ] }, { "cell_type": "code", "execution_count": 107, "id": "ea54de34", "metadata": {}, "outputs": [], "source": [ "translator_spanish = pipeline(\"translation\", model=\"Helsinki-NLP/opus-mt-es-en\", tokenizer=\"Helsinki-NLP/opus-mt-es-en\")" ] }, { "cell_type": "code", "execution_count": 117, "id": "07f1c640", "metadata": {}, "outputs": [], "source": [ "spanish_text = \"hola como estas\"" ] }, { "cell_type": "code", "execution_count": 124, "id": "76b5f447", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello, how are you?'" ] }, "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "en_to_spanish = translator(spanish_text)\n", "en_to_spanish[0]['translation_text']" ] }, { "cell_type": "markdown", "id": "e08fc4e7", "metadata": {}, "source": [ "## Test and Gradio" ] }, { "cell_type": "code", "execution_count": 61, "id": "7d5cdcb8", "metadata": {}, "outputs": [], "source": [ "import gradio as gr" ] }, { "cell_type": "code", "execution_count": 62, "id": "560ec8e5", "metadata": {}, "outputs": [], "source": [ "model=tf.keras.models.load_model('finalprojecttoxic.h5')" ] }, { "cell_type": "code", "execution_count": 73, "id": "aaf4a3cd", "metadata": {}, "outputs": [], "source": [ "input_str=vectorizer('Hey i freaking hate you!. I\\'m going to hurt you!')" ] }, { "cell_type": "code", "execution_count": 74, "id": "54761270", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1/1 [==============================] - 0s 88ms/step\n" ] } ], "source": [ "res=model.predict(np.expand_dims(input_str,0))" ] }, { "cell_type": "code", "execution_count": 75, "id": "ba15136b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.9133858 , 0.00198671, 0.0333592 , 0.00411558, 0.71037763,\n", " 0.00563182]], dtype=float32)" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res" ] }, { "cell_type": "code", "execution_count": 72, "id": "c189f6c9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',\n", " 'identity_hate'],\n", " dtype='object')" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.columns[2:]" ] }, { "cell_type": "code", "execution_count": 122, "id": "8c1fbac0", "metadata": {}, "outputs": [], "source": [ "translator_hindi = pipeline(\"translation\", model=\"Helsinki-NLP/opus-mt-hi-en\", tokenizer=\"Helsinki-NLP/opus-mt-hi-en\")" ] }, { "cell_type": "code", "execution_count": 104, "id": "c8db9d6d", "metadata": {}, "outputs": [], "source": [ "hindi_text = \"नमस्ते, आप कैसे हैं?\"" ] }, { "cell_type": "code", "execution_count": 123, "id": "9c95d205", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello, how are you?'" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "en_to_hin = translator_hindi(hindi_text)\n", "en_to_hin[0]['translation_text']" ] }, { "cell_type": "code", "execution_count": 131, "id": "3d25803f", "metadata": {}, "outputs": [], "source": [ "def translate_hindi(from_text):\n", " result2 = translator_hindi(from_text)\n", " \n", " return result2[0]['translation_text']" ] }, { "cell_type": "code", "execution_count": 133, "id": "52108859", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello, how are you?'" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "translate_hindi('नमस्ते, आप कैसे हैं?')" ] }, { "cell_type": "code", "execution_count": 94, "id": "837c3093", "metadata": {}, "outputs": [], "source": [ "def score_comment(comment):\n", " vectorized_comment = vectorizer([comment])\n", " results=model.predict(vectorized_comment)\n", " \n", " text=''\n", " for idx, col in enumerate(data.columns[2:]):\n", " text+= '{}: {}\\n'.format(col, results[0][idx]>0.5)\n", " \n", " return text" ] }, { "cell_type": "code", "execution_count": 163, "id": "21ea015f", "metadata": {}, "outputs": [], "source": [ "def combined_models(input):\n", " output1=translate_hindi(input)\n", " output2=score_comment(input)\n", " \n", " return output1, output2" ] }, { "cell_type": "code", "execution_count": 166, "id": "ca5d14a9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1/1 [==============================] - 0s 109ms/step\n" ] } ], "source": [ "interface = gr.Interface(fn=combined_models, inputs=\"text\", outputs=[\"text\",\"text\"],title=\"Toxic Comment Analyzer\")" ] }, { "cell_type": "code", "execution_count": 168, "id": "cb485bb9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7871\n", "Running on public URL: https://27f88e54e3177749fa.gradio.live\n", "\n", "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 168, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "1/1 [==============================] - 0s 426ms/step\n" ] } ], "source": [ "interface.launch(share=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "e30aa7aa", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 5 }