DANI001
/

bert-b-m-uncased

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c6866894",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "7b293125",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1ddac164d1df40438dddfddf1730f471",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/2312 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments\n",
+    "from datasets import Dataset, DatasetDict\n",
+    "import torch\n",
+    "\n",
+    "# Load dataset\n",
+    "data = pd.read_csv('C:/Users/Administrator/Downloads/ds_2300_Sheet1.csv')\n",
+    "\n",
+    "# Remove 'id' column\n",
+    "data = data.drop(columns=['id'])\n",
+    "\n",
+    "# Adding a dummy label column (ensure it's an integer type)\n",
+    "data['label'] = 0\n",
+    "\n",
+    "# Convert label column to integer type\n",
+    "data['label'] = data['label'].astype(float)\n",
+    "\n",
+    "# Convert to Hugging Face dataset\n",
+    "dataset = Dataset.from_pandas(data)\n",
+    "\n",
+    "# Loading pre-trained uncased multilingual BERT model and tokenizer\n",
+    "model_name = 'bert-base-multilingual-uncased'\n",
+    "tokenizer = BertTokenizer.from_pretrained(model_name)\n",
+    "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)  # Adjust num_labels if needed\n",
+    "\n",
+    "# Tokenization function\n",
+    "def tokenize_function(examples):\n",
+    "    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)  # Adjust max_length if needed\n",
+    "\n",
+    "# Tokenize the dataset\n",
+    "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
+    "\n",
+    "# Split the dataset\n",
+    "split_datasets = tokenized_datasets.train_test_split(test_size=0.1)\n",
+    "train_dataset = split_datasets['train']\n",
+    "eval_dataset = split_datasets['test']\n",
+    "\n",
+    "# Convert train and eval datasets to PyTorch tensors and ensure labels are Long tensors\n",
+    "def format_dataset(dataset):\n",
+    "    return dataset.with_format('torch', columns=['input_ids', 'attention_mask', 'label'])\n",
+    "\n",
+    "train_dataset = format_dataset(train_dataset)\n",
+    "eval_dataset = format_dataset(eval_dataset)\n",
+    "\n",
+    "# Define training arguments\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./results',\n",
+    "    evaluation_strategy='epoch',\n",
+    "    learning_rate=2e-5,\n",
+    "    per_device_train_batch_size=8,\n",
+    "    per_device_eval_batch_size=8,\n",
+    "    num_train_epochs=5,\n",
+    "    weight_decay=0.01,\n",
+    ")\n",
+    "\n",
+    "# Define Trainer\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    "    eval_dataset=eval_dataset,\n",
+    "    tokenizer=tokenizer,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d533c43",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='187' max='1300' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [ 187/1300 17:25 < 1:44:52, 0.18 it/s, Epoch 0.72/5]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Train the model\n",
+    "trainer.train()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ca0b7d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evaluate the model\n",
+    "eval_results = trainer.evaluate()\n",
+    "print(\"Evaluation Results:\", eval_results)\n",
+    "\n",
+    "# Save the model\n",
+    "model.save_pretrained('./fine-tuned-bert-urdu')\n",
+    "tokenizer.save_pretrained('./fine-tuned-bert-urdu')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}