DANI001 commited on
Commit
909299f
1 Parent(s): c405041

Upload urmbertmodel.ipynb

Browse files
Files changed (1) hide show
  1. urmbertmodel.ipynb +191 -0
urmbertmodel.ipynb ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 5,
6
+ "id": "c6866894",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import warnings\n",
11
+ "warnings.filterwarnings(\"ignore\")"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 16,
17
+ "id": "7b293125",
18
+ "metadata": {
19
+ "scrolled": false
20
+ },
21
+ "outputs": [
22
+ {
23
+ "name": "stderr",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
27
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
28
+ ]
29
+ },
30
+ {
31
+ "data": {
32
+ "application/vnd.jupyter.widget-view+json": {
33
+ "model_id": "1ddac164d1df40438dddfddf1730f471",
34
+ "version_major": 2,
35
+ "version_minor": 0
36
+ },
37
+ "text/plain": [
38
+ "Map: 0%| | 0/2312 [00:00<?, ? examples/s]"
39
+ ]
40
+ },
41
+ "metadata": {},
42
+ "output_type": "display_data"
43
+ }
44
+ ],
45
+ "source": [
46
+ "import pandas as pd\n",
47
+ "from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments\n",
48
+ "from datasets import Dataset, DatasetDict\n",
49
+ "import torch\n",
50
+ "\n",
51
+ "# Load dataset\n",
52
+ "data = pd.read_csv('C:/Users/Administrator/Downloads/ds_2300_Sheet1.csv')\n",
53
+ "\n",
54
+ "# Remove 'id' column\n",
55
+ "data = data.drop(columns=['id'])\n",
56
+ "\n",
57
+ "# Adding a dummy label column (ensure it's an integer type)\n",
58
+ "data['label'] = 0\n",
59
+ "\n",
60
+ "# Convert label column to integer type\n",
61
+ "data['label'] = data['label'].astype(float)\n",
62
+ "\n",
63
+ "# Convert to Hugging Face dataset\n",
64
+ "dataset = Dataset.from_pandas(data)\n",
65
+ "\n",
66
+ "# Loading pre-trained uncased multilingual BERT model and tokenizer\n",
67
+ "model_name = 'bert-base-multilingual-uncased'\n",
68
+ "tokenizer = BertTokenizer.from_pretrained(model_name)\n",
69
+ "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1) # Adjust num_labels if needed\n",
70
+ "\n",
71
+ "# Tokenization function\n",
72
+ "def tokenize_function(examples):\n",
73
+ " return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512) # Adjust max_length if needed\n",
74
+ "\n",
75
+ "# Tokenize the dataset\n",
76
+ "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
77
+ "\n",
78
+ "# Split the dataset\n",
79
+ "split_datasets = tokenized_datasets.train_test_split(test_size=0.1)\n",
80
+ "train_dataset = split_datasets['train']\n",
81
+ "eval_dataset = split_datasets['test']\n",
82
+ "\n",
83
+ "# Convert train and eval datasets to PyTorch tensors and ensure labels are Long tensors\n",
84
+ "def format_dataset(dataset):\n",
85
+ " return dataset.with_format('torch', columns=['input_ids', 'attention_mask', 'label'])\n",
86
+ "\n",
87
+ "train_dataset = format_dataset(train_dataset)\n",
88
+ "eval_dataset = format_dataset(eval_dataset)\n",
89
+ "\n",
90
+ "# Define training arguments\n",
91
+ "training_args = TrainingArguments(\n",
92
+ " output_dir='./results',\n",
93
+ " evaluation_strategy='epoch',\n",
94
+ " learning_rate=2e-5,\n",
95
+ " per_device_train_batch_size=8,\n",
96
+ " per_device_eval_batch_size=8,\n",
97
+ " num_train_epochs=5,\n",
98
+ " weight_decay=0.01,\n",
99
+ ")\n",
100
+ "\n",
101
+ "# Define Trainer\n",
102
+ "trainer = Trainer(\n",
103
+ " model=model,\n",
104
+ " args=training_args,\n",
105
+ " train_dataset=train_dataset,\n",
106
+ " eval_dataset=eval_dataset,\n",
107
+ " tokenizer=tokenizer,\n",
108
+ ")"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": null,
114
+ "id": "2d533c43",
115
+ "metadata": {
116
+ "scrolled": true
117
+ },
118
+ "outputs": [
119
+ {
120
+ "data": {
121
+ "text/html": [
122
+ "\n",
123
+ " <div>\n",
124
+ " \n",
125
+ " <progress value='187' max='1300' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
126
+ " [ 187/1300 17:25 < 1:44:52, 0.18 it/s, Epoch 0.72/5]\n",
127
+ " </div>\n",
128
+ " <table border=\"1\" class=\"dataframe\">\n",
129
+ " <thead>\n",
130
+ " <tr style=\"text-align: left;\">\n",
131
+ " <th>Epoch</th>\n",
132
+ " <th>Training Loss</th>\n",
133
+ " <th>Validation Loss</th>\n",
134
+ " </tr>\n",
135
+ " </thead>\n",
136
+ " <tbody>\n",
137
+ " </tbody>\n",
138
+ "</table><p>"
139
+ ],
140
+ "text/plain": [
141
+ "<IPython.core.display.HTML object>"
142
+ ]
143
+ },
144
+ "metadata": {},
145
+ "output_type": "display_data"
146
+ }
147
+ ],
148
+ "source": [
149
+ "# Train the model\n",
150
+ "trainer.train()\n"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": null,
156
+ "id": "3ca0b7d2",
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": [
160
+ "# Evaluate the model\n",
161
+ "eval_results = trainer.evaluate()\n",
162
+ "print(\"Evaluation Results:\", eval_results)\n",
163
+ "\n",
164
+ "# Save the model\n",
165
+ "model.save_pretrained('./fine-tuned-bert-urdu')\n",
166
+ "tokenizer.save_pretrained('./fine-tuned-bert-urdu')"
167
+ ]
168
+ }
169
+ ],
170
+ "metadata": {
171
+ "kernelspec": {
172
+ "display_name": "Python 3 (ipykernel)",
173
+ "language": "python",
174
+ "name": "python3"
175
+ },
176
+ "language_info": {
177
+ "codemirror_mode": {
178
+ "name": "ipython",
179
+ "version": 3
180
+ },
181
+ "file_extension": ".py",
182
+ "mimetype": "text/x-python",
183
+ "name": "python",
184
+ "nbconvert_exporter": "python",
185
+ "pygments_lexer": "ipython3",
186
+ "version": "3.11.5"
187
+ }
188
+ },
189
+ "nbformat": 4,
190
+ "nbformat_minor": 5
191
+ }