ankitkupadhyay commited on
Commit
dce0000
Β·
1 Parent(s): 43789f0

Delete data_exploration.ipynb

Browse files
Files changed (1) hide show
  1. data_exploration.ipynb +0 -394
data_exploration.ipynb DELETED
@@ -1,394 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import pandas as pd\n",
10
- "import numpy as np\n",
11
- "from sklearn.model_selection import train_test_split\n",
12
- "import matplotlib.pyplot as plt\n",
13
- "import cv2\n",
14
- "import os\n",
15
- "from PIL import Image\n",
16
- "from tqdm import tqdm\n",
17
- "import torch\n",
18
- "from torch.utils.data import Dataset, DataLoader\n",
19
- "from torchvision import transforms"
20
- ]
21
- },
22
- {
23
- "cell_type": "markdown",
24
- "metadata": {},
25
- "source": [
26
- "# creating train and test dataset"
27
- ]
28
- },
29
- {
30
- "cell_type": "code",
31
- "execution_count": 2,
32
- "metadata": {},
33
- "outputs": [],
34
- "source": [
35
- "def getData(type):\n",
36
- " df = list()\n",
37
- " directory = f'D-Fire/{type}/labels' \n",
38
- " n = len(os.listdir(directory))\n",
39
- " for filename in tqdm(os.listdir(directory)):\n",
40
- " f = os.path.join(directory, filename)\n",
41
- " # print(f)\n",
42
- "\n",
43
- " image = filename[:-3] + 'jpg'\n",
44
- " # print(image)\n",
45
- " # break\n",
46
- " img = Image.open(f'D-Fire/{type}/images/{image}')\n",
47
- " width, height = img.size\n",
48
- " # print(width, height)\n",
49
- " # plt.imshow(img)\n",
50
- " # plt.show()\n",
51
- " # break\n",
52
- " pre = [image, width, height]\n",
53
- " if os.path.getsize(f) == 0:\n",
54
- " dp = pre + [2]\n",
55
- " df.append(dp)\n",
56
- " else:\n",
57
- " with open(f) as fp:\n",
58
- " lines = fp.readlines()\n",
59
- " for line in lines:\n",
60
- " line = line.split()\n",
61
- " line = list(map(float, line))\n",
62
- " line[0] = int(line[0])\n",
63
- " # line.insert(0, image)\n",
64
- " dp = pre + line\n",
65
- " df.append(dp)\n",
66
- " fp.close()\n",
67
- " return df, n"
68
- ]
69
- },
70
- {
71
- "cell_type": "code",
72
- "execution_count": 3,
73
- "metadata": {},
74
- "outputs": [
75
- {
76
- "name": "stderr",
77
- "output_type": "stream",
78
- "text": [
79
- "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17221/17221 [00:11<00:00, 1447.90it/s]\n",
80
- "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4306/4306 [00:03<00:00, 1340.39it/s]\n"
81
- ]
82
- }
83
- ],
84
- "source": [
85
- "# get train and test data\n",
86
- "train, n_train = getData(\"train\")\n",
87
- "df_train = pd.DataFrame(train, columns= [\"Image\", \"Width\", \"Height\", \"Label\", \"x_min\", \"y_min\", \"x_max\", \"y_max\"])\n",
88
- "test, n_test = getData(\"test\")\n",
89
- "df_test = pd.DataFrame(test, columns= [\"Image\", \"Width\", \"Height\", \"Label\", \"x_min\", \"y_min\", \"x_max\", \"y_max\"])"
90
- ]
91
- },
92
- {
93
- "cell_type": "code",
94
- "execution_count": 4,
95
- "metadata": {},
96
- "outputs": [
97
- {
98
- "data": {
99
- "text/html": [
100
- "<div>\n",
101
- "<style scoped>\n",
102
- " .dataframe tbody tr th:only-of-type {\n",
103
- " vertical-align: middle;\n",
104
- " }\n",
105
- "\n",
106
- " .dataframe tbody tr th {\n",
107
- " vertical-align: top;\n",
108
- " }\n",
109
- "\n",
110
- " .dataframe thead th {\n",
111
- " text-align: right;\n",
112
- " }\n",
113
- "</style>\n",
114
- "<table border=\"1\" class=\"dataframe\">\n",
115
- " <thead>\n",
116
- " <tr style=\"text-align: right;\">\n",
117
- " <th></th>\n",
118
- " <th>Image</th>\n",
119
- " <th>Width</th>\n",
120
- " <th>Height</th>\n",
121
- " <th>Label</th>\n",
122
- " <th>x_min</th>\n",
123
- " <th>y_min</th>\n",
124
- " <th>x_max</th>\n",
125
- " <th>y_max</th>\n",
126
- " </tr>\n",
127
- " </thead>\n",
128
- " <tbody>\n",
129
- " <tr>\n",
130
- " <th>0</th>\n",
131
- " <td>AoF05695.jpg</td>\n",
132
- " <td>1280</td>\n",
133
- " <td>720</td>\n",
134
- " <td>0</td>\n",
135
- " <td>0.700781</td>\n",
136
- " <td>0.379167</td>\n",
137
- " <td>0.039062</td>\n",
138
- " <td>0.105556</td>\n",
139
- " </tr>\n",
140
- " <tr>\n",
141
- " <th>1</th>\n",
142
- " <td>WEB08898.jpg</td>\n",
143
- " <td>640</td>\n",
144
- " <td>360</td>\n",
145
- " <td>0</td>\n",
146
- " <td>0.477344</td>\n",
147
- " <td>0.291667</td>\n",
148
- " <td>0.264063</td>\n",
149
- " <td>0.555556</td>\n",
150
- " </tr>\n",
151
- " <tr>\n",
152
- " <th>2</th>\n",
153
- " <td>WEB01102.jpg</td>\n",
154
- " <td>640</td>\n",
155
- " <td>360</td>\n",
156
- " <td>2</td>\n",
157
- " <td>NaN</td>\n",
158
- " <td>NaN</td>\n",
159
- " <td>NaN</td>\n",
160
- " <td>NaN</td>\n",
161
- " </tr>\n",
162
- " <tr>\n",
163
- " <th>3</th>\n",
164
- " <td>WEB07573.jpg</td>\n",
165
- " <td>1100</td>\n",
166
- " <td>619</td>\n",
167
- " <td>0</td>\n",
168
- " <td>0.465000</td>\n",
169
- " <td>0.475767</td>\n",
170
- " <td>0.290000</td>\n",
171
- " <td>0.906300</td>\n",
172
- " </tr>\n",
173
- " <tr>\n",
174
- " <th>4</th>\n",
175
- " <td>WEB08640.jpg</td>\n",
176
- " <td>640</td>\n",
177
- " <td>360</td>\n",
178
- " <td>0</td>\n",
179
- " <td>0.578125</td>\n",
180
- " <td>0.506944</td>\n",
181
- " <td>0.709375</td>\n",
182
- " <td>0.936111</td>\n",
183
- " </tr>\n",
184
- " </tbody>\n",
185
- "</table>\n",
186
- "</div>"
187
- ],
188
- "text/plain": [
189
- " Image Width Height Label x_min y_min x_max y_max\n",
190
- "0 AoF05695.jpg 1280 720 0 0.700781 0.379167 0.039062 0.105556\n",
191
- "1 WEB08898.jpg 640 360 0 0.477344 0.291667 0.264063 0.555556\n",
192
- "2 WEB01102.jpg 640 360 2 NaN NaN NaN NaN\n",
193
- "3 WEB07573.jpg 1100 619 0 0.465000 0.475767 0.290000 0.906300\n",
194
- "4 WEB08640.jpg 640 360 0 0.578125 0.506944 0.709375 0.936111"
195
- ]
196
- },
197
- "execution_count": 4,
198
- "metadata": {},
199
- "output_type": "execute_result"
200
- }
201
- ],
202
- "source": [
203
- "# train sample\n",
204
- "df_train.head()"
205
- ]
206
- },
207
- {
208
- "cell_type": "markdown",
209
- "metadata": {},
210
- "source": [
211
- "# data split exploration"
212
- ]
213
- },
214
- {
215
- "cell_type": "code",
216
- "execution_count": 5,
217
- "metadata": {},
218
- "outputs": [],
219
- "source": [
220
- "group_tr = df_train.groupby(\"Label\").count().iloc[:, 0].to_numpy()\n",
221
- "group_tr_ratio = group_tr / n_train\n",
222
- "group_te = df_test.groupby(\"Label\").count().iloc[:, 0].to_numpy()\n",
223
- "group_te_ratio = group_te / n_test"
224
- ]
225
- },
226
- {
227
- "cell_type": "code",
228
- "execution_count": 6,
229
- "metadata": {},
230
- "outputs": [
231
- {
232
- "data": {
233
- "image/png": "",
234
- "text/plain": [
235
- "<Figure size 640x480 with 1 Axes>"
236
- ]
237
- },
238
- "metadata": {},
239
- "output_type": "display_data"
240
- }
241
- ],
242
- "source": [
243
- "# statistics on data ratio split\n",
244
- "x = np.arange(3)\n",
245
- "plt.bar(x, group_tr_ratio, color ='r', width = 0.25,\n",
246
- " edgecolor ='grey', label ='Train')\n",
247
- "x = [x + 0.25 for x in x]\n",
248
- "plt.bar(x, group_te_ratio, color ='b', width = 0.25,\n",
249
- " edgecolor ='grey', label ='Test')\n",
250
- "plt.xlabel('Labels')\n",
251
- "plt.ylabel('Proprtion Ratio')\n",
252
- "plt.xticks([0.15, 1.15, 2.15], [\"Smoke\", \"Fire\", \"None\"])\n",
253
- "plt.legend()\n",
254
- "plt.show()"
255
- ]
256
- },
257
- {
258
- "cell_type": "markdown",
259
- "metadata": {},
260
- "source": [
261
- "# total count for different classes"
262
- ]
263
- },
264
- {
265
- "cell_type": "code",
266
- "execution_count": 7,
267
- "metadata": {},
268
- "outputs": [
269
- {
270
- "data": {
271
- "text/html": [
272
- "<div>\n",
273
- "<style scoped>\n",
274
- " .dataframe tbody tr th:only-of-type {\n",
275
- " vertical-align: middle;\n",
276
- " }\n",
277
- "\n",
278
- " .dataframe tbody tr th {\n",
279
- " vertical-align: top;\n",
280
- " }\n",
281
- "\n",
282
- " .dataframe thead th {\n",
283
- " text-align: right;\n",
284
- " }\n",
285
- "</style>\n",
286
- "<table border=\"1\" class=\"dataframe\">\n",
287
- " <thead>\n",
288
- " <tr style=\"text-align: right;\">\n",
289
- " <th></th>\n",
290
- " <th>Smoke</th>\n",
291
- " <th>Fire</th>\n",
292
- " <th>Neither</th>\n",
293
- " </tr>\n",
294
- " </thead>\n",
295
- " <tbody>\n",
296
- " <tr>\n",
297
- " <th>Train</th>\n",
298
- " <td>9550</td>\n",
299
- " <td>11814</td>\n",
300
- " <td>7833</td>\n",
301
- " </tr>\n",
302
- " <tr>\n",
303
- " <th>Test</th>\n",
304
- " <td>2315</td>\n",
305
- " <td>2878</td>\n",
306
- " <td>2005</td>\n",
307
- " </tr>\n",
308
- " </tbody>\n",
309
- "</table>\n",
310
- "</div>"
311
- ],
312
- "text/plain": [
313
- " Smoke Fire Neither\n",
314
- "Train 9550 11814 7833\n",
315
- "Test 2315 2878 2005"
316
- ]
317
- },
318
- "execution_count": 7,
319
- "metadata": {},
320
- "output_type": "execute_result"
321
- }
322
- ],
323
- "source": [
324
- "pd.DataFrame([group_tr, group_te], columns=[\"Smoke\", \"Fire\", \"Neither\"], index=[\"Train\", \"Test\"])"
325
- ]
326
- },
327
- {
328
- "cell_type": "code",
329
- "execution_count": 8,
330
- "metadata": {},
331
- "outputs": [
332
- {
333
- "data": {
334
- "text/plain": [
335
- "Image Label\n",
336
- "AoF00000.jpg 2 1\n",
337
- "AoF00001.jpg 1 1\n",
338
- "AoF00002.jpg 0 1\n",
339
- "AoF00003.jpg 2 1\n",
340
- "AoF00004.jpg 2 1\n",
341
- " ..\n",
342
- "WEB09440.jpg 0 2\n",
343
- "WEB09441.jpg 0 2\n",
344
- " 1 3\n",
345
- "WEB09442.jpg 0 1\n",
346
- " 1 1\n",
347
- "Name: Width, Length: 20984, dtype: int64"
348
- ]
349
- },
350
- "execution_count": 8,
351
- "metadata": {},
352
- "output_type": "execute_result"
353
- }
354
- ],
355
- "source": [
356
- "df_train.groupby([\"Image\", \"Label\"]).count()[\"Width\"]"
357
- ]
358
- },
359
- {
360
- "cell_type": "code",
361
- "execution_count": null,
362
- "metadata": {},
363
- "outputs": [],
364
- "source": [
365
- "np.random.seed(42)\n",
366
- "idx = np.random.randint\n",
367
- "smoke = df_train[\"Label\"] == 0 \n",
368
- "fire = df_train[\"Label\"] == 1\n",
369
- "neither = df_train[\"Label\"] == 2\n"
370
- ]
371
- }
372
- ],
373
- "metadata": {
374
- "kernelspec": {
375
- "display_name": "AIClass",
376
- "language": "python",
377
- "name": "python3"
378
- },
379
- "language_info": {
380
- "codemirror_mode": {
381
- "name": "ipython",
382
- "version": 3
383
- },
384
- "file_extension": ".py",
385
- "mimetype": "text/x-python",
386
- "name": "python",
387
- "nbconvert_exporter": "python",
388
- "pygments_lexer": "ipython3",
389
- "version": "3.11.4"
390
- }
391
- },
392
- "nbformat": 4,
393
- "nbformat_minor": 2
394
- }