ankitkupadhyay commited on
Commit
43789f0
·
1 Parent(s): 3c1b5a2

Upload data_exploration.ipynb

Browse files
Files changed (1) hide show
  1. data_exploration.ipynb +394 -0
data_exploration.ipynb ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import numpy as np\n",
11
+ "from sklearn.model_selection import train_test_split\n",
12
+ "import matplotlib.pyplot as plt\n",
13
+ "import cv2\n",
14
+ "import os\n",
15
+ "from PIL import Image\n",
16
+ "from tqdm import tqdm\n",
17
+ "import torch\n",
18
+ "from torch.utils.data import Dataset, DataLoader\n",
19
+ "from torchvision import transforms"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "markdown",
24
+ "metadata": {},
25
+ "source": [
26
+ "# creating train and test dataset"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 2,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "def getData(type):\n",
36
+ " df = list()\n",
37
+ " directory = f'D-Fire/{type}/labels' \n",
38
+ " n = len(os.listdir(directory))\n",
39
+ " for filename in tqdm(os.listdir(directory)):\n",
40
+ " f = os.path.join(directory, filename)\n",
41
+ " # print(f)\n",
42
+ "\n",
43
+ " image = filename[:-3] + 'jpg'\n",
44
+ " # print(image)\n",
45
+ " # break\n",
46
+ " img = Image.open(f'D-Fire/{type}/images/{image}')\n",
47
+ " width, height = img.size\n",
48
+ " # print(width, height)\n",
49
+ " # plt.imshow(img)\n",
50
+ " # plt.show()\n",
51
+ " # break\n",
52
+ " pre = [image, width, height]\n",
53
+ " if os.path.getsize(f) == 0:\n",
54
+ " dp = pre + [2]\n",
55
+ " df.append(dp)\n",
56
+ " else:\n",
57
+ " with open(f) as fp:\n",
58
+ " lines = fp.readlines()\n",
59
+ " for line in lines:\n",
60
+ " line = line.split()\n",
61
+ " line = list(map(float, line))\n",
62
+ " line[0] = int(line[0])\n",
63
+ " # line.insert(0, image)\n",
64
+ " dp = pre + line\n",
65
+ " df.append(dp)\n",
66
+ " fp.close()\n",
67
+ " return df, n"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 3,
73
+ "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "name": "stderr",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "100%|██████████| 17221/17221 [00:11<00:00, 1447.90it/s]\n",
80
+ "100%|██████████| 4306/4306 [00:03<00:00, 1340.39it/s]\n"
81
+ ]
82
+ }
83
+ ],
84
+ "source": [
85
+ "# get train and test data\n",
86
+ "train, n_train = getData(\"train\")\n",
87
+ "df_train = pd.DataFrame(train, columns= [\"Image\", \"Width\", \"Height\", \"Label\", \"x_min\", \"y_min\", \"x_max\", \"y_max\"])\n",
88
+ "test, n_test = getData(\"test\")\n",
89
+ "df_test = pd.DataFrame(test, columns= [\"Image\", \"Width\", \"Height\", \"Label\", \"x_min\", \"y_min\", \"x_max\", \"y_max\"])"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": 4,
95
+ "metadata": {},
96
+ "outputs": [
97
+ {
98
+ "data": {
99
+ "text/html": [
100
+ "<div>\n",
101
+ "<style scoped>\n",
102
+ " .dataframe tbody tr th:only-of-type {\n",
103
+ " vertical-align: middle;\n",
104
+ " }\n",
105
+ "\n",
106
+ " .dataframe tbody tr th {\n",
107
+ " vertical-align: top;\n",
108
+ " }\n",
109
+ "\n",
110
+ " .dataframe thead th {\n",
111
+ " text-align: right;\n",
112
+ " }\n",
113
+ "</style>\n",
114
+ "<table border=\"1\" class=\"dataframe\">\n",
115
+ " <thead>\n",
116
+ " <tr style=\"text-align: right;\">\n",
117
+ " <th></th>\n",
118
+ " <th>Image</th>\n",
119
+ " <th>Width</th>\n",
120
+ " <th>Height</th>\n",
121
+ " <th>Label</th>\n",
122
+ " <th>x_min</th>\n",
123
+ " <th>y_min</th>\n",
124
+ " <th>x_max</th>\n",
125
+ " <th>y_max</th>\n",
126
+ " </tr>\n",
127
+ " </thead>\n",
128
+ " <tbody>\n",
129
+ " <tr>\n",
130
+ " <th>0</th>\n",
131
+ " <td>AoF05695.jpg</td>\n",
132
+ " <td>1280</td>\n",
133
+ " <td>720</td>\n",
134
+ " <td>0</td>\n",
135
+ " <td>0.700781</td>\n",
136
+ " <td>0.379167</td>\n",
137
+ " <td>0.039062</td>\n",
138
+ " <td>0.105556</td>\n",
139
+ " </tr>\n",
140
+ " <tr>\n",
141
+ " <th>1</th>\n",
142
+ " <td>WEB08898.jpg</td>\n",
143
+ " <td>640</td>\n",
144
+ " <td>360</td>\n",
145
+ " <td>0</td>\n",
146
+ " <td>0.477344</td>\n",
147
+ " <td>0.291667</td>\n",
148
+ " <td>0.264063</td>\n",
149
+ " <td>0.555556</td>\n",
150
+ " </tr>\n",
151
+ " <tr>\n",
152
+ " <th>2</th>\n",
153
+ " <td>WEB01102.jpg</td>\n",
154
+ " <td>640</td>\n",
155
+ " <td>360</td>\n",
156
+ " <td>2</td>\n",
157
+ " <td>NaN</td>\n",
158
+ " <td>NaN</td>\n",
159
+ " <td>NaN</td>\n",
160
+ " <td>NaN</td>\n",
161
+ " </tr>\n",
162
+ " <tr>\n",
163
+ " <th>3</th>\n",
164
+ " <td>WEB07573.jpg</td>\n",
165
+ " <td>1100</td>\n",
166
+ " <td>619</td>\n",
167
+ " <td>0</td>\n",
168
+ " <td>0.465000</td>\n",
169
+ " <td>0.475767</td>\n",
170
+ " <td>0.290000</td>\n",
171
+ " <td>0.906300</td>\n",
172
+ " </tr>\n",
173
+ " <tr>\n",
174
+ " <th>4</th>\n",
175
+ " <td>WEB08640.jpg</td>\n",
176
+ " <td>640</td>\n",
177
+ " <td>360</td>\n",
178
+ " <td>0</td>\n",
179
+ " <td>0.578125</td>\n",
180
+ " <td>0.506944</td>\n",
181
+ " <td>0.709375</td>\n",
182
+ " <td>0.936111</td>\n",
183
+ " </tr>\n",
184
+ " </tbody>\n",
185
+ "</table>\n",
186
+ "</div>"
187
+ ],
188
+ "text/plain": [
189
+ " Image Width Height Label x_min y_min x_max y_max\n",
190
+ "0 AoF05695.jpg 1280 720 0 0.700781 0.379167 0.039062 0.105556\n",
191
+ "1 WEB08898.jpg 640 360 0 0.477344 0.291667 0.264063 0.555556\n",
192
+ "2 WEB01102.jpg 640 360 2 NaN NaN NaN NaN\n",
193
+ "3 WEB07573.jpg 1100 619 0 0.465000 0.475767 0.290000 0.906300\n",
194
+ "4 WEB08640.jpg 640 360 0 0.578125 0.506944 0.709375 0.936111"
195
+ ]
196
+ },
197
+ "execution_count": 4,
198
+ "metadata": {},
199
+ "output_type": "execute_result"
200
+ }
201
+ ],
202
+ "source": [
203
+ "# train sample\n",
204
+ "df_train.head()"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "markdown",
209
+ "metadata": {},
210
+ "source": [
211
+ "# data split exploration"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": 5,
217
+ "metadata": {},
218
+ "outputs": [],
219
+ "source": [
220
+ "group_tr = df_train.groupby(\"Label\").count().iloc[:, 0].to_numpy()\n",
221
+ "group_tr_ratio = group_tr / n_train\n",
222
+ "group_te = df_test.groupby(\"Label\").count().iloc[:, 0].to_numpy()\n",
223
+ "group_te_ratio = group_te / n_test"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "execution_count": 6,
229
+ "metadata": {},
230
+ "outputs": [
231
+ {
232
+ "data": {
233
+ "image/png": "",
234
+ "text/plain": [
235
+ "<Figure size 640x480 with 1 Axes>"
236
+ ]
237
+ },
238
+ "metadata": {},
239
+ "output_type": "display_data"
240
+ }
241
+ ],
242
+ "source": [
243
+ "# statistics on data ratio split\n",
244
+ "x = np.arange(3)\n",
245
+ "plt.bar(x, group_tr_ratio, color ='r', width = 0.25,\n",
246
+ " edgecolor ='grey', label ='Train')\n",
247
+ "x = [x + 0.25 for x in x]\n",
248
+ "plt.bar(x, group_te_ratio, color ='b', width = 0.25,\n",
249
+ " edgecolor ='grey', label ='Test')\n",
250
+ "plt.xlabel('Labels')\n",
251
+ "plt.ylabel('Proprtion Ratio')\n",
252
+ "plt.xticks([0.15, 1.15, 2.15], [\"Smoke\", \"Fire\", \"None\"])\n",
253
+ "plt.legend()\n",
254
+ "plt.show()"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "markdown",
259
+ "metadata": {},
260
+ "source": [
261
+ "# total count for different classes"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "code",
266
+ "execution_count": 7,
267
+ "metadata": {},
268
+ "outputs": [
269
+ {
270
+ "data": {
271
+ "text/html": [
272
+ "<div>\n",
273
+ "<style scoped>\n",
274
+ " .dataframe tbody tr th:only-of-type {\n",
275
+ " vertical-align: middle;\n",
276
+ " }\n",
277
+ "\n",
278
+ " .dataframe tbody tr th {\n",
279
+ " vertical-align: top;\n",
280
+ " }\n",
281
+ "\n",
282
+ " .dataframe thead th {\n",
283
+ " text-align: right;\n",
284
+ " }\n",
285
+ "</style>\n",
286
+ "<table border=\"1\" class=\"dataframe\">\n",
287
+ " <thead>\n",
288
+ " <tr style=\"text-align: right;\">\n",
289
+ " <th></th>\n",
290
+ " <th>Smoke</th>\n",
291
+ " <th>Fire</th>\n",
292
+ " <th>Neither</th>\n",
293
+ " </tr>\n",
294
+ " </thead>\n",
295
+ " <tbody>\n",
296
+ " <tr>\n",
297
+ " <th>Train</th>\n",
298
+ " <td>9550</td>\n",
299
+ " <td>11814</td>\n",
300
+ " <td>7833</td>\n",
301
+ " </tr>\n",
302
+ " <tr>\n",
303
+ " <th>Test</th>\n",
304
+ " <td>2315</td>\n",
305
+ " <td>2878</td>\n",
306
+ " <td>2005</td>\n",
307
+ " </tr>\n",
308
+ " </tbody>\n",
309
+ "</table>\n",
310
+ "</div>"
311
+ ],
312
+ "text/plain": [
313
+ " Smoke Fire Neither\n",
314
+ "Train 9550 11814 7833\n",
315
+ "Test 2315 2878 2005"
316
+ ]
317
+ },
318
+ "execution_count": 7,
319
+ "metadata": {},
320
+ "output_type": "execute_result"
321
+ }
322
+ ],
323
+ "source": [
324
+ "pd.DataFrame([group_tr, group_te], columns=[\"Smoke\", \"Fire\", \"Neither\"], index=[\"Train\", \"Test\"])"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": 8,
330
+ "metadata": {},
331
+ "outputs": [
332
+ {
333
+ "data": {
334
+ "text/plain": [
335
+ "Image Label\n",
336
+ "AoF00000.jpg 2 1\n",
337
+ "AoF00001.jpg 1 1\n",
338
+ "AoF00002.jpg 0 1\n",
339
+ "AoF00003.jpg 2 1\n",
340
+ "AoF00004.jpg 2 1\n",
341
+ " ..\n",
342
+ "WEB09440.jpg 0 2\n",
343
+ "WEB09441.jpg 0 2\n",
344
+ " 1 3\n",
345
+ "WEB09442.jpg 0 1\n",
346
+ " 1 1\n",
347
+ "Name: Width, Length: 20984, dtype: int64"
348
+ ]
349
+ },
350
+ "execution_count": 8,
351
+ "metadata": {},
352
+ "output_type": "execute_result"
353
+ }
354
+ ],
355
+ "source": [
356
+ "df_train.groupby([\"Image\", \"Label\"]).count()[\"Width\"]"
357
+ ]
358
+ },
359
+ {
360
+ "cell_type": "code",
361
+ "execution_count": null,
362
+ "metadata": {},
363
+ "outputs": [],
364
+ "source": [
365
+ "np.random.seed(42)\n",
366
+ "idx = np.random.randint\n",
367
+ "smoke = df_train[\"Label\"] == 0 \n",
368
+ "fire = df_train[\"Label\"] == 1\n",
369
+ "neither = df_train[\"Label\"] == 2\n"
370
+ ]
371
+ }
372
+ ],
373
+ "metadata": {
374
+ "kernelspec": {
375
+ "display_name": "AIClass",
376
+ "language": "python",
377
+ "name": "python3"
378
+ },
379
+ "language_info": {
380
+ "codemirror_mode": {
381
+ "name": "ipython",
382
+ "version": 3
383
+ },
384
+ "file_extension": ".py",
385
+ "mimetype": "text/x-python",
386
+ "name": "python",
387
+ "nbconvert_exporter": "python",
388
+ "pygments_lexer": "ipython3",
389
+ "version": "3.11.4"
390
+ }
391
+ },
392
+ "nbformat": 4,
393
+ "nbformat_minor": 2
394
+ }