push the changes
Browse files- .gitignore +4 -1
- .vscode/settings.json +3 -0
- Google_Talk_chatbot.ipynb +697 -0
- app.py +133 -69
- aqi_downloader.ipynb +785 -0
- prompts.csv +57 -0
- sandbox.ipynb +0 -0
.gitignore
CHANGED
@@ -1,2 +1,5 @@
|
|
1 |
*.pyc
|
2 |
-
*.png
|
|
|
|
|
|
|
|
1 |
*.pyc
|
2 |
+
*.png
|
3 |
+
AQI*/
|
4 |
+
*.parquet
|
5 |
+
*.zip
|
.vscode/settings.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cSpell.words": ["dataframe"]
|
3 |
+
}
|
Google_Talk_chatbot.ipynb
ADDED
@@ -0,0 +1,697 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 18,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import streamlit as st\n",
|
10 |
+
"import os\n",
|
11 |
+
"import pandas as pd\n",
|
12 |
+
"import random\n",
|
13 |
+
"from os.path import join\n",
|
14 |
+
"from datetime import datetime\n",
|
15 |
+
"# from src import decorate_with_code, show_response, get_from_user\n",
|
16 |
+
"from dotenv import load_dotenv\n",
|
17 |
+
"from langchain_groq.chat_models import ChatGroq\n",
|
18 |
+
"from langchain_mistralai import ChatMistralAI\n",
|
19 |
+
"from langchain_google_genai import GoogleGenerativeAI,GoogleGenerativeAIEmbeddings\n",
|
20 |
+
"from huggingface_hub import HfApi\n",
|
21 |
+
"load_dotenv()\n",
|
22 |
+
"\n",
|
23 |
+
"import pandas as pd"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"cell_type": "code",
|
28 |
+
"execution_count": 19,
|
29 |
+
"metadata": {},
|
30 |
+
"outputs": [
|
31 |
+
{
|
32 |
+
"data": {
|
33 |
+
"text/html": [
|
34 |
+
"<div>\n",
|
35 |
+
"<style scoped>\n",
|
36 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
37 |
+
" vertical-align: middle;\n",
|
38 |
+
" }\n",
|
39 |
+
"\n",
|
40 |
+
" .dataframe tbody tr th {\n",
|
41 |
+
" vertical-align: top;\n",
|
42 |
+
" }\n",
|
43 |
+
"\n",
|
44 |
+
" .dataframe thead th {\n",
|
45 |
+
" text-align: right;\n",
|
46 |
+
" }\n",
|
47 |
+
"</style>\n",
|
48 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
49 |
+
" <thead>\n",
|
50 |
+
" <tr style=\"text-align: right;\">\n",
|
51 |
+
" <th></th>\n",
|
52 |
+
" <th>prompt</th>\n",
|
53 |
+
" <th>Desired Answer</th>\n",
|
54 |
+
" <th>Category</th>\n",
|
55 |
+
" <th>llama3_answer_1721726247</th>\n",
|
56 |
+
" <th>llama3_score_1721726247</th>\n",
|
57 |
+
" <th>mixtral_answer_1721726407</th>\n",
|
58 |
+
" <th>mixtral_score_1721726407</th>\n",
|
59 |
+
" <th>gemma_answer_1721726499</th>\n",
|
60 |
+
" <th>gemma_score_1721726499</th>\n",
|
61 |
+
" <th>Codestral Mamba_answer_1721759526</th>\n",
|
62 |
+
" <th>Codestral Mamba_score_1721759526</th>\n",
|
63 |
+
" <th>Codestral_answer_1721759762</th>\n",
|
64 |
+
" <th>Codestral_score_1721759762</th>\n",
|
65 |
+
" </tr>\n",
|
66 |
+
" </thead>\n",
|
67 |
+
" <tbody>\n",
|
68 |
+
" <tr>\n",
|
69 |
+
" <th>0</th>\n",
|
70 |
+
" <td>Which month has the highest average PM2.5 in 2...</td>\n",
|
71 |
+
" <td>January</td>\n",
|
72 |
+
" <td>NaN</td>\n",
|
73 |
+
" <td>The month with the highest average PM2.5 in 20...</td>\n",
|
74 |
+
" <td>True</td>\n",
|
75 |
+
" <td>The month with the highest average PM2.5 in 20...</td>\n",
|
76 |
+
" <td>True</td>\n",
|
77 |
+
" <td>The highest average PM2.5 in 2023 for Mumbai w...</td>\n",
|
78 |
+
" <td>True</td>\n",
|
79 |
+
" <td>content='To find the month with the highest av...</td>\n",
|
80 |
+
" <td>True</td>\n",
|
81 |
+
" <td>The month with the highest average PM2.5 in 20...</td>\n",
|
82 |
+
" <td>True</td>\n",
|
83 |
+
" </tr>\n",
|
84 |
+
" <tr>\n",
|
85 |
+
" <th>1</th>\n",
|
86 |
+
" <td>Which month generally has the highest pollution?</td>\n",
|
87 |
+
" <td>November</td>\n",
|
88 |
+
" <td>NaN</td>\n",
|
89 |
+
" <td>The month with the highest pollution is 11 wit...</td>\n",
|
90 |
+
" <td>True</td>\n",
|
91 |
+
" <td>The month with the highest pollution (on avera...</td>\n",
|
92 |
+
" <td>True</td>\n",
|
93 |
+
" <td>The month with the highest average PM2.5 is No...</td>\n",
|
94 |
+
" <td>True</td>\n",
|
95 |
+
" <td>content='To find out which month generally has...</td>\n",
|
96 |
+
" <td>False</td>\n",
|
97 |
+
" <td>The month with the highest pollution is Novemb...</td>\n",
|
98 |
+
" <td>True</td>\n",
|
99 |
+
" </tr>\n",
|
100 |
+
" </tbody>\n",
|
101 |
+
"</table>\n",
|
102 |
+
"</div>"
|
103 |
+
],
|
104 |
+
"text/plain": [
|
105 |
+
" prompt Desired Answer \\\n",
|
106 |
+
"0 Which month has the highest average PM2.5 in 2... January \n",
|
107 |
+
"1 Which month generally has the highest pollution? November \n",
|
108 |
+
"\n",
|
109 |
+
" Category llama3_answer_1721726247 \\\n",
|
110 |
+
"0 NaN The month with the highest average PM2.5 in 20... \n",
|
111 |
+
"1 NaN The month with the highest pollution is 11 wit... \n",
|
112 |
+
"\n",
|
113 |
+
" llama3_score_1721726247 mixtral_answer_1721726407 \\\n",
|
114 |
+
"0 True The month with the highest average PM2.5 in 20... \n",
|
115 |
+
"1 True The month with the highest pollution (on avera... \n",
|
116 |
+
"\n",
|
117 |
+
" mixtral_score_1721726407 gemma_answer_1721726499 \\\n",
|
118 |
+
"0 True The highest average PM2.5 in 2023 for Mumbai w... \n",
|
119 |
+
"1 True The month with the highest average PM2.5 is No... \n",
|
120 |
+
"\n",
|
121 |
+
" gemma_score_1721726499 Codestral Mamba_answer_1721759526 \\\n",
|
122 |
+
"0 True content='To find the month with the highest av... \n",
|
123 |
+
"1 True content='To find out which month generally has... \n",
|
124 |
+
"\n",
|
125 |
+
" Codestral Mamba_score_1721759526 \\\n",
|
126 |
+
"0 True \n",
|
127 |
+
"1 False \n",
|
128 |
+
"\n",
|
129 |
+
" Codestral_answer_1721759762 \\\n",
|
130 |
+
"0 The month with the highest average PM2.5 in 20... \n",
|
131 |
+
"1 The month with the highest pollution is Novemb... \n",
|
132 |
+
"\n",
|
133 |
+
" Codestral_score_1721759762 \n",
|
134 |
+
"0 True \n",
|
135 |
+
"1 True "
|
136 |
+
]
|
137 |
+
},
|
138 |
+
"execution_count": 19,
|
139 |
+
"metadata": {},
|
140 |
+
"output_type": "execute_result"
|
141 |
+
}
|
142 |
+
],
|
143 |
+
"source": [
|
144 |
+
"prompts = pd.read_csv(\"prompts.csv\")\n",
|
145 |
+
"prompts.head(2)"
|
146 |
+
]
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"cell_type": "code",
|
150 |
+
"execution_count": 28,
|
151 |
+
"metadata": {},
|
152 |
+
"outputs": [
|
153 |
+
{
|
154 |
+
"name": "stdout",
|
155 |
+
"output_type": "stream",
|
156 |
+
"text": [
|
157 |
+
"Date datetime64[ns]\n",
|
158 |
+
"City object\n",
|
159 |
+
"AQI float64\n",
|
160 |
+
"Pollutant object\n",
|
161 |
+
"Air Quality object\n",
|
162 |
+
"Based on number of monitoring stations float64\n",
|
163 |
+
"State object\n",
|
164 |
+
"Date_City object\n",
|
165 |
+
"dtype: object\n"
|
166 |
+
]
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"data": {
|
170 |
+
"text/html": [
|
171 |
+
"<div>\n",
|
172 |
+
"<style scoped>\n",
|
173 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
174 |
+
" vertical-align: middle;\n",
|
175 |
+
" }\n",
|
176 |
+
"\n",
|
177 |
+
" .dataframe tbody tr th {\n",
|
178 |
+
" vertical-align: top;\n",
|
179 |
+
" }\n",
|
180 |
+
"\n",
|
181 |
+
" .dataframe thead th {\n",
|
182 |
+
" text-align: right;\n",
|
183 |
+
" }\n",
|
184 |
+
"</style>\n",
|
185 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
186 |
+
" <thead>\n",
|
187 |
+
" <tr style=\"text-align: right;\">\n",
|
188 |
+
" <th></th>\n",
|
189 |
+
" <th>Date</th>\n",
|
190 |
+
" <th>City</th>\n",
|
191 |
+
" <th>AQI</th>\n",
|
192 |
+
" <th>Pollutant</th>\n",
|
193 |
+
" <th>Air Quality</th>\n",
|
194 |
+
" <th>Based on number of monitoring stations</th>\n",
|
195 |
+
" <th>State</th>\n",
|
196 |
+
" <th>Date_City</th>\n",
|
197 |
+
" </tr>\n",
|
198 |
+
" </thead>\n",
|
199 |
+
" <tbody>\n",
|
200 |
+
" <tr>\n",
|
201 |
+
" <th>0</th>\n",
|
202 |
+
" <td>2016-01-01</td>\n",
|
203 |
+
" <td>Agartala</td>\n",
|
204 |
+
" <td>NaN</td>\n",
|
205 |
+
" <td>None</td>\n",
|
206 |
+
" <td>None</td>\n",
|
207 |
+
" <td>NaN</td>\n",
|
208 |
+
" <td>None</td>\n",
|
209 |
+
" <td>None</td>\n",
|
210 |
+
" </tr>\n",
|
211 |
+
" <tr>\n",
|
212 |
+
" <th>1</th>\n",
|
213 |
+
" <td>2016-01-01</td>\n",
|
214 |
+
" <td>Agra</td>\n",
|
215 |
+
" <td>417.0</td>\n",
|
216 |
+
" <td>PM\\n2.5</td>\n",
|
217 |
+
" <td>Severe</td>\n",
|
218 |
+
" <td>1.0</td>\n",
|
219 |
+
" <td>Uttar Pradesh</td>\n",
|
220 |
+
" <td>2016-01-01_Agra</td>\n",
|
221 |
+
" </tr>\n",
|
222 |
+
" </tbody>\n",
|
223 |
+
"</table>\n",
|
224 |
+
"</div>"
|
225 |
+
],
|
226 |
+
"text/plain": [
|
227 |
+
" Date City AQI Pollutant Air Quality \\\n",
|
228 |
+
"0 2016-01-01 Agartala NaN None None \n",
|
229 |
+
"1 2016-01-01 Agra 417.0 PM\\n2.5 Severe \n",
|
230 |
+
"\n",
|
231 |
+
" Based on number of monitoring stations State Date_City \n",
|
232 |
+
"0 NaN None None \n",
|
233 |
+
"1 1.0 Uttar Pradesh 2016-01-01_Agra "
|
234 |
+
]
|
235 |
+
},
|
236 |
+
"execution_count": 28,
|
237 |
+
"metadata": {},
|
238 |
+
"output_type": "execute_result"
|
239 |
+
}
|
240 |
+
],
|
241 |
+
"source": [
|
242 |
+
"data = pd.read_parquet(\"AQI_data.parquet\")\n",
|
243 |
+
"print(data.dtypes)\n",
|
244 |
+
"data.head(2)"
|
245 |
+
]
|
246 |
+
},
|
247 |
+
{
|
248 |
+
"cell_type": "code",
|
249 |
+
"execution_count": 34,
|
250 |
+
"metadata": {},
|
251 |
+
"outputs": [],
|
252 |
+
"source": [
|
253 |
+
"def get_full_prompt(question):\n",
|
254 |
+
" return f\"\"\"You are a data scientist who is good with tools such as pandas, numpy and matplotlib. You are also an expert in air quality. You\n",
|
255 |
+
"can access daily AQI `data` and you have to complete a code provided by me based on my question. `data` is a pandas DataFrame and has the following columns and data types:\n",
|
256 |
+
"\n",
|
257 |
+
"Date: date, Date of the `AQI` data\n",
|
258 |
+
"City: string, Name of the city where the `AQI` was recorded\n",
|
259 |
+
"State: string, Name of the state where `City` is located\n",
|
260 |
+
"AQI: float, AQI value\n",
|
261 |
+
"Air Quality: string, Air quality category from [\"Satisfactory\", \"Moderate\", \"Good\", \"Poor\", \"Very Poor\", \"Severe\"] based on `AQI` value\n",
|
262 |
+
"\n",
|
263 |
+
"Now, my question is: \"{question}\"\n",
|
264 |
+
"\n",
|
265 |
+
"Complete the code below to answer my question:\n",
|
266 |
+
"\n",
|
267 |
+
"```python\n",
|
268 |
+
"import pandas as pd\n",
|
269 |
+
"import numpy as np\n",
|
270 |
+
"\n",
|
271 |
+
"data = pd.read_parquet(\"AQI_data.parquet\")\"\"\""
|
272 |
+
]
|
273 |
+
},
|
274 |
+
{
|
275 |
+
"cell_type": "code",
|
276 |
+
"execution_count": 35,
|
277 |
+
"metadata": {},
|
278 |
+
"outputs": [
|
279 |
+
{
|
280 |
+
"name": "stdout",
|
281 |
+
"output_type": "stream",
|
282 |
+
"text": [
|
283 |
+
"You are a data scientist who is good with tools such as pandas, numpy and matplotlib. You are also an expert in air quality. You\n",
|
284 |
+
"can access daily AQI `data` and you have to complete a code provided by me based on my question. `data` is a pandas DataFrame and has the following columns and data types:\n",
|
285 |
+
"\n",
|
286 |
+
"Date: date, Date of the `AQI` data\n",
|
287 |
+
"City: string, Name of the city where the `AQI` was recorded\n",
|
288 |
+
"State: string, Name of the state where `City` is located\n",
|
289 |
+
"AQI: float, AQI value\n",
|
290 |
+
"Air Quality: string, Air quality category from [\"Satisfactory\", \"Moderate\", \"Good\", \"Poor\", \"Very Poor\", \"Severe\"] based on `AQI` value\n",
|
291 |
+
"\n",
|
292 |
+
"Now, my question is: \"Which month has the highest average PM2.5 in 2023 for Mumbai?\"\n",
|
293 |
+
"\n",
|
294 |
+
"Complete the code below to answer my question:\n",
|
295 |
+
"\n",
|
296 |
+
"```python\n",
|
297 |
+
"import pandas as pd\n",
|
298 |
+
"import numpy as np\n",
|
299 |
+
"\n",
|
300 |
+
"data = pd.read_parquet(\"AQI_data.parquet\")\n"
|
301 |
+
]
|
302 |
+
}
|
303 |
+
],
|
304 |
+
"source": [
|
305 |
+
"print(get_full_prompt(prompts.prompt[0]))"
|
306 |
+
]
|
307 |
+
},
|
308 |
+
{
|
309 |
+
"cell_type": "code",
|
310 |
+
"execution_count": 74,
|
311 |
+
"metadata": {},
|
312 |
+
"outputs": [],
|
313 |
+
"source": [
|
314 |
+
"def get_gemini_response(prompt, model):\n",
|
315 |
+
" return GoogleGenerativeAI(model=model, google_api_key=os.environ.get(\"GOOGLE_API_KEY\"), temperature=0).invoke(prompt)\n",
|
316 |
+
"\n",
|
317 |
+
"def get_groq_response(prompt, model):\n",
|
318 |
+
" return ChatGroq(model=model, api_key=os.environ.get(\"GROQ_API_KEY\"), temperature=0).invoke(prompt).content\n",
|
319 |
+
"\n",
|
320 |
+
"llms = {\"gemini-pro\": lambda prompt: get_gemini_response(prompt, \"gemini-pro\"), \"groq_gemma-7b-it\": lambda prompt: get_groq_response(prompt, \"gemma-7b-it\"), \"groq_llama-3.2-90b-text-preview\": lambda prompt: get_groq_response(prompt, \"llama-3.2-90b-text-preview\")}"
|
321 |
+
]
|
322 |
+
},
|
323 |
+
{
|
324 |
+
"cell_type": "code",
|
325 |
+
"execution_count": 75,
|
326 |
+
"metadata": {},
|
327 |
+
"outputs": [
|
328 |
+
{
|
329 |
+
"data": {
|
330 |
+
"text/plain": [
|
331 |
+
"('Which month generally has the highest pollution?', 'November')"
|
332 |
+
]
|
333 |
+
},
|
334 |
+
"execution_count": 75,
|
335 |
+
"metadata": {},
|
336 |
+
"output_type": "execute_result"
|
337 |
+
}
|
338 |
+
],
|
339 |
+
"source": [
|
340 |
+
"i = 1\n",
|
341 |
+
"prompts.prompt[i], prompts['Desired Answer'][i]"
|
342 |
+
]
|
343 |
+
},
|
344 |
+
{
|
345 |
+
"cell_type": "code",
|
346 |
+
"execution_count": null,
|
347 |
+
"metadata": {},
|
348 |
+
"outputs": [],
|
349 |
+
"source": [
|
350 |
+
"llm = \"groq_llama-3.2-90b-text-preview\"\n",
|
351 |
+
"full_prompt = get_full_prompt(\"Which city has the highest AQI value consistently over the years?\")\n",
|
352 |
+
"answer = llms[llm](full_prompt)"
|
353 |
+
]
|
354 |
+
},
|
355 |
+
{
|
356 |
+
"cell_type": "code",
|
357 |
+
"execution_count": 83,
|
358 |
+
"metadata": {},
|
359 |
+
"outputs": [
|
360 |
+
{
|
361 |
+
"name": "stdout",
|
362 |
+
"output_type": "stream",
|
363 |
+
"text": [
|
364 |
+
"You are a data scientist who is good with tools such as pandas, numpy and matplotlib. You are also an expert in air quality. You\n",
|
365 |
+
"can access daily AQI `data` and you have to complete a code provided by me based on my question. `data` is a pandas DataFrame and has the following columns and data types:\n",
|
366 |
+
"\n",
|
367 |
+
"Date: date, Date of the `AQI` data\n",
|
368 |
+
"City: string, Name of the city where the `AQI` was recorded\n",
|
369 |
+
"State: string, Name of the state where `City` is located\n",
|
370 |
+
"AQI: float, AQI value\n",
|
371 |
+
"Air Quality: string, Air quality category from [\"Satisfactory\", \"Moderate\", \"Good\", \"Poor\", \"Very Poor\", \"Severe\"] based on `AQI` value\n",
|
372 |
+
"\n",
|
373 |
+
"Now, my question is: \"Which city has the highest AQI value consistently over the years?\"\n",
|
374 |
+
"\n",
|
375 |
+
"Complete the code below to answer my question:\n",
|
376 |
+
"\n",
|
377 |
+
"```python\n",
|
378 |
+
"import pandas as pd\n",
|
379 |
+
"import numpy as np\n",
|
380 |
+
"\n",
|
381 |
+
"data = pd.read_parquet(\"AQI_data.parquet\")\n",
|
382 |
+
"####################################################################################################\n",
|
383 |
+
"import pandas as pd\n",
|
384 |
+
"import numpy as np\n",
|
385 |
+
"import matplotlib.pyplot as plt\n",
|
386 |
+
"\n",
|
387 |
+
"# Load the data\n",
|
388 |
+
"data = pd.read_parquet(\"AQI_data.parquet\")\n",
|
389 |
+
"\n",
|
390 |
+
"# Convert 'Date' column to datetime and extract the year\n",
|
391 |
+
"data['Date'] = pd.to_datetime(data['Date'])\n",
|
392 |
+
"data['Year'] = data['Date'].dt.year\n",
|
393 |
+
"\n",
|
394 |
+
"# Group by 'City' and 'Year', and calculate the average AQI value\n",
|
395 |
+
"avg_aqi = data.groupby(['City', 'Year'])['AQI'].mean().reset_index()\n",
|
396 |
+
"\n",
|
397 |
+
"# Group by 'City' and calculate the average AQI value over the years\n",
|
398 |
+
"avg_aqi_over_years = avg_aqi.groupby('City')['AQI'].mean().reset_index()\n",
|
399 |
+
"\n",
|
400 |
+
"# Find the city with the highest average AQI value\n",
|
401 |
+
"city_with_highest_aqi = avg_aqi_over_years.loc[avg_aqi_over_years['AQI'].idxmax()]\n",
|
402 |
+
"\n",
|
403 |
+
"print(f\"The city with the highest AQI value consistently over the years is {city_with_highest_aqi['City']} with an average AQI value of {city_with_highest_aqi['AQI']:.2f}\")\n",
|
404 |
+
"\n",
|
405 |
+
"# Plot the top 10 cities with the highest average AQI values\n",
|
406 |
+
"top_10_cities = avg_aqi_over_years.nlargest(10, 'AQI')\n",
|
407 |
+
"plt.figure(figsize=(10, 6))\n",
|
408 |
+
"plt.bar(top_10_cities['City'], top_10_cities['AQI'])\n",
|
409 |
+
"plt.xlabel('City')\n",
|
410 |
+
"plt.ylabel('Average AQI Value')\n",
|
411 |
+
"plt.title('Top 10 Cities with the Highest Average AQI Values')\n",
|
412 |
+
"plt.xticks(rotation=90)\n",
|
413 |
+
"plt.tight_layout()\n",
|
414 |
+
"plt.show()\n",
|
415 |
+
"\n",
|
416 |
+
"####################################################################################################\n",
|
417 |
+
"The city with the highest AQI value consistently over the years is Jharsuguda with an average AQI value of 282.00\n"
|
418 |
+
]
|
419 |
+
}
|
420 |
+
],
|
421 |
+
"source": [
|
422 |
+
"import re\n",
|
423 |
+
"code = re.search(r\"```python\\n(.*)```\", answer, re.DOTALL).group(1)\n",
|
424 |
+
"print(full_prompt)\n",
|
425 |
+
"print(\"#\" * 100)\n",
|
426 |
+
"print(code)\n",
|
427 |
+
"print(\"#\" * 100)\n",
|
428 |
+
"exec(code)"
|
429 |
+
]
|
430 |
+
},
|
431 |
+
{
|
432 |
+
"cell_type": "code",
|
433 |
+
"execution_count": null,
|
434 |
+
"metadata": {},
|
435 |
+
"outputs": [],
|
436 |
+
"source": [
|
437 |
+
"# import streamlit as st\n",
|
438 |
+
"# import os\n",
|
439 |
+
"# import pandas as pd\n",
|
440 |
+
"# import random\n",
|
441 |
+
"# from os.path import join\n",
|
442 |
+
"# from datetime import datetime\n",
|
443 |
+
"# from src import decorate_with_code, show_response, get_from_user\n",
|
444 |
+
"# from dotenv import load_dotenv\n",
|
445 |
+
"# from langchain_groq.chat_models import ChatGroq\n",
|
446 |
+
"# from langchain_mistralai import ChatMistralAI\n",
|
447 |
+
"# from huggingface_hub import HfApi\n",
|
448 |
+
"# st.set_page_config(layout=\"wide\")\n",
|
449 |
+
"\n",
|
450 |
+
"# ### Extract data.zip\n",
|
451 |
+
"# if not os.path.exists(\"data/1\"):\n",
|
452 |
+
"# os.system(\"unzip data.zip\")\n",
|
453 |
+
"\n",
|
454 |
+
"# # Load environment variables : Groq and Hugging Face API keys\n",
|
455 |
+
"# load_dotenv()\n",
|
456 |
+
"# Groq_Token = os.environ[\"GROQ_API_KEY\"]\n",
|
457 |
+
"# CODESTRAL_API_KEY = os.environ[\"CODESTRAL_API_KEY\"]\n",
|
458 |
+
"# hf_token = os.environ[\"HF_TOKEN\"]\n",
|
459 |
+
"# models = {\"llama3\":\"llama3-70b-8192\",\"mixtral\": \"mixtral-8x7b-32768\", \"llama2\": \"llama2-70b-4096\", \"gemma\": \"gemma-7b-it\"}\n",
|
460 |
+
"# groq_models = {\"llama3-70b\": \"llama3-70b-8192\", \"mixtral\": \"mixtral-8x7b-32768\", \"gemma-7b\": \"gemma-7b-it\",\"llama3.1-70b\":\"llama-3.1-70b-versatile\",\"llama3-8b\":\"llama3-8b-8192\",\"llama3.1-8b\":\"llama-3.1-8b-instant\",\"gemma-9b\":\"gemma2-9b-it\"}\n",
|
461 |
+
"# mistral_models = {\"Codestral Mamba\" : \"open-codestral-mamba\", \"Codestral\" : \"codestral-latest\",\"Mistral 7B\":\"open-mistral-7b\"}\n",
|
462 |
+
"# groq_model_list = list(groq_models.keys())\n",
|
463 |
+
"# mistral_model_list = list(mistral_models.keys())\n",
|
464 |
+
"\n",
|
465 |
+
"# self_path = os.path.dirname(os.path.abspath(__file__))\n",
|
466 |
+
"\n",
|
467 |
+
"\n",
|
468 |
+
"# def generate_template(prompt):\n",
|
469 |
+
"# df_check = pd.read_csv(\"Data.csv\")\n",
|
470 |
+
"# df_check[\"Timestamp\"] = pd.to_datetime(df_check[\"Timestamp\"])\n",
|
471 |
+
"# df_check = df_check.head(5)\n",
|
472 |
+
"\n",
|
473 |
+
"# new_line = \"\\n\"\n",
|
474 |
+
"\n",
|
475 |
+
"# template = f\"\"\"```python\n",
|
476 |
+
"# import pandas as pd\n",
|
477 |
+
"# import matplotlib.pyplot as plt\n",
|
478 |
+
"\n",
|
479 |
+
"# df = pd.read_csv(\"Data.csv\")\n",
|
480 |
+
"# df[\"Timestamp\"] = pd.to_datetime(df[\"Timestamp\"])\n",
|
481 |
+
"\n",
|
482 |
+
"# # df.dtypes\n",
|
483 |
+
"# {new_line.join(map(lambda x: '# '+x, str(df_check.dtypes).split(new_line)))}\n",
|
484 |
+
"\n",
|
485 |
+
"# # {prompt.strip()}\n",
|
486 |
+
"# # <your code here>\n",
|
487 |
+
"\n",
|
488 |
+
"# #answer = \n",
|
489 |
+
"# ```\n",
|
490 |
+
"# \"\"\"\n",
|
491 |
+
"# return template\n",
|
492 |
+
"\n",
|
493 |
+
"\n",
|
494 |
+
"# def generate_query(template):\n",
|
495 |
+
" \n",
|
496 |
+
"# query = f\"\"\"I have a pandas dataframe data of PM2.5.\n",
|
497 |
+
"# * The columns are 'Timestamp', 'station', 'PM2.5', 'address', 'city', 'latitude', 'longitude', and 'state'.\n",
|
498 |
+
"# * Frequency of Data is Daily.\n",
|
499 |
+
"# * `Pollution` generally means `PM2.5`.\n",
|
500 |
+
"# * PM2.5 guidelines: India: 60, WHO: 15.\n",
|
501 |
+
"# * Store the final answer in a global variable `answer`.\n",
|
502 |
+
"# * Always report the unit of the data. Example: `The average PM2.5 is 45.67 µg/m³`\n",
|
503 |
+
"\n",
|
504 |
+
"# Complete the following code.\n",
|
505 |
+
"\n",
|
506 |
+
"# {template}\n",
|
507 |
+
"# \"\"\"\n",
|
508 |
+
"# return query\n",
|
509 |
+
"\n",
|
510 |
+
"\n",
|
511 |
+
"# def process_query(query, llm):\n",
|
512 |
+
"# global answer\n",
|
513 |
+
"# template = generate_template(query)\n",
|
514 |
+
"# query = generate_query(template)\n",
|
515 |
+
"# global code\n",
|
516 |
+
"# global error\n",
|
517 |
+
"# try:\n",
|
518 |
+
"# answer = llm.invoke(query)\n",
|
519 |
+
"# error = ''\n",
|
520 |
+
"# code = f\"\"\"\n",
|
521 |
+
"# {template.split(\"```python\")[1].split(\"```\")[0]}\n",
|
522 |
+
"# {answer.content.split(\"```python\")[1].split(\"```\")[0]}\n",
|
523 |
+
"# \"\"\"\n",
|
524 |
+
"# # update variable `answer` when code is executed\n",
|
525 |
+
"# exec(code,globals())\n",
|
526 |
+
"# except Exception as e:\n",
|
527 |
+
"# error = e\n",
|
528 |
+
"# code = ''\n",
|
529 |
+
"# answer = f\"Error: {e}\"\n",
|
530 |
+
"# print(answer)\n",
|
531 |
+
"\n",
|
532 |
+
"\n",
|
533 |
+
"# # Using HTML and CSS to center the title\n",
|
534 |
+
"# st.write(\n",
|
535 |
+
"# \"\"\"\n",
|
536 |
+
"# <style>\n",
|
537 |
+
"# .title {\n",
|
538 |
+
"# text-align: center;\n",
|
539 |
+
"# color: #17becf;\n",
|
540 |
+
"# }\n",
|
541 |
+
"# </style>\n",
|
542 |
+
"# \"\"\",\n",
|
543 |
+
"# unsafe_allow_html=True,\n",
|
544 |
+
"# )\n",
|
545 |
+
"\n",
|
546 |
+
"\n",
|
547 |
+
"\n",
|
548 |
+
"\n",
|
549 |
+
"# # Display images and text in three columns with specified ratios\n",
|
550 |
+
"# col1, col2, col3 = st.sidebar.columns((1.0, 2, 1.0)) \n",
|
551 |
+
"# with col2:\n",
|
552 |
+
"# st.markdown(\"<h1 class='title'>Airchat</h1>\", unsafe_allow_html=True)\n",
|
553 |
+
" \n",
|
554 |
+
" \n",
|
555 |
+
"# model_name = st.sidebar.selectbox(\"Select LLM:\", groq_model_list + mistral_model_list)\n",
|
556 |
+
"\n",
|
557 |
+
"# questions = ['Custom Prompt']\n",
|
558 |
+
"# with open(join(self_path, \"questions.txt\")) as f:\n",
|
559 |
+
"# questions += f.read().split(\"\\n\")\n",
|
560 |
+
"\n",
|
561 |
+
"# waiting_lines = (\"Thinking...\", \"Just a moment...\", \"Let me think...\", \"Working on it...\", \"Processing...\", \"Hold on...\", \"One moment...\", \"On it...\")\n",
|
562 |
+
"\n",
|
563 |
+
"\n",
|
564 |
+
"\n",
|
565 |
+
"# # Initialize chat history\n",
|
566 |
+
"# if \"responses\" not in st.session_state:\n",
|
567 |
+
"# st.session_state.responses = []\n",
|
568 |
+
" \n",
|
569 |
+
"\n",
|
570 |
+
"# # Display chat responses from history on app rerun\n",
|
571 |
+
"# print(\"#\"*10)\n",
|
572 |
+
"# for response_id, response in enumerate(st.session_state.responses):\n",
|
573 |
+
"# status = show_response(st, response)\n",
|
574 |
+
"# if response[\"role\"] == \"assistant\":\n",
|
575 |
+
"# # feedback_key = f\"feedback_{int(response_id/2)}\"\n",
|
576 |
+
"# print(\"response_id\", response_id)\n",
|
577 |
+
" \n",
|
578 |
+
"# error = response[\"error\"]\n",
|
579 |
+
"# output = response[\"content\"]\n",
|
580 |
+
"# last_prompt = response[\"last_prompt\"]\n",
|
581 |
+
"# code = response[\"gen_code\"]\n",
|
582 |
+
"# evaluation = response[\"evaluation\"]\n",
|
583 |
+
" \n",
|
584 |
+
" \n",
|
585 |
+
" \n",
|
586 |
+
"# print(\"#\"*10)\n",
|
587 |
+
"\n",
|
588 |
+
"# show = True\n",
|
589 |
+
"# prompt = st.sidebar.selectbox(\"Select a Prompt:\", questions, key=\"prompt_key\")\n",
|
590 |
+
"# if prompt == 'Custom Prompt':\n",
|
591 |
+
"# show = False\n",
|
592 |
+
"# # React to user input\n",
|
593 |
+
"# prompt = st.chat_input(\"Ask me anything about air quality!\", key=1000)\n",
|
594 |
+
"# if prompt :\n",
|
595 |
+
"# show = True\n",
|
596 |
+
"# else:\n",
|
597 |
+
"# # placeholder for chat input\n",
|
598 |
+
"# st.chat_input(\"Select 'Select a Prompt' -> 'Custom Prompt' in the sidebar to ask your own questions.\", key=1000, disabled=True)\n",
|
599 |
+
"\n",
|
600 |
+
"# if \"last_prompt\" in st.session_state:\n",
|
601 |
+
"# last_prompt = st.session_state[\"last_prompt\"]\n",
|
602 |
+
"# last_model_name = st.session_state[\"last_model_name\"]\n",
|
603 |
+
"# if (prompt == last_prompt) and (model_name == last_model_name):\n",
|
604 |
+
"# show = False\n",
|
605 |
+
"\n",
|
606 |
+
"# if prompt:\n",
|
607 |
+
"# st.sidebar.info(\"Select 'Custom Prompt' to ask your own questions.\")\n",
|
608 |
+
"\n",
|
609 |
+
"# if show:\n",
|
610 |
+
"# # Add user input to chat history\n",
|
611 |
+
"# user_response = get_from_user(prompt)\n",
|
612 |
+
"# st.session_state.responses.append(user_response)\n",
|
613 |
+
"\n",
|
614 |
+
"# # select random waiting line\n",
|
615 |
+
"# with st.spinner(random.choice(waiting_lines)):\n",
|
616 |
+
"# ran = False\n",
|
617 |
+
"# for i in range(1):\n",
|
618 |
+
"# print(f\"Attempt {i+1}\")\n",
|
619 |
+
"# if model_name in groq_models:\n",
|
620 |
+
"# model_folder = \"Groq_\" + groq_models[model_name]\n",
|
621 |
+
"# llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)\n",
|
622 |
+
"# else:\n",
|
623 |
+
"# model_folder = \"MistralAI_\" + mistral_models[model_name]\n",
|
624 |
+
"# llm = ChatMistralAI(model=mistral_models[model_name], api_key=CODESTRAL_API_KEY, temperature=0)\n",
|
625 |
+
"# print(llm)\n",
|
626 |
+
"# # llm = ChatGroq(model=models[model_name], api_key=os.getenv(\"GROQ_API\"), temperature=0)\n",
|
627 |
+
"\n",
|
628 |
+
"# df_check = pd.read_csv(\"Data.csv\")\n",
|
629 |
+
"# df_check[\"Timestamp\"] = pd.to_datetime(df_check[\"Timestamp\"])\n",
|
630 |
+
"# df_check = df_check.head(5)\n",
|
631 |
+
"\n",
|
632 |
+
"# new_line = \"\\n\"\n",
|
633 |
+
"\n",
|
634 |
+
"# parameters = {\"font.size\": 12,\"figure.dpi\": 600}\n",
|
635 |
+
"\n",
|
636 |
+
"# process_query(prompt, llm)\n",
|
637 |
+
" \n",
|
638 |
+
" \n",
|
639 |
+
"# # Read the questions from Questions.txt and find the index of the question if there is a match\n",
|
640 |
+
"# with open(join(\"questions.txt\")) as f:\n",
|
641 |
+
"# questions = f.read().split(\"\\n\")\n",
|
642 |
+
"# try:\n",
|
643 |
+
"# index = questions.index(prompt)\n",
|
644 |
+
"# index = index + 1\n",
|
645 |
+
"# except:\n",
|
646 |
+
"# index = None \n",
|
647 |
+
"# print(\"Index\",index)\n",
|
648 |
+
"# if type(index) == int:\n",
|
649 |
+
"# # Open folder data/index/llm_name and compare with evaluation.txt\n",
|
650 |
+
"# with open(join(\"data\", str(index), model_folder, \"evaluation.txt\")) as f:\n",
|
651 |
+
"# evaluation = f.read().strip()\n",
|
652 |
+
"# with open(join(\"data\", str(index), \"ground_truth\", \"answer.txt\")) as f:\n",
|
653 |
+
"# ground_truth = f.read().strip()\n",
|
654 |
+
"# else:\n",
|
655 |
+
"# evaluation = \"DK\"\n",
|
656 |
+
"# ground_truth = None \n",
|
657 |
+
"# response = {\"role\": \"assistant\", \"content\": answer, \"gen_code\": code, \"ex_code\": code, \"last_prompt\": prompt, \"error\": error,\"evaluation\": evaluation,\"ground_truth\": ground_truth}\n",
|
658 |
+
"\n",
|
659 |
+
"# if ran:\n",
|
660 |
+
"# break\n",
|
661 |
+
" \n",
|
662 |
+
"# # Append agent response to chat history\n",
|
663 |
+
"# st.session_state.responses.append(response)\n",
|
664 |
+
" \n",
|
665 |
+
"# st.session_state['last_prompt'] = prompt\n",
|
666 |
+
"# st.session_state['last_model_name'] = model_name\n",
|
667 |
+
"# st.rerun()\n",
|
668 |
+
" \n",
|
669 |
+
" \n",
|
670 |
+
"\n",
|
671 |
+
"# # Display contact details with message\n",
|
672 |
+
"# st.sidebar.markdown(\"<hr>\", unsafe_allow_html=True)"
|
673 |
+
]
|
674 |
+
}
|
675 |
+
],
|
676 |
+
"metadata": {
|
677 |
+
"kernelspec": {
|
678 |
+
"display_name": "zeel_py310",
|
679 |
+
"language": "python",
|
680 |
+
"name": "python3"
|
681 |
+
},
|
682 |
+
"language_info": {
|
683 |
+
"codemirror_mode": {
|
684 |
+
"name": "ipython",
|
685 |
+
"version": 3
|
686 |
+
},
|
687 |
+
"file_extension": ".py",
|
688 |
+
"mimetype": "text/x-python",
|
689 |
+
"name": "python",
|
690 |
+
"nbconvert_exporter": "python",
|
691 |
+
"pygments_lexer": "ipython3",
|
692 |
+
"version": "3.10.15"
|
693 |
+
}
|
694 |
+
},
|
695 |
+
"nbformat": 4,
|
696 |
+
"nbformat_minor": 2
|
697 |
+
}
|
app.py
CHANGED
@@ -5,13 +5,22 @@ import pandas as pd
|
|
5 |
import random
|
6 |
from os.path import join
|
7 |
from datetime import datetime
|
8 |
-
from src import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
from dotenv import load_dotenv
|
10 |
from langchain_groq.chat_models import ChatGroq
|
11 |
from langchain_google_genai import GoogleGenerativeAI
|
12 |
from streamlit_feedback import streamlit_feedback
|
13 |
from huggingface_hub import HfApi
|
14 |
-
from datasets import load_dataset, get_dataset_config_info
|
15 |
from PIL import Image
|
16 |
|
17 |
st.set_page_config(layout="wide")
|
@@ -21,7 +30,13 @@ load_dotenv()
|
|
21 |
Groq_Token = os.environ["GROQ_API_KEY"]
|
22 |
hf_token = os.environ["HF_TOKEN"]
|
23 |
gemini_token = os.environ["GEMINI_TOKEN"]
|
24 |
-
models = {
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
self_path = os.path.dirname(os.path.abspath(__file__))
|
27 |
|
@@ -40,10 +55,16 @@ st.write(
|
|
40 |
)
|
41 |
|
42 |
# Displaying the centered title
|
43 |
-
st.markdown(
|
|
|
|
|
|
|
44 |
|
45 |
# Center-aligned instruction text with bold formatting
|
46 |
-
st.markdown(
|
|
|
|
|
|
|
47 |
# os.environ["PANDASAI_API_KEY"] = "$2a$10$gbmqKotzJOnqa7iYOun8eO50TxMD/6Zw1pLI2JEoqncwsNx4XeBS2"
|
48 |
|
49 |
# with open(join(self_path, "context1.txt")) as f:
|
@@ -57,27 +78,36 @@ st.markdown("<div style='text-align:center;'>Choose a query from <b>Select a pro
|
|
57 |
|
58 |
image_path = "IITGN_Logo.png"
|
59 |
|
60 |
-
|
61 |
-
col1, col2, col3 = st.sidebar.columns((1.0, 2, 1.0))
|
62 |
with col2:
|
63 |
st.image(image_path, use_column_width=True)
|
64 |
st.markdown("<h1 class='title'>VayuBuddy</h1>", unsafe_allow_html=True)
|
65 |
-
|
66 |
-
|
67 |
-
model_name = st.sidebar.selectbox("Select LLM:", ["llama3","mixtral", "gemma", "gemini-pro"])
|
68 |
|
69 |
-
|
|
|
|
|
|
|
70 |
with open(join(self_path, "questions.txt")) as f:
|
71 |
questions += f.read().split("\n")
|
72 |
|
73 |
-
waiting_lines = (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
# agent = load_agent(df, context="", inference_server=inference_server, name=model_name)
|
76 |
|
77 |
# Initialize chat history
|
78 |
if "responses" not in st.session_state:
|
79 |
st.session_state.responses = []
|
80 |
-
|
81 |
### Old code for feedback
|
82 |
# def push_to_dataset(feedback, comments,output,code,error):
|
83 |
# # Load existing dataset or create a new one if it doesn't exist
|
@@ -89,19 +119,25 @@ if "responses" not in st.session_state:
|
|
89 |
|
90 |
# # Add new feedback to the dataset
|
91 |
# new_data = {"feedback": [feedback], "comments": [comments], "error": [error], "output": [output], "code": [code]} # Convert feedback and comments to lists
|
92 |
-
# new_data = Dataset.from_dict(new_data)
|
93 |
|
94 |
# ds = concatenate_datasets([ds, new_data])
|
95 |
|
96 |
# # Push the updated dataset to Hugging Face Hub
|
97 |
-
# ds.push_to_hub("YashB1/Feedbacks_eoc", split="evaluation")
|
|
|
98 |
|
99 |
def upload_feedback():
|
100 |
print("Uploading feedback")
|
101 |
data = {
|
102 |
-
"feedback": feedback[
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
105 |
# generate a random file name based on current time-stamp: YYYY-MM-DD_HH-MM-SS
|
106 |
random_folder_name = str(datetime.now()).replace(" ", "_").replace(":", "-").replace(".", "-")
|
107 |
print("Random folder:", random_folder_name)
|
@@ -124,71 +160,74 @@ Feedback: {feedback['score']}
|
|
124 |
|
125 |
Comments: {feedback['text']}
|
126 |
"""
|
127 |
-
|
128 |
print(template, file=f)
|
129 |
-
|
130 |
api = HfApi(token=hf_token)
|
131 |
api.upload_file(
|
132 |
-
|
133 |
-
|
134 |
-
repo_id="SustainabilityLabIITGN/VayuBuddy_Feedback",
|
135 |
-
repo_type="dataset",
|
136 |
-
)
|
137 |
-
if status['is_image']:
|
138 |
-
api.upload_file(
|
139 |
-
path_or_fileobj=output,
|
140 |
-
path_in_repo=f"data/{random_folder_name}/plot.png",
|
141 |
repo_id="SustainabilityLabIITGN/VayuBuddy_Feedback",
|
142 |
repo_type="dataset",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
)
|
144 |
-
|
145 |
print("Feedback uploaded successfully!")
|
146 |
|
|
|
147 |
# Display chat responses from history on app rerun
|
148 |
-
print("#"*10)
|
149 |
for response_id, response in enumerate(st.session_state.responses):
|
150 |
status = show_response(st, response)
|
151 |
if response["role"] == "assistant":
|
152 |
feedback_key = f"feedback_{int(response_id/2)}"
|
153 |
print("response_id", response_id, "feedback_key", feedback_key)
|
154 |
-
|
155 |
error = response["error"]
|
156 |
output = response["content"]
|
157 |
last_prompt = response["last_prompt"]
|
158 |
code = response["gen_code"]
|
159 |
-
|
160 |
if "feedback" in st.session_state.responses[response_id]:
|
161 |
st.write("Feedback:", st.session_state.responses[response_id]["feedback"])
|
162 |
else:
|
163 |
## !!! This does on work on Safari !!!
|
164 |
# feedback = streamlit_feedback(feedback_type="thumbs",
|
165 |
# optional_text_label="[Optional] Please provide extra information", on_submit=upload_feedback, key=feedback_key)
|
166 |
-
|
167 |
-
|
168 |
-
thumbs = st.radio("We would appreciate your feedback!", (
|
169 |
|
170 |
if thumbs:
|
171 |
# Text input for comments
|
172 |
-
comments = st.text_area("[Optional] Please provide extra information", key=feedback_key+"_comments")
|
173 |
feedback = {"score": thumbs, "text": comments}
|
174 |
-
if st.button("Submit", on_click=upload_feedback, key=feedback_key+"_submit"):
|
175 |
st.session_state.responses[response_id]["feedback"] = feedback
|
176 |
st.success("Feedback uploaded successfully!")
|
177 |
-
|
178 |
-
|
179 |
-
print("#"*10)
|
180 |
|
181 |
show = True
|
182 |
prompt = st.sidebar.selectbox("Select a Prompt:", questions, key="prompt_key")
|
183 |
-
if prompt ==
|
184 |
show = False
|
185 |
# React to user input
|
186 |
prompt = st.chat_input("Ask me anything about air quality!", key=1000)
|
187 |
-
if prompt
|
188 |
show = True
|
189 |
else:
|
190 |
# placeholder for chat input
|
191 |
-
st.chat_input(
|
|
|
|
|
192 |
|
193 |
if "last_prompt" in st.session_state:
|
194 |
last_prompt = st.session_state["last_prompt"]
|
@@ -210,7 +249,9 @@ if prompt:
|
|
210 |
for i in range(1):
|
211 |
print(f"Attempt {i+1}")
|
212 |
if model_name == "gemini-pro":
|
213 |
-
llm = GoogleGenerativeAI(
|
|
|
|
|
214 |
else:
|
215 |
llm = ChatGroq(model=models[model_name], api_key=os.getenv("GROQ_API"), temperature=0)
|
216 |
|
@@ -220,8 +261,8 @@ if prompt:
|
|
220 |
|
221 |
new_line = "\n"
|
222 |
|
223 |
-
parameters = {"font.size": 12,"figure.dpi": 600}
|
224 |
-
|
225 |
template = f"""```python
|
226 |
import pandas as pd
|
227 |
import matplotlib.pyplot as plt
|
@@ -249,7 +290,7 @@ import uuid
|
|
249 |
{template}
|
250 |
|
251 |
"""
|
252 |
-
|
253 |
answer = None
|
254 |
code = None
|
255 |
error = None
|
@@ -269,32 +310,51 @@ import uuid
|
|
269 |
error = e
|
270 |
if code is not None:
|
271 |
answer = f"Error executing the code...\n\n{e}"
|
272 |
-
|
273 |
if type(answer) != str:
|
274 |
answer = f"!!!Faced an error while working on your query. Please try again!!!"
|
275 |
-
|
276 |
-
response = {
|
277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
# Update to HuggingFace dataset
|
279 |
dataset_config = get_dataset_config_info("SustainabilityLabIITGN/VayuBuddy_logs", token=hf_token)
|
280 |
splits = dataset_config.splits
|
281 |
last_split = list(splits.keys())[-1]
|
282 |
last_split_size = splits[last_split].num_examples
|
283 |
-
|
|
|
284 |
if last_split_size >= 100:
|
285 |
current_split = str(int(last_split) + 1)
|
|
|
286 |
else:
|
287 |
current_split = last_split
|
288 |
-
|
289 |
-
try:
|
290 |
-
img = Image.open(answer)
|
291 |
-
answer = None
|
292 |
-
image = answer
|
293 |
-
except:
|
294 |
-
image = None
|
295 |
|
296 |
-
ds = load_dataset("SustainabilityLabIITGN/VayuBuddy_logs", token=hf_token, split=current_split)
|
297 |
-
ds = ds.add_item({"prompt": prompt, "code": code, "answer": answer, "error": error, "model": model_name, "image": image})
|
298 |
ds.push_to_hub("SustainabilityLabIITGN/VayuBuddy_logs", split=current_split, token=hf_token)
|
299 |
|
300 |
# Get response from agent
|
@@ -303,20 +363,24 @@ import uuid
|
|
303 |
|
304 |
if ran:
|
305 |
break
|
306 |
-
|
307 |
# Append agent response to chat history
|
308 |
st.session_state.responses.append(response)
|
309 |
|
310 |
-
st.session_state[
|
311 |
-
st.session_state[
|
312 |
st.rerun()
|
313 |
-
|
314 |
-
|
315 |
# contact details
|
316 |
contact_details = """
|
317 |
**Feel free to reach out to us:**
|
318 |
- [Zeel B Patel](https://patel-zeel.github.io/)
|
319 |
(PhD Student, IIT Gandhinagar)
|
|
|
|
|
|
|
|
|
320 |
- Yash J Bachwana
|
321 |
(Developer, IIT Gandhinagar)
|
322 |
- [Nipun Batra](https://nipunbatra.github.io/)
|
@@ -341,5 +405,5 @@ st.markdown(
|
|
341 |
}
|
342 |
</style>
|
343 |
""",
|
344 |
-
unsafe_allow_html=True
|
345 |
-
)
|
|
|
5 |
import random
|
6 |
from os.path import join
|
7 |
from datetime import datetime
|
8 |
+
from src import (
|
9 |
+
preprocess_and_load_df,
|
10 |
+
load_agent,
|
11 |
+
ask_agent,
|
12 |
+
decorate_with_code,
|
13 |
+
show_response,
|
14 |
+
get_from_user,
|
15 |
+
load_smart_df,
|
16 |
+
ask_question,
|
17 |
+
)
|
18 |
from dotenv import load_dotenv
|
19 |
from langchain_groq.chat_models import ChatGroq
|
20 |
from langchain_google_genai import GoogleGenerativeAI
|
21 |
from streamlit_feedback import streamlit_feedback
|
22 |
from huggingface_hub import HfApi
|
23 |
+
from datasets import load_dataset, get_dataset_config_info, Dataset
|
24 |
from PIL import Image
|
25 |
|
26 |
st.set_page_config(layout="wide")
|
|
|
30 |
Groq_Token = os.environ["GROQ_API_KEY"]
|
31 |
hf_token = os.environ["HF_TOKEN"]
|
32 |
gemini_token = os.environ["GEMINI_TOKEN"]
|
33 |
+
models = {
|
34 |
+
"llama3": "llama3-70b-8192",
|
35 |
+
"mixtral": "mixtral-8x7b-32768",
|
36 |
+
"llama2": "llama2-70b-4096",
|
37 |
+
"gemma": "gemma-7b-it",
|
38 |
+
"gemini-pro": "gemini-pro",
|
39 |
+
}
|
40 |
|
41 |
self_path = os.path.dirname(os.path.abspath(__file__))
|
42 |
|
|
|
55 |
)
|
56 |
|
57 |
# Displaying the centered title
|
58 |
+
st.markdown(
|
59 |
+
"<div style='text-align:center; padding: 20px;'>VayuBuddy makes pollution monitoring easier by bridging the gap between users and datasets.<br>No coding required—just meaningful insights at your fingertips!</div>",
|
60 |
+
unsafe_allow_html=True,
|
61 |
+
)
|
62 |
|
63 |
# Center-aligned instruction text with bold formatting
|
64 |
+
st.markdown(
|
65 |
+
"<div style='text-align:center;'>Choose a query from <b>Select a prompt</b> or type a query in the <b>chat box</b>, select a <b>LLM</b> (Large Language Model), and press enter to generate a response.</div>",
|
66 |
+
unsafe_allow_html=True,
|
67 |
+
)
|
68 |
# os.environ["PANDASAI_API_KEY"] = "$2a$10$gbmqKotzJOnqa7iYOun8eO50TxMD/6Zw1pLI2JEoqncwsNx4XeBS2"
|
69 |
|
70 |
# with open(join(self_path, "context1.txt")) as f:
|
|
|
78 |
|
79 |
image_path = "IITGN_Logo.png"
|
80 |
|
81 |
+
# Display images and text in three columns with specified ratios
|
82 |
+
col1, col2, col3 = st.sidebar.columns((1.0, 2, 1.0))
|
83 |
with col2:
|
84 |
st.image(image_path, use_column_width=True)
|
85 |
st.markdown("<h1 class='title'>VayuBuddy</h1>", unsafe_allow_html=True)
|
|
|
|
|
|
|
86 |
|
87 |
+
|
88 |
+
model_name = st.sidebar.selectbox("Select LLM:", ["llama3", "mixtral", "gemma", "gemini-pro"])
|
89 |
+
|
90 |
+
questions = ["Custom Prompt"]
|
91 |
with open(join(self_path, "questions.txt")) as f:
|
92 |
questions += f.read().split("\n")
|
93 |
|
94 |
+
waiting_lines = (
|
95 |
+
"Thinking...",
|
96 |
+
"Just a moment...",
|
97 |
+
"Let me think...",
|
98 |
+
"Working on it...",
|
99 |
+
"Processing...",
|
100 |
+
"Hold on...",
|
101 |
+
"One moment...",
|
102 |
+
"On it...",
|
103 |
+
)
|
104 |
|
105 |
# agent = load_agent(df, context="", inference_server=inference_server, name=model_name)
|
106 |
|
107 |
# Initialize chat history
|
108 |
if "responses" not in st.session_state:
|
109 |
st.session_state.responses = []
|
110 |
+
|
111 |
### Old code for feedback
|
112 |
# def push_to_dataset(feedback, comments,output,code,error):
|
113 |
# # Load existing dataset or create a new one if it doesn't exist
|
|
|
119 |
|
120 |
# # Add new feedback to the dataset
|
121 |
# new_data = {"feedback": [feedback], "comments": [comments], "error": [error], "output": [output], "code": [code]} # Convert feedback and comments to lists
|
122 |
+
# new_data = Dataset.from_dict(new_data)
|
123 |
|
124 |
# ds = concatenate_datasets([ds, new_data])
|
125 |
|
126 |
# # Push the updated dataset to Hugging Face Hub
|
127 |
+
# ds.push_to_hub("YashB1/Feedbacks_eoc", split="evaluation")
|
128 |
+
|
129 |
|
130 |
def upload_feedback():
|
131 |
print("Uploading feedback")
|
132 |
data = {
|
133 |
+
"feedback": feedback["score"],
|
134 |
+
"comment": feedback["text"],
|
135 |
+
"error": error,
|
136 |
+
"output": output,
|
137 |
+
"prompt": last_prompt,
|
138 |
+
"code": code,
|
139 |
+
}
|
140 |
+
|
141 |
# generate a random file name based on current time-stamp: YYYY-MM-DD_HH-MM-SS
|
142 |
random_folder_name = str(datetime.now()).replace(" ", "_").replace(":", "-").replace(".", "-")
|
143 |
print("Random folder:", random_folder_name)
|
|
|
160 |
|
161 |
Comments: {feedback['text']}
|
162 |
"""
|
163 |
+
|
164 |
print(template, file=f)
|
165 |
+
|
166 |
api = HfApi(token=hf_token)
|
167 |
api.upload_file(
|
168 |
+
path_or_fileobj=save_path,
|
169 |
+
path_in_repo=path_in_repo,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
repo_id="SustainabilityLabIITGN/VayuBuddy_Feedback",
|
171 |
repo_type="dataset",
|
172 |
+
)
|
173 |
+
if status["is_image"]:
|
174 |
+
api.upload_file(
|
175 |
+
path_or_fileobj=output,
|
176 |
+
path_in_repo=f"data/{random_folder_name}/plot.png",
|
177 |
+
repo_id="SustainabilityLabIITGN/VayuBuddy_Feedback",
|
178 |
+
repo_type="dataset",
|
179 |
)
|
180 |
+
|
181 |
print("Feedback uploaded successfully!")
|
182 |
|
183 |
+
|
184 |
# Display chat responses from history on app rerun
|
185 |
+
print("#" * 10)
|
186 |
for response_id, response in enumerate(st.session_state.responses):
|
187 |
status = show_response(st, response)
|
188 |
if response["role"] == "assistant":
|
189 |
feedback_key = f"feedback_{int(response_id/2)}"
|
190 |
print("response_id", response_id, "feedback_key", feedback_key)
|
191 |
+
|
192 |
error = response["error"]
|
193 |
output = response["content"]
|
194 |
last_prompt = response["last_prompt"]
|
195 |
code = response["gen_code"]
|
196 |
+
|
197 |
if "feedback" in st.session_state.responses[response_id]:
|
198 |
st.write("Feedback:", st.session_state.responses[response_id]["feedback"])
|
199 |
else:
|
200 |
## !!! This does on work on Safari !!!
|
201 |
# feedback = streamlit_feedback(feedback_type="thumbs",
|
202 |
# optional_text_label="[Optional] Please provide extra information", on_submit=upload_feedback, key=feedback_key)
|
203 |
+
|
204 |
+
# Display thumbs up/down buttons for feedback
|
205 |
+
thumbs = st.radio("We would appreciate your feedback!", ("👍", "👎"), index=None, key=feedback_key)
|
206 |
|
207 |
if thumbs:
|
208 |
# Text input for comments
|
209 |
+
comments = st.text_area("[Optional] Please provide extra information", key=feedback_key + "_comments")
|
210 |
feedback = {"score": thumbs, "text": comments}
|
211 |
+
if st.button("Submit", on_click=upload_feedback, key=feedback_key + "_submit"):
|
212 |
st.session_state.responses[response_id]["feedback"] = feedback
|
213 |
st.success("Feedback uploaded successfully!")
|
214 |
+
|
215 |
+
|
216 |
+
print("#" * 10)
|
217 |
|
218 |
show = True
|
219 |
prompt = st.sidebar.selectbox("Select a Prompt:", questions, key="prompt_key")
|
220 |
+
if prompt == "Custom Prompt":
|
221 |
show = False
|
222 |
# React to user input
|
223 |
prompt = st.chat_input("Ask me anything about air quality!", key=1000)
|
224 |
+
if prompt:
|
225 |
show = True
|
226 |
else:
|
227 |
# placeholder for chat input
|
228 |
+
st.chat_input(
|
229 |
+
"Select 'Select a Prompt' -> 'Custom Prompt' in the sidebar to ask your own questions.", key=1000, disabled=True
|
230 |
+
)
|
231 |
|
232 |
if "last_prompt" in st.session_state:
|
233 |
last_prompt = st.session_state["last_prompt"]
|
|
|
249 |
for i in range(1):
|
250 |
print(f"Attempt {i+1}")
|
251 |
if model_name == "gemini-pro":
|
252 |
+
llm = GoogleGenerativeAI(
|
253 |
+
model=models[model_name], google_api_key=os.getenv("GEMINI_TOKEN"), temperature=0
|
254 |
+
)
|
255 |
else:
|
256 |
llm = ChatGroq(model=models[model_name], api_key=os.getenv("GROQ_API"), temperature=0)
|
257 |
|
|
|
261 |
|
262 |
new_line = "\n"
|
263 |
|
264 |
+
parameters = {"font.size": 12, "figure.dpi": 600}
|
265 |
+
|
266 |
template = f"""```python
|
267 |
import pandas as pd
|
268 |
import matplotlib.pyplot as plt
|
|
|
290 |
{template}
|
291 |
|
292 |
"""
|
293 |
+
|
294 |
answer = None
|
295 |
code = None
|
296 |
error = None
|
|
|
310 |
error = e
|
311 |
if code is not None:
|
312 |
answer = f"Error executing the code...\n\n{e}"
|
313 |
+
|
314 |
if type(answer) != str:
|
315 |
answer = f"!!!Faced an error while working on your query. Please try again!!!"
|
316 |
+
|
317 |
+
response = {
|
318 |
+
"role": "assistant",
|
319 |
+
"content": answer,
|
320 |
+
"gen_code": code,
|
321 |
+
"ex_code": code,
|
322 |
+
"last_prompt": prompt,
|
323 |
+
"error": error,
|
324 |
+
}
|
325 |
+
|
326 |
+
try:
|
327 |
+
print("Trying to open image", answer)
|
328 |
+
img = Image.open(answer)
|
329 |
+
print("Image opened")
|
330 |
+
image = answer
|
331 |
+
answer = None
|
332 |
+
except:
|
333 |
+
image = None
|
334 |
+
|
335 |
+
item = {
|
336 |
+
"prompt": prompt,
|
337 |
+
"code": code,
|
338 |
+
"answer": answer,
|
339 |
+
"error": error,
|
340 |
+
"model": model_name,
|
341 |
+
"image": image,
|
342 |
+
}
|
343 |
+
|
344 |
# Update to HuggingFace dataset
|
345 |
dataset_config = get_dataset_config_info("SustainabilityLabIITGN/VayuBuddy_logs", token=hf_token)
|
346 |
splits = dataset_config.splits
|
347 |
last_split = list(splits.keys())[-1]
|
348 |
last_split_size = splits[last_split].num_examples
|
349 |
+
|
350 |
+
ds = load_dataset("SustainabilityLabIITGN/VayuBuddy_logs", token=hf_token, split=last_split)
|
351 |
if last_split_size >= 100:
|
352 |
current_split = str(int(last_split) + 1)
|
353 |
+
ds = Dataset.from_list([item], features=ds.features)
|
354 |
else:
|
355 |
current_split = last_split
|
356 |
+
ds = ds.add_item(item)
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
|
|
|
|
|
358 |
ds.push_to_hub("SustainabilityLabIITGN/VayuBuddy_logs", split=current_split, token=hf_token)
|
359 |
|
360 |
# Get response from agent
|
|
|
363 |
|
364 |
if ran:
|
365 |
break
|
366 |
+
|
367 |
# Append agent response to chat history
|
368 |
st.session_state.responses.append(response)
|
369 |
|
370 |
+
st.session_state["last_prompt"] = prompt
|
371 |
+
st.session_state["last_model_name"] = model_name
|
372 |
st.rerun()
|
373 |
+
|
374 |
+
|
375 |
# contact details
|
376 |
contact_details = """
|
377 |
**Feel free to reach out to us:**
|
378 |
- [Zeel B Patel](https://patel-zeel.github.io/)
|
379 |
(PhD Student, IIT Gandhinagar)
|
380 |
+
- Vinayak Rana
|
381 |
+
(Developer, IIT Gandhinagar)
|
382 |
+
- Nitish Sharma
|
383 |
+
(Developer, Independent Contributor)
|
384 |
- Yash J Bachwana
|
385 |
(Developer, IIT Gandhinagar)
|
386 |
- [Nipun Batra](https://nipunbatra.github.io/)
|
|
|
405 |
}
|
406 |
</style>
|
407 |
""",
|
408 |
+
unsafe_allow_html=True,
|
409 |
+
)
|
aqi_downloader.ipynb
ADDED
@@ -0,0 +1,785 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 13,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import os\n",
|
10 |
+
"from os.path import join, dirname, exists\n",
|
11 |
+
"import concurrent\n",
|
12 |
+
"import pandas as pd\n",
|
13 |
+
"from tqdm.notebook import tqdm\n",
|
14 |
+
"import pdfplumber\n",
|
15 |
+
"from datetime import datetime\n",
|
16 |
+
"import requests\n",
|
17 |
+
"import shutil\n",
|
18 |
+
"import sys"
|
19 |
+
]
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"cell_type": "code",
|
23 |
+
"execution_count": 14,
|
24 |
+
"metadata": {},
|
25 |
+
"outputs": [],
|
26 |
+
"source": [
|
27 |
+
"def check_exists(date):\n",
|
28 |
+
" str_date = date.strftime('%Y%m%d')\n",
|
29 |
+
" file_name = f\"AQI_Bulletin_{str_date}.pdf\"\n",
|
30 |
+
" file_path = f\"AQI_data/{file_name}\"\n",
|
31 |
+
" return exists(file_path), file_path, file_name\n",
|
32 |
+
"\n",
|
33 |
+
"def download(date):\n",
|
34 |
+
" file_exists, file_path, file_name = check_exists(date)\n",
|
35 |
+
" if file_exists:\n",
|
36 |
+
" return file_path\n",
|
37 |
+
" \n",
|
38 |
+
" url = f\"https://cpcb.nic.in//upload/Downloads/{file_name}\"\n",
|
39 |
+
" response = requests.get(url)\n",
|
40 |
+
" if response.status_code == 200:\n",
|
41 |
+
" with open(file_path, 'wb') as f:\n",
|
42 |
+
" f.write(response.content)\n",
|
43 |
+
" return file_path\n",
|
44 |
+
" else:\n",
|
45 |
+
" print(f\"Failed to download {url} with status code {response.status_code}\")\n",
|
46 |
+
" return None"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"cell_type": "code",
|
51 |
+
"execution_count": 15,
|
52 |
+
"metadata": {},
|
53 |
+
"outputs": [
|
54 |
+
{
|
55 |
+
"name": "stdout",
|
56 |
+
"output_type": "stream",
|
57 |
+
"text": [
|
58 |
+
"DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',\n",
|
59 |
+
" '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',\n",
|
60 |
+
" '2016-01-09', '2016-01-10',\n",
|
61 |
+
" ...\n",
|
62 |
+
" '2024-11-02', '2024-11-03', '2024-11-04', '2024-11-05',\n",
|
63 |
+
" '2024-11-06', '2024-11-07', '2024-11-08', '2024-11-09',\n",
|
64 |
+
" '2024-11-10', '2024-11-11'],\n",
|
65 |
+
" dtype='datetime64[ns]', length=3238, freq='D')\n"
|
66 |
+
]
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"data": {
|
70 |
+
"text/plain": [
|
71 |
+
"(None, 3238)"
|
72 |
+
]
|
73 |
+
},
|
74 |
+
"execution_count": 15,
|
75 |
+
"metadata": {},
|
76 |
+
"output_type": "execute_result"
|
77 |
+
}
|
78 |
+
],
|
79 |
+
"source": [
|
80 |
+
"dates = pd.date_range('2016-01-01', datetime.today() - pd.Timedelta(days=1), freq='D')\n",
|
81 |
+
"# dates = pd.date_range('2024-01-01', '2024-02-01', freq='D')\n",
|
82 |
+
"print(dates), len(dates)"
|
83 |
+
]
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"cell_type": "code",
|
87 |
+
"execution_count": 16,
|
88 |
+
"metadata": {},
|
89 |
+
"outputs": [
|
90 |
+
{
|
91 |
+
"name": "stdout",
|
92 |
+
"output_type": "stream",
|
93 |
+
"text": [
|
94 |
+
"Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20160606.pdf with status code 404\n",
|
95 |
+
"Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20171014.pdf with status code 404\n",
|
96 |
+
"Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20170618.pdf with status code 404\n"
|
97 |
+
]
|
98 |
+
}
|
99 |
+
],
|
100 |
+
"source": [
|
101 |
+
"with concurrent.futures.ThreadPoolExecutor(48) as executor:\n",
|
102 |
+
" files = list(executor.map(download, dates))"
|
103 |
+
]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"cell_type": "code",
|
107 |
+
"execution_count": 17,
|
108 |
+
"metadata": {},
|
109 |
+
"outputs": [
|
110 |
+
{
|
111 |
+
"name": "stdout",
|
112 |
+
"output_type": "stream",
|
113 |
+
"text": [
|
114 |
+
"3238\n"
|
115 |
+
]
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"data": {
|
119 |
+
"text/plain": [
|
120 |
+
"3235"
|
121 |
+
]
|
122 |
+
},
|
123 |
+
"execution_count": 17,
|
124 |
+
"metadata": {},
|
125 |
+
"output_type": "execute_result"
|
126 |
+
}
|
127 |
+
],
|
128 |
+
"source": [
|
129 |
+
"print(len(files))\n",
|
130 |
+
"files = list(filter(None, files))\n",
|
131 |
+
"len(files)"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"cell_type": "code",
|
136 |
+
"execution_count": 18,
|
137 |
+
"metadata": {},
|
138 |
+
"outputs": [
|
139 |
+
{
|
140 |
+
"data": {
|
141 |
+
"application/vnd.jupyter.widget-view+json": {
|
142 |
+
"model_id": "6b1ea587cbcb48f6838b51ebc5b2dfc3",
|
143 |
+
"version_major": 2,
|
144 |
+
"version_minor": 0
|
145 |
+
},
|
146 |
+
"text/plain": [
|
147 |
+
" 0%| | 0/3238 [00:00<?, ?it/s]"
|
148 |
+
]
|
149 |
+
},
|
150 |
+
"metadata": {},
|
151 |
+
"output_type": "display_data"
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"name": "stdout",
|
155 |
+
"output_type": "stream",
|
156 |
+
"text": [
|
157 |
+
"File AQI_Bulletin_20160606.pdf does not exist\n",
|
158 |
+
"No tables found in AQI_data/AQI_Bulletin_20160704.pdf\n",
|
159 |
+
"No tables found in AQI_data/AQI_Bulletin_20160721.pdf\n",
|
160 |
+
"No tables found in AQI_data/AQI_Bulletin_20160723.pdf\n",
|
161 |
+
"No tables found in AQI_data/AQI_Bulletin_20160722.pdf\n",
|
162 |
+
"File AQI_Bulletin_20170618.pdf does not exist\n",
|
163 |
+
"File AQI_Bulletin_20171014.pdf does not exist\n"
|
164 |
+
]
|
165 |
+
}
|
166 |
+
],
|
167 |
+
"source": [
|
168 |
+
"def check_valid(value):\n",
|
169 |
+
" if value is None:\n",
|
170 |
+
" return False\n",
|
171 |
+
" if value == \"\":\n",
|
172 |
+
" return False\n",
|
173 |
+
" return True\n",
|
174 |
+
"\n",
|
175 |
+
"def process_pattern_1(table, i, key):\n",
|
176 |
+
" # second line\n",
|
177 |
+
" if (key is not None) and (not check_valid(table[i][0])):\n",
|
178 |
+
" air_quality = table[i][2]\n",
|
179 |
+
" return air_quality\n",
|
180 |
+
" \n",
|
181 |
+
" # first line\n",
|
182 |
+
" if check_valid(table[i][0]):\n",
|
183 |
+
" key = table[i][0]\n",
|
184 |
+
" aqi = int(table[i][4])\n",
|
185 |
+
" pollutant = table[i][5]\n",
|
186 |
+
" air_quality = process_pattern_1(table, i+1, key)\n",
|
187 |
+
" return {key: {\"AQI\": aqi, \"Pollutant\": pollutant, \"Air Quality\": air_quality, \"Based on number of monitoring stations\": None}}\n",
|
188 |
+
" \n",
|
189 |
+
"def process_pattern_2(table, i, key):\n",
|
190 |
+
" # second line\n",
|
191 |
+
" try:\n",
|
192 |
+
" if (key is not None) and (not check_valid(table[i][0])):\n",
|
193 |
+
" air_quality = table[i][2]\n",
|
194 |
+
" return air_quality\n",
|
195 |
+
" except Exception as e:\n",
|
196 |
+
" print(table[i-1])\n",
|
197 |
+
" raise e\n",
|
198 |
+
" \n",
|
199 |
+
" # first line\n",
|
200 |
+
" if check_valid(table[i][0]):\n",
|
201 |
+
" try:\n",
|
202 |
+
" key = table[i][0]\n",
|
203 |
+
" if check_valid(table[i][1]):\n",
|
204 |
+
" air_quality = table[i][1]\n",
|
205 |
+
" need_for_second_line = False\n",
|
206 |
+
" else:\n",
|
207 |
+
" need_for_second_line = True\n",
|
208 |
+
" aqi = int(table[i][4])\n",
|
209 |
+
" pollutant = table[i][5] # p2\n",
|
210 |
+
" n_stations = table[i][6]\n",
|
211 |
+
" except Exception as e:\n",
|
212 |
+
" print(table[i])\n",
|
213 |
+
" print(table)\n",
|
214 |
+
" raise e\n",
|
215 |
+
" if need_for_second_line:\n",
|
216 |
+
" air_quality = process_pattern_2(table, i+1, key)\n",
|
217 |
+
" return {key: {\"AQI\": aqi, \"Pollutant\": pollutant, \"Air Quality\": air_quality, \"Based on number of monitoring stations\": n_stations}}\n",
|
218 |
+
" \n",
|
219 |
+
"def process_pattern_3(table, i, key):\n",
|
220 |
+
" # second line\n",
|
221 |
+
" if (key is not None) and (not check_valid(table[i][0])):\n",
|
222 |
+
" air_quality = table[i][2]\n",
|
223 |
+
" return air_quality\n",
|
224 |
+
" \n",
|
225 |
+
" # first line\n",
|
226 |
+
" if check_valid(table[i][0]):\n",
|
227 |
+
" key = table[i][0]\n",
|
228 |
+
" if check_valid(table[i][1]):\n",
|
229 |
+
" air_quality = table[i][1]\n",
|
230 |
+
" second_line_needed = False\n",
|
231 |
+
" else:\n",
|
232 |
+
" second_line_needed = True\n",
|
233 |
+
" aqi = int(table[i][2])\n",
|
234 |
+
" pollutant = table[i][3]\n",
|
235 |
+
" n_stations = table[i][4]\n",
|
236 |
+
" if second_line_needed:\n",
|
237 |
+
" air_quality = process_pattern_3(table, i+1, key)\n",
|
238 |
+
" return {key: {\"AQI\": aqi, \"Pollutant\": pollutant, \"Air Quality\": air_quality, \"Based on number of monitoring stations\": n_stations}}\n",
|
239 |
+
"\n",
|
240 |
+
"def process_pattern_4(table, i, key):\n",
|
241 |
+
" # ['S.No', 'City', 'Air Quality', 'Index Value', 'Prominent Pollutant', 'Based on Number\\nof Monitoring\\nStations'], ['1', 'Agra', 'Moderate', '138', 'PM\\n2.5', '1'], [None, None, '', None, None, None], ['2', 'Ahmedabad', 'Satisfactory', '77', 'PM\\n10', '1'], [None, None, '', None, None, None], ['3', 'Aizawl', 'Satisfactory', '53', 'PM\\n2.5', '1'], [None, None, '', None, None, None], ['4', 'Ajmer', 'Satisfa\n",
|
242 |
+
" # # invalid line\n",
|
243 |
+
" # if (key is not None) and (not check_valid(table[i][0])):\n",
|
244 |
+
" # air_quality = table[i][2]\n",
|
245 |
+
" # return air_quality\n",
|
246 |
+
" \n",
|
247 |
+
" # first line\n",
|
248 |
+
" if check_valid(table[i][0]):\n",
|
249 |
+
" key = table[i][1]\n",
|
250 |
+
" air_quality = table[i][2]\n",
|
251 |
+
" aqi = int(table[i][3].split(\"\\n\")[0])\n",
|
252 |
+
" pollutant = table[i][4]\n",
|
253 |
+
" n_stations = table[i][5]\n",
|
254 |
+
" return {key: {\"AQI\": aqi, \"Pollutant\": pollutant, \"Air Quality\": air_quality, \"Based on number of monitoring stations\": n_stations}}\n",
|
255 |
+
" \n",
|
256 |
+
"\n",
|
257 |
+
"def process_table(table, start):\n",
|
258 |
+
" data_dict = {}\n",
|
259 |
+
" if table[0] == ['City', 'Air Quality', None, None, 'Index Value', 'Prominent\\nPollutant']:\n",
|
260 |
+
" table = table[1:]\n",
|
261 |
+
" for i in range(len(table)):\n",
|
262 |
+
" data = process_pattern_1(table, i, None)\n",
|
263 |
+
" if data:\n",
|
264 |
+
" data_dict.update(process_pattern_1(table, i, None))\n",
|
265 |
+
" \n",
|
266 |
+
" df = pd.DataFrame(data_dict).T\n",
|
267 |
+
" df.index.name = \"City\"\n",
|
268 |
+
" df.reset_index(inplace=True, drop=False)\n",
|
269 |
+
" return df, None\n",
|
270 |
+
" elif (table[0] == ['City', 'Air Quality', None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on number of\\nmonitoring stations']) or (table[0] == ['City', 'Air Quality', None, None, 'Index Value', None, None, 'Prominent\\nPollutant', 'Based on\\nnumber of\\nstations']) or (table[0] == ['City', 'Air Quality', None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on number of\\nmonitoring stations']) or (table[0] == ['City', 'Air Quality', None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on\\nnumber of\\nmonitoring\\nstations']) or (table[0] == ['City', 'Air Quality', None, None, None, None, 'Index Value', None, None, 'Prominent\\nPollutant', 'Based on number of\\nstations']) or (table[0] == ['City', 'Air Quality', None, None, None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on\\nnumber of\\nmonitoring\\nstations']) or (table[0] == ['City', 'Air Quality', None, None, 'Index\\nValue', 'Prominent\\nPollutant', 'Based on number of\\nmonitoring stations']) or (table[0] == ['City', 'Air Quality', None, None, None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on\\nnumber of\\nstations']):\n",
|
271 |
+
" table = table[1:]\n",
|
272 |
+
" for i in range(len(table)):\n",
|
273 |
+
" data = process_pattern_2(table, i, None)\n",
|
274 |
+
" if data:\n",
|
275 |
+
" data_dict.update(process_pattern_2(table, i, None))\n",
|
276 |
+
" \n",
|
277 |
+
" df = pd.DataFrame(data_dict).T\n",
|
278 |
+
" df.index.name = \"City\"\n",
|
279 |
+
" df.reset_index(inplace=True, drop=False)\n",
|
280 |
+
" return df, None\n",
|
281 |
+
" elif table[0] == ['City', 'Air Quality', 'Index Value', 'Prominent\\nPollutant', 'Based on number of\\nmonitoring stations']:\n",
|
282 |
+
" # print(\"Pattern 3\")\n",
|
283 |
+
" table = table[1:]\n",
|
284 |
+
" for i in range(len(table)):\n",
|
285 |
+
" data = process_pattern_3(table, i, None)\n",
|
286 |
+
" if data:\n",
|
287 |
+
" data_dict.update(process_pattern_3(table, i, None))\n",
|
288 |
+
" df = pd.DataFrame(data_dict).T\n",
|
289 |
+
" df.index.name = \"City\"\n",
|
290 |
+
" df.reset_index(inplace=True, drop=False)\n",
|
291 |
+
" return df, None\n",
|
292 |
+
" elif (table[0] == ['S.No', 'City', 'Air Quality', 'Index Value', 'Prominent Polluta\\nnt', 'Based on number\\nof monitoring\\nstations']) or (table[0] == ['S.No', 'City', 'Air Quality', 'Index Value', 'Prominent Pollutant', 'Based on Number\\nof Monitoring\\nStations']) or (table[0] == ['S.No', 'City', 'Air Quality', 'Index\\nValue', 'Prominent Pollutant', 'No. of Stations\\nParticipated/\\nTotal Stations']):\n",
|
293 |
+
" # print(\"Pattern 4\")\n",
|
294 |
+
" table = table[1:]\n",
|
295 |
+
" for i in range(len(table)):\n",
|
296 |
+
" data = process_pattern_4(table, i, None)\n",
|
297 |
+
" if data:\n",
|
298 |
+
" data_dict.update(process_pattern_4(table, i, None))\n",
|
299 |
+
" \n",
|
300 |
+
" df = pd.DataFrame(data_dict).T\n",
|
301 |
+
" df.index.name = \"City\"\n",
|
302 |
+
" df.reset_index(inplace=True, drop=False)\n",
|
303 |
+
" return df, None\n",
|
304 |
+
" elif (table[0] == ['Good', 'Minimal impact']) or (table[0] == ['Good', 'Minimal Impact']) or (table[0] == ['AQI', 'Category', 'Color Code', 'Possible Health Impacts']):\n",
|
305 |
+
" # print(\"Not a data table\")\n",
|
306 |
+
" return None, None\n",
|
307 |
+
" else:\n",
|
308 |
+
" print(table)\n",
|
309 |
+
" raise ValueError(\"Table pattern not recognized\")\n",
|
310 |
+
"\n",
|
311 |
+
"def process_file(date):\n",
|
312 |
+
" file_exists, file_path, file_name = check_exists(date)\n",
|
313 |
+
" if not file_exists:\n",
|
314 |
+
" print(f\"File {file_name} does not exist\")\n",
|
315 |
+
" return None\n",
|
316 |
+
" save_path = file_name.replace(\".pdf\",\".csv\")\n",
|
317 |
+
" if exists(f\"AQI_data_csv/{save_path}\"):\n",
|
318 |
+
" try:\n",
|
319 |
+
" pd.read_csv(f\"AQI_data_csv/{save_path}\")\n",
|
320 |
+
" # print(f\"File {save_path} already exists\")\n",
|
321 |
+
" return None\n",
|
322 |
+
" except Exception as e:\n",
|
323 |
+
" print(f\"File {save_path} is corrupted and will be overwritten\")\n",
|
324 |
+
" \n",
|
325 |
+
" tables = []\n",
|
326 |
+
" with pdfplumber.open(file_path) as pdf:\n",
|
327 |
+
" for page in pdf.pages:\n",
|
328 |
+
" table = page.extract_table()\n",
|
329 |
+
" if table:\n",
|
330 |
+
" tables.append(table)\n",
|
331 |
+
" try:\n",
|
332 |
+
" assert len(tables) > 0, f\"No tables found in {file_path}\"\n",
|
333 |
+
" except AssertionError:\n",
|
334 |
+
" print(f\"No tables found in {file_path}\")\n",
|
335 |
+
" return None\n",
|
336 |
+
"\n",
|
337 |
+
" df_list = []\n",
|
338 |
+
" for table in tables:\n",
|
339 |
+
" try:\n",
|
340 |
+
" df, _ = process_table(table, 0)\n",
|
341 |
+
" if df is not None:\n",
|
342 |
+
" df_list.append(df)\n",
|
343 |
+
" except Exception as e:\n",
|
344 |
+
" print(f\"Ignoring a table for {file_name}\")\n",
|
345 |
+
" # print(table)\n",
|
346 |
+
" print(\"Error message:\", e)\n",
|
347 |
+
" \n",
|
348 |
+
" if len(df_list) == 0:\n",
|
349 |
+
" print(f\"No valid tables found in {file_name}\")\n",
|
350 |
+
" return None\n",
|
351 |
+
" \n",
|
352 |
+
" df = pd.concat(df_list, ignore_index=True)\n",
|
353 |
+
" df['Date'] = date\n",
|
354 |
+
" df.to_csv(f\"AQI_data_csv/{save_path}\", index=False)\n",
|
355 |
+
"\n",
|
356 |
+
"from joblib import Parallel, delayed\n",
|
357 |
+
"\n",
|
358 |
+
"# dfs = {}\n",
|
359 |
+
"# for file_path in tqdm(files[1000:]):\n",
|
360 |
+
" # print(file_path)\n",
|
361 |
+
" # df = process_file(file_path)\n",
|
362 |
+
" # dfs[file_path] s= df\n",
|
363 |
+
"# print(dates[15:16])\n",
|
364 |
+
"_ = Parallel(48)(delayed(process_file)(file_path) for file_path in tqdm(dates))"
|
365 |
+
]
|
366 |
+
},
|
367 |
+
{
|
368 |
+
"cell_type": "markdown",
|
369 |
+
"metadata": {},
|
370 |
+
"source": [
|
371 |
+
"## Postprocessing"
|
372 |
+
]
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"cell_type": "code",
|
376 |
+
"execution_count": 19,
|
377 |
+
"metadata": {},
|
378 |
+
"outputs": [
|
379 |
+
{
|
380 |
+
"data": {
|
381 |
+
"text/plain": [
|
382 |
+
"372342"
|
383 |
+
]
|
384 |
+
},
|
385 |
+
"execution_count": 19,
|
386 |
+
"metadata": {},
|
387 |
+
"output_type": "execute_result"
|
388 |
+
}
|
389 |
+
],
|
390 |
+
"source": [
|
391 |
+
"merged_df = pd.read_csv(\"AQI_data_csv/merged.csv\")\n",
|
392 |
+
"len(merged_df)"
|
393 |
+
]
|
394 |
+
},
|
395 |
+
{
|
396 |
+
"cell_type": "code",
|
397 |
+
"execution_count": 20,
|
398 |
+
"metadata": {},
|
399 |
+
"outputs": [
|
400 |
+
{
|
401 |
+
"name": "stdout",
|
402 |
+
"output_type": "stream",
|
403 |
+
"text": [
|
404 |
+
"['Agartala', 'Agra', 'Ahmedabad', 'Ahmednagar', 'Aizawl', 'Ajmer', 'Akola', 'Alwar', 'Ambala', 'Amravati', 'Amritsar', 'Anantapur', 'Angul', 'Ankleshwar', 'Araria', 'Ariyalur', 'Arrah', 'Asansol', 'Aurangabad (Bihar)', 'Aurangabad(Maharashtra)', 'Baddi', 'Badlapur', 'Bagalkot', 'Baghpat', 'Bahadurgarh', 'Balasore', 'Ballabgarh', 'Banswara', 'Baran', 'Barbil', 'Bareilly', 'Baripada', 'Barmer', 'Barrackpore', 'Bathinda', 'Begusarai', 'Belapur', 'Belgaum', 'Bengaluru', 'Bettiah', 'Bhagalpur', 'Bharatpur', 'Bhilai', 'Bhilwara', 'Bhiwadi', 'Bhiwandi', 'Bhiwani', 'Bhopal', 'Bhubaneswar', 'Bidar', 'Bihar Sharif', 'Bikaner', 'Bilaspur', 'Bileipada', 'Boisar', 'Brajrajnagar', 'Bulandshahr', 'Bundi', 'Buxar', 'Byasanagar', 'Byrnihat', 'Chamarajanagar', 'Chandigarh', 'Chandrapur', 'Charkhi Dadri', 'Chengalpattu', 'Chennai', 'Chhal', 'Chhapra', 'Chikkaballapur', 'Chikkamagaluru', 'Chittoor', 'Chittorgarh', 'Churu', 'Coimbtore', 'Cuddalore', 'Cuttack', 'Damoh', 'Darbhanga', 'Dausa', 'Davanagere', 'Dehradun', 'Delhi', 'Dewas', 'Dhanbad', 'Dharuhera', 'Dharwad', 'Dholpur', 'Dhule', 'Dindigul', 'Dungarpur', 'Durgapur', 'Eloor', 'Ernakulam', 'Faridabad', 'Fatehabad', 'Firozabad', 'Gadag', 'Gandhinagar', 'Gangtok', 'Gaya', 'Ghaziabad', 'Gorakhpur', 'Greater_Noida', 'Gummidipoondi', 'Gurugram', 'Guwahati', 'Gwalior', 'Hajipur', 'Haldia', 'Hanumangarh', 'Hapur', 'Hassan', 'Haveri', 'Hisar', 'Hosur', 'Howrah', 'Hubballi', 'Hyderabad', 'Imphal', 'Indore', 'Jabalpur', 'Jaipur', 'Jaisalmer', 'Jalandhar', 'Jalgaon', 'Jalna', 'Jalore', 'Jhalawar', 'Jhansi', 'Jharsuguda', 'Jhunjhunu', 'Jind', 'Jodhpur', 'Jorapokhar', 'Kadapa', 'Kaithal', 'Kalaburgi', 'Kalyan', 'Kanchipuram', 'Kannur', 'Kanpur', 'Karauli', 'Karnal', 'Karur', 'Karwar', 'Kashipur', 'Katihar', 'Katni', 'Keonjhar', 'Khanna', 'Khurja', 'Kishanganj', 'Kochi', 'Kohima', 'Kolar', 'Kolhapur', 'Kolkata', 'Kollam', 'Koppal', 'Korba', 'Kota', 'Kozhikode', 'Kunjemura', 'Kurushketra', 'Latur', 'Loni_Ghaziabad', 'Lucknow', 'Ludhiana', 'Madurai', 'Mahad', 'Maihar', 'Malegaon', 'Mandi Gobindgarh', 'Mandideep', 'Mandikhera', 'Manesar', 'Mangalore', 'Manguraha', 'Medikeri', 'Meerut', 'Milupara', 'Mira-Bhayandar', 'Moradabad', 'Motihari', 'Mumbai', 'Munger', 'Muzaffarnagar', 'Muzaffarpur', 'Mysuru', 'NOIDA', 'Nagaon', 'Nagapattinam', 'Nagaur', 'Nagpur', 'Naharlagun', 'Nalbari', 'Nanded', 'Nandesari', 'Narnaul', 'Nashik', 'Navi Mumbai', 'Nayagarh', 'Noida', 'Ooty', 'Pali', 'Palkalaiperur', 'Palwal', 'Panchkula', 'Panipat', 'Parbhani', 'Pathardih', 'Patiala', 'Patna', 'Pimpri-Chinchwad', 'Pithampur', 'Pratapgarh', 'Prayagraj', 'Puducherry', 'Pudukottai', 'Pune', 'Purnia', 'Raichur', 'Raipur', 'Rairangpur', 'Rajamahendravaram', 'Rajgir', 'Rajsamand', 'Ramanagara', 'Ramanathapuram', 'Ranipet', 'Ratlam', 'Rishikesh', 'Rohtak', 'Rourkela', 'Rupnagar', 'Sagar', 'Saharsa', 'Salem', 'Samastipur', 'Sangli', 'Sasaram', 'Satna', 'Sawai Madhopur', 'Shillong', 'Shivamogga', 'Sikar', 'Silchar', 'Siliguri', 'Singrauli', 'Sirohi', 'Sirsa', 'Sivasagar', 'Siwan', 'Solapur', 'Sonipat', 'Sri Ganganagar', 'Srinagar', 'Suakati', 'Surat', 'Talcher', 'Tensa', 'Thane', 'Thanjavur', 'Thiruvananthapuram', 'Thoothukudi', 'Thrissur', 'Tiruchirappalli', 'Tirunelveli', 'Tirupati', 'Tirupur', 'Tonk', 'Tumidih', 'Udaipur', 'Udupi', 'Ujjain', 'Ulhasnagar', 'Vapi', 'Varanasi', 'Vatva', 'Vellore', 'Vijayapura', 'Vijayawada', 'Virar', 'Virudhunagar', 'Visakhapatnam', 'Vrindavan', 'Yadgir', 'Yamunanagar']\n"
|
405 |
+
]
|
406 |
+
}
|
407 |
+
],
|
408 |
+
"source": [
|
409 |
+
"city_mapping = {\n",
|
410 |
+
" \"Amaravati\": \"Amravati\",\n",
|
411 |
+
" \"Asanol\": \"Asansol\",\n",
|
412 |
+
" \"Greater Noida\": \"Greater_Noida\",\n",
|
413 |
+
" \"GandhiNagar\": \"Gandhinagar\",\n",
|
414 |
+
" \"Gurgaon\": \"Gurugram\",\n",
|
415 |
+
" \"Coimbatore\": \"Coimbtore\",\n",
|
416 |
+
" \"Kalaburagi\": \"Kalaburgi\",\n",
|
417 |
+
" \"Kurukshetra\": \"Kurushketra\",\n",
|
418 |
+
" \"Loni_Dehat\": \"Loni_Ghaziabad\",\n",
|
419 |
+
" \"Madikeri\": \"Medikeri\",\n",
|
420 |
+
" \"Manglore\": \"Mangalore\",\n",
|
421 |
+
" \"Pimpri Chinchwad\": \"Pimpri-Chinchwad\",\n",
|
422 |
+
" \"Tumakuru\": \"Tumidih\",\n",
|
423 |
+
" \"Tiruppur\": \"Tirupur\",\n",
|
424 |
+
" \"Yamuna Nagar\": \"Yamunanagar\",\n",
|
425 |
+
" \"vellore\": \"Vellore\" # duplicate, can map to itself or be handled separately\n",
|
426 |
+
"}\n",
|
427 |
+
"def replace_it(x):\n",
|
428 |
+
" x = x.strip().replace(\"\\n\",\"\")\n",
|
429 |
+
" if x in city_mapping:\n",
|
430 |
+
" return city_mapping[x]\n",
|
431 |
+
" else:\n",
|
432 |
+
" return x\n",
|
433 |
+
"\n",
|
434 |
+
"merged_df['City'] = merged_df['City'].apply(lambda x: replace_it(x))\n",
|
435 |
+
"merged_df = merged_df[merged_df.City != \"Aurangabad\"]\n",
|
436 |
+
"print(merged_df['City'].value_counts().sort_index().index.tolist())"
|
437 |
+
]
|
438 |
+
},
|
439 |
+
{
|
440 |
+
"cell_type": "code",
|
441 |
+
"execution_count": 21,
|
442 |
+
"metadata": {},
|
443 |
+
"outputs": [
|
444 |
+
{
|
445 |
+
"data": {
|
446 |
+
"text/plain": [
|
447 |
+
"State\n",
|
448 |
+
"Andhra Pradesh 10881\n",
|
449 |
+
"Arunachal Pradesh 557\n",
|
450 |
+
"Assam 4582\n",
|
451 |
+
"Bihar 26391\n",
|
452 |
+
"Chandigarh 1874\n",
|
453 |
+
"Chhattisgarh 4674\n",
|
454 |
+
"Delhi 3224\n",
|
455 |
+
"Gujarat 11558\n",
|
456 |
+
"Haryana 49090\n",
|
457 |
+
"Himachal Pradesh 916\n",
|
458 |
+
"Jammu and Kashmir 822\n",
|
459 |
+
"Jharkhand 1872\n",
|
460 |
+
"Karnataka 33054\n",
|
461 |
+
"Kerala 11121\n",
|
462 |
+
"Madhya Pradesh 29800\n",
|
463 |
+
"Maharashtra 35954\n",
|
464 |
+
"Manipur 724\n",
|
465 |
+
"Meghalaya 1839\n",
|
466 |
+
"Mizoram 1433\n",
|
467 |
+
"Nagaland 1305\n",
|
468 |
+
"Odisha 10816\n",
|
469 |
+
"Puducherry 1330\n",
|
470 |
+
"Punjab 18876\n",
|
471 |
+
"Rajasthan 34231\n",
|
472 |
+
"Sikkim 772\n",
|
473 |
+
"Tamil Nadu 12170\n",
|
474 |
+
"Telangana 3216\n",
|
475 |
+
"Tripura 1349\n",
|
476 |
+
"Uttar Pradesh 39710\n",
|
477 |
+
"Uttarakhand 1866\n",
|
478 |
+
"West Bengal 14674\n",
|
479 |
+
"Name: count, dtype: int64"
|
480 |
+
]
|
481 |
+
},
|
482 |
+
"execution_count": 21,
|
483 |
+
"metadata": {},
|
484 |
+
"output_type": "execute_result"
|
485 |
+
}
|
486 |
+
],
|
487 |
+
"source": [
|
488 |
+
"city_to_state = {\n",
|
489 |
+
" 'Agartala': 'Tripura', 'Agra': 'Uttar Pradesh', 'Ahmedabad': 'Gujarat', 'Ahmednagar': 'Maharashtra',\n",
|
490 |
+
" 'Aizawl': 'Mizoram', 'Ajmer': 'Rajasthan', 'Akola': 'Maharashtra', 'Alwar': 'Rajasthan', \n",
|
491 |
+
" 'Ambala': 'Haryana', 'Amravati': 'Maharashtra', 'Amritsar': 'Punjab', 'Anantapur': 'Andhra Pradesh', \n",
|
492 |
+
" 'Angul': 'Odisha', 'Ankleshwar': 'Gujarat', 'Araria': 'Bihar', 'Ariyalur': 'Tamil Nadu', \n",
|
493 |
+
" 'Arrah': 'Bihar', 'Asansol': 'West Bengal', 'Aurangabad (Bihar)': 'Bihar', \n",
|
494 |
+
" 'Aurangabad(Maharashtra)': 'Maharashtra', 'Baddi': 'Himachal Pradesh', 'Badlapur': 'Maharashtra', \n",
|
495 |
+
" 'Bagalkot': 'Karnataka', 'Baghpat': 'Uttar Pradesh', 'Bahadurgarh': 'Haryana', 'Balasore': 'Odisha', \n",
|
496 |
+
" 'Ballabgarh': 'Haryana', 'Banswara': 'Rajasthan', 'Baran': 'Rajasthan', 'Barbil': 'Odisha', \n",
|
497 |
+
" 'Bareilly': 'Uttar Pradesh', 'Baripada': 'Odisha', 'Barmer': 'Rajasthan', 'Barrackpore': 'West Bengal', \n",
|
498 |
+
" 'Bathinda': 'Punjab', 'Begusarai': 'Bihar', 'Belapur': 'Maharashtra', 'Belgaum': 'Karnataka', \n",
|
499 |
+
" 'Bengaluru': 'Karnataka', 'Bettiah': 'Bihar', 'Bhagalpur': 'Bihar', 'Bharatpur': 'Rajasthan', \n",
|
500 |
+
" 'Bhilai': 'Chhattisgarh', 'Bhilwara': 'Rajasthan', 'Bhiwadi': 'Rajasthan', 'Bhiwandi': 'Maharashtra', \n",
|
501 |
+
" 'Bhiwani': 'Haryana', 'Bhopal': 'Madhya Pradesh', 'Bhubaneswar': 'Odisha', 'Bidar': 'Karnataka', \n",
|
502 |
+
" 'Bihar Sharif': 'Bihar', 'Bikaner': 'Rajasthan', 'Bilaspur': 'Chhattisgarh', 'Bileipada': 'Odisha', \n",
|
503 |
+
" 'Boisar': 'Maharashtra', 'Brajrajnagar': 'Odisha', 'Bulandshahr': 'Uttar Pradesh', 'Bundi': 'Rajasthan', \n",
|
504 |
+
" 'Buxar': 'Bihar', 'Byasanagar': 'Odisha', 'Byrnihat': 'Meghalaya', 'Chamarajanagar': 'Karnataka', \n",
|
505 |
+
" 'Chandigarh': 'Chandigarh', 'Chandrapur': 'Maharashtra', 'Charkhi Dadri': 'Haryana', \n",
|
506 |
+
" 'Chengalpattu': 'Tamil Nadu', 'Chennai': 'Tamil Nadu', 'Chhal': 'Chhattisgarh', 'Chhapra': 'Bihar', \n",
|
507 |
+
" 'Chikkaballapur': 'Karnataka', 'Chikkamagaluru': 'Karnataka', 'Chittoor': 'Andhra Pradesh', \n",
|
508 |
+
" 'Chittorgarh': 'Rajasthan', 'Churu': 'Rajasthan', 'Coimbtore': 'Tamil Nadu', 'Cuddalore': 'Tamil Nadu', \n",
|
509 |
+
" 'Cuttack': 'Odisha', 'Damoh': 'Madhya Pradesh', 'Darbhanga': 'Bihar', 'Dausa': 'Rajasthan', \n",
|
510 |
+
" 'Davanagere': 'Karnataka', 'Dehradun': 'Uttarakhand', 'Delhi': 'Delhi', 'Dewas': 'Madhya Pradesh', \n",
|
511 |
+
" 'Dhanbad': 'Jharkhand', 'Dharuhera': 'Haryana', 'Dharwad': 'Karnataka', 'Dholpur': 'Rajasthan', \n",
|
512 |
+
" 'Dhule': 'Maharashtra', 'Dindigul': 'Tamil Nadu', 'Dungarpur': 'Rajasthan', 'Durgapur': 'West Bengal', \n",
|
513 |
+
" 'Eloor': 'Kerala', 'Ernakulam': 'Kerala', 'Faridabad': 'Haryana', 'Fatehabad': 'Haryana', \n",
|
514 |
+
" 'Firozabad': 'Uttar Pradesh', 'Gadag': 'Karnataka', 'Gandhinagar': 'Gujarat', 'Gangtok': 'Sikkim', \n",
|
515 |
+
" 'Gaya': 'Bihar', 'Ghaziabad': 'Uttar Pradesh', 'Gorakhpur': 'Uttar Pradesh', 'Greater_Noida': 'Uttar Pradesh', \n",
|
516 |
+
" 'Gummidipoondi': 'Tamil Nadu', 'Gurugram': 'Haryana', 'Guwahati': 'Assam', 'Gwalior': 'Madhya Pradesh', \n",
|
517 |
+
" 'Hajipur': 'Bihar', 'Haldia': 'West Bengal', 'Hanumangarh': 'Rajasthan', 'Hapur': 'Uttar Pradesh', \n",
|
518 |
+
" 'Hassan': 'Karnataka', 'Haveri': 'Karnataka', 'Hisar': 'Haryana', 'Hosur': 'Tamil Nadu', 'Howrah': 'West Bengal', \n",
|
519 |
+
" 'Hubballi': 'Karnataka', 'Hyderabad': 'Telangana', 'Imphal': 'Manipur', 'Indore': 'Madhya Pradesh', \n",
|
520 |
+
" 'Jabalpur': 'Madhya Pradesh', 'Jaipur': 'Rajasthan', 'Jaisalmer': 'Rajasthan', 'Jalandhar': 'Punjab', \n",
|
521 |
+
" 'Jalgaon': 'Maharashtra', 'Jalna': 'Maharashtra', 'Jalore': 'Rajasthan', 'Jhalawar': 'Rajasthan', \n",
|
522 |
+
" 'Jhansi': 'Uttar Pradesh', 'Jharsuguda': 'Odisha', 'Jhunjhunu': 'Rajasthan', 'Jind': 'Haryana', \n",
|
523 |
+
" 'Jodhpur': 'Rajasthan', 'Jorapokhar': 'Jharkhand', 'Kadapa': 'Andhra Pradesh', 'Kaithal': 'Haryana', \n",
|
524 |
+
" 'Kalaburgi': 'Karnataka', 'Kalyan': 'Maharashtra', 'Kanchipuram': 'Tamil Nadu', 'Kannur': 'Kerala', \n",
|
525 |
+
" 'Kanpur': 'Uttar Pradesh', 'Karauli': 'Rajasthan', 'Karnal': 'Haryana', 'Karur': 'Tamil Nadu', \n",
|
526 |
+
" 'Karwar': 'Karnataka', 'Kashipur': 'Uttarakhand', 'Katihar': 'Bihar', 'Katni': 'Madhya Pradesh', \n",
|
527 |
+
" 'Keonjhar': 'Odisha', 'Khanna': 'Punjab', 'Khurja': 'Uttar Pradesh', 'Kishanganj': 'Bihar', \n",
|
528 |
+
" 'Kochi': 'Kerala', 'Kohima': 'Nagaland', 'Kolar': 'Karnataka', 'Kolhapur': 'Maharashtra', \n",
|
529 |
+
" 'Kolkata': 'West Bengal', 'Kollam': 'Kerala', 'Koppal': 'Karnataka', 'Korba': 'Chhattisgarh', \n",
|
530 |
+
" 'Kota': 'Rajasthan', 'Kozhikode': 'Kerala', 'Kunjemura': 'Jharkhand', 'Kurushketra': 'Haryana', \n",
|
531 |
+
" 'Latur': 'Maharashtra', 'Loni_Ghaziabad': 'Uttar Pradesh', 'Lucknow': 'Uttar Pradesh', 'Ludhiana': 'Punjab', \n",
|
532 |
+
" 'Madurai': 'Tamil Nadu', 'Mahad': 'Maharashtra', 'Maihar': 'Madhya Pradesh', 'Malegaon': 'Maharashtra', \n",
|
533 |
+
" 'Mandi Gobindgarh': 'Punjab', 'Mandideep': 'Madhya Pradesh', 'Mandikhera': 'Haryana', 'Manesar': 'Haryana', \n",
|
534 |
+
" 'Mangalore': 'Karnataka', 'Manguraha': 'Bihar', 'Medikeri': 'Karnataka', 'Meerut': 'Uttar Pradesh', \n",
|
535 |
+
" 'Milupara': 'Chhattisgarh', 'Mira-Bhayandar': 'Maharashtra', 'Moradabad': 'Uttar Pradesh', \n",
|
536 |
+
" 'Motihari': 'Bihar', 'Mumbai': 'Maharashtra', 'Munger': 'Bihar', 'Muzaffarnagar': 'Uttar Pradesh', \n",
|
537 |
+
" 'Muzaffarpur': 'Bihar', 'Mysuru': 'Karnataka', 'NOIDA': 'Uttar Pradesh', 'Nagaon': 'Assam', \n",
|
538 |
+
" 'Nagapattinam': 'Tamil Nadu', 'Nagaur': 'Rajasthan', 'Nagpur': 'Maharashtra', 'Naharlagun': 'Arunachal Pradesh', \n",
|
539 |
+
" 'Nalbari': 'Assam', 'Nanded': 'Maharashtra', 'Nandesari': 'Gujarat', 'Narnaul': 'Haryana', 'Nashik': 'Maharashtra',\n",
|
540 |
+
" 'Navi Mumbai': 'Maharashtra',\n",
|
541 |
+
" 'Nayagarh': 'Odisha',\n",
|
542 |
+
" 'Noida': 'Uttar Pradesh',\n",
|
543 |
+
" 'Ooty': 'Tamil Nadu',\n",
|
544 |
+
" 'Pali': 'Rajasthan',\n",
|
545 |
+
" 'Palkalaiperur': 'Tamil Nadu',\n",
|
546 |
+
" 'Palwal': 'Haryana',\n",
|
547 |
+
" 'Panchkula': 'Haryana',\n",
|
548 |
+
" 'Panipat': 'Haryana',\n",
|
549 |
+
" 'Parbhani': 'Maharashtra',\n",
|
550 |
+
" 'Pathardih': 'Jharkhand',\n",
|
551 |
+
" 'Patiala': 'Punjab',\n",
|
552 |
+
" 'Patna': 'Bihar',\n",
|
553 |
+
" 'Pimpri-Chinchwad': 'Maharashtra',\n",
|
554 |
+
" 'Pithampur': 'Madhya Pradesh',\n",
|
555 |
+
" 'Pratapgarh': 'Rajasthan',\n",
|
556 |
+
" 'Prayagraj': 'Uttar Pradesh',\n",
|
557 |
+
" 'Puducherry': 'Puducherry',\n",
|
558 |
+
" 'Pudukottai': 'Tamil Nadu',\n",
|
559 |
+
" 'Pune': 'Maharashtra',\n",
|
560 |
+
" 'Purnia': 'Bihar',\n",
|
561 |
+
" 'Raichur': 'Karnataka',\n",
|
562 |
+
" 'Raipur': 'Chhattisgarh',\n",
|
563 |
+
" 'Rairangpur': 'Odisha',\n",
|
564 |
+
" 'Rajamahendravaram': 'Andhra Pradesh',\n",
|
565 |
+
" 'Rajgir': 'Bihar',\n",
|
566 |
+
" 'Rajsamand': 'Rajasthan',\n",
|
567 |
+
" 'Ramanagara': 'Karnataka',\n",
|
568 |
+
" 'Ramanathapuram': 'Tamil Nadu',\n",
|
569 |
+
" 'Ranipet': 'Tamil Nadu',\n",
|
570 |
+
" 'Ratlam': 'Madhya Pradesh',\n",
|
571 |
+
" 'Rishikesh': 'Uttarakhand',\n",
|
572 |
+
" 'Rohtak': 'Haryana',\n",
|
573 |
+
" 'Rourkela': 'Odisha',\n",
|
574 |
+
" 'Rupnagar': 'Punjab',\n",
|
575 |
+
" 'Sagar': 'Madhya Pradesh',\n",
|
576 |
+
" 'Saharsa': 'Bihar',\n",
|
577 |
+
" 'Salem': 'Tamil Nadu',\n",
|
578 |
+
" 'Samastipur': 'Bihar',\n",
|
579 |
+
" 'Sangli': 'Maharashtra',\n",
|
580 |
+
" 'Sasaram': 'Bihar',\n",
|
581 |
+
" 'Satna': 'Madhya Pradesh',\n",
|
582 |
+
" 'Sawai Madhopur': 'Rajasthan',\n",
|
583 |
+
" 'Shillong': 'Meghalaya',\n",
|
584 |
+
" 'Shivamogga': 'Karnataka',\n",
|
585 |
+
" 'Sikar': 'Rajasthan',\n",
|
586 |
+
" 'Silchar': 'Assam',\n",
|
587 |
+
" 'Siliguri': 'West Bengal',\n",
|
588 |
+
" 'Singrauli': 'Madhya Pradesh',\n",
|
589 |
+
" 'Sirohi': 'Rajasthan',\n",
|
590 |
+
" 'Sirsa': 'Haryana',\n",
|
591 |
+
" 'Sivasagar': 'Assam',\n",
|
592 |
+
" 'Siwan': 'Bihar',\n",
|
593 |
+
" 'Solapur': 'Maharashtra',\n",
|
594 |
+
" 'Sonipat': 'Haryana',\n",
|
595 |
+
" 'Sri Ganganagar': 'Rajasthan',\n",
|
596 |
+
" 'Srinagar': 'Jammu and Kashmir',\n",
|
597 |
+
" 'Suakati': 'Odisha',\n",
|
598 |
+
" 'Surat': 'Gujarat',\n",
|
599 |
+
" 'Talcher': 'Odisha',\n",
|
600 |
+
" 'Tensa': 'Odisha',\n",
|
601 |
+
" 'Thane': 'Maharashtra',\n",
|
602 |
+
" 'Thanjavur': 'Tamil Nadu',\n",
|
603 |
+
" 'Thiruvananthapuram': 'Kerala',\n",
|
604 |
+
" 'Thoothukudi': 'Tamil Nadu',\n",
|
605 |
+
" 'Thrissur': 'Kerala',\n",
|
606 |
+
" 'Tiruchirappalli': 'Tamil Nadu',\n",
|
607 |
+
" 'Tirunelveli': 'Tamil Nadu',\n",
|
608 |
+
" 'Tirupati': 'Andhra Pradesh',\n",
|
609 |
+
" 'Tirupur': 'Tamil Nadu',\n",
|
610 |
+
" 'Tonk': 'Rajasthan',\n",
|
611 |
+
" 'Tumidih': 'Chhattisgarh',\n",
|
612 |
+
" 'Udaipur': 'Rajasthan',\n",
|
613 |
+
" 'Udupi': 'Karnataka',\n",
|
614 |
+
" 'Ujjain': 'Madhya Pradesh',\n",
|
615 |
+
" 'Ulhasnagar': 'Maharashtra',\n",
|
616 |
+
" 'Vapi': 'Gujarat',\n",
|
617 |
+
" 'Varanasi': 'Uttar Pradesh', \n",
|
618 |
+
" 'Vatva': 'Gujarat', 'Vellore': 'Tamil Nadu',\n",
|
619 |
+
" 'Vijayapura': 'Karnataka',\n",
|
620 |
+
" 'Vijayawada': 'Andhra Pradesh',\n",
|
621 |
+
" 'Virar': 'Maharashtra',\n",
|
622 |
+
" 'Virudhunagar': 'Tamil Nadu',\n",
|
623 |
+
" 'Visakhapatnam': 'Andhra Pradesh',\n",
|
624 |
+
" 'Vrindavan': 'Uttar Pradesh',\n",
|
625 |
+
" 'Yadgir': 'Karnataka',\n",
|
626 |
+
" 'Yamunanagar': 'Haryana'\n",
|
627 |
+
"}\n",
|
628 |
+
"merged_df['State'] = merged_df['City'].apply(lambda x: city_to_state[x])\n",
|
629 |
+
"merged_df['State'].value_counts().sort_index()"
|
630 |
+
]
|
631 |
+
},
|
632 |
+
{
|
633 |
+
"cell_type": "code",
|
634 |
+
"execution_count": 34,
|
635 |
+
"metadata": {},
|
636 |
+
"outputs": [
|
637 |
+
{
|
638 |
+
"data": {
|
639 |
+
"text/html": [
|
640 |
+
"<div>\n",
|
641 |
+
"<style scoped>\n",
|
642 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
643 |
+
" vertical-align: middle;\n",
|
644 |
+
" }\n",
|
645 |
+
"\n",
|
646 |
+
" .dataframe tbody tr th {\n",
|
647 |
+
" vertical-align: top;\n",
|
648 |
+
" }\n",
|
649 |
+
"\n",
|
650 |
+
" .dataframe thead th {\n",
|
651 |
+
" text-align: right;\n",
|
652 |
+
" }\n",
|
653 |
+
"</style>\n",
|
654 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
655 |
+
" <thead>\n",
|
656 |
+
" <tr style=\"text-align: right;\">\n",
|
657 |
+
" <th></th>\n",
|
658 |
+
" <th>City</th>\n",
|
659 |
+
" <th>AQI</th>\n",
|
660 |
+
" <th>Pollutant</th>\n",
|
661 |
+
" <th>Air Quality</th>\n",
|
662 |
+
" <th>Based on number of monitoring stations</th>\n",
|
663 |
+
" <th>Date</th>\n",
|
664 |
+
" <th>State</th>\n",
|
665 |
+
" </tr>\n",
|
666 |
+
" </thead>\n",
|
667 |
+
" <tbody>\n",
|
668 |
+
" <tr>\n",
|
669 |
+
" <th>0</th>\n",
|
670 |
+
" <td>Agra</td>\n",
|
671 |
+
" <td>417</td>\n",
|
672 |
+
" <td>PM\\n2.5</td>\n",
|
673 |
+
" <td>Severe</td>\n",
|
674 |
+
" <td>1</td>\n",
|
675 |
+
" <td>2016-01-01</td>\n",
|
676 |
+
" <td>Uttar Pradesh</td>\n",
|
677 |
+
" </tr>\n",
|
678 |
+
" <tr>\n",
|
679 |
+
" <th>1</th>\n",
|
680 |
+
" <td>Bengaluru</td>\n",
|
681 |
+
" <td>95</td>\n",
|
682 |
+
" <td>PM , PM\\n2.5 10</td>\n",
|
683 |
+
" <td>Satisfactory</td>\n",
|
684 |
+
" <td>5</td>\n",
|
685 |
+
" <td>2016-01-01</td>\n",
|
686 |
+
" <td>Karnataka</td>\n",
|
687 |
+
" </tr>\n",
|
688 |
+
" </tbody>\n",
|
689 |
+
"</table>\n",
|
690 |
+
"</div>"
|
691 |
+
],
|
692 |
+
"text/plain": [
|
693 |
+
" City AQI Pollutant Air Quality \\\n",
|
694 |
+
"0 Agra 417 PM\\n2.5 Severe \n",
|
695 |
+
"1 Bengaluru 95 PM , PM\\n2.5 10 Satisfactory \n",
|
696 |
+
"\n",
|
697 |
+
" Based on number of monitoring stations Date State \n",
|
698 |
+
"0 1 2016-01-01 Uttar Pradesh \n",
|
699 |
+
"1 5 2016-01-01 Karnataka "
|
700 |
+
]
|
701 |
+
},
|
702 |
+
"execution_count": 34,
|
703 |
+
"metadata": {},
|
704 |
+
"output_type": "execute_result"
|
705 |
+
}
|
706 |
+
],
|
707 |
+
"source": [
|
708 |
+
"# merged_df['Based on number of monitoring stations'] = merged_df['Based on number of monitoring stations'].apply(lambda x: int(x.split(\"/\")[0].split(\"#\")[0].split(\"\\n\")[0]))\n",
|
709 |
+
"merged_df['Date'] = pd.to_datetime(merged_df['Date'])\n",
|
710 |
+
"merged_df.head(2)"
|
711 |
+
]
|
712 |
+
},
|
713 |
+
{
|
714 |
+
"cell_type": "code",
|
715 |
+
"execution_count": 37,
|
716 |
+
"metadata": {},
|
717 |
+
"outputs": [
|
718 |
+
{
|
719 |
+
"data": {
|
720 |
+
"text/plain": [
|
721 |
+
"370681"
|
722 |
+
]
|
723 |
+
},
|
724 |
+
"execution_count": 37,
|
725 |
+
"metadata": {},
|
726 |
+
"output_type": "execute_result"
|
727 |
+
}
|
728 |
+
],
|
729 |
+
"source": [
|
730 |
+
"len(merged_df)"
|
731 |
+
]
|
732 |
+
},
|
733 |
+
{
|
734 |
+
"cell_type": "code",
|
735 |
+
"execution_count": 43,
|
736 |
+
"metadata": {},
|
737 |
+
"outputs": [
|
738 |
+
{
|
739 |
+
"name": "stdout",
|
740 |
+
"output_type": "stream",
|
741 |
+
"text": [
|
742 |
+
"370681\n",
|
743 |
+
"369967\n"
|
744 |
+
]
|
745 |
+
}
|
746 |
+
],
|
747 |
+
"source": [
|
748 |
+
"merged_df['Date_City'] = merged_df['Date'].astype(str) + \"_\" + merged_df['City']\n",
|
749 |
+
"print(len(merged_df))\n",
|
750 |
+
"merged_df = merged_df.drop_duplicates(subset=['Date_City'], keep='first')\n",
|
751 |
+
"print(len(merged_df))"
|
752 |
+
]
|
753 |
+
},
|
754 |
+
{
|
755 |
+
"cell_type": "code",
|
756 |
+
"execution_count": 46,
|
757 |
+
"metadata": {},
|
758 |
+
"outputs": [],
|
759 |
+
"source": [
|
760 |
+
"merged_df.set_index([\"Date\", \"City\"]).to_xarray().to_dataframe().to_parquet(\"AQI_data.parquet\")"
|
761 |
+
]
|
762 |
+
}
|
763 |
+
],
|
764 |
+
"metadata": {
|
765 |
+
"kernelspec": {
|
766 |
+
"display_name": "zeel_py310",
|
767 |
+
"language": "python",
|
768 |
+
"name": "python3"
|
769 |
+
},
|
770 |
+
"language_info": {
|
771 |
+
"codemirror_mode": {
|
772 |
+
"name": "ipython",
|
773 |
+
"version": 3
|
774 |
+
},
|
775 |
+
"file_extension": ".py",
|
776 |
+
"mimetype": "text/x-python",
|
777 |
+
"name": "python",
|
778 |
+
"nbconvert_exporter": "python",
|
779 |
+
"pygments_lexer": "ipython3",
|
780 |
+
"version": "3.10.15"
|
781 |
+
}
|
782 |
+
},
|
783 |
+
"nbformat": 4,
|
784 |
+
"nbformat_minor": 2
|
785 |
+
}
|
prompts.csv
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
prompt,Desired Answer,Category ,llama3_answer_1721726247,llama3_score_1721726247,mixtral_answer_1721726407,mixtral_score_1721726407,gemma_answer_1721726499,gemma_score_1721726499,Codestral Mamba_answer_1721759526,Codestral Mamba_score_1721759526,Codestral_answer_1721759762,Codestral_score_1721759762
|
2 |
+
Which month has the highest average PM2.5 in 2023 for Mumbai?,January,,The month with the highest average PM2.5 in 2023 for Mumbai is January with an average of 93.31 µg/m³.,TRUE,The month with the highest average PM2.5 in 2023 for Mumbai is January with an average of 93.31277266048505 µg/m³.,TRUE,The highest average PM2.5 in 2023 for Mumbai was 93.31 µg/m³. It occurred in January.,TRUE,"content='To find the month with the highest average PM2.5 in 2023 for Mumbai, you can follow these steps:\n\n1. Filter the data for the year 2023 and the city of Mumbai.\n2. Group the data by month and calculate the average PM2.5 for each month.\n3. Find the month with the highest average PM2.5.\n\nHere\'s the code to accomplish this:\n\n```python\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\ndf = pd.read_csv(""Data.csv"")\ndf[""Timestamp""] = pd.to_datetime(df[""Timestamp""])\n\n# Filter data for the year 2023 and Mumbai\nmumbai_2023 = df[(df[""city""] == ""Mumbai"") & (df[""Timestamp""].dt.year == 2023)]\n\n# Group data by month and calculate average PM2.5\naverage_pm25 = mumbai_2023.groupby(mumbai_2023[""Timestamp""].dt.month)[""PM2.5""].mean()\n\n# Find the month with the highest average PM2.5\nhighest_month = average_pm25.idxmax()\n\n# Print the result\nprint(f""The month with the highest average PM2.5 in 2023 for Mumbai is {highest_month}."")\n```\n\nThis code will output the month with the highest average PM2.5 in 2023 for Mumbai.\n\nThe unit of the data is micrograms per cubic meter (µg/m³).' response_metadata={'token_usage': {'prompt_tokens': 313, 'total_tokens': 696, 'completion_tokens': 383}, 'model': 'open-codestral-mamba', 'finish_reason': 'stop'} id='run-a1f7518c-da50-4559-8aa5-46df78dfb29c-0' usage_metadata={'input_tokens': 313, 'output_tokens': 383, 'total_tokens': 696}",TRUE,The month with the highest average PM2.5 in 2023 for Mumbai is January. The average PM2.5 is 93.31 µg/m³,TRUE
|
3 |
+
Which month generally has the highest pollution?,November,,The month with the highest pollution is 11 with an average PM2.5 of 106.61 µg/m³.,TRUE,"The month with the highest pollution (on average) is month 11, with an average PM2.5 of 106.61250133484131 µg/m³.",TRUE,"The month with the highest average PM2.5 is November, with an average of 106.61 µg/m³.",TRUE,"content='To find out which month generally has the highest pollution, we can group the data by month and calculate the average PM2.5 for each month. Then, we can find the month with the highest average PM2.5.\n\nHere\'s the code to accomplish this:\n\n```python\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\ndf = pd.read_csv(""Data.csv"")\ndf[""Timestamp""] = pd.to_datetime(df[""Timestamp""])\n\n# Group the data by month and calculate the average PM2.5 for each month\nmonthly_avg = df.groupby(df[""Timestamp""].dt.month)[""PM2.5""].mean()\n\n# Find the month with the highest average PM2.5\nhighest_month = monthly_avg.idxmax()\n\n# Print the result\nprint(""The month with the highest pollution is:"", highest_month)\n```\n\nThis code will output the month with the highest average PM2.5.\n\nThe unit of the data is µg/m³, as PM2.5 is measured in micrograms per cubic meter.' response_metadata={'token_usage': {'prompt_tokens': 300, 'total_tokens': 554, 'completion_tokens': 254}, 'model': 'open-codestral-mamba', 'finish_reason': 'stop'} id='run-a1dd6414-e96c-4672-99dc-6cf79be7925b-0' usage_metadata={'input_tokens': 300, 'output_tokens': 254, 'total_tokens': 554}",FALSE,The month with the highest pollution is November with an average PM2.5 of 106.61250133484131 µg/m³,TRUE
|
4 |
+
Which city has the highest PM2.5 level in July 2022?,Saharsa,,The city with the highest PM2.5 level in July 2022 is Saharsa with an average PM2.5 level of 65.47860816896052 µg/m³.,TRUE,"Mumbai has the highest PM2.5 level in July 2022, with a value of 833.8 µg/m³.",FALSE,"Error: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().",FALSE,"content='To find the city with the highest PM2.5 level in July 2022, you can follow these steps:\n\n1. Filter the data for the month of July 2022.\n2. Group the data by city and calculate the mean PM2.5 level for each city.\n3. Find the city with the highest mean PM2.5 level.\n\nHere\'s the code to accomplish this:\n\n```python\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\ndf = pd.read_csv(""Data.csv"")\ndf[""Timestamp""] = pd.to_datetime(df[""Timestamp""])\n\n# Filter data for July 2022\njuly_2022_data = df[(df[""Timestamp""].dt.month == 7) & (df[""Timestamp""].dt.year == 2022)]\n\n# Group data by city and calculate mean PM2.5 level\ncity_pm25_levels = july_2022_data.groupby(""city"")[""PM2.5""].mean()\n\n# Find city with the highest PM2.5 level\nhighest_pm25_city = city_pm25_levels.idxmax()\n\n# Print the result\nprint(""The city with the highest PM2.5 level in July 2022 is:"", highest_pm25_city)\n```\n\nThis code will output the city with the highest PM2.5 level in July 2022.\n\nThe unit of the PM2.5 data is micrograms per cubic meter (µg/m³).' response_metadata={'token_usage': {'prompt_tokens': 310, 'total_tokens': 678, 'completion_tokens': 368}, 'model': 'open-codestral-mamba', 'finish_reason': 'stop'} id='run-196fb218-c16e-48ad-880a-563573e05cc2-0' usage_metadata={'input_tokens': 310, 'output_tokens': 368, 'total_tokens': 678}",FALSE,The city with the highest PM2.5 level in July 2022 is Saharsa with an average PM2.5 level of 65.47860816896052 µg/m³,TRUE
|
5 |
+
"Which (month, year) has the highest PM2.5?",2017 November,,"The month and year with the highest PM2.5 is (11.0, 2017.0) with an average PM2.5 of 149.09 µg/m³.",TRUE,The month and year with the highest PM2.5 value is 11/2017.,TRUE,"Error: invalid decimal literal (<string>, line 30)",FALSE,Nov-17,TRUE,"The month and year with the highest average PM2.5 is November, 2017 with a value of 149.09468767706852 µg/m³",TRUE
|
6 |
+
Which state has the highest average PM2.5?,Delhi,,The state with the highest average PM2.5 is Delhi with an average of 104.96 µg/m³.,TRUE,The state with the highest average PM2.5 is Delhi with an average of 104.96 µg/m³.,TRUE,"Error: invalid decimal literal (<string>, line 30)",FALSE,"content='To find the state with the highest average PM2.5, you can group the data by state and calculate the mean PM2.5 for each group. Then, you can find the state with the maximum mean PM2.5.\n\nHere\'s the code to accomplish this:\n\n```python\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\ndf = pd.read_csv(""Data.csv"")\ndf[""Timestamp""] = pd.to_datetime(df[""Timestamp""])\n\n# Group the data by state and calculate the mean PM2.5 for each group\nmean_pm25_by_state = df.groupby(\'state\')[\'PM2.5\'].mean()\n\n# Find the state with the maximum mean PM2.5\nstate_with_max_pm25 = mean_pm25_by_state.idxmax()\n\n# Print the result\nprint(""The state with the highest average PM2.5 is:"", state_with_max_pm25)\n```\n\nThis code will output the state with the highest average PM2.5.\n\nThe unit of the data is micrograms per cubic meter (µg/m³).' response_metadata={'token_usage': {'prompt_tokens': 303, 'total_tokens': 569, 'completion_tokens': 266}, 'model': 'open-codestral-mamba', 'finish_reason': 'stop'} id='run-4ce9fd47-5024-4dcb-b933-d1ea9ec4cfd7-0' usage_metadata={'input_tokens': 303, 'output_tokens': 266, 'total_tokens': 569}",FALSE,The state with the highest average PM2.5 is Delhi with an average of 104.96301991330218 µg/m³,TRUE
|
7 |
+
What is the name of the month with the highest average PM2.5 overall?,November,,The month with the highest average PM2.5 is November with an average of 149.09 µg/m³.,TRUE,The average PM2.5 in the month of December is 106.61 µg/m³,FALSE,Error: 'str' object has no attribute 'strftime',FALSE,November,TRUE,The month with the highest average PM2.5 overall is November. The average PM2.5 is 106.61 µg/m³,TRUE
|
8 |
+
"Which city had the lowest average PM2.5 level on January 15th, 2023?",Silchar,,"The city with the lowest average PM2.5 level on January 15th, 2023 is Silchar with an average PM2.5 of 6.209687499999999 µg/m³.",TRUE,Error: attempt to get argmin of an empty sequence,FALSE,"Error: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().",FALSE,"content='To find the city with the lowest average PM2.5 level on January 15th, 2023, you can use the following code:\n\n```python\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\ndf = pd.read_csv(""Data.csv"")\ndf[""Timestamp""] = pd.to_datetime(df[""Timestamp""])\n\n# Filter data for January 15th, 2023\nfiltered_data = df[(df[""Timestamp""].dt.month == 1) & (df[""Timestamp""].dt.day == 15) & (df[""Timestamp""].dt.year == 2023)]\n\n# Group data by city and calculate the average PM2.5 level\naverage_pm25 = filtered_data.groupby(""city"")[""PM2.5""].mean()\n\n# Find the city with the lowest average PM2.5 level\nlowest_city = average_pm25.idxmin()\n\n# Print the result\nprint(""The city with the lowest average PM2.5 level on January 15th, 2023 is:"", lowest_city)\n```\n\nThe unit of the PM2.5 data in this case is micrograms per cubic meter (µg/m³).' response_metadata={'token_usage': {'prompt_tokens': 316, 'total_tokens': 607, 'completion_tokens': 291}, 'model': 'open-codestral-mamba', 'finish_reason': 'stop'} id='run-136e6aed-6f13-4d1d-b4c6-f5c73ddfb742-0' usage_metadata={'input_tokens': 316, 'output_tokens': 291, 'total_tokens': 607}",FALSE,"The city with the lowest average PM2.5 level on January 15th, 2023 is Silchar with an average PM2.5 level of 6.209687499999999 µg/m³",TRUE
|
9 |
+
What was the PM2.5 level in Kolkata on Diwali day in 2022?,18.31,,The average PM2.5 level in Kolkata on Diwali day in 2022 is 18.31 µg/m³,TRUE,83.29255319,FALSE,"content='```python\n# What was the PM2.5 level in Kolkata on Diwali day in 2022?\n\ndf_filtered = df[(df[\'city\'] == \'Kolkata\') & (df[\'Timestamp\'].dt.month == 11) & (df[\'Timestamp\'].dt.day == 22)]\n\npm2_5_diwali = df_filtered[\'PM2.5\'].values[0]\n\nprint(f""The PM2.5 level in Kolkata on Diwali day in 2022 was {pm2_5_diwali} µg/m³."")\n```' response_metadata={'token_usage': {'completion_time': 0.148857874, 'completion_tokens': 130, 'prompt_time': 0.200585383, 'prompt_tokens': 296, 'queue_time': None, 'total_time': 0.349443257, 'total_tokens': 426}, 'model_name': 'gemma-7b-it', 'system_fingerprint': 'fp_7d8efeb0b1', 'finish_reason': 'stop', 'logprobs': None} id='run-b65480e1-1191-434e-8da6-c2abe3016d05-0'",FALSE,41.98108108,FALSE,The PM2.5 level in Kolkata on Diwali day in 2022 was 83.29255319148938 µg/m³,FALSE
|
10 |
+
How many days did Chennai exceed the PM2.5 level of 150 in March 2023?,0,,0,TRUE,0,TRUE,1011,FALSE,0,TRUE,0,TRUE
|
11 |
+
What was the average PM2.5 level in Delhi for the year 2023?,102.18,,The average PM2.5 level in Delhi for the year 2023 is 102.18 µg/m³,TRUE,The average PM2.5 level in Delhi for the year 2023 was 102.18 µg/m³.,TRUE,102.1839229,TRUE,"content='To find the average PM2.5 level in Delhi for the year 2023, you can use the following code:\n\n```python\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\ndf = pd.read_csv(""Data.csv"")\ndf[""Timestamp""] = pd.to_datetime(df[""Timestamp""])\n\n# Filter data for Delhi in 2023\ndelhi_2023 = df[(df[""city""] == ""Delhi"") & (df[""Timestamp""].dt.year == 2023)]\n\n# Calculate average PM2.5 level\naverage_pm25 = delhi_2023[""PM2.5""].mean()\n\n# Print the result\nprint(""The average PM2.5 level in Delhi for the year 2023 is"", average_pm25, ""µg/m³"")\n```\n\nThis code first filters the data to include only the rows for Delhi in the year 2023. Then, it calculates the average PM2.5 level by taking the mean of the ""PM2.5"" column. Finally, it prints the result.\n\nThe unit of the data is µg/m³ (micrograms per cubic meter).' response_metadata={'token_usage': {'prompt_tokens': 312, 'total_tokens': 598, 'completion_tokens': 286}, 'model': 'open-codestral-mamba', 'finish_reason': 'stop'} id='run-cc197e4c-cc1a-4353-8e59-6ee572266271-0' usage_metadata={'input_tokens': 312, 'output_tokens': 286, 'total_tokens': 598}",FALSE,The average PM2.5 level in Delhi for the year 2023 was 102.18392291539847 µg/m³,TRUE
|
12 |
+
Which city had the highest PM2.5 levels in January 2023?,Begusarai,,The city with the highest PM2.5 levels in January 2023 is Begusarai with an average PM2.5 of 318.5035825572216 µg/m³.,TRUE,"Error: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().",FALSE,"Error: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().",FALSE,Begusarai,TRUE,The city with the highest PM2.5 levels in January 2023 is Begusarai with an average PM2.5 level of 318.5035825572216 µg/m³,TRUE
|
13 |
+
What are the top three most polluted cities based on PM2.5 levels?,Byrnihat Begusarai Angul,,"The top three most polluted cities in India based on PM2.5 levels are: Byrnihat, Begusarai, and Angul with average PM2.5 levels of 151.39 µg/m³, 133.71 µg/m³, and 118.56 µg/m³ respectively.",TRUE,"Series([], Name: PM2.5, dtype: float64)",FALSE,"Empty DataFrame
|
14 |
+
Columns: [city, PM2.5]
|
15 |
+
Index: []",FALSE,"city
|
16 |
+
Byrnihat 151.394592
|
17 |
+
Begusarai 133.711359
|
18 |
+
Angul 118.555268
|
19 |
+
Name: PM2.5, dtype: float64",TRUE,"city
|
20 |
+
Byrnihat 151.394592
|
21 |
+
Begusarai 133.711359
|
22 |
+
Angul 118.555268
|
23 |
+
Name: PM2.5, dtype: float64",TRUE
|
24 |
+
How many days in 2023 did Mumbai exceed the WHO's PM2.5 guidelines?,365,,"In 2023, Mumbai exceeded the WHO's PM2.5 guidelines on 5954 days.",FALSE,0,FALSE,Error: Cannot perform 'rand_' with a dtyped [object] array and scalar of type [bool],FALSE,5954,FALSE,5954,FALSE
|
25 |
+
What was the maximum PM2.5 level recorded in Kolkata in 2023?,195.69,,The maximum PM2.5 level recorded in Kolkata in 2023 is 195.6911578947369 µg/m³,TRUE,The maximum PM2.5 level recorded in Kolkata in 2023 was 195.69 µg/m³.,TRUE,The maximum PM2.5 level recorded in Kolkata in 2023 was 195.69 µg/m³.,TRUE,195.6911579,TRUE,The maximum PM2.5 level recorded in Kolkata in 2023 is 195.6911578947369 µg/m³,TRUE
|
26 |
+
What is the difference between the pollution level of the month before lockdown start and the month after lockdown end in Hyderabad in 2020,-25.76,,The difference in pollution levels before and after lockdown in Hyderabad in 2020 is -25.76 µg/m³.,TRUE,-25.76412152,TRUE,Error: 'year',FALSE,-3.692312722,FALSE,The difference in the average PM2.5 pollution level between the month before lockdown start and the month after lockdown end in Hyderabad in 2020 is -7.76 µg/m³,FALSE
|
27 |
+
What is the percentage change in PM2.5 levels in Delhi from 2019 to 2023?,-5.89,,The percentage change in PM2.5 levels in Delhi from 2019 to 2023 is -5.89%.,TRUE,The percentage change in PM2.5 levels in Delhi from 2019 to 2023 is -5.89% (µg/m³).,TRUE,"content='```python\n# Filter the data to Delhi station\ndelhi = df[df[\'city\'] == \'Delhi\']\n\n# Filter the data to the year range\ndf_filtered = delhi[df[\'Timestamp\'].between(\'2019-01-01\', \'2023-12-31\')]\n\n# Calculate the average PM2.5 levels for each year\npm25_2019 = df_filtered[df_filtered[\'Timestamp\'].dt.year == 2019][\'PM2.5\'].mean()\npm25_2023 = df_filtered[df_filtered[\'Timestamp\'].dt.year == 2023][\'PM2.5\'].mean()\n\n# Calculate the percentage change\npercentage_change = (pm25_2023 - pm25_2019) / pm25_2019 * 100\n\n# Print the result\nprint(""The percentage change in PM2.5 levels in Delhi from 2019 to 2023 is approximately {:.2f}%"".format(percentage_change))\n\n#answer = percentage_change\n```' response_metadata={'token_usage': {'completion_time': 0.298705402, 'completion_tokens': 254, 'prompt_time': 0.208471803, 'prompt_tokens': 302, 'queue_time': None, 'total_time': 0.5071772050000001, 'total_tokens': 556}, 'model_name': 'gemma-7b-it', 'system_fingerprint': 'fp_e3b292b95f', 'finish_reason': 'stop', 'logprobs': None} id='run-073c43dc-8e74-4b7c-9265-b504d60f3fa7-0'",FALSE,7.285826926,FALSE,The percentage change in PM2.5 levels in Delhi from 2019 to 2023 is -5.89%,TRUE
|
28 |
+
Which state had the lowest average PM2.5 levels in 2023?,Mizoram,,The state with the lowest average PM2.5 levels in 2023 is Mizoram with an average of 10.010255033721341 µg/m³.,TRUE,The state with the lowest average PM2.5 levels in 2023 was Mizoram with an average of 10.01 µg/m³.,TRUE,"Error: invalid syntax (<string>, line 30)",FALSE,The state with the lowest average PM2.5 levels in 2023 was Mizoram with an average of 10.010255033721341 µg/m³.,TRUE,The state with the lowest average PM2.5 levels in 2023 is Mizoram with an average of 10.010255033721341 µg/m³,TRUE
|
29 |
+
"Which (month,year) show the most impact on reducing PM2.5 levels?",2020 August,,The month and year that show the most impact on reducing PM2.5 levels are 8.0 2020.0 with an average PM2.5 of 20.8460273003573 µg/m³.,TRUE,"43 The month and year that shows the most impact ...
|
30 |
+
Name: month_year, dtype: object",FALSE,"Error: invalid syntax (<string>, line 30)",FALSE,"(8, 2020)",TRUE,The month and year that show the most impact on reducing PM2.5 levels is 8-2020 with an average PM2.5 of 20.8460273003573 µg/m³,FALSE
|
31 |
+
Which city in India has the best air quality?,Silchar,,Error: attempt to get argmin of an empty sequence,FALSE,Error: 'country',FALSE,Error: 'float' object is not subscriptable,FALSE,Error: attempt to get argmin of an empty sequence,FALSE,The city in India with the best air quality is Silchar. The average PM2.5 is 9.564287909611249 µg/m³.,TRUE
|
32 |
+
Number of cities had PM2.5 levels above the WHO guideline in November 2023?,266,,There were 266 cities that had PM2.5 levels above the WHO guideline (15 µg/m³) in November 2023.,TRUE,266,TRUE,Error: 'Series' object has no attribute 'columns',FALSE,266,TRUE,274,FALSE
|
33 |
+
What was the PM2.5 level during the spring season in 2022?,61.59,,The average PM2.5 level during the spring season in 2022 is 61.59 µg/m³,TRUE,61.58655549,TRUE,61.58655549,TRUE,61.58655549,TRUE,The average PM2.5 level during the spring season in 2022 was 61.59 µg/m³,TRUE
|
34 |
+
By how much is the PM2.5 levels in Delhi on weekdays more than the weekends in 2023?,-2.83,,The PM2.5 levels in Delhi on weekdays are -2.83 µg/m³ more than the weekends in 2023.,TRUE,The PM2.5 levels in Delhi are -2.83 µg/m³ higher on weekdays than on weekends in 2023.,TRUE,"Error: invalid decimal literal (<string>, line 36)",FALSE,Error: name 'weekdays' is not defined,FALSE,The PM2.5 levels in Delhi on weekdays are -2.83 µg/m³ more than the weekends in 2023.,TRUE
|
35 |
+
What was the percentage change in PM2.5 levels in Varanasi before and after the implementation of the National Clean Air Programme (NCAP)?,-61.208,,The percentage change in PM2.5 levels in Varanasi before and after the implementation of the National Clean Air Programme (NCAP) is -61.21%.,TRUE,"content='To find the percentage change in PM2.5 levels in Varanasi before and after the implementation of the National Clean Air Programme (NCAP), we first need to filter the data for Varanasi station and then find the average PM2.5 levels before and after the implementation date.\n\nLet\'s assume the NCAP was implemented on 2019-01-01. We can filter the data using the following code:\n\n```python\nvaranasi_df = df[df[""station""] == ""Varanasi""]\nbefore_ncap_df = varanasi_df[varanasi_df[""Timestamp""] < ""2019-01-01""]\nafter_ncap_df = varanasi_df[varanasi_df[""Timestamp""] >= ""2019-01-01""]\n```\n\nNow, we can find the average PM2.5 levels before and after NCAP using the `mean()` function:\n\n```python\nbefore_ncap_avg = before_ncap_df[""PM2.5""].mean()\nafter_ncap_avg = after_ncap_df[""PM2.5""].mean()\n```\n\nFinally, we can calculate the percentage change using the following formula:\n\n```python\npercentage_change = ((after_ncap_avg - before_ncap_avg) / before_ncap_avg) * 100\n```\n\nSo, the final code would look like this:\n\n```python\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\ndf = pd.read_csv(""Data.csv"")\ndf[""Timestamp""] = pd.to_datetime(df[""Timestamp""])\n\nvaranasi_df = df[df[""station""] == ""Varanasi""]\nbefore_ncap_df = varanasi_df[varanasi_df[""Timestamp""] < ""2019-01-01""]\nafter_ncap_df = varanasi_df[varanasi_df[""Timestamp""] >= ""2019-01-01""]\n\nbefore_ncap_avg = before_ncap_df[""PM2.5""].mean()\nafter_ncap_avg = after_ncap_df[""PM2.5""].mean()\n\npercentage_change = ((after_ncap_avg - before_ncap_avg) / before_ncap_avg) * 100\n\nanswer = f""The percentage change in PM2.5 levels in Varanasi before and after the implementation of the National Clean Air Programme (NCAP) is {percentage_change:.2f}%.""\n```\n\nNote: The actual percentage change may vary depending on the actual implementation date of the NCAP.' response_metadata={'token_usage': {'completion_time': 1.008675864, 'completion_tokens': 634, 'prompt_time': 0.016755465, 'prompt_tokens': 330, 'queue_time': None, 'total_time': 1.025431329, 'total_tokens': 964}, 'model_name': 'mixtral-8x7b-32768', 'system_fingerprint': 'fp_c5f20b5bb1', 'finish_reason': 'stop', 'logprobs': None} id='run-158bdf2f-55bd-4b7c-8a46-c03efde070f3-0'",FALSE,"Error: invalid decimal literal (<string>, line 34)",FALSE,"The percentage change in PM2.5 levels in Varanasi before the implementation of the National Clean Air Programme (NCAP) was 2.12%. After the implementation, the percentage change was 7.35%.",FALSE,The percentage change in PM2.5 levels in Varanasi before and after the implementation of the National Clean Air Programme (NCAP) is -61.96%.,TRUE
|
36 |
+
"Which season of Summer, Winter, Spring, Autumn experiences high pollution levels.",Winter,,The season with high pollution levels is Winter with an average PM2.5 of 95.60 µg/m³.,TRUE,The season with the highest average PM2.5 level is Winter with an average of 91.53 µg/m³.,TRUE,Error: 'Series' object is not callable,FALSE,Winter,TRUE,The season that experiences high pollution levels is Winter. The average PM2.5 is 95.60 µg/m³,TRUE
|
37 |
+
"In which city in Gujarat, GIDC has highest pollution?",Vapi,,Error: attempt to get argmax of an empty sequence,FALSE,Error: attempt to get argmax of an empty sequence,FALSE,"The city in Gujarat with the highest PM2.5 pollution is GIDC, with a reading of nan µg/m³. ",FALSE,Surat,FALSE,Error: attempt to get argmax of an empty sequence,FALSE
|
38 |
+
How many cities had PM2.5 levels below the national standard in 2022?,201,,,,,,,,,,,
|
39 |
+
Which city experienced the highest percentage increase in PM2.5 levels in summer 2023?,Fatehabad,,,,,,,,,,,
|
40 |
+
What were the PM2.5 levels in Kochi on Christmas Day in 2023?,54.77,,,,,,,,,,,
|
41 |
+
What was the average PM2.5 level in Patna for the winter season in 2020?,132.44,,,,,,,,,,,
|
42 |
+
Which city has the lowest PM2.5 in India's largest rice producing state.,Haldia,,,,,,,,,,,
|
43 |
+
Which city has the highest PM2.5 in India's largest milk producing state.,Ghaziabad,,,,,,,,,,,
|
44 |
+
What is the difference between average pollution level of most polluted and least polluted city of Rajasthan?,76.95,,,,,,,,,,,
|
45 |
+
Which among the top 3 cities selected under smart city mission have the least pollution?,Pune,,,,,,,,,,,
|
46 |
+
Which metro city has the highest pollution?,Delhi,,,,,,,,,,,
|
47 |
+
Which out of the 9 metro city has the lowest pollution?,Bengaluru,,,,,,,,,,,
|
48 |
+
Which city is the most affected in terms of pollution due Christmas?,Begusarai,,,,,,,,,,,
|
49 |
+
Which of the cities that have top 9 IITs has the highest PM2.5?,Delhi,,,,,,,,,,,
|
50 |
+
Which of the cities that have top 2 IIMs has the highest PM2.5?,Ahmedabad,,,,,,,,,,,
|
51 |
+
How many stations noted PM2.5 levels exceeding the WHO guideline for all the 365 days in year 2023?,10,,,,,,,,,,,
|
52 |
+
Which of the 7 north-eastern states experiences lowest pollution?,Mizoram,,,,,,,,,,,
|
53 |
+
Which city showed the maximum drop between the pollution level of the month before lockdown start and the month after lockdown end in 2020,Muzaffarpur,,,,,,,,,,,
|
54 |
+
"Which season of Summer, Winter, Spring, Autumn experiences lowest pollution levels?",Summer,,,,,,,,,,,
|
55 |
+
What is the average PM2.5 of Ahmedabad on weekends in 2022?,61.44,,,,,,,,,,,
|
56 |
+
What is the average PM2.5 of Indore on weekdays in 2021?,49.26,,,,,,,,,,,
|
57 |
+
How many stations noted PM2.5 levels exceeding the Indian guideline for all the 365 days in year 2023?,0,,,,,,,,,,,
|
sandbox.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|