{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"import streamlit as st\n",
"import os\n",
"import pandas as pd\n",
"import random\n",
"from os.path import join\n",
"from datetime import datetime\n",
"# from src import decorate_with_code, show_response, get_from_user\n",
"from dotenv import load_dotenv\n",
"from langchain_groq.chat_models import ChatGroq\n",
"from langchain_mistralai import ChatMistralAI\n",
"from langchain_google_genai import GoogleGenerativeAI,GoogleGenerativeAIEmbeddings\n",
"from huggingface_hub import HfApi\n",
"load_dotenv()\n",
"\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" prompt | \n",
" Desired Answer | \n",
" Category | \n",
" llama3_answer_1721726247 | \n",
" llama3_score_1721726247 | \n",
" mixtral_answer_1721726407 | \n",
" mixtral_score_1721726407 | \n",
" gemma_answer_1721726499 | \n",
" gemma_score_1721726499 | \n",
" Codestral Mamba_answer_1721759526 | \n",
" Codestral Mamba_score_1721759526 | \n",
" Codestral_answer_1721759762 | \n",
" Codestral_score_1721759762 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Which month has the highest average PM2.5 in 2... | \n",
" January | \n",
" NaN | \n",
" The month with the highest average PM2.5 in 20... | \n",
" True | \n",
" The month with the highest average PM2.5 in 20... | \n",
" True | \n",
" The highest average PM2.5 in 2023 for Mumbai w... | \n",
" True | \n",
" content='To find the month with the highest av... | \n",
" True | \n",
" The month with the highest average PM2.5 in 20... | \n",
" True | \n",
"
\n",
" \n",
" 1 | \n",
" Which month generally has the highest pollution? | \n",
" November | \n",
" NaN | \n",
" The month with the highest pollution is 11 wit... | \n",
" True | \n",
" The month with the highest pollution (on avera... | \n",
" True | \n",
" The month with the highest average PM2.5 is No... | \n",
" True | \n",
" content='To find out which month generally has... | \n",
" False | \n",
" The month with the highest pollution is Novemb... | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" prompt Desired Answer \\\n",
"0 Which month has the highest average PM2.5 in 2... January \n",
"1 Which month generally has the highest pollution? November \n",
"\n",
" Category llama3_answer_1721726247 \\\n",
"0 NaN The month with the highest average PM2.5 in 20... \n",
"1 NaN The month with the highest pollution is 11 wit... \n",
"\n",
" llama3_score_1721726247 mixtral_answer_1721726407 \\\n",
"0 True The month with the highest average PM2.5 in 20... \n",
"1 True The month with the highest pollution (on avera... \n",
"\n",
" mixtral_score_1721726407 gemma_answer_1721726499 \\\n",
"0 True The highest average PM2.5 in 2023 for Mumbai w... \n",
"1 True The month with the highest average PM2.5 is No... \n",
"\n",
" gemma_score_1721726499 Codestral Mamba_answer_1721759526 \\\n",
"0 True content='To find the month with the highest av... \n",
"1 True content='To find out which month generally has... \n",
"\n",
" Codestral Mamba_score_1721759526 \\\n",
"0 True \n",
"1 False \n",
"\n",
" Codestral_answer_1721759762 \\\n",
"0 The month with the highest average PM2.5 in 20... \n",
"1 The month with the highest pollution is Novemb... \n",
"\n",
" Codestral_score_1721759762 \n",
"0 True \n",
"1 True "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prompts = pd.read_csv(\"prompts.csv\")\n",
"prompts.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Date datetime64[ns]\n",
"City object\n",
"AQI float64\n",
"Pollutant object\n",
"Air Quality object\n",
"Based on number of monitoring stations float64\n",
"State object\n",
"Date_City object\n",
"dtype: object\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Date | \n",
" City | \n",
" AQI | \n",
" Pollutant | \n",
" Air Quality | \n",
" Based on number of monitoring stations | \n",
" State | \n",
" Date_City | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2016-01-01 | \n",
" Agartala | \n",
" NaN | \n",
" None | \n",
" None | \n",
" NaN | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 1 | \n",
" 2016-01-01 | \n",
" Agra | \n",
" 417.0 | \n",
" PM\\n2.5 | \n",
" Severe | \n",
" 1.0 | \n",
" Uttar Pradesh | \n",
" 2016-01-01_Agra | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Date City AQI Pollutant Air Quality \\\n",
"0 2016-01-01 Agartala NaN None None \n",
"1 2016-01-01 Agra 417.0 PM\\n2.5 Severe \n",
"\n",
" Based on number of monitoring stations State Date_City \n",
"0 NaN None None \n",
"1 1.0 Uttar Pradesh 2016-01-01_Agra "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_parquet(\"AQI_data.parquet\")\n",
"print(data.dtypes)\n",
"data.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"def get_full_prompt(question):\n",
" return f\"\"\"You are a data scientist who is good with tools such as pandas, numpy and matplotlib. You are also an expert in air quality. You\n",
"can access daily AQI `data` and you have to complete a code provided by me based on my question. `data` is a pandas DataFrame and has the following columns and data types:\n",
"\n",
"Date: date, Date of the `AQI` data\n",
"City: string, Name of the city where the `AQI` was recorded\n",
"State: string, Name of the state where `City` is located\n",
"AQI: float, AQI value\n",
"Air Quality: string, Air quality category from [\"Satisfactory\", \"Moderate\", \"Good\", \"Poor\", \"Very Poor\", \"Severe\"] based on `AQI` value\n",
"\n",
"Now, my question is: \"{question}\"\n",
"\n",
"Complete the code below to answer my question:\n",
"\n",
"```python\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"data = pd.read_parquet(\"AQI_data.parquet\")\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"You are a data scientist who is good with tools such as pandas, numpy and matplotlib. You are also an expert in air quality. You\n",
"can access daily AQI `data` and you have to complete a code provided by me based on my question. `data` is a pandas DataFrame and has the following columns and data types:\n",
"\n",
"Date: date, Date of the `AQI` data\n",
"City: string, Name of the city where the `AQI` was recorded\n",
"State: string, Name of the state where `City` is located\n",
"AQI: float, AQI value\n",
"Air Quality: string, Air quality category from [\"Satisfactory\", \"Moderate\", \"Good\", \"Poor\", \"Very Poor\", \"Severe\"] based on `AQI` value\n",
"\n",
"Now, my question is: \"Which month has the highest average PM2.5 in 2023 for Mumbai?\"\n",
"\n",
"Complete the code below to answer my question:\n",
"\n",
"```python\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"data = pd.read_parquet(\"AQI_data.parquet\")\n"
]
}
],
"source": [
"print(get_full_prompt(prompts.prompt[0]))"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"def get_gemini_response(prompt, model):\n",
" return GoogleGenerativeAI(model=model, google_api_key=os.environ.get(\"GOOGLE_API_KEY\"), temperature=0).invoke(prompt)\n",
"\n",
"def get_groq_response(prompt, model):\n",
" return ChatGroq(model=model, api_key=os.environ.get(\"GROQ_API_KEY\"), temperature=0).invoke(prompt).content\n",
"\n",
"llms = {\"gemini-pro\": lambda prompt: get_gemini_response(prompt, \"gemini-pro\"), \"groq_gemma-7b-it\": lambda prompt: get_groq_response(prompt, \"gemma-7b-it\"), \"groq_llama-3.2-90b-text-preview\": lambda prompt: get_groq_response(prompt, \"llama-3.2-90b-text-preview\")}"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('Which month generally has the highest pollution?', 'November')"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"i = 1\n",
"prompts.prompt[i], prompts['Desired Answer'][i]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"llm = \"groq_llama-3.2-90b-text-preview\"\n",
"full_prompt = get_full_prompt(\"Which city has the highest AQI value consistently over the years?\")\n",
"answer = llms[llm](full_prompt)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"You are a data scientist who is good with tools such as pandas, numpy and matplotlib. You are also an expert in air quality. You\n",
"can access daily AQI `data` and you have to complete a code provided by me based on my question. `data` is a pandas DataFrame and has the following columns and data types:\n",
"\n",
"Date: date, Date of the `AQI` data\n",
"City: string, Name of the city where the `AQI` was recorded\n",
"State: string, Name of the state where `City` is located\n",
"AQI: float, AQI value\n",
"Air Quality: string, Air quality category from [\"Satisfactory\", \"Moderate\", \"Good\", \"Poor\", \"Very Poor\", \"Severe\"] based on `AQI` value\n",
"\n",
"Now, my question is: \"Which city has the highest AQI value consistently over the years?\"\n",
"\n",
"Complete the code below to answer my question:\n",
"\n",
"```python\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"data = pd.read_parquet(\"AQI_data.parquet\")\n",
"####################################################################################################\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Load the data\n",
"data = pd.read_parquet(\"AQI_data.parquet\")\n",
"\n",
"# Convert 'Date' column to datetime and extract the year\n",
"data['Date'] = pd.to_datetime(data['Date'])\n",
"data['Year'] = data['Date'].dt.year\n",
"\n",
"# Group by 'City' and 'Year', and calculate the average AQI value\n",
"avg_aqi = data.groupby(['City', 'Year'])['AQI'].mean().reset_index()\n",
"\n",
"# Group by 'City' and calculate the average AQI value over the years\n",
"avg_aqi_over_years = avg_aqi.groupby('City')['AQI'].mean().reset_index()\n",
"\n",
"# Find the city with the highest average AQI value\n",
"city_with_highest_aqi = avg_aqi_over_years.loc[avg_aqi_over_years['AQI'].idxmax()]\n",
"\n",
"print(f\"The city with the highest AQI value consistently over the years is {city_with_highest_aqi['City']} with an average AQI value of {city_with_highest_aqi['AQI']:.2f}\")\n",
"\n",
"# Plot the top 10 cities with the highest average AQI values\n",
"top_10_cities = avg_aqi_over_years.nlargest(10, 'AQI')\n",
"plt.figure(figsize=(10, 6))\n",
"plt.bar(top_10_cities['City'], top_10_cities['AQI'])\n",
"plt.xlabel('City')\n",
"plt.ylabel('Average AQI Value')\n",
"plt.title('Top 10 Cities with the Highest Average AQI Values')\n",
"plt.xticks(rotation=90)\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"####################################################################################################\n",
"The city with the highest AQI value consistently over the years is Jharsuguda with an average AQI value of 282.00\n"
]
}
],
"source": [
"import re\n",
"code = re.search(r\"```python\\n(.*)```\", answer, re.DOTALL).group(1)\n",
"print(full_prompt)\n",
"print(\"#\" * 100)\n",
"print(code)\n",
"print(\"#\" * 100)\n",
"exec(code)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import streamlit as st\n",
"# import os\n",
"# import pandas as pd\n",
"# import random\n",
"# from os.path import join\n",
"# from datetime import datetime\n",
"# from src import decorate_with_code, show_response, get_from_user\n",
"# from dotenv import load_dotenv\n",
"# from langchain_groq.chat_models import ChatGroq\n",
"# from langchain_mistralai import ChatMistralAI\n",
"# from huggingface_hub import HfApi\n",
"# st.set_page_config(layout=\"wide\")\n",
"\n",
"# ### Extract data.zip\n",
"# if not os.path.exists(\"data/1\"):\n",
"# os.system(\"unzip data.zip\")\n",
"\n",
"# # Load environment variables : Groq and Hugging Face API keys\n",
"# load_dotenv()\n",
"# Groq_Token = os.environ[\"GROQ_API_KEY\"]\n",
"# CODESTRAL_API_KEY = os.environ[\"CODESTRAL_API_KEY\"]\n",
"# hf_token = os.environ[\"HF_TOKEN\"]\n",
"# models = {\"llama3\":\"llama3-70b-8192\",\"mixtral\": \"mixtral-8x7b-32768\", \"llama2\": \"llama2-70b-4096\", \"gemma\": \"gemma-7b-it\"}\n",
"# groq_models = {\"llama3-70b\": \"llama3-70b-8192\", \"mixtral\": \"mixtral-8x7b-32768\", \"gemma-7b\": \"gemma-7b-it\",\"llama3.1-70b\":\"llama-3.1-70b-versatile\",\"llama3-8b\":\"llama3-8b-8192\",\"llama3.1-8b\":\"llama-3.1-8b-instant\",\"gemma-9b\":\"gemma2-9b-it\"}\n",
"# mistral_models = {\"Codestral Mamba\" : \"open-codestral-mamba\", \"Codestral\" : \"codestral-latest\",\"Mistral 7B\":\"open-mistral-7b\"}\n",
"# groq_model_list = list(groq_models.keys())\n",
"# mistral_model_list = list(mistral_models.keys())\n",
"\n",
"# self_path = os.path.dirname(os.path.abspath(__file__))\n",
"\n",
"\n",
"# def generate_template(prompt):\n",
"# df_check = pd.read_csv(\"Data.csv\")\n",
"# df_check[\"Timestamp\"] = pd.to_datetime(df_check[\"Timestamp\"])\n",
"# df_check = df_check.head(5)\n",
"\n",
"# new_line = \"\\n\"\n",
"\n",
"# template = f\"\"\"```python\n",
"# import pandas as pd\n",
"# import matplotlib.pyplot as plt\n",
"\n",
"# df = pd.read_csv(\"Data.csv\")\n",
"# df[\"Timestamp\"] = pd.to_datetime(df[\"Timestamp\"])\n",
"\n",
"# # df.dtypes\n",
"# {new_line.join(map(lambda x: '# '+x, str(df_check.dtypes).split(new_line)))}\n",
"\n",
"# # {prompt.strip()}\n",
"# # \n",
"\n",
"# #answer = \n",
"# ```\n",
"# \"\"\"\n",
"# return template\n",
"\n",
"\n",
"# def generate_query(template):\n",
" \n",
"# query = f\"\"\"I have a pandas dataframe data of PM2.5.\n",
"# * The columns are 'Timestamp', 'station', 'PM2.5', 'address', 'city', 'latitude', 'longitude', and 'state'.\n",
"# * Frequency of Data is Daily.\n",
"# * `Pollution` generally means `PM2.5`.\n",
"# * PM2.5 guidelines: India: 60, WHO: 15.\n",
"# * Store the final answer in a global variable `answer`.\n",
"# * Always report the unit of the data. Example: `The average PM2.5 is 45.67 µg/m³`\n",
"\n",
"# Complete the following code.\n",
"\n",
"# {template}\n",
"# \"\"\"\n",
"# return query\n",
"\n",
"\n",
"# def process_query(query, llm):\n",
"# global answer\n",
"# template = generate_template(query)\n",
"# query = generate_query(template)\n",
"# global code\n",
"# global error\n",
"# try:\n",
"# answer = llm.invoke(query)\n",
"# error = ''\n",
"# code = f\"\"\"\n",
"# {template.split(\"```python\")[1].split(\"```\")[0]}\n",
"# {answer.content.split(\"```python\")[1].split(\"```\")[0]}\n",
"# \"\"\"\n",
"# # update variable `answer` when code is executed\n",
"# exec(code,globals())\n",
"# except Exception as e:\n",
"# error = e\n",
"# code = ''\n",
"# answer = f\"Error: {e}\"\n",
"# print(answer)\n",
"\n",
"\n",
"# # Using HTML and CSS to center the title\n",
"# st.write(\n",
"# \"\"\"\n",
"# \n",
"# \"\"\",\n",
"# unsafe_allow_html=True,\n",
"# )\n",
"\n",
"\n",
"\n",
"\n",
"# # Display images and text in three columns with specified ratios\n",
"# col1, col2, col3 = st.sidebar.columns((1.0, 2, 1.0)) \n",
"# with col2:\n",
"# st.markdown(\"Airchat
\", unsafe_allow_html=True)\n",
" \n",
" \n",
"# model_name = st.sidebar.selectbox(\"Select LLM:\", groq_model_list + mistral_model_list)\n",
"\n",
"# questions = ['Custom Prompt']\n",
"# with open(join(self_path, \"questions.txt\")) as f:\n",
"# questions += f.read().split(\"\\n\")\n",
"\n",
"# waiting_lines = (\"Thinking...\", \"Just a moment...\", \"Let me think...\", \"Working on it...\", \"Processing...\", \"Hold on...\", \"One moment...\", \"On it...\")\n",
"\n",
"\n",
"\n",
"# # Initialize chat history\n",
"# if \"responses\" not in st.session_state:\n",
"# st.session_state.responses = []\n",
" \n",
"\n",
"# # Display chat responses from history on app rerun\n",
"# print(\"#\"*10)\n",
"# for response_id, response in enumerate(st.session_state.responses):\n",
"# status = show_response(st, response)\n",
"# if response[\"role\"] == \"assistant\":\n",
"# # feedback_key = f\"feedback_{int(response_id/2)}\"\n",
"# print(\"response_id\", response_id)\n",
" \n",
"# error = response[\"error\"]\n",
"# output = response[\"content\"]\n",
"# last_prompt = response[\"last_prompt\"]\n",
"# code = response[\"gen_code\"]\n",
"# evaluation = response[\"evaluation\"]\n",
" \n",
" \n",
" \n",
"# print(\"#\"*10)\n",
"\n",
"# show = True\n",
"# prompt = st.sidebar.selectbox(\"Select a Prompt:\", questions, key=\"prompt_key\")\n",
"# if prompt == 'Custom Prompt':\n",
"# show = False\n",
"# # React to user input\n",
"# prompt = st.chat_input(\"Ask me anything about air quality!\", key=1000)\n",
"# if prompt :\n",
"# show = True\n",
"# else:\n",
"# # placeholder for chat input\n",
"# st.chat_input(\"Select 'Select a Prompt' -> 'Custom Prompt' in the sidebar to ask your own questions.\", key=1000, disabled=True)\n",
"\n",
"# if \"last_prompt\" in st.session_state:\n",
"# last_prompt = st.session_state[\"last_prompt\"]\n",
"# last_model_name = st.session_state[\"last_model_name\"]\n",
"# if (prompt == last_prompt) and (model_name == last_model_name):\n",
"# show = False\n",
"\n",
"# if prompt:\n",
"# st.sidebar.info(\"Select 'Custom Prompt' to ask your own questions.\")\n",
"\n",
"# if show:\n",
"# # Add user input to chat history\n",
"# user_response = get_from_user(prompt)\n",
"# st.session_state.responses.append(user_response)\n",
"\n",
"# # select random waiting line\n",
"# with st.spinner(random.choice(waiting_lines)):\n",
"# ran = False\n",
"# for i in range(1):\n",
"# print(f\"Attempt {i+1}\")\n",
"# if model_name in groq_models:\n",
"# model_folder = \"Groq_\" + groq_models[model_name]\n",
"# llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)\n",
"# else:\n",
"# model_folder = \"MistralAI_\" + mistral_models[model_name]\n",
"# llm = ChatMistralAI(model=mistral_models[model_name], api_key=CODESTRAL_API_KEY, temperature=0)\n",
"# print(llm)\n",
"# # llm = ChatGroq(model=models[model_name], api_key=os.getenv(\"GROQ_API\"), temperature=0)\n",
"\n",
"# df_check = pd.read_csv(\"Data.csv\")\n",
"# df_check[\"Timestamp\"] = pd.to_datetime(df_check[\"Timestamp\"])\n",
"# df_check = df_check.head(5)\n",
"\n",
"# new_line = \"\\n\"\n",
"\n",
"# parameters = {\"font.size\": 12,\"figure.dpi\": 600}\n",
"\n",
"# process_query(prompt, llm)\n",
" \n",
" \n",
"# # Read the questions from Questions.txt and find the index of the question if there is a match\n",
"# with open(join(\"questions.txt\")) as f:\n",
"# questions = f.read().split(\"\\n\")\n",
"# try:\n",
"# index = questions.index(prompt)\n",
"# index = index + 1\n",
"# except:\n",
"# index = None \n",
"# print(\"Index\",index)\n",
"# if type(index) == int:\n",
"# # Open folder data/index/llm_name and compare with evaluation.txt\n",
"# with open(join(\"data\", str(index), model_folder, \"evaluation.txt\")) as f:\n",
"# evaluation = f.read().strip()\n",
"# with open(join(\"data\", str(index), \"ground_truth\", \"answer.txt\")) as f:\n",
"# ground_truth = f.read().strip()\n",
"# else:\n",
"# evaluation = \"DK\"\n",
"# ground_truth = None \n",
"# response = {\"role\": \"assistant\", \"content\": answer, \"gen_code\": code, \"ex_code\": code, \"last_prompt\": prompt, \"error\": error,\"evaluation\": evaluation,\"ground_truth\": ground_truth}\n",
"\n",
"# if ran:\n",
"# break\n",
" \n",
"# # Append agent response to chat history\n",
"# st.session_state.responses.append(response)\n",
" \n",
"# st.session_state['last_prompt'] = prompt\n",
"# st.session_state['last_model_name'] = model_name\n",
"# st.rerun()\n",
" \n",
" \n",
"\n",
"# # Display contact details with message\n",
"# st.sidebar.markdown(\"
\", unsafe_allow_html=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "zeel_py310",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}