{ "cells": [ { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "import streamlit as st\n", "import os\n", "import pandas as pd\n", "import random\n", "from os.path import join\n", "from datetime import datetime\n", "# from src import decorate_with_code, show_response, get_from_user\n", "from dotenv import load_dotenv\n", "from langchain_groq.chat_models import ChatGroq\n", "from langchain_mistralai import ChatMistralAI\n", "from langchain_google_genai import GoogleGenerativeAI,GoogleGenerativeAIEmbeddings\n", "from huggingface_hub import HfApi\n", "load_dotenv()\n", "\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
promptDesired AnswerCategoryllama3_answer_1721726247llama3_score_1721726247mixtral_answer_1721726407mixtral_score_1721726407gemma_answer_1721726499gemma_score_1721726499Codestral Mamba_answer_1721759526Codestral Mamba_score_1721759526Codestral_answer_1721759762Codestral_score_1721759762
0Which month has the highest average PM2.5 in 2...JanuaryNaNThe month with the highest average PM2.5 in 20...TrueThe month with the highest average PM2.5 in 20...TrueThe highest average PM2.5 in 2023 for Mumbai w...Truecontent='To find the month with the highest av...TrueThe month with the highest average PM2.5 in 20...True
1Which month generally has the highest pollution?NovemberNaNThe month with the highest pollution is 11 wit...TrueThe month with the highest pollution (on avera...TrueThe month with the highest average PM2.5 is No...Truecontent='To find out which month generally has...FalseThe month with the highest pollution is Novemb...True
\n", "
" ], "text/plain": [ " prompt Desired Answer \\\n", "0 Which month has the highest average PM2.5 in 2... January \n", "1 Which month generally has the highest pollution? November \n", "\n", " Category llama3_answer_1721726247 \\\n", "0 NaN The month with the highest average PM2.5 in 20... \n", "1 NaN The month with the highest pollution is 11 wit... \n", "\n", " llama3_score_1721726247 mixtral_answer_1721726407 \\\n", "0 True The month with the highest average PM2.5 in 20... \n", "1 True The month with the highest pollution (on avera... \n", "\n", " mixtral_score_1721726407 gemma_answer_1721726499 \\\n", "0 True The highest average PM2.5 in 2023 for Mumbai w... \n", "1 True The month with the highest average PM2.5 is No... \n", "\n", " gemma_score_1721726499 Codestral Mamba_answer_1721759526 \\\n", "0 True content='To find the month with the highest av... \n", "1 True content='To find out which month generally has... \n", "\n", " Codestral Mamba_score_1721759526 \\\n", "0 True \n", "1 False \n", "\n", " Codestral_answer_1721759762 \\\n", "0 The month with the highest average PM2.5 in 20... \n", "1 The month with the highest pollution is Novemb... \n", "\n", " Codestral_score_1721759762 \n", "0 True \n", "1 True " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prompts = pd.read_csv(\"prompts.csv\")\n", "prompts.head(2)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Date datetime64[ns]\n", "City object\n", "AQI float64\n", "Pollutant object\n", "Air Quality object\n", "Based on number of monitoring stations float64\n", "State object\n", "Date_City object\n", "dtype: object\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DateCityAQIPollutantAir QualityBased on number of monitoring stationsStateDate_City
02016-01-01AgartalaNaNNoneNoneNaNNoneNone
12016-01-01Agra417.0PM\\n2.5Severe1.0Uttar Pradesh2016-01-01_Agra
\n", "
" ], "text/plain": [ " Date City AQI Pollutant Air Quality \\\n", "0 2016-01-01 Agartala NaN None None \n", "1 2016-01-01 Agra 417.0 PM\\n2.5 Severe \n", "\n", " Based on number of monitoring stations State Date_City \n", "0 NaN None None \n", "1 1.0 Uttar Pradesh 2016-01-01_Agra " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_parquet(\"AQI_data.parquet\")\n", "print(data.dtypes)\n", "data.head(2)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "def get_full_prompt(question):\n", " return f\"\"\"You are a data scientist who is good with tools such as pandas, numpy and matplotlib. You are also an expert in air quality. You\n", "can access daily AQI `data` and you have to complete a code provided by me based on my question. `data` is a pandas DataFrame and has the following columns and data types:\n", "\n", "Date: date, Date of the `AQI` data\n", "City: string, Name of the city where the `AQI` was recorded\n", "State: string, Name of the state where `City` is located\n", "AQI: float, AQI value\n", "Air Quality: string, Air quality category from [\"Satisfactory\", \"Moderate\", \"Good\", \"Poor\", \"Very Poor\", \"Severe\"] based on `AQI` value\n", "\n", "Now, my question is: \"{question}\"\n", "\n", "Complete the code below to answer my question:\n", "\n", "```python\n", "import pandas as pd\n", "import numpy as np\n", "\n", "data = pd.read_parquet(\"AQI_data.parquet\")\"\"\"" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "You are a data scientist who is good with tools such as pandas, numpy and matplotlib. You are also an expert in air quality. You\n", "can access daily AQI `data` and you have to complete a code provided by me based on my question. `data` is a pandas DataFrame and has the following columns and data types:\n", "\n", "Date: date, Date of the `AQI` data\n", "City: string, Name of the city where the `AQI` was recorded\n", "State: string, Name of the state where `City` is located\n", "AQI: float, AQI value\n", "Air Quality: string, Air quality category from [\"Satisfactory\", \"Moderate\", \"Good\", \"Poor\", \"Very Poor\", \"Severe\"] based on `AQI` value\n", "\n", "Now, my question is: \"Which month has the highest average PM2.5 in 2023 for Mumbai?\"\n", "\n", "Complete the code below to answer my question:\n", "\n", "```python\n", "import pandas as pd\n", "import numpy as np\n", "\n", "data = pd.read_parquet(\"AQI_data.parquet\")\n" ] } ], "source": [ "print(get_full_prompt(prompts.prompt[0]))" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "def get_gemini_response(prompt, model):\n", " return GoogleGenerativeAI(model=model, google_api_key=os.environ.get(\"GOOGLE_API_KEY\"), temperature=0).invoke(prompt)\n", "\n", "def get_groq_response(prompt, model):\n", " return ChatGroq(model=model, api_key=os.environ.get(\"GROQ_API_KEY\"), temperature=0).invoke(prompt).content\n", "\n", "llms = {\"gemini-pro\": lambda prompt: get_gemini_response(prompt, \"gemini-pro\"), \"groq_gemma-7b-it\": lambda prompt: get_groq_response(prompt, \"gemma-7b-it\"), \"groq_llama-3.2-90b-text-preview\": lambda prompt: get_groq_response(prompt, \"llama-3.2-90b-text-preview\")}" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('Which month generally has the highest pollution?', 'November')" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "i = 1\n", "prompts.prompt[i], prompts['Desired Answer'][i]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "llm = \"groq_llama-3.2-90b-text-preview\"\n", "full_prompt = get_full_prompt(\"Which city has the highest AQI value consistently over the years?\")\n", "answer = llms[llm](full_prompt)" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "You are a data scientist who is good with tools such as pandas, numpy and matplotlib. You are also an expert in air quality. You\n", "can access daily AQI `data` and you have to complete a code provided by me based on my question. `data` is a pandas DataFrame and has the following columns and data types:\n", "\n", "Date: date, Date of the `AQI` data\n", "City: string, Name of the city where the `AQI` was recorded\n", "State: string, Name of the state where `City` is located\n", "AQI: float, AQI value\n", "Air Quality: string, Air quality category from [\"Satisfactory\", \"Moderate\", \"Good\", \"Poor\", \"Very Poor\", \"Severe\"] based on `AQI` value\n", "\n", "Now, my question is: \"Which city has the highest AQI value consistently over the years?\"\n", "\n", "Complete the code below to answer my question:\n", "\n", "```python\n", "import pandas as pd\n", "import numpy as np\n", "\n", "data = pd.read_parquet(\"AQI_data.parquet\")\n", "####################################################################################################\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "# Load the data\n", "data = pd.read_parquet(\"AQI_data.parquet\")\n", "\n", "# Convert 'Date' column to datetime and extract the year\n", "data['Date'] = pd.to_datetime(data['Date'])\n", "data['Year'] = data['Date'].dt.year\n", "\n", "# Group by 'City' and 'Year', and calculate the average AQI value\n", "avg_aqi = data.groupby(['City', 'Year'])['AQI'].mean().reset_index()\n", "\n", "# Group by 'City' and calculate the average AQI value over the years\n", "avg_aqi_over_years = avg_aqi.groupby('City')['AQI'].mean().reset_index()\n", "\n", "# Find the city with the highest average AQI value\n", "city_with_highest_aqi = avg_aqi_over_years.loc[avg_aqi_over_years['AQI'].idxmax()]\n", "\n", "print(f\"The city with the highest AQI value consistently over the years is {city_with_highest_aqi['City']} with an average AQI value of {city_with_highest_aqi['AQI']:.2f}\")\n", "\n", "# Plot the top 10 cities with the highest average AQI values\n", "top_10_cities = avg_aqi_over_years.nlargest(10, 'AQI')\n", "plt.figure(figsize=(10, 6))\n", "plt.bar(top_10_cities['City'], top_10_cities['AQI'])\n", "plt.xlabel('City')\n", "plt.ylabel('Average AQI Value')\n", "plt.title('Top 10 Cities with the Highest Average AQI Values')\n", "plt.xticks(rotation=90)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "####################################################################################################\n", "The city with the highest AQI value consistently over the years is Jharsuguda with an average AQI value of 282.00\n" ] } ], "source": [ "import re\n", "code = re.search(r\"```python\\n(.*)```\", answer, re.DOTALL).group(1)\n", "print(full_prompt)\n", "print(\"#\" * 100)\n", "print(code)\n", "print(\"#\" * 100)\n", "exec(code)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import streamlit as st\n", "# import os\n", "# import pandas as pd\n", "# import random\n", "# from os.path import join\n", "# from datetime import datetime\n", "# from src import decorate_with_code, show_response, get_from_user\n", "# from dotenv import load_dotenv\n", "# from langchain_groq.chat_models import ChatGroq\n", "# from langchain_mistralai import ChatMistralAI\n", "# from huggingface_hub import HfApi\n", "# st.set_page_config(layout=\"wide\")\n", "\n", "# ### Extract data.zip\n", "# if not os.path.exists(\"data/1\"):\n", "# os.system(\"unzip data.zip\")\n", "\n", "# # Load environment variables : Groq and Hugging Face API keys\n", "# load_dotenv()\n", "# Groq_Token = os.environ[\"GROQ_API_KEY\"]\n", "# CODESTRAL_API_KEY = os.environ[\"CODESTRAL_API_KEY\"]\n", "# hf_token = os.environ[\"HF_TOKEN\"]\n", "# models = {\"llama3\":\"llama3-70b-8192\",\"mixtral\": \"mixtral-8x7b-32768\", \"llama2\": \"llama2-70b-4096\", \"gemma\": \"gemma-7b-it\"}\n", "# groq_models = {\"llama3-70b\": \"llama3-70b-8192\", \"mixtral\": \"mixtral-8x7b-32768\", \"gemma-7b\": \"gemma-7b-it\",\"llama3.1-70b\":\"llama-3.1-70b-versatile\",\"llama3-8b\":\"llama3-8b-8192\",\"llama3.1-8b\":\"llama-3.1-8b-instant\",\"gemma-9b\":\"gemma2-9b-it\"}\n", "# mistral_models = {\"Codestral Mamba\" : \"open-codestral-mamba\", \"Codestral\" : \"codestral-latest\",\"Mistral 7B\":\"open-mistral-7b\"}\n", "# groq_model_list = list(groq_models.keys())\n", "# mistral_model_list = list(mistral_models.keys())\n", "\n", "# self_path = os.path.dirname(os.path.abspath(__file__))\n", "\n", "\n", "# def generate_template(prompt):\n", "# df_check = pd.read_csv(\"Data.csv\")\n", "# df_check[\"Timestamp\"] = pd.to_datetime(df_check[\"Timestamp\"])\n", "# df_check = df_check.head(5)\n", "\n", "# new_line = \"\\n\"\n", "\n", "# template = f\"\"\"```python\n", "# import pandas as pd\n", "# import matplotlib.pyplot as plt\n", "\n", "# df = pd.read_csv(\"Data.csv\")\n", "# df[\"Timestamp\"] = pd.to_datetime(df[\"Timestamp\"])\n", "\n", "# # df.dtypes\n", "# {new_line.join(map(lambda x: '# '+x, str(df_check.dtypes).split(new_line)))}\n", "\n", "# # {prompt.strip()}\n", "# # \n", "\n", "# #answer = \n", "# ```\n", "# \"\"\"\n", "# return template\n", "\n", "\n", "# def generate_query(template):\n", " \n", "# query = f\"\"\"I have a pandas dataframe data of PM2.5.\n", "# * The columns are 'Timestamp', 'station', 'PM2.5', 'address', 'city', 'latitude', 'longitude', and 'state'.\n", "# * Frequency of Data is Daily.\n", "# * `Pollution` generally means `PM2.5`.\n", "# * PM2.5 guidelines: India: 60, WHO: 15.\n", "# * Store the final answer in a global variable `answer`.\n", "# * Always report the unit of the data. Example: `The average PM2.5 is 45.67 µg/m³`\n", "\n", "# Complete the following code.\n", "\n", "# {template}\n", "# \"\"\"\n", "# return query\n", "\n", "\n", "# def process_query(query, llm):\n", "# global answer\n", "# template = generate_template(query)\n", "# query = generate_query(template)\n", "# global code\n", "# global error\n", "# try:\n", "# answer = llm.invoke(query)\n", "# error = ''\n", "# code = f\"\"\"\n", "# {template.split(\"```python\")[1].split(\"```\")[0]}\n", "# {answer.content.split(\"```python\")[1].split(\"```\")[0]}\n", "# \"\"\"\n", "# # update variable `answer` when code is executed\n", "# exec(code,globals())\n", "# except Exception as e:\n", "# error = e\n", "# code = ''\n", "# answer = f\"Error: {e}\"\n", "# print(answer)\n", "\n", "\n", "# # Using HTML and CSS to center the title\n", "# st.write(\n", "# \"\"\"\n", "# \n", "# \"\"\",\n", "# unsafe_allow_html=True,\n", "# )\n", "\n", "\n", "\n", "\n", "# # Display images and text in three columns with specified ratios\n", "# col1, col2, col3 = st.sidebar.columns((1.0, 2, 1.0)) \n", "# with col2:\n", "# st.markdown(\"

Airchat

\", unsafe_allow_html=True)\n", " \n", " \n", "# model_name = st.sidebar.selectbox(\"Select LLM:\", groq_model_list + mistral_model_list)\n", "\n", "# questions = ['Custom Prompt']\n", "# with open(join(self_path, \"questions.txt\")) as f:\n", "# questions += f.read().split(\"\\n\")\n", "\n", "# waiting_lines = (\"Thinking...\", \"Just a moment...\", \"Let me think...\", \"Working on it...\", \"Processing...\", \"Hold on...\", \"One moment...\", \"On it...\")\n", "\n", "\n", "\n", "# # Initialize chat history\n", "# if \"responses\" not in st.session_state:\n", "# st.session_state.responses = []\n", " \n", "\n", "# # Display chat responses from history on app rerun\n", "# print(\"#\"*10)\n", "# for response_id, response in enumerate(st.session_state.responses):\n", "# status = show_response(st, response)\n", "# if response[\"role\"] == \"assistant\":\n", "# # feedback_key = f\"feedback_{int(response_id/2)}\"\n", "# print(\"response_id\", response_id)\n", " \n", "# error = response[\"error\"]\n", "# output = response[\"content\"]\n", "# last_prompt = response[\"last_prompt\"]\n", "# code = response[\"gen_code\"]\n", "# evaluation = response[\"evaluation\"]\n", " \n", " \n", " \n", "# print(\"#\"*10)\n", "\n", "# show = True\n", "# prompt = st.sidebar.selectbox(\"Select a Prompt:\", questions, key=\"prompt_key\")\n", "# if prompt == 'Custom Prompt':\n", "# show = False\n", "# # React to user input\n", "# prompt = st.chat_input(\"Ask me anything about air quality!\", key=1000)\n", "# if prompt :\n", "# show = True\n", "# else:\n", "# # placeholder for chat input\n", "# st.chat_input(\"Select 'Select a Prompt' -> 'Custom Prompt' in the sidebar to ask your own questions.\", key=1000, disabled=True)\n", "\n", "# if \"last_prompt\" in st.session_state:\n", "# last_prompt = st.session_state[\"last_prompt\"]\n", "# last_model_name = st.session_state[\"last_model_name\"]\n", "# if (prompt == last_prompt) and (model_name == last_model_name):\n", "# show = False\n", "\n", "# if prompt:\n", "# st.sidebar.info(\"Select 'Custom Prompt' to ask your own questions.\")\n", "\n", "# if show:\n", "# # Add user input to chat history\n", "# user_response = get_from_user(prompt)\n", "# st.session_state.responses.append(user_response)\n", "\n", "# # select random waiting line\n", "# with st.spinner(random.choice(waiting_lines)):\n", "# ran = False\n", "# for i in range(1):\n", "# print(f\"Attempt {i+1}\")\n", "# if model_name in groq_models:\n", "# model_folder = \"Groq_\" + groq_models[model_name]\n", "# llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)\n", "# else:\n", "# model_folder = \"MistralAI_\" + mistral_models[model_name]\n", "# llm = ChatMistralAI(model=mistral_models[model_name], api_key=CODESTRAL_API_KEY, temperature=0)\n", "# print(llm)\n", "# # llm = ChatGroq(model=models[model_name], api_key=os.getenv(\"GROQ_API\"), temperature=0)\n", "\n", "# df_check = pd.read_csv(\"Data.csv\")\n", "# df_check[\"Timestamp\"] = pd.to_datetime(df_check[\"Timestamp\"])\n", "# df_check = df_check.head(5)\n", "\n", "# new_line = \"\\n\"\n", "\n", "# parameters = {\"font.size\": 12,\"figure.dpi\": 600}\n", "\n", "# process_query(prompt, llm)\n", " \n", " \n", "# # Read the questions from Questions.txt and find the index of the question if there is a match\n", "# with open(join(\"questions.txt\")) as f:\n", "# questions = f.read().split(\"\\n\")\n", "# try:\n", "# index = questions.index(prompt)\n", "# index = index + 1\n", "# except:\n", "# index = None \n", "# print(\"Index\",index)\n", "# if type(index) == int:\n", "# # Open folder data/index/llm_name and compare with evaluation.txt\n", "# with open(join(\"data\", str(index), model_folder, \"evaluation.txt\")) as f:\n", "# evaluation = f.read().strip()\n", "# with open(join(\"data\", str(index), \"ground_truth\", \"answer.txt\")) as f:\n", "# ground_truth = f.read().strip()\n", "# else:\n", "# evaluation = \"DK\"\n", "# ground_truth = None \n", "# response = {\"role\": \"assistant\", \"content\": answer, \"gen_code\": code, \"ex_code\": code, \"last_prompt\": prompt, \"error\": error,\"evaluation\": evaluation,\"ground_truth\": ground_truth}\n", "\n", "# if ran:\n", "# break\n", " \n", "# # Append agent response to chat history\n", "# st.session_state.responses.append(response)\n", " \n", "# st.session_state['last_prompt'] = prompt\n", "# st.session_state['last_model_name'] = model_name\n", "# st.rerun()\n", " \n", " \n", "\n", "# # Display contact details with message\n", "# st.sidebar.markdown(\"
\", unsafe_allow_html=True)" ] } ], "metadata": { "kernelspec": { "display_name": "zeel_py310", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 2 }