{
"cells": [
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"import gradio as gr\n",
"from pydantic import BaseModel, Field\n",
"\n",
"import langchain\n",
"from langchain.output_parsers import PydanticOutputParser\n",
"from langchain.prompts import ChatPromptTemplate\n",
"from langchain.prompts import ChatPromptTemplate\n",
"from langchain.tools import PythonAstREPLTool\n",
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.schema.output_parser import StrOutputParser\n",
"from dotenv import load_dotenv\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"langchain.debug = False\n",
"pd.set_option('display.max_columns', 20)\n",
"pd.set_option('display.max_rows', 20)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"data_dir_path = os.path.join(os.getcwd(), 'data')\n",
"NUM_ROWS_TO_RETURN = 5\n",
"\n",
"table_1_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_a.csv'))\n",
"table_2_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_b.csv'))\n",
"template_df = pd.read_csv(os.path.join(data_dir_path, 'legal_template.csv'))"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"transform_model = ChatOpenAI(\n",
" model_name='gpt-4',\n",
" temperature=0,\n",
")\n",
"\n",
"natural_language_model = ChatOpenAI(\n",
" model_name='gpt-4',\n",
" temperature=0.1,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"# TODO: add validation to models, coupled with retry mechanism in chain\n",
"class TableMappingEntry(BaseModel):\n",
" '''A single row in a table mapping. Describes how a single column in a source table maps to a single column in a target table, including any necessary transformations, and their explanations.'''\n",
" source_column_name: str = Field(..., description=\"Name of the column in the source table.\")\n",
" target_column_name: str = Field(..., description=\"Name of the column in the target table, to which the source column maps.\")\n",
" value_transformations: str = Field(..., description=\"Transformations needed make the source values match the target values. If unncecessary, write 'NO_TRANSFORM'.\")\n",
" explanation: str = Field(..., description=\"One-sentence explanation of this row (source-target mapping/transformation). Include any information that might be relevant to a software engineer building an ETL pipeline with this document.\")\n",
"\n",
"class TableMapping(BaseModel):\n",
" '''A list of table mappings collectively describe how a source table should be transformed to match the schema of a target table.'''\n",
" table_mappings: list[TableMappingEntry] = Field(..., description=\"A list of table mappings.\")\n",
" \n",
"analyst_prompt_str = '''\n",
"You are a Data Scientist, who specializes in generating schema mappings for use by Software Engineers in ETL pipelines.\n",
"\n",
"Head of `source_csv`:\n",
"\n",
"{source_1_csv_str}\n",
"\n",
"Head of `target_csv`:\n",
"\n",
"{target_csv_str}\n",
"\n",
"Your job is to generate a thorough, precise summary of how `source_csv` should be transformed to adhere exactly to the `target_csv` schema.\n",
"\n",
"For each column in the `source_csv`, you must communicate which column in the `target_csv` it maps to, and how the values in the `source_csv` column should be transformed to match those in the `target_csv`.\n",
"You can assume the rows are aligned: that is, the first row in `source_csv` corresponds to the first row in `target_csv`, and so on.\n",
"\n",
"Remember:\n",
"1. Which column in `target_csv` it maps to. You should consider the semantic meaning of the columns, not just the character similarity. \n",
"\n",
"Example mappings:\n",
"- 'MunICipality' in `source_csv` should map to 'City' in `target_csv`.\n",
"- 'fullname' in `source_csv` should map to both 'FirstName' and 'LastName' in `target_csv`. You must explain this transformation, as well, including the target sequencing of first and last name.\n",
"\n",
"Example transformations:\n",
"- If date in `source_csv` is `2020-01-01` and date in `target_csv` is `01/01/2020`, explain exactly how this should be transformed and the reasoning behind it.\n",
"- If city in `source_csv` is `New York` and city in `target_csv` is `NEW YORK` or `NYC`, explain exactly how this should be transformed and the reasoning behind it.\n",
"\n",
"Lastly, point out any other oddities, such as duplicate columns, erroneous columns, etc.\n",
"\n",
"{format_instructions}\n",
"\n",
"Remember:\n",
"- Be concise: you are speaking to engineers, not customers.\n",
"- Be precise: all of these values are case sensitive. Consider casing for city names, exact prefixes for identifiers, ordering of people's names, etc.\n",
"- DO NOT include commas, quotes, or any other characters that might interfere with JSON serialization or CSV generation\n",
"\n",
"Your response:\n",
"'''\n",
"\n",
"def get_data_str_from_df_for_prompt(df, use_head=True, num_rows_to_return=NUM_ROWS_TO_RETURN):\n",
" data = df.head(num_rows_to_return) if use_head else df.tail(num_rows_to_return)\n",
" return f'