diff --git "a/src/notebooks/brainstorm3.ipynb" "b/src/notebooks/brainstorm3.ipynb" new file mode 100644--- /dev/null +++ "b/src/notebooks/brainstorm3.ipynb" @@ -0,0 +1,1932 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import langchain\n", + "from langchain.agents import OpenAIFunctionsAgent, AgentExecutor\n", + "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n", + "from langchain.tools import PythonAstREPLTool\n", + "from langchain.chat_models import ChatOpenAI\n", + "from pydantic import BaseModel, Field\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.schema.output_parser import StrOutputParser\n", + "import json\n", + "import gradio as gr" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "langchain.debug = True\n", + "os.environ[\"OPENAI_API_KEY\"] = \"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\"\n", + "data_dir_path = os.path.join(os.getcwd())\n", + "pd.set_option('display.max_rows', 20)\n", + "pd.set_option('display.max_columns', 20)\n", + "NUM_ROWS_TO_RETURN = 5" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "def get_data_str_from_df_for_prompt(df, use_head=True, num_rows_to_return=NUM_ROWS_TO_RETURN):\n", + " data = df.head(num_rows_to_return) if use_head else df.tail(num_rows_to_return)\n", + " return f'\\n{data.to_markdown()}\\n'\n", + "\n", + "entries_a_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_a.csv'))\n", + "entries_b_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_b.csv'))\n", + "template_df = pd.read_csv(os.path.join(data_dir_path, 'legal_template.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "analyst_prompt_str = '''\n", + " You are a Junior Data Analyst. You are working with Data Scientists and Software Engineers to transform data from one source format to a target format.\n", + " Here is the head of a CSV file, called source_1_csv:\n", + " \n", + " {source_1_csv_str}\n", + " \n", + " Here is the head of another, called target_csv:\n", + " \n", + " {target_csv_str}\n", + " \n", + " Your job is to generate a thorough, precise summary of exactly which columns in the source file should map to the target file, including a brief explanation for each.\n", + " In addition, if the cell values appear to be different, you should note that as well. For example, if the source date is formatted like this 2020-01-01 and the target date is formatted like this 01/01/2020, you should note that.\n", + " Your output should be natural language; a bulleted list with four values: 1. source column name, 2. target column name, 3. value transformations (if necessary), 4. explanation.\n", + " Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.\n", + " Do not include rows: focus on the column names and data types.\n", + " DO NOT INCLUDE COMMAS OR QUOTES ANYWHERE THIS WILL BE USED IN CSV GENERATION.\n", + " Instead of quotes, use backticks.\n", + " Your response:\n", + "'''\n", + "\n", + "scientist_prompt_str = '''\n", + " You are a Senior Data Scientist. You are generating a CSV that will be used by a Software Engineer to generate python code to transform one CSV file format to another.\n", + " \n", + " Here is the tail of a CSV file, called source_1_csv: \n", + " \n", + " {source_1_csv_str}\n", + " \n", + " Here are five from another, called target_csv: \n", + " \n", + " {target_csv_str}\n", + " \n", + " Your employee wrote a document with the mappings: \n", + " \n", + " {employee_script_str}\n", + " \n", + " Please review their work, think step by step about their proposal, and then revise it with improvements.\n", + " Note: your employee had access to different rows than you did. Neither one of you had access to the entire dataset.\n", + " Your final output should be a CSV with these columns: source_column_name, target_column_name, value_transformation, notes.\n", + " Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.\n", + " Do not include any commas or quotes in the output, except for the CSV delimiters.\n", + " Instead of quotes, use backticks.\n", + " Your response:\n", + "'''\n", + "\n", + "python_prompt_str = '''\n", + " You are a Python Engineer. Your job is to write code to convert a csv-like string into a Pandas dataframe:\n", + "\n", + " {csv_str}\n", + "\n", + " You might have to transform the data slightly to get it to work. But do not do any data type transformation. For example, the date strings are strings: do not conver them to data type.\n", + " The int are int, do not convert them to float, etc.\n", + "\n", + " Your output should be Pandas DataFrame.\n", + "'''\n", + "\n", + "csv_convert_prompt_str = '''\n", + " Convert the following csv-like string into valid json:\n", + " {csv_str}\n", + " Steps: 1. infer schema, 2. remove newlines or any other characters that are not json-serializable, 3. convert to valid json, 4. double check your work and make revisions as needed. \n", + " Your output should be 100% valid json. Do not transform the values, just the structure.\n", + " '''\n", + "\n", + "python_prompt_str = '''\n", + "You are a Python Engineer. Your job is to write code to convert JSON string into a Pandas dataframe:\n", + "\n", + "{csv_str}\n", + "\n", + "Do not transform the data at all, just convert it to a csv format and then generate a pandas dataframe.\n", + "\n", + "Return only python code in Markdown format, eg:\n", + "\n", + "```python\n", + "....\n", + "```\"\"\"\n", + "'''\n", + "from langchain.utilities import PythonREPL" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents.agent_toolkits import create_python_agent\n", + "from langchain.tools.python.tool import PythonREPLTool\n", + "from langchain.python import PythonREPL\n", + "from langchain.llms.openai import OpenAI\n", + "from langchain.agents.agent_types import AgentType\n", + "from langchain.chat_models import ChatOpenAI\n", + "import io\n", + "\n", + "agent_executor = create_python_agent(\n", + " llm=ChatOpenAI(temperature=0, model=\"gpt-4-0613\"),\n", + " tool=PythonREPLTool(),\n", + " verbose=True,\n", + " agent_type=AgentType.OPENAI_FUNCTIONS,\n", + " agent_executor_kwargs={\"handle_parsing_errors\": True},\n", + ")\n", + "\n", + "def _sanitize_output(text: str):\n", + " _, after = text.split(\"```python\")\n", + " return after.split(\"```\")[0]\n", + "\n", + "def run_code(thing):\n", + " model = ChatOpenAI(model_name='gpt-3.5-turbo')\n", + " analyst_prompt = ChatPromptTemplate.from_template(analyst_prompt_str)\n", + " scientist_prompt = ChatPromptTemplate.from_template(scientist_prompt_str)\n", + "\n", + " chain_1 = analyst_prompt | model | StrOutputParser()\n", + " employee_script_str = chain_1.invoke({\"source_1_csv_str\": get_data_str_from_df_for_prompt(entries_a_df), \"target_csv_str\": get_data_str_from_df_for_prompt(template_df)})\n", + "\n", + " chain_2 = scientist_prompt | model | StrOutputParser()\n", + " scientist_csv_str = chain_2.invoke({\"source_1_csv_str\": get_data_str_from_df_for_prompt(entries_a_df, False), \"target_csv_str\": get_data_str_from_df_for_prompt(template_df, False), \"employee_script_str\": employee_script_str})\n", + " csv_prompt = ChatPromptTemplate.from_template(csv_convert_prompt_str)\n", + " chain_3 = csv_prompt| model | StrOutputParser()\n", + " # python_prompt = ChatPromptTemplate.from_template(python_prompt_str)\n", + " # chain_4 = {\"csv_string\": chain_3} | python_prompt | model | StrOutputParser() | _sanitize_output | PythonREPL().run\n", + " # csv_bytes = io.BytesIO(scientist_csv_str.encode())\n", + " return pd.read_csv(json.loads(chain_3.invoke({\"csv_str\": scientist_csv_str})))\n", + " # return chain_3.invoke({\"csv_str\": scientist_csv_str})\n", + " # return agent_executor.run(f\"Please convert this csv-like string into a Pandas Dataframe: {scientist_csv_str}. Do not transform the data: it should have the same column names and types. Do not convert date strings to datetimes or ints to floats. Return one dataframe and nothing else.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running on local URL: http://127.0.0.1:7871\n", + "\n", + "To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence] Entering Chain run with input:\n", + "\u001b[0m{\n", + " \"source_1_csv_str\": \"\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\",\n", + " \"target_csv_str\": \"\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:\n", + "\u001b[0m{\n", + " \"source_1_csv_str\": \"\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\",\n", + " \"target_csv_str\": \"\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [1ms] Exiting Prompt run with output:\n", + "\u001b[0m{\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"prompts\",\n", + " \"chat\",\n", + " \"ChatPromptValue\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"messages\": [\n", + " {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"HumanMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"\\n You are a Junior Data Analyst. You are working with Data Scientists and Software Engineers to transform data from one source format to a target format.\\n Here is the head of a CSV file, called source_1_csv:\\n \\n \\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\\n \\n Here is the head of another, called target_csv:\\n \\n \\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\\n \\n Your job is to generate a thorough, precise summary of exactly which columns in the source file should map to the target file, including a brief explanation for each.\\n In addition, if the cell values appear to be different, you should note that as well. For example, if the source date is formatted like this 2020-01-01 and the target date is formatted like this 01/01/2020, you should note that.\\n Your output should be natural language; a bulleted list with four values: 1. source column name, 2. target column name, 3. value transformations (if necessary), 4. explanation.\\n Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.\\n Do not include rows: focus on the column names and data types.\\n DO NOT INCLUDE COMMAS OR QUOTES ANYWHERE THIS WILL BE USED IN CSV GENERATION.\\n Instead of quotes, use backticks.\\n Your response:\\n\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " ]\n", + " }\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"Human: \\n You are a Junior Data Analyst. You are working with Data Scientists and Software Engineers to transform data from one source format to a target format.\\n Here is the head of a CSV file, called source_1_csv:\\n \\n \\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\\n \\n Here is the head of another, called target_csv:\\n \\n \\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\\n \\n Your job is to generate a thorough, precise summary of exactly which columns in the source file should map to the target file, including a brief explanation for each.\\n In addition, if the cell values appear to be different, you should note that as well. For example, if the source date is formatted like this 2020-01-01 and the target date is formatted like this 01/01/2020, you should note that.\\n Your output should be natural language; a bulleted list with four values: 1. source column name, 2. target column name, 3. value transformations (if necessary), 4. explanation.\\n Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.\\n Do not include rows: focus on the column names and data types.\\n DO NOT INCLUDE COMMAS OR QUOTES ANYWHERE THIS WILL BE USED IN CSV GENERATION.\\n Instead of quotes, use backticks.\\n Your response:\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] [18.30s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"- Source column name: `case_date`\\n Target column name: `CaseDate`\\n Value transformations: None\\n Explanation: The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\n\\n- Source column name: `lastname`\\n Target column name: `FullName`\\n Value transformations: None\\n Explanation: The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\n\\n- Source column name: `firstname`\\n Target column name: None\\n Value transformations: Concatenate with `lastname` column in target file\\n Explanation: The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\n\\n- Source column name: `case_type`\\n Target column name: `CaseType`\\n Value transformations: None\\n Explanation: The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\n\\n- Source column name: `case_id`\\n Target column name: `CaseID`\\n Value transformations: Replace `CR-` with `CASE-` in target file\\n Explanation: The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\n\\n- Source column name: `court_fee`\\n Target column name: `Fee`\\n Value transformations: None\\n Explanation: The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\n\\n- Source column name: `jurisdiction`\\n Target column name: `Jurisdiction`\\n Value transformations: Capitalize first letter in target file\\n Explanation: The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"stop\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"- Source column name: `case_date`\\n Target column name: `CaseDate`\\n Value transformations: None\\n Explanation: The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\n\\n- Source column name: `lastname`\\n Target column name: `FullName`\\n Value transformations: None\\n Explanation: The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\n\\n- Source column name: `firstname`\\n Target column name: None\\n Value transformations: Concatenate with `lastname` column in target file\\n Explanation: The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\n\\n- Source column name: `case_type`\\n Target column name: `CaseType`\\n Value transformations: None\\n Explanation: The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\n\\n- Source column name: `case_id`\\n Target column name: `CaseID`\\n Value transformations: Replace `CR-` with `CASE-` in target file\\n Explanation: The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\n\\n- Source column name: `court_fee`\\n Target column name: `Fee`\\n Value transformations: None\\n Explanation: The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\n\\n- Source column name: `jurisdiction`\\n Target column name: `Jurisdiction`\\n Value transformations: Capitalize first letter in target file\\n Explanation: The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 758,\n", + " \"completion_tokens\": 526,\n", + " \"total_tokens\": 1284\n", + " },\n", + " \"model_name\": \"gpt-3.5-turbo\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] Entering Parser run with input:\n", + "\u001b[0m[inputs]\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] [0ms] Exiting Parser run with output:\n", + "\u001b[0m{\n", + " \"output\": \"- Source column name: `case_date`\\n Target column name: `CaseDate`\\n Value transformations: None\\n Explanation: The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\n\\n- Source column name: `lastname`\\n Target column name: `FullName`\\n Value transformations: None\\n Explanation: The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\n\\n- Source column name: `firstname`\\n Target column name: None\\n Value transformations: Concatenate with `lastname` column in target file\\n Explanation: The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\n\\n- Source column name: `case_type`\\n Target column name: `CaseType`\\n Value transformations: None\\n Explanation: The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\n\\n- Source column name: `case_id`\\n Target column name: `CaseID`\\n Value transformations: Replace `CR-` with `CASE-` in target file\\n Explanation: The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\n\\n- Source column name: `court_fee`\\n Target column name: `Fee`\\n Value transformations: None\\n Explanation: The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\n\\n- Source column name: `jurisdiction`\\n Target column name: `Jurisdiction`\\n Value transformations: Capitalize first letter in target file\\n Explanation: The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence] [18.31s] Exiting Chain run with output:\n", + "\u001b[0m{\n", + " \"output\": \"- Source column name: `case_date`\\n Target column name: `CaseDate`\\n Value transformations: None\\n Explanation: The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\n\\n- Source column name: `lastname`\\n Target column name: `FullName`\\n Value transformations: None\\n Explanation: The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\n\\n- Source column name: `firstname`\\n Target column name: None\\n Value transformations: Concatenate with `lastname` column in target file\\n Explanation: The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\n\\n- Source column name: `case_type`\\n Target column name: `CaseType`\\n Value transformations: None\\n Explanation: The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\n\\n- Source column name: `case_id`\\n Target column name: `CaseID`\\n Value transformations: Replace `CR-` with `CASE-` in target file\\n Explanation: The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\n\\n- Source column name: `court_fee`\\n Target column name: `Fee`\\n Value transformations: None\\n Explanation: The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\n\\n- Source column name: `jurisdiction`\\n Target column name: `Jurisdiction`\\n Value transformations: Capitalize first letter in target file\\n Explanation: The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence] Entering Chain run with input:\n", + "\u001b[0m{\n", + " \"source_1_csv_str\": \"\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 95 | 2023-08-13 | Rodriguez | Jane | Familly | CR-8883 | 200 | chicago |\\n| 96 | 2023-03-27 | Patel | John | Familly | CR-2838 | 100 | BOSTON |\\n| 97 | 2023-07-27 | Okafor | Miguel | Criminal | CR-3885 | 250 | new York |\\n| 98 | 2023-01-14 | Ivanov | Alan | Family | CR-1066 | 250 | houston |\\n| 99 | 2023-03-20 | Okafor | Oluwaseun | Criminl | CR-9851 | 200 | chicago |\\n\",\n", + " \"target_csv_str\": \"\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-----------------|:-----------|:----------|------:|:---------------|\\n| 95 | 2023-08-13 | Jane Rodriguez | Family | CASE-1703 | 200 | Chicago |\\n| 96 | 2023-03-27 | John Patel | Family | CASE-7934 | 100 | Boston |\\n| 97 | 2023-07-27 | Miguel Okafor | Criminal | CASE-3842 | 250 | New York |\\n| 98 | 2023-01-14 | Alan Ivanov | Family | CASE-1295 | 250 | Houston |\\n| 99 | 2023-03-20 | Oluwaseun Okafor | Criminl | CASE-9124 | 200 | Chicago |\\n\",\n", + " \"employee_script_str\": \"- Source column name: `case_date`\\n Target column name: `CaseDate`\\n Value transformations: None\\n Explanation: The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\n\\n- Source column name: `lastname`\\n Target column name: `FullName`\\n Value transformations: None\\n Explanation: The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\n\\n- Source column name: `firstname`\\n Target column name: None\\n Value transformations: Concatenate with `lastname` column in target file\\n Explanation: The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\n\\n- Source column name: `case_type`\\n Target column name: `CaseType`\\n Value transformations: None\\n Explanation: The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\n\\n- Source column name: `case_id`\\n Target column name: `CaseID`\\n Value transformations: Replace `CR-` with `CASE-` in target file\\n Explanation: The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\n\\n- Source column name: `court_fee`\\n Target column name: `Fee`\\n Value transformations: None\\n Explanation: The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\n\\n- Source column name: `jurisdiction`\\n Target column name: `Jurisdiction`\\n Value transformations: Capitalize first letter in target file\\n Explanation: The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:\n", + "\u001b[0m{\n", + " \"source_1_csv_str\": \"\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 95 | 2023-08-13 | Rodriguez | Jane | Familly | CR-8883 | 200 | chicago |\\n| 96 | 2023-03-27 | Patel | John | Familly | CR-2838 | 100 | BOSTON |\\n| 97 | 2023-07-27 | Okafor | Miguel | Criminal | CR-3885 | 250 | new York |\\n| 98 | 2023-01-14 | Ivanov | Alan | Family | CR-1066 | 250 | houston |\\n| 99 | 2023-03-20 | Okafor | Oluwaseun | Criminl | CR-9851 | 200 | chicago |\\n\",\n", + " \"target_csv_str\": \"\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-----------------|:-----------|:----------|------:|:---------------|\\n| 95 | 2023-08-13 | Jane Rodriguez | Family | CASE-1703 | 200 | Chicago |\\n| 96 | 2023-03-27 | John Patel | Family | CASE-7934 | 100 | Boston |\\n| 97 | 2023-07-27 | Miguel Okafor | Criminal | CASE-3842 | 250 | New York |\\n| 98 | 2023-01-14 | Alan Ivanov | Family | CASE-1295 | 250 | Houston |\\n| 99 | 2023-03-20 | Oluwaseun Okafor | Criminl | CASE-9124 | 200 | Chicago |\\n\",\n", + " \"employee_script_str\": \"- Source column name: `case_date`\\n Target column name: `CaseDate`\\n Value transformations: None\\n Explanation: The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\n\\n- Source column name: `lastname`\\n Target column name: `FullName`\\n Value transformations: None\\n Explanation: The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\n\\n- Source column name: `firstname`\\n Target column name: None\\n Value transformations: Concatenate with `lastname` column in target file\\n Explanation: The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\n\\n- Source column name: `case_type`\\n Target column name: `CaseType`\\n Value transformations: None\\n Explanation: The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\n\\n- Source column name: `case_id`\\n Target column name: `CaseID`\\n Value transformations: Replace `CR-` with `CASE-` in target file\\n Explanation: The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\n\\n- Source column name: `court_fee`\\n Target column name: `Fee`\\n Value transformations: None\\n Explanation: The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\n\\n- Source column name: `jurisdiction`\\n Target column name: `Jurisdiction`\\n Value transformations: Capitalize first letter in target file\\n Explanation: The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [0ms] Exiting Prompt run with output:\n", + "\u001b[0m{\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"prompts\",\n", + " \"chat\",\n", + " \"ChatPromptValue\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"messages\": [\n", + " {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"HumanMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"\\n You are a Senior Data Scientist. You are generating a CSV that will be used by a Software Engineer to generate python code to transform one CSV file format to another.\\n \\n Here is the tail of a CSV file, called source_1_csv: \\n \\n \\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 95 | 2023-08-13 | Rodriguez | Jane | Familly | CR-8883 | 200 | chicago |\\n| 96 | 2023-03-27 | Patel | John | Familly | CR-2838 | 100 | BOSTON |\\n| 97 | 2023-07-27 | Okafor | Miguel | Criminal | CR-3885 | 250 | new York |\\n| 98 | 2023-01-14 | Ivanov | Alan | Family | CR-1066 | 250 | houston |\\n| 99 | 2023-03-20 | Okafor | Oluwaseun | Criminl | CR-9851 | 200 | chicago |\\n\\n \\n Here are five from another, called target_csv: \\n \\n \\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-----------------|:-----------|:----------|------:|:---------------|\\n| 95 | 2023-08-13 | Jane Rodriguez | Family | CASE-1703 | 200 | Chicago |\\n| 96 | 2023-03-27 | John Patel | Family | CASE-7934 | 100 | Boston |\\n| 97 | 2023-07-27 | Miguel Okafor | Criminal | CASE-3842 | 250 | New York |\\n| 98 | 2023-01-14 | Alan Ivanov | Family | CASE-1295 | 250 | Houston |\\n| 99 | 2023-03-20 | Oluwaseun Okafor | Criminl | CASE-9124 | 200 | Chicago |\\n\\n \\n Your employee wrote a document with the mappings: \\n \\n - Source column name: `case_date`\\n Target column name: `CaseDate`\\n Value transformations: None\\n Explanation: The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\n\\n- Source column name: `lastname`\\n Target column name: `FullName`\\n Value transformations: None\\n Explanation: The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\n\\n- Source column name: `firstname`\\n Target column name: None\\n Value transformations: Concatenate with `lastname` column in target file\\n Explanation: The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\n\\n- Source column name: `case_type`\\n Target column name: `CaseType`\\n Value transformations: None\\n Explanation: The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\n\\n- Source column name: `case_id`\\n Target column name: `CaseID`\\n Value transformations: Replace `CR-` with `CASE-` in target file\\n Explanation: The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\n\\n- Source column name: `court_fee`\\n Target column name: `Fee`\\n Value transformations: None\\n Explanation: The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\n\\n- Source column name: `jurisdiction`\\n Target column name: `Jurisdiction`\\n Value transformations: Capitalize first letter in target file\\n Explanation: The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\\n \\n Please review their work, think step by step about their proposal, and then revise it with improvements.\\n Note: your employee had access to different rows than you did. Neither one of you had access to the entire dataset.\\n Your final output should be a CSV with these columns: source_column_name, target_column_name, value_transformation, notes.\\n Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.\\n Do not include any commas or quotes in the output, except for the CSV delimiters.\\n Instead of quotes, use backticks.\\n Your response:\\n\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " ]\n", + " }\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"Human: \\n You are a Senior Data Scientist. You are generating a CSV that will be used by a Software Engineer to generate python code to transform one CSV file format to another.\\n \\n Here is the tail of a CSV file, called source_1_csv: \\n \\n \\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 95 | 2023-08-13 | Rodriguez | Jane | Familly | CR-8883 | 200 | chicago |\\n| 96 | 2023-03-27 | Patel | John | Familly | CR-2838 | 100 | BOSTON |\\n| 97 | 2023-07-27 | Okafor | Miguel | Criminal | CR-3885 | 250 | new York |\\n| 98 | 2023-01-14 | Ivanov | Alan | Family | CR-1066 | 250 | houston |\\n| 99 | 2023-03-20 | Okafor | Oluwaseun | Criminl | CR-9851 | 200 | chicago |\\n\\n \\n Here are five from another, called target_csv: \\n \\n \\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-----------------|:-----------|:----------|------:|:---------------|\\n| 95 | 2023-08-13 | Jane Rodriguez | Family | CASE-1703 | 200 | Chicago |\\n| 96 | 2023-03-27 | John Patel | Family | CASE-7934 | 100 | Boston |\\n| 97 | 2023-07-27 | Miguel Okafor | Criminal | CASE-3842 | 250 | New York |\\n| 98 | 2023-01-14 | Alan Ivanov | Family | CASE-1295 | 250 | Houston |\\n| 99 | 2023-03-20 | Oluwaseun Okafor | Criminl | CASE-9124 | 200 | Chicago |\\n\\n \\n Your employee wrote a document with the mappings: \\n \\n - Source column name: `case_date`\\n Target column name: `CaseDate`\\n Value transformations: None\\n Explanation: The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\n\\n- Source column name: `lastname`\\n Target column name: `FullName`\\n Value transformations: None\\n Explanation: The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\n\\n- Source column name: `firstname`\\n Target column name: None\\n Value transformations: Concatenate with `lastname` column in target file\\n Explanation: The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\n\\n- Source column name: `case_type`\\n Target column name: `CaseType`\\n Value transformations: None\\n Explanation: The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\n\\n- Source column name: `case_id`\\n Target column name: `CaseID`\\n Value transformations: Replace `CR-` with `CASE-` in target file\\n Explanation: The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\n\\n- Source column name: `court_fee`\\n Target column name: `Fee`\\n Value transformations: None\\n Explanation: The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\n\\n- Source column name: `jurisdiction`\\n Target column name: `Jurisdiction`\\n Value transformations: Capitalize first letter in target file\\n Explanation: The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\\n \\n Please review their work, think step by step about their proposal, and then revise it with improvements.\\n Note: your employee had access to different rows than you did. Neither one of you had access to the entire dataset.\\n Your final output should be a CSV with these columns: source_column_name, target_column_name, value_transformation, notes.\\n Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.\\n Do not include any commas or quotes in the output, except for the CSV delimiters.\\n Instead of quotes, use backticks.\\n Your response:\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] [14.53s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\nlastname,FullName,None,The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\nfirstname,FullName,Concatenate with `lastname` column in target file,The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\ncase_type,CaseType,None,The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\ncase_id,CaseID,Replace `CR-` with `CASE-` in target file,The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\ncourt_fee,Fee,None,The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\njurisdiction,Jurisdiction,Capitalize first letter in target file,The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"stop\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\nlastname,FullName,None,The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\nfirstname,FullName,Concatenate with `lastname` column in target file,The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\ncase_type,CaseType,None,The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\ncase_id,CaseID,Replace `CR-` with `CASE-` in target file,The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\ncourt_fee,Fee,None,The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\njurisdiction,Jurisdiction,Capitalize first letter in target file,The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 1227,\n", + " \"completion_tokens\": 402,\n", + " \"total_tokens\": 1629\n", + " },\n", + " \"model_name\": \"gpt-3.5-turbo\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] Entering Parser run with input:\n", + "\u001b[0m[inputs]\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] [0ms] Exiting Parser run with output:\n", + "\u001b[0m{\n", + " \"output\": \"source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\nlastname,FullName,None,The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\nfirstname,FullName,Concatenate with `lastname` column in target file,The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\ncase_type,CaseType,None,The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\ncase_id,CaseID,Replace `CR-` with `CASE-` in target file,The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\ncourt_fee,Fee,None,The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\njurisdiction,Jurisdiction,Capitalize first letter in target file,The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence] [14.54s] Exiting Chain run with output:\n", + "\u001b[0m{\n", + " \"output\": \"source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\nlastname,FullName,None,The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\nfirstname,FullName,Concatenate with `lastname` column in target file,The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\ncase_type,CaseType,None,The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\ncase_id,CaseID,Replace `CR-` with `CASE-` in target file,The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\ncourt_fee,Fee,None,The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\njurisdiction,Jurisdiction,Capitalize first letter in target file,The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence] Entering Chain run with input:\n", + "\u001b[0m{\n", + " \"csv_str\": \"source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\nlastname,FullName,None,The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\nfirstname,FullName,Concatenate with `lastname` column in target file,The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\ncase_type,CaseType,None,The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\ncase_id,CaseID,Replace `CR-` with `CASE-` in target file,The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\ncourt_fee,Fee,None,The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\njurisdiction,Jurisdiction,Capitalize first letter in target file,The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:\n", + "\u001b[0m{\n", + " \"csv_str\": \"source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\nlastname,FullName,None,The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\nfirstname,FullName,Concatenate with `lastname` column in target file,The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\ncase_type,CaseType,None,The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\ncase_id,CaseID,Replace `CR-` with `CASE-` in target file,The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\ncourt_fee,Fee,None,The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\njurisdiction,Jurisdiction,Capitalize first letter in target file,The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [0ms] Exiting Prompt run with output:\n", + "\u001b[0m{\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"prompts\",\n", + " \"chat\",\n", + " \"ChatPromptValue\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"messages\": [\n", + " {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"HumanMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"\\n Convert the following csv-like string into valid json:\\n source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\nlastname,FullName,None,The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\nfirstname,FullName,Concatenate with `lastname` column in target file,The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\ncase_type,CaseType,None,The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\ncase_id,CaseID,Replace `CR-` with `CASE-` in target file,The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\ncourt_fee,Fee,None,The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\njurisdiction,Jurisdiction,Capitalize first letter in target file,The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\\n Do not include any additional text. Your output should be 100% valid json. Do not transform the values, just the structure.\\n \",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " ]\n", + " }\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"Human: \\n Convert the following csv-like string into valid json:\\n source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\nlastname,FullName,None,The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\nfirstname,FullName,Concatenate with `lastname` column in target file,The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\ncase_type,CaseType,None,The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\ncase_id,CaseID,Replace `CR-` with `CASE-` in target file,The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\ncourt_fee,Fee,None,The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\njurisdiction,Jurisdiction,Capitalize first letter in target file,The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\\n Do not include any additional text. Your output should be 100% valid json. Do not transform the values, just the structure.\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] [20.56s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"[\\n {\\n \\\"source_column_name\\\": \\\"case_date\\\",\\n \\\"target_column_name\\\": \\\"CaseDate\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"lastname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"firstname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"Concatenate with `lastname` column in target file\\\",\\n \\\"notes\\\": \\\"The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_type\\\",\\n \\\"target_column_name\\\": \\\"CaseType\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_id\\\",\\n \\\"target_column_name\\\": \\\"CaseID\\\",\\n \\\"value_transformation\\\": \\\"Replace `CR-` with `CASE-` in target file\\\",\\n \\\"notes\\\": \\\"The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"court_fee\\\",\\n \\\"target_column_name\\\": \\\"Fee\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"jurisdiction\\\",\\n \\\"target_column_name\\\": \\\"Jurisdiction\\\",\\n \\\"value_transformation\\\": \\\"Capitalize first letter in target file\\\",\\n \\\"notes\\\": \\\"The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\\\"\\n }\\n]\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"stop\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"[\\n {\\n \\\"source_column_name\\\": \\\"case_date\\\",\\n \\\"target_column_name\\\": \\\"CaseDate\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"lastname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"firstname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"Concatenate with `lastname` column in target file\\\",\\n \\\"notes\\\": \\\"The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_type\\\",\\n \\\"target_column_name\\\": \\\"CaseType\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_id\\\",\\n \\\"target_column_name\\\": \\\"CaseID\\\",\\n \\\"value_transformation\\\": \\\"Replace `CR-` with `CASE-` in target file\\\",\\n \\\"notes\\\": \\\"The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"court_fee\\\",\\n \\\"target_column_name\\\": \\\"Fee\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"jurisdiction\\\",\\n \\\"target_column_name\\\": \\\"Jurisdiction\\\",\\n \\\"value_transformation\\\": \\\"Capitalize first letter in target file\\\",\\n \\\"notes\\\": \\\"The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\\\"\\n }\\n]\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 451,\n", + " \"completion_tokens\": 606,\n", + " \"total_tokens\": 1057\n", + " },\n", + " \"model_name\": \"gpt-3.5-turbo\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] Entering Parser run with input:\n", + "\u001b[0m[inputs]\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] [0ms] Exiting Parser run with output:\n", + "\u001b[0m{\n", + " \"output\": \"[\\n {\\n \\\"source_column_name\\\": \\\"case_date\\\",\\n \\\"target_column_name\\\": \\\"CaseDate\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"lastname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"firstname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"Concatenate with `lastname` column in target file\\\",\\n \\\"notes\\\": \\\"The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_type\\\",\\n \\\"target_column_name\\\": \\\"CaseType\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_id\\\",\\n \\\"target_column_name\\\": \\\"CaseID\\\",\\n \\\"value_transformation\\\": \\\"Replace `CR-` with `CASE-` in target file\\\",\\n \\\"notes\\\": \\\"The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"court_fee\\\",\\n \\\"target_column_name\\\": \\\"Fee\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"jurisdiction\\\",\\n \\\"target_column_name\\\": \\\"Jurisdiction\\\",\\n \\\"value_transformation\\\": \\\"Capitalize first letter in target file\\\",\\n \\\"notes\\\": \\\"The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\\\"\\n }\\n]\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence] [20.56s] Exiting Chain run with output:\n", + "\u001b[0m{\n", + " \"output\": \"[\\n {\\n \\\"source_column_name\\\": \\\"case_date\\\",\\n \\\"target_column_name\\\": \\\"CaseDate\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `case_date` column in the source file should be mapped to the `CaseDate` column in the target file as is, without any transformations. Both columns represent the date of the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"lastname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `lastname` column in the source file should be mapped to the `FullName` column in the target file as is, without any transformations. Both columns represent the last name of the individual involved in the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"firstname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"Concatenate with `lastname` column in target file\\\",\\n \\\"notes\\\": \\\"The `firstname` column in the source file should be concatenated with the `lastname` column in the target file to form the `FullName` column. Both columns represent the first name of the individual involved in the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_type\\\",\\n \\\"target_column_name\\\": \\\"CaseType\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `case_type` column in the source file should be mapped to the `CaseType` column in the target file as is, without any transformations. Both columns represent the type of the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_id\\\",\\n \\\"target_column_name\\\": \\\"CaseID\\\",\\n \\\"value_transformation\\\": \\\"Replace `CR-` with `CASE-` in target file\\\",\\n \\\"notes\\\": \\\"The `case_id` column in the source file should be mapped to the `CaseID` column in the target file. However, in the target file, the prefix `CR-` should be replaced with `CASE-` to match the desired format.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"court_fee\\\",\\n \\\"target_column_name\\\": \\\"Fee\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The `court_fee` column in the source file should be mapped to the `Fee` column in the target file as is, without any transformations. Both columns represent the court fee associated with the case.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"jurisdiction\\\",\\n \\\"target_column_name\\\": \\\"Jurisdiction\\\",\\n \\\"value_transformation\\\": \\\"Capitalize first letter in target file\\\",\\n \\\"notes\\\": \\\"The `jurisdiction` column in the source file should be mapped to the `Jurisdiction` column in the target file. However, in the target file, the first letter should be capitalized to match the desired format. Both columns represent the jurisdiction where the case is being handled.\\\"\\n }\\n]\"\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/gradio/routes.py\", line 488, in run_predict\n", + " output = await app.get_blocks().process_api(\n", + " File \"/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/gradio/blocks.py\", line 1431, in process_api\n", + " result = await self.call_function(\n", + " File \"/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/gradio/blocks.py\", line 1109, in call_function\n", + " prediction = await anyio.to_thread.run_sync(\n", + " File \"/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/anyio/to_thread.py\", line 33, in run_sync\n", + " return await get_asynclib().run_sync_in_worker_thread(\n", + " File \"/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/anyio/_backends/_asyncio.py\", line 877, in run_sync_in_worker_thread\n", + " return await future\n", + " File \"/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/anyio/_backends/_asyncio.py\", line 807, in run\n", + " result = context.run(func, *args)\n", + " File \"/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/gradio/utils.py\", line 706, in wrapper\n", + " response = f(*args, **kwargs)\n", + " File \"/var/folders/lx/3ksh07r96gn2v7b8mb__3mpc0000gn/T/ipykernel_74941/201359743.py\", line 36, in run_code\n", + " return pd.read_csv(json.loads(chain_3.invoke({\"csv_str\": scientist_csv_str})))\n", + " File \"/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/pandas/io/parsers/readers.py\", line 912, in read_csv\n", + " return _read(filepath_or_buffer, kwds)\n", + " File \"/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/pandas/io/parsers/readers.py\", line 577, in _read\n", + " parser = TextFileReader(filepath_or_buffer, **kwds)\n", + " File \"/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/pandas/io/parsers/readers.py\", line 1407, in __init__\n", + " self._engine = self._make_engine(f, self.engine)\n", + " File \"/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/pandas/io/parsers/readers.py\", line 1676, in _make_engine\n", + " raise ValueError(msg)\n", + "ValueError: Invalid file path or buffer object type: \n" + ] + } + ], + "source": [ + "demo = gr.Interface(fn=run_code, inputs=[\"file\"], outputs='dataframe')\n", + "demo.launch()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "scientist_csv_str = 'source_column_name, target_column_name, value_transformation, notes\\ncase_date, CaseDate, None, The source column \"case_date\" maps directly to the target column \"CaseDate\" without any value transformations. The data represents the date of the case in both files.\\nlastname, FullName, None, The source column \"lastname\" maps directly to the target column \"FullName\" without any value transformations. The data represents the last name of the individual involved in the case in both files.\\nfirstname, FullName, None, The source column \"firstname\" maps directly to the target column \"FullName\" without any value transformations. The data represents the first name of the individual involved in the case in both files.\\ncase_type, CaseType, None, The source column \"case_type\" maps directly to the target column \"CaseType\" without any value transformations. The data represents the type of the case in both files.\\ncase_id, CaseID, None, The source column \"case_id\" maps directly to the target column \"CaseID\" without any value transformations. The data represents the unique identifier of the case in both files.\\ncourt_fee, Fee, None, The source column \"court_fee\" maps directly to the target column \"Fee\" without any value transformations. The data represents the fee associated with the case in both files.\\njurisdiction, Jurisdiction, \"BOSTON\" -> \"Boston\", \"houston\" -> \"Houston\", \"chicago\" -> \"Chicago\", The source column \"jurisdiction\" maps directly to the target column \"Jurisdiction\" with the following value transformations: \"BOSTON\" is transformed to \"Boston\", \"houston\" is transformed to \"Houston\", and \"chicago\" is transformed to \"Chicago\". The data represents the jurisdiction where the case is filed, and the capitalization is adjusted to match the formatting in the target file.'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running on local URL: http://127.0.0.1:7870\n", + "\n", + "To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def process_files(files):\n", + " # Process files here \n", + " return scientist_csv_str\n", + "\n", + "iface = gr.Interface(fn=process_files, \n", + " inputs=[gr.File(label=\"Load files\", file_types=['csv'], file_count='multiple')], \n", + " outputs=gr.Markdown(interactive=True)) \n", + "\n", + "iface.launch()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence] Entering Chain run with input:\n", + "\u001b[0m{\n", + " \"source_1_csv_str\": \"\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\",\n", + " \"target_csv_str\": \"\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:\n", + "\u001b[0m{\n", + " \"source_1_csv_str\": \"\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\",\n", + " \"target_csv_str\": \"\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [0ms] Exiting Prompt run with output:\n", + "\u001b[0m{\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"prompts\",\n", + " \"chat\",\n", + " \"ChatPromptValue\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"messages\": [\n", + " {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"HumanMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"\\n You are a Junior Data Analyst. You are working with Data Scientists and Software Engineers to transform data from one source format to a target format.\\n Here is the head of a CSV file, called source_1_csv:\\n \\n \\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\\n \\n Here is the head of another, called target_csv:\\n \\n \\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\\n \\n Your job is to generate a thorough, precise summary of exactly which columns in the source file should map to the target file, including a brief explanation for each.\\n In addition, if the cell values appear to be different, you should note that as well. For example, if the source date is formatted like this 2020-01-01 and the target date is formatted like this 01/01/2020, you should note that.\\n Your output should be natural language; a bulleted list with four values: 1. source column name, 2. target column name, 3. value transformations (if necessary), 4. explanation.\\n Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.\\n Do not include rows: focus on the column names and data types.\\n DO NOT INCLUDE COMMAS OR QUOTES ANYWHERE THIS WILL BE USED IN CSV GENERATION.\\n Instead of quotes, use backticks.\\n Your response:\\n\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " ]\n", + " }\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"Human: \\n You are a Junior Data Analyst. You are working with Data Scientists and Software Engineers to transform data from one source format to a target format.\\n Here is the head of a CSV file, called source_1_csv:\\n \\n \\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\\n \\n Here is the head of another, called target_csv:\\n \\n \\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\\n \\n Your job is to generate a thorough, precise summary of exactly which columns in the source file should map to the target file, including a brief explanation for each.\\n In addition, if the cell values appear to be different, you should note that as well. For example, if the source date is formatted like this 2020-01-01 and the target date is formatted like this 01/01/2020, you should note that.\\n Your output should be natural language; a bulleted list with four values: 1. source column name, 2. target column name, 3. value transformations (if necessary), 4. explanation.\\n Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.\\n Do not include rows: focus on the column names and data types.\\n DO NOT INCLUDE COMMAS OR QUOTES ANYWHERE THIS WILL BE USED IN CSV GENERATION.\\n Instead of quotes, use backticks.\\n Your response:\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] [15.45s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"- Source column name: case_date\\n Target column name: CaseDate\\n Value transformations: None\\n Explanation: The source column \\\"case_date\\\" should map to the target column \\\"CaseDate\\\" as it represents the date of the case. No value transformations are required.\\n\\n- Source column name: lastname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"lastname\\\" should map to the target column \\\"FullName\\\" as it represents the last name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: firstname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"firstname\\\" should also map to the target column \\\"FullName\\\" as it represents the first name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: case_type\\n Target column name: CaseType\\n Value transformations: None\\n Explanation: The source column \\\"case_type\\\" should map to the target column \\\"CaseType\\\" as it represents the type of the case. No value transformations are required.\\n\\n- Source column name: case_id\\n Target column name: CaseID\\n Value transformations: \\\"CR-\\\" prefix should be removed from the source values.\\n Explanation: The source column \\\"case_id\\\" should map to the target column \\\"CaseID\\\" as it represents the unique identifier of the case. However, the source values have a \\\"CR-\\\" prefix that should be removed to match the target format.\\n\\n- Source column name: court_fee\\n Target column name: Fee\\n Value transformations: None\\n Explanation: The source column \\\"court_fee\\\" should map to the target column \\\"Fee\\\" as it represents the fee associated with the case. No value transformations are required.\\n\\n- Source column name: jurisdiction\\n Target column name: Jurisdiction\\n Value transformations: Capitalize the first letter of each word in the source values.\\n Explanation: The source column \\\"jurisdiction\\\" should map to the target column \\\"Jurisdiction\\\" as it represents the jurisdiction where the case is held. However, the source values are not consistently capitalized, so they should be transformed to capitalize the first letter of each word to match the target format.\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"stop\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"- Source column name: case_date\\n Target column name: CaseDate\\n Value transformations: None\\n Explanation: The source column \\\"case_date\\\" should map to the target column \\\"CaseDate\\\" as it represents the date of the case. No value transformations are required.\\n\\n- Source column name: lastname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"lastname\\\" should map to the target column \\\"FullName\\\" as it represents the last name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: firstname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"firstname\\\" should also map to the target column \\\"FullName\\\" as it represents the first name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: case_type\\n Target column name: CaseType\\n Value transformations: None\\n Explanation: The source column \\\"case_type\\\" should map to the target column \\\"CaseType\\\" as it represents the type of the case. No value transformations are required.\\n\\n- Source column name: case_id\\n Target column name: CaseID\\n Value transformations: \\\"CR-\\\" prefix should be removed from the source values.\\n Explanation: The source column \\\"case_id\\\" should map to the target column \\\"CaseID\\\" as it represents the unique identifier of the case. However, the source values have a \\\"CR-\\\" prefix that should be removed to match the target format.\\n\\n- Source column name: court_fee\\n Target column name: Fee\\n Value transformations: None\\n Explanation: The source column \\\"court_fee\\\" should map to the target column \\\"Fee\\\" as it represents the fee associated with the case. No value transformations are required.\\n\\n- Source column name: jurisdiction\\n Target column name: Jurisdiction\\n Value transformations: Capitalize the first letter of each word in the source values.\\n Explanation: The source column \\\"jurisdiction\\\" should map to the target column \\\"Jurisdiction\\\" as it represents the jurisdiction where the case is held. However, the source values are not consistently capitalized, so they should be transformed to capitalize the first letter of each word to match the target format.\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 758,\n", + " \"completion_tokens\": 462,\n", + " \"total_tokens\": 1220\n", + " },\n", + " \"model_name\": \"gpt-3.5-turbo\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] Entering Parser run with input:\n", + "\u001b[0m[inputs]\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] [0ms] Exiting Parser run with output:\n", + "\u001b[0m{\n", + " \"output\": \"- Source column name: case_date\\n Target column name: CaseDate\\n Value transformations: None\\n Explanation: The source column \\\"case_date\\\" should map to the target column \\\"CaseDate\\\" as it represents the date of the case. No value transformations are required.\\n\\n- Source column name: lastname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"lastname\\\" should map to the target column \\\"FullName\\\" as it represents the last name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: firstname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"firstname\\\" should also map to the target column \\\"FullName\\\" as it represents the first name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: case_type\\n Target column name: CaseType\\n Value transformations: None\\n Explanation: The source column \\\"case_type\\\" should map to the target column \\\"CaseType\\\" as it represents the type of the case. No value transformations are required.\\n\\n- Source column name: case_id\\n Target column name: CaseID\\n Value transformations: \\\"CR-\\\" prefix should be removed from the source values.\\n Explanation: The source column \\\"case_id\\\" should map to the target column \\\"CaseID\\\" as it represents the unique identifier of the case. However, the source values have a \\\"CR-\\\" prefix that should be removed to match the target format.\\n\\n- Source column name: court_fee\\n Target column name: Fee\\n Value transformations: None\\n Explanation: The source column \\\"court_fee\\\" should map to the target column \\\"Fee\\\" as it represents the fee associated with the case. No value transformations are required.\\n\\n- Source column name: jurisdiction\\n Target column name: Jurisdiction\\n Value transformations: Capitalize the first letter of each word in the source values.\\n Explanation: The source column \\\"jurisdiction\\\" should map to the target column \\\"Jurisdiction\\\" as it represents the jurisdiction where the case is held. However, the source values are not consistently capitalized, so they should be transformed to capitalize the first letter of each word to match the target format.\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence] [15.45s] Exiting Chain run with output:\n", + "\u001b[0m{\n", + " \"output\": \"- Source column name: case_date\\n Target column name: CaseDate\\n Value transformations: None\\n Explanation: The source column \\\"case_date\\\" should map to the target column \\\"CaseDate\\\" as it represents the date of the case. No value transformations are required.\\n\\n- Source column name: lastname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"lastname\\\" should map to the target column \\\"FullName\\\" as it represents the last name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: firstname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"firstname\\\" should also map to the target column \\\"FullName\\\" as it represents the first name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: case_type\\n Target column name: CaseType\\n Value transformations: None\\n Explanation: The source column \\\"case_type\\\" should map to the target column \\\"CaseType\\\" as it represents the type of the case. No value transformations are required.\\n\\n- Source column name: case_id\\n Target column name: CaseID\\n Value transformations: \\\"CR-\\\" prefix should be removed from the source values.\\n Explanation: The source column \\\"case_id\\\" should map to the target column \\\"CaseID\\\" as it represents the unique identifier of the case. However, the source values have a \\\"CR-\\\" prefix that should be removed to match the target format.\\n\\n- Source column name: court_fee\\n Target column name: Fee\\n Value transformations: None\\n Explanation: The source column \\\"court_fee\\\" should map to the target column \\\"Fee\\\" as it represents the fee associated with the case. No value transformations are required.\\n\\n- Source column name: jurisdiction\\n Target column name: Jurisdiction\\n Value transformations: Capitalize the first letter of each word in the source values.\\n Explanation: The source column \\\"jurisdiction\\\" should map to the target column \\\"Jurisdiction\\\" as it represents the jurisdiction where the case is held. However, the source values are not consistently capitalized, so they should be transformed to capitalize the first letter of each word to match the target format.\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence] Entering Chain run with input:\n", + "\u001b[0m{\n", + " \"source_1_csv_str\": \"\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 95 | 2023-08-13 | Rodriguez | Jane | Familly | CR-8883 | 200 | chicago |\\n| 96 | 2023-03-27 | Patel | John | Familly | CR-2838 | 100 | BOSTON |\\n| 97 | 2023-07-27 | Okafor | Miguel | Criminal | CR-3885 | 250 | new York |\\n| 98 | 2023-01-14 | Ivanov | Alan | Family | CR-1066 | 250 | houston |\\n| 99 | 2023-03-20 | Okafor | Oluwaseun | Criminl | CR-9851 | 200 | chicago |\\n\",\n", + " \"target_csv_str\": \"\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-----------------|:-----------|:----------|------:|:---------------|\\n| 95 | 2023-08-13 | Jane Rodriguez | Family | CASE-1703 | 200 | Chicago |\\n| 96 | 2023-03-27 | John Patel | Family | CASE-7934 | 100 | Boston |\\n| 97 | 2023-07-27 | Miguel Okafor | Criminal | CASE-3842 | 250 | New York |\\n| 98 | 2023-01-14 | Alan Ivanov | Family | CASE-1295 | 250 | Houston |\\n| 99 | 2023-03-20 | Oluwaseun Okafor | Criminl | CASE-9124 | 200 | Chicago |\\n\",\n", + " \"employee_script_str\": \"- Source column name: case_date\\n Target column name: CaseDate\\n Value transformations: None\\n Explanation: The source column \\\"case_date\\\" should map to the target column \\\"CaseDate\\\" as it represents the date of the case. No value transformations are required.\\n\\n- Source column name: lastname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"lastname\\\" should map to the target column \\\"FullName\\\" as it represents the last name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: firstname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"firstname\\\" should also map to the target column \\\"FullName\\\" as it represents the first name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: case_type\\n Target column name: CaseType\\n Value transformations: None\\n Explanation: The source column \\\"case_type\\\" should map to the target column \\\"CaseType\\\" as it represents the type of the case. No value transformations are required.\\n\\n- Source column name: case_id\\n Target column name: CaseID\\n Value transformations: \\\"CR-\\\" prefix should be removed from the source values.\\n Explanation: The source column \\\"case_id\\\" should map to the target column \\\"CaseID\\\" as it represents the unique identifier of the case. However, the source values have a \\\"CR-\\\" prefix that should be removed to match the target format.\\n\\n- Source column name: court_fee\\n Target column name: Fee\\n Value transformations: None\\n Explanation: The source column \\\"court_fee\\\" should map to the target column \\\"Fee\\\" as it represents the fee associated with the case. No value transformations are required.\\n\\n- Source column name: jurisdiction\\n Target column name: Jurisdiction\\n Value transformations: Capitalize the first letter of each word in the source values.\\n Explanation: The source column \\\"jurisdiction\\\" should map to the target column \\\"Jurisdiction\\\" as it represents the jurisdiction where the case is held. However, the source values are not consistently capitalized, so they should be transformed to capitalize the first letter of each word to match the target format.\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:\n", + "\u001b[0m{\n", + " \"source_1_csv_str\": \"\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 95 | 2023-08-13 | Rodriguez | Jane | Familly | CR-8883 | 200 | chicago |\\n| 96 | 2023-03-27 | Patel | John | Familly | CR-2838 | 100 | BOSTON |\\n| 97 | 2023-07-27 | Okafor | Miguel | Criminal | CR-3885 | 250 | new York |\\n| 98 | 2023-01-14 | Ivanov | Alan | Family | CR-1066 | 250 | houston |\\n| 99 | 2023-03-20 | Okafor | Oluwaseun | Criminl | CR-9851 | 200 | chicago |\\n\",\n", + " \"target_csv_str\": \"\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-----------------|:-----------|:----------|------:|:---------------|\\n| 95 | 2023-08-13 | Jane Rodriguez | Family | CASE-1703 | 200 | Chicago |\\n| 96 | 2023-03-27 | John Patel | Family | CASE-7934 | 100 | Boston |\\n| 97 | 2023-07-27 | Miguel Okafor | Criminal | CASE-3842 | 250 | New York |\\n| 98 | 2023-01-14 | Alan Ivanov | Family | CASE-1295 | 250 | Houston |\\n| 99 | 2023-03-20 | Oluwaseun Okafor | Criminl | CASE-9124 | 200 | Chicago |\\n\",\n", + " \"employee_script_str\": \"- Source column name: case_date\\n Target column name: CaseDate\\n Value transformations: None\\n Explanation: The source column \\\"case_date\\\" should map to the target column \\\"CaseDate\\\" as it represents the date of the case. No value transformations are required.\\n\\n- Source column name: lastname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"lastname\\\" should map to the target column \\\"FullName\\\" as it represents the last name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: firstname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"firstname\\\" should also map to the target column \\\"FullName\\\" as it represents the first name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: case_type\\n Target column name: CaseType\\n Value transformations: None\\n Explanation: The source column \\\"case_type\\\" should map to the target column \\\"CaseType\\\" as it represents the type of the case. No value transformations are required.\\n\\n- Source column name: case_id\\n Target column name: CaseID\\n Value transformations: \\\"CR-\\\" prefix should be removed from the source values.\\n Explanation: The source column \\\"case_id\\\" should map to the target column \\\"CaseID\\\" as it represents the unique identifier of the case. However, the source values have a \\\"CR-\\\" prefix that should be removed to match the target format.\\n\\n- Source column name: court_fee\\n Target column name: Fee\\n Value transformations: None\\n Explanation: The source column \\\"court_fee\\\" should map to the target column \\\"Fee\\\" as it represents the fee associated with the case. No value transformations are required.\\n\\n- Source column name: jurisdiction\\n Target column name: Jurisdiction\\n Value transformations: Capitalize the first letter of each word in the source values.\\n Explanation: The source column \\\"jurisdiction\\\" should map to the target column \\\"Jurisdiction\\\" as it represents the jurisdiction where the case is held. However, the source values are not consistently capitalized, so they should be transformed to capitalize the first letter of each word to match the target format.\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [0ms] Exiting Prompt run with output:\n", + "\u001b[0m{\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"prompts\",\n", + " \"chat\",\n", + " \"ChatPromptValue\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"messages\": [\n", + " {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"HumanMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"\\n You are a Senior Data Scientist. You are generating a CSV that will be used by a Software Engineer to generate python code to transform one CSV file format to another.\\n \\n Here is the tail of a CSV file, called source_1_csv: \\n \\n \\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 95 | 2023-08-13 | Rodriguez | Jane | Familly | CR-8883 | 200 | chicago |\\n| 96 | 2023-03-27 | Patel | John | Familly | CR-2838 | 100 | BOSTON |\\n| 97 | 2023-07-27 | Okafor | Miguel | Criminal | CR-3885 | 250 | new York |\\n| 98 | 2023-01-14 | Ivanov | Alan | Family | CR-1066 | 250 | houston |\\n| 99 | 2023-03-20 | Okafor | Oluwaseun | Criminl | CR-9851 | 200 | chicago |\\n\\n \\n Here are five from another, called target_csv: \\n \\n \\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-----------------|:-----------|:----------|------:|:---------------|\\n| 95 | 2023-08-13 | Jane Rodriguez | Family | CASE-1703 | 200 | Chicago |\\n| 96 | 2023-03-27 | John Patel | Family | CASE-7934 | 100 | Boston |\\n| 97 | 2023-07-27 | Miguel Okafor | Criminal | CASE-3842 | 250 | New York |\\n| 98 | 2023-01-14 | Alan Ivanov | Family | CASE-1295 | 250 | Houston |\\n| 99 | 2023-03-20 | Oluwaseun Okafor | Criminl | CASE-9124 | 200 | Chicago |\\n\\n \\n Your employee wrote a document with the mappings: \\n \\n - Source column name: case_date\\n Target column name: CaseDate\\n Value transformations: None\\n Explanation: The source column \\\"case_date\\\" should map to the target column \\\"CaseDate\\\" as it represents the date of the case. No value transformations are required.\\n\\n- Source column name: lastname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"lastname\\\" should map to the target column \\\"FullName\\\" as it represents the last name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: firstname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"firstname\\\" should also map to the target column \\\"FullName\\\" as it represents the first name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: case_type\\n Target column name: CaseType\\n Value transformations: None\\n Explanation: The source column \\\"case_type\\\" should map to the target column \\\"CaseType\\\" as it represents the type of the case. No value transformations are required.\\n\\n- Source column name: case_id\\n Target column name: CaseID\\n Value transformations: \\\"CR-\\\" prefix should be removed from the source values.\\n Explanation: The source column \\\"case_id\\\" should map to the target column \\\"CaseID\\\" as it represents the unique identifier of the case. However, the source values have a \\\"CR-\\\" prefix that should be removed to match the target format.\\n\\n- Source column name: court_fee\\n Target column name: Fee\\n Value transformations: None\\n Explanation: The source column \\\"court_fee\\\" should map to the target column \\\"Fee\\\" as it represents the fee associated with the case. No value transformations are required.\\n\\n- Source column name: jurisdiction\\n Target column name: Jurisdiction\\n Value transformations: Capitalize the first letter of each word in the source values.\\n Explanation: The source column \\\"jurisdiction\\\" should map to the target column \\\"Jurisdiction\\\" as it represents the jurisdiction where the case is held. However, the source values are not consistently capitalized, so they should be transformed to capitalize the first letter of each word to match the target format.\\n \\n Please review their work, think step by step about their proposal, and then revise it with improvements.\\n Note: your employee had access to different rows than you did. Neither one of you had access to the entire dataset.\\n Your final output should be a CSV with these columns: source_column_name, target_column_name, value_transformation, notes.\\n Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.\\n Do not include any commas or quotes in the output, except for the CSV delimiters.\\n Instead of quotes, use backticks.\\n Your response:\\n\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " ]\n", + " }\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"Human: \\n You are a Senior Data Scientist. You are generating a CSV that will be used by a Software Engineer to generate python code to transform one CSV file format to another.\\n \\n Here is the tail of a CSV file, called source_1_csv: \\n \\n \\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 95 | 2023-08-13 | Rodriguez | Jane | Familly | CR-8883 | 200 | chicago |\\n| 96 | 2023-03-27 | Patel | John | Familly | CR-2838 | 100 | BOSTON |\\n| 97 | 2023-07-27 | Okafor | Miguel | Criminal | CR-3885 | 250 | new York |\\n| 98 | 2023-01-14 | Ivanov | Alan | Family | CR-1066 | 250 | houston |\\n| 99 | 2023-03-20 | Okafor | Oluwaseun | Criminl | CR-9851 | 200 | chicago |\\n\\n \\n Here are five from another, called target_csv: \\n \\n \\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-----------------|:-----------|:----------|------:|:---------------|\\n| 95 | 2023-08-13 | Jane Rodriguez | Family | CASE-1703 | 200 | Chicago |\\n| 96 | 2023-03-27 | John Patel | Family | CASE-7934 | 100 | Boston |\\n| 97 | 2023-07-27 | Miguel Okafor | Criminal | CASE-3842 | 250 | New York |\\n| 98 | 2023-01-14 | Alan Ivanov | Family | CASE-1295 | 250 | Houston |\\n| 99 | 2023-03-20 | Oluwaseun Okafor | Criminl | CASE-9124 | 200 | Chicago |\\n\\n \\n Your employee wrote a document with the mappings: \\n \\n - Source column name: case_date\\n Target column name: CaseDate\\n Value transformations: None\\n Explanation: The source column \\\"case_date\\\" should map to the target column \\\"CaseDate\\\" as it represents the date of the case. No value transformations are required.\\n\\n- Source column name: lastname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"lastname\\\" should map to the target column \\\"FullName\\\" as it represents the last name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: firstname\\n Target column name: FullName\\n Value transformations: None\\n Explanation: The source column \\\"firstname\\\" should also map to the target column \\\"FullName\\\" as it represents the first name of the individual involved in the case. No value transformations are required.\\n\\n- Source column name: case_type\\n Target column name: CaseType\\n Value transformations: None\\n Explanation: The source column \\\"case_type\\\" should map to the target column \\\"CaseType\\\" as it represents the type of the case. No value transformations are required.\\n\\n- Source column name: case_id\\n Target column name: CaseID\\n Value transformations: \\\"CR-\\\" prefix should be removed from the source values.\\n Explanation: The source column \\\"case_id\\\" should map to the target column \\\"CaseID\\\" as it represents the unique identifier of the case. However, the source values have a \\\"CR-\\\" prefix that should be removed to match the target format.\\n\\n- Source column name: court_fee\\n Target column name: Fee\\n Value transformations: None\\n Explanation: The source column \\\"court_fee\\\" should map to the target column \\\"Fee\\\" as it represents the fee associated with the case. No value transformations are required.\\n\\n- Source column name: jurisdiction\\n Target column name: Jurisdiction\\n Value transformations: Capitalize the first letter of each word in the source values.\\n Explanation: The source column \\\"jurisdiction\\\" should map to the target column \\\"Jurisdiction\\\" as it represents the jurisdiction where the case is held. However, the source values are not consistently capitalized, so they should be transformed to capitalize the first letter of each word to match the target format.\\n \\n Please review their work, think step by step about their proposal, and then revise it with improvements.\\n Note: your employee had access to different rows than you did. Neither one of you had access to the entire dataset.\\n Your final output should be a CSV with these columns: source_column_name, target_column_name, value_transformation, notes.\\n Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.\\n Do not include any commas or quotes in the output, except for the CSV delimiters.\\n Instead of quotes, use backticks.\\n Your response:\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] [11.34s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,\\\"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\\\"\\nlastname,FullName,None,\\\"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\\\"\\nfirstname,FullName,None,\\\"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\\\"\\ncase_type,CaseType,None,\\\"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\\\"\\ncase_id,CaseID,Remove \\\"CR-\\\" prefix,\\\"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\\\"\\ncourt_fee,Fee,None,\\\"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\\\"\\njurisdiction,Jurisdiction,Capitalize first letter of each word,\\\"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\\\"\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"stop\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,\\\"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\\\"\\nlastname,FullName,None,\\\"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\\\"\\nfirstname,FullName,None,\\\"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\\\"\\ncase_type,CaseType,None,\\\"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\\\"\\ncase_id,CaseID,Remove \\\"CR-\\\" prefix,\\\"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\\\"\\ncourt_fee,Fee,None,\\\"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\\\"\\njurisdiction,Jurisdiction,Capitalize first letter of each word,\\\"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\\\"\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 1163,\n", + " \"completion_tokens\": 329,\n", + " \"total_tokens\": 1492\n", + " },\n", + " \"model_name\": \"gpt-3.5-turbo\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] Entering Parser run with input:\n", + "\u001b[0m[inputs]\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] [0ms] Exiting Parser run with output:\n", + "\u001b[0m{\n", + " \"output\": \"source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,\\\"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\\\"\\nlastname,FullName,None,\\\"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\\\"\\nfirstname,FullName,None,\\\"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\\\"\\ncase_type,CaseType,None,\\\"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\\\"\\ncase_id,CaseID,Remove \\\"CR-\\\" prefix,\\\"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\\\"\\ncourt_fee,Fee,None,\\\"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\\\"\\njurisdiction,Jurisdiction,Capitalize first letter of each word,\\\"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\\\"\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence] [11.35s] Exiting Chain run with output:\n", + "\u001b[0m{\n", + " \"output\": \"source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,\\\"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\\\"\\nlastname,FullName,None,\\\"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\\\"\\nfirstname,FullName,None,\\\"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\\\"\\ncase_type,CaseType,None,\\\"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\\\"\\ncase_id,CaseID,Remove \\\"CR-\\\" prefix,\\\"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\\\"\\ncourt_fee,Fee,None,\\\"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\\\"\\njurisdiction,Jurisdiction,Capitalize first letter of each word,\\\"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\\\"\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence] Entering Chain run with input:\n", + "\u001b[0m{\n", + " \"csv_str\": \"source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,\\\"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\\\"\\nlastname,FullName,None,\\\"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\\\"\\nfirstname,FullName,None,\\\"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\\\"\\ncase_type,CaseType,None,\\\"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\\\"\\ncase_id,CaseID,Remove \\\"CR-\\\" prefix,\\\"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\\\"\\ncourt_fee,Fee,None,\\\"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\\\"\\njurisdiction,Jurisdiction,Capitalize first letter of each word,\\\"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\\\"\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:\n", + "\u001b[0m{\n", + " \"csv_str\": \"source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,\\\"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\\\"\\nlastname,FullName,None,\\\"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\\\"\\nfirstname,FullName,None,\\\"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\\\"\\ncase_type,CaseType,None,\\\"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\\\"\\ncase_id,CaseID,Remove \\\"CR-\\\" prefix,\\\"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\\\"\\ncourt_fee,Fee,None,\\\"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\\\"\\njurisdiction,Jurisdiction,Capitalize first letter of each word,\\\"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\\\"\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [0ms] Exiting Prompt run with output:\n", + "\u001b[0m{\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"prompts\",\n", + " \"chat\",\n", + " \"ChatPromptValue\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"messages\": [\n", + " {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"HumanMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"\\n Convert the following csv-like string into valid json:\\n source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,\\\"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\\\"\\nlastname,FullName,None,\\\"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\\\"\\nfirstname,FullName,None,\\\"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\\\"\\ncase_type,CaseType,None,\\\"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\\\"\\ncase_id,CaseID,Remove \\\"CR-\\\" prefix,\\\"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\\\"\\ncourt_fee,Fee,None,\\\"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\\\"\\njurisdiction,Jurisdiction,Capitalize first letter of each word,\\\"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\\\"\\n Steps: 1. infer schema, 2. remove newlines or any other characters that are not json-serializable, 3. convert to valid json, 4. double check your work and make revisions as needed. \\n Your output should be 100% valid json. Do not transform the values, just the structure.\\n \",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " ]\n", + " }\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"Human: \\n Convert the following csv-like string into valid json:\\n source_column_name,target_column_name,value_transformation,notes\\ncase_date,CaseDate,None,\\\"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\\\"\\nlastname,FullName,None,\\\"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\\\"\\nfirstname,FullName,None,\\\"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\\\"\\ncase_type,CaseType,None,\\\"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\\\"\\ncase_id,CaseID,Remove \\\"CR-\\\" prefix,\\\"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\\\"\\ncourt_fee,Fee,None,\\\"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\\\"\\njurisdiction,Jurisdiction,Capitalize first letter of each word,\\\"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\\\"\\n Steps: 1. infer schema, 2. remove newlines or any other characters that are not json-serializable, 3. convert to valid json, 4. double check your work and make revisions as needed. \\n Your output should be 100% valid json. Do not transform the values, just the structure.\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] [19.15s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"[\\n {\\n \\\"source_column_name\\\": \\\"case_date\\\",\\n \\\"target_column_name\\\": \\\"CaseDate\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"lastname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"firstname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_type\\\",\\n \\\"target_column_name\\\": \\\"CaseType\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_id\\\",\\n \\\"target_column_name\\\": \\\"CaseID\\\",\\n \\\"value_transformation\\\": \\\"Remove \\\\\\\"CR-\\\\\\\" prefix\\\",\\n \\\"notes\\\": \\\"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"court_fee\\\",\\n \\\"target_column_name\\\": \\\"Fee\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"jurisdiction\\\",\\n \\\"target_column_name\\\": \\\"Jurisdiction\\\",\\n \\\"value_transformation\\\": \\\"Capitalize first letter of each word\\\",\\n \\\"notes\\\": \\\"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\\\"\\n }\\n]\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"stop\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"[\\n {\\n \\\"source_column_name\\\": \\\"case_date\\\",\\n \\\"target_column_name\\\": \\\"CaseDate\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"lastname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"firstname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_type\\\",\\n \\\"target_column_name\\\": \\\"CaseType\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_id\\\",\\n \\\"target_column_name\\\": \\\"CaseID\\\",\\n \\\"value_transformation\\\": \\\"Remove \\\\\\\"CR-\\\\\\\" prefix\\\",\\n \\\"notes\\\": \\\"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"court_fee\\\",\\n \\\"target_column_name\\\": \\\"Fee\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"jurisdiction\\\",\\n \\\"target_column_name\\\": \\\"Jurisdiction\\\",\\n \\\"value_transformation\\\": \\\"Capitalize first letter of each word\\\",\\n \\\"notes\\\": \\\"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\\\"\\n }\\n]\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 420,\n", + " \"completion_tokens\": 535,\n", + " \"total_tokens\": 955\n", + " },\n", + " \"model_name\": \"gpt-3.5-turbo\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] Entering Parser run with input:\n", + "\u001b[0m[inputs]\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] [0ms] Exiting Parser run with output:\n", + "\u001b[0m{\n", + " \"output\": \"[\\n {\\n \\\"source_column_name\\\": \\\"case_date\\\",\\n \\\"target_column_name\\\": \\\"CaseDate\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"lastname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"firstname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_type\\\",\\n \\\"target_column_name\\\": \\\"CaseType\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_id\\\",\\n \\\"target_column_name\\\": \\\"CaseID\\\",\\n \\\"value_transformation\\\": \\\"Remove \\\\\\\"CR-\\\\\\\" prefix\\\",\\n \\\"notes\\\": \\\"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"court_fee\\\",\\n \\\"target_column_name\\\": \\\"Fee\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"jurisdiction\\\",\\n \\\"target_column_name\\\": \\\"Jurisdiction\\\",\\n \\\"value_transformation\\\": \\\"Capitalize first letter of each word\\\",\\n \\\"notes\\\": \\\"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\\\"\\n }\\n]\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence] [19.15s] Exiting Chain run with output:\n", + "\u001b[0m{\n", + " \"output\": \"[\\n {\\n \\\"source_column_name\\\": \\\"case_date\\\",\\n \\\"target_column_name\\\": \\\"CaseDate\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"lastname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"firstname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_type\\\",\\n \\\"target_column_name\\\": \\\"CaseType\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_id\\\",\\n \\\"target_column_name\\\": \\\"CaseID\\\",\\n \\\"value_transformation\\\": \\\"Remove \\\\\\\"CR-\\\\\\\" prefix\\\",\\n \\\"notes\\\": \\\"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"court_fee\\\",\\n \\\"target_column_name\\\": \\\"Fee\\\",\\n \\\"value_transformation\\\": \\\"None\\\",\\n \\\"notes\\\": \\\"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"jurisdiction\\\",\\n \\\"target_column_name\\\": \\\"Jurisdiction\\\",\\n \\\"value_transformation\\\": \\\"Capitalize first letter of each word\\\",\\n \\\"notes\\\": \\\"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\\\"\\n }\\n]\"\n", + "}\n" + ] + } + ], + "source": [ + "model = ChatOpenAI(model_name='gpt-3.5-turbo')\n", + "analyst_prompt = ChatPromptTemplate.from_template(analyst_prompt_str)\n", + "scientist_prompt = ChatPromptTemplate.from_template(scientist_prompt_str)\n", + "\n", + "chain_1 = analyst_prompt | model | StrOutputParser()\n", + "employee_script_str = chain_1.invoke({\"source_1_csv_str\": get_data_str_from_df_for_prompt(entries_a_df), \"target_csv_str\": get_data_str_from_df_for_prompt(template_df)})\n", + "\n", + "chain_2 = scientist_prompt | model | StrOutputParser()\n", + "scientist_csv_str = chain_2.invoke({\"source_1_csv_str\": get_data_str_from_df_for_prompt(entries_a_df, False), \"target_csv_str\": get_data_str_from_df_for_prompt(template_df, False), \"employee_script_str\": employee_script_str})\n", + "csv_prompt = ChatPromptTemplate.from_template(csv_convert_prompt_str)\n", + "chain_3 = csv_prompt| model | StrOutputParser()\n", + "# python_prompt = ChatPromptTemplate.from_template(python_prompt_str)\n", + "# chain_4 = {\"csv_string\": chain_3} | python_prompt | model | StrOutputParser() | _sanitize_output | PythonREPL().run\n", + "# csv_bytes = io.BytesIO(scientist_csv_str.encode())\n", + "res = chain_3.invoke({\"csv_str\": scientist_csv_str})" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[\\n {\\n \"source_column_name\": \"case_date\",\\n \"target_column_name\": \"CaseDate\",\\n \"value_transformation\": \"None\",\\n \"notes\": \"The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required.\"\\n },\\n {\\n \"source_column_name\": \"lastname\",\\n \"target_column_name\": \"FullName\",\\n \"value_transformation\": \"None\",\\n \"notes\": \"The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required.\"\\n },\\n {\\n \"source_column_name\": \"firstname\",\\n \"target_column_name\": \"FullName\",\\n \"value_transformation\": \"None\",\\n \"notes\": \"The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are required.\"\\n },\\n {\\n \"source_column_name\": \"case_type\",\\n \"target_column_name\": \"CaseType\",\\n \"value_transformation\": \"None\",\\n \"notes\": \"The source column `case_type` should map to the target column `CaseType` as it represents the type of the case. No value transformations are required.\"\\n },\\n {\\n \"source_column_name\": \"case_id\",\\n \"target_column_name\": \"CaseID\",\\n \"value_transformation\": \"Remove \\\\\"CR-\\\\\" prefix\",\\n \"notes\": \"The source column `case_id` should map to the target column `CaseID` as it represents the unique identifier of the case. The `CR-` prefix in the source values should be removed to match the target format.\"\\n },\\n {\\n \"source_column_name\": \"court_fee\",\\n \"target_column_name\": \"Fee\",\\n \"value_transformation\": \"None\",\\n \"notes\": \"The source column `court_fee` should map to the target column `Fee` as it represents the fee associated with the case. No value transformations are required.\"\\n },\\n {\\n \"source_column_name\": \"jurisdiction\",\\n \"target_column_name\": \"Jurisdiction\",\\n \"value_transformation\": \"Capitalize first letter of each word\",\\n \"notes\": \"The source column `jurisdiction` should map to the target column `Jurisdiction` as it represents the jurisdiction where the case is held. The source values should be transformed to capitalize the first letter of each word to match the target format.\"\\n }\\n]'" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.output_parsers import PydanticOutputParser\n", + "from pydantic import BaseModel, Field, validator\n", + "from langchain.prompts import (\n", + " ChatPromptTemplate,\n", + ")\n", + "from pydantic import BaseModel, Field, validator, ValidationError\n", + "\n", + "class TableMappingEntry(BaseModel):\n", + " source_column_name: str = Field(..., description=\"Name of the column in the source table.\")\n", + " target_column_name: str = Field(..., description=\"Name of the column in the target table.\")\n", + " value_transformations: str = Field(None, description=\"The transformations that need to be applied to the source values to make it match the target values, if necessary.\")\n", + " explanation: str = Field(None, description=\"A brief explanation of why the source column maps to the target column. Include any other relevant information.\")\n", + "\n", + " @validator(\"source_column_name\", \"target_column_name\", \"value_transformations\", \"explanation\", pre=True, always=True)\n", + " def check_forbidden_characters(cls, value):\n", + " if value:\n", + " forbidden_chars = [\",\", \"\\\"\", \"'\"]\n", + " for char in forbidden_chars:\n", + " if char in value:\n", + " raise ValueError(f\"Field contains forbidden character: {char}\")\n", + " return value\n", + "\n", + "class TableMapping(BaseModel):\n", + " table_mappings: list[TableMappingEntry] = Field(..., description=\"A list of table mappings.\")\n", + "\n", + " @validator('table_mappings', pre=True, always=True)\n", + " def check_unique_source_and_target_names(cls, value):\n", + " source_columns = [entry.source_column_name for entry in value]\n", + " target_columns = [entry.target_column_name for entry in value]\n", + " \n", + " if len(source_columns) != len(set(source_columns)):\n", + " raise ValueError(\"Duplicate source column names detected.\")\n", + " if len(target_columns) != len(set(target_columns)):\n", + " raise ValueError(\"Duplicate target column names detected.\")\n", + " \n", + " return value\n", + "\n", + "# Test the models\n", + "try:\n", + " entry = TableMappingEntry(source_column_name=\"FirstName,LastName\", target_column_name=\"FullName\")\n", + "except ValidationError as e:\n", + " print(e)\n", + " \n", + "analyst_prompt_str = '''\n", + " You are a Data Scientist, who specializes in generating mappings between schemas for use by Software Engineers in ETL pipelines.\n", + " Head of `source_csv`:\n", + " \n", + " {source_1_csv_str}\n", + " \n", + " Head of `target_csv`:\n", + " \n", + " {target_csv_str}\n", + " \n", + " Your job is to generate a thorough, precise summary of exactly which columns in `source_csv` map to columns in `target_csv`.\n", + " In addition, compare cell values and note any formatting differences. For example:\n", + " - If date in `source_csv` is `2020-01-01` and date in `target_csv` is `01/01/2020`, you should mention that it's the same date, but structured differently.\n", + " - If `source_csv` has columns \"FirstName\" and \"LastName\", but `target_csv` only has \"FullName\", you should include both of the first columns as entries, but also explain how they should be transform to adhere to the target column.\n", + "\n", + " Lastly, point out any other oddities, such as duplicate columns, erroneous columns, etc.\n", + " \n", + " {format_instructions}\n", + " \n", + " Remember:\n", + " - Be concise: you are speaking to engineers, not customers.\n", + " - Be precise: all of these values are case sensitive. \n", + " - DO NOT transform or impute values: your job is to explain the data, not change it.\n", + " - DO NOT include commas, quotes, or any other characters that might interfere with JSON serialization or CSV generation\n", + "\n", + " Your response:\n", + "'''\n", + "\n", + "table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)\n", + "analyst_prompt = ChatPromptTemplate.from_template(\n", + " template=analyst_prompt_str, \n", + " partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence] Entering Chain run with input:\n", + "\u001b[0m{\n", + " \"source_1_csv_str\": \"\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\",\n", + " \"target_csv_str\": \"\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:\n", + "\u001b[0m{\n", + " \"source_1_csv_str\": \"\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\",\n", + " \"target_csv_str\": \"\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [3ms] Exiting Prompt run with output:\n", + "\u001b[0m{\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"prompts\",\n", + " \"chat\",\n", + " \"ChatPromptValue\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"messages\": [\n", + " {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"HumanMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"\\n You are a Data Scientist, who specializes in generating mappings between schemas for use by Software Engineers in ETL pipelines.\\n Head of `source_csv`:\\n \\n \\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\\n \\n Head of `target_csv`:\\n \\n \\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\\n \\n Your job is to generate a thorough, precise summary of exactly which columns in `source_csv` map to columns in `target_csv`.\\n In addition, compare cell values and note any formatting differences. For example:\\n - If date in `source_csv` is `2020-01-01` and date in `target_csv` is `01/01/2020`, you should mention that it's the same date, but structured differently.\\n - If `source_csv` has columns \\\"FirstName\\\" and \\\"LastName\\\", but `target_csv` only has \\\"FullName\\\", you should include both of the first columns as entries, but also explain how they should be transform to adhere to the target column.\\n\\n Lastly, point out any other oddities, such as duplicate columns, erroneous columns, etc.\\n \\n The output should be formatted as a JSON instance that conforms to the JSON schema below.\\n\\nAs an example, for the schema {\\\"properties\\\": {\\\"foo\\\": {\\\"title\\\": \\\"Foo\\\", \\\"description\\\": \\\"a list of strings\\\", \\\"type\\\": \\\"array\\\", \\\"items\\\": {\\\"type\\\": \\\"string\\\"}}}, \\\"required\\\": [\\\"foo\\\"]}\\nthe object {\\\"foo\\\": [\\\"bar\\\", \\\"baz\\\"]} is a well-formatted instance of the schema. The object {\\\"properties\\\": {\\\"foo\\\": [\\\"bar\\\", \\\"baz\\\"]}} is not well-formatted.\\n\\nHere is the output schema:\\n```\\n{\\\"properties\\\": {\\\"table_mappings\\\": {\\\"title\\\": \\\"Table Mappings\\\", \\\"description\\\": \\\"A list of table mappings.\\\", \\\"type\\\": \\\"array\\\", \\\"items\\\": {\\\"$ref\\\": \\\"#/definitions/TableMappingEntry\\\"}}}, \\\"required\\\": [\\\"table_mappings\\\"], \\\"definitions\\\": {\\\"TableMappingEntry\\\": {\\\"title\\\": \\\"TableMappingEntry\\\", \\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"source_column_name\\\": {\\\"title\\\": \\\"Source Column Name\\\", \\\"description\\\": \\\"Name of the column in the source table.\\\", \\\"type\\\": \\\"string\\\"}, \\\"target_column_name\\\": {\\\"title\\\": \\\"Target Column Name\\\", \\\"description\\\": \\\"Name of the column in the target table.\\\", \\\"type\\\": \\\"string\\\"}, \\\"value_transformations\\\": {\\\"title\\\": \\\"Value Transformations\\\", \\\"description\\\": \\\"The transformations that need to be applied to the source values to make it match the target values, if necessary.\\\", \\\"type\\\": \\\"string\\\"}, \\\"explanation\\\": {\\\"title\\\": \\\"Explanation\\\", \\\"description\\\": \\\"A brief explanation of why the source column maps to the target column. Include any other relevant information.\\\", \\\"type\\\": \\\"string\\\"}}, \\\"required\\\": [\\\"source_column_name\\\", \\\"target_column_name\\\", \\\"value_transformations\\\", \\\"explanation\\\"]}}}\\n```\\n \\n Remember:\\n - Be concise: you are speaking to engineers, not customers.\\n - Be precise: all of these values are case sensitive. \\n - DO NOT transform or impute values: your job is to explain the data, not change it.\\n - DO NOT include commas, quotes, or any other characters that might interfere with JSON serialization or CSV generation\\n\\n Your response:\\n\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " ]\n", + " }\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"Human: \\n You are a Data Scientist, who specializes in generating mappings between schemas for use by Software Engineers in ETL pipelines.\\n Head of `source_csv`:\\n \\n \\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\\n \\n Head of `target_csv`:\\n \\n \\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\\n \\n Your job is to generate a thorough, precise summary of exactly which columns in `source_csv` map to columns in `target_csv`.\\n In addition, compare cell values and note any formatting differences. For example:\\n - If date in `source_csv` is `2020-01-01` and date in `target_csv` is `01/01/2020`, you should mention that it's the same date, but structured differently.\\n - If `source_csv` has columns \\\"FirstName\\\" and \\\"LastName\\\", but `target_csv` only has \\\"FullName\\\", you should include both of the first columns as entries, but also explain how they should be transform to adhere to the target column.\\n\\n Lastly, point out any other oddities, such as duplicate columns, erroneous columns, etc.\\n \\n The output should be formatted as a JSON instance that conforms to the JSON schema below.\\n\\nAs an example, for the schema {\\\"properties\\\": {\\\"foo\\\": {\\\"title\\\": \\\"Foo\\\", \\\"description\\\": \\\"a list of strings\\\", \\\"type\\\": \\\"array\\\", \\\"items\\\": {\\\"type\\\": \\\"string\\\"}}}, \\\"required\\\": [\\\"foo\\\"]}\\nthe object {\\\"foo\\\": [\\\"bar\\\", \\\"baz\\\"]} is a well-formatted instance of the schema. The object {\\\"properties\\\": {\\\"foo\\\": [\\\"bar\\\", \\\"baz\\\"]}} is not well-formatted.\\n\\nHere is the output schema:\\n```\\n{\\\"properties\\\": {\\\"table_mappings\\\": {\\\"title\\\": \\\"Table Mappings\\\", \\\"description\\\": \\\"A list of table mappings.\\\", \\\"type\\\": \\\"array\\\", \\\"items\\\": {\\\"$ref\\\": \\\"#/definitions/TableMappingEntry\\\"}}}, \\\"required\\\": [\\\"table_mappings\\\"], \\\"definitions\\\": {\\\"TableMappingEntry\\\": {\\\"title\\\": \\\"TableMappingEntry\\\", \\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"source_column_name\\\": {\\\"title\\\": \\\"Source Column Name\\\", \\\"description\\\": \\\"Name of the column in the source table.\\\", \\\"type\\\": \\\"string\\\"}, \\\"target_column_name\\\": {\\\"title\\\": \\\"Target Column Name\\\", \\\"description\\\": \\\"Name of the column in the target table.\\\", \\\"type\\\": \\\"string\\\"}, \\\"value_transformations\\\": {\\\"title\\\": \\\"Value Transformations\\\", \\\"description\\\": \\\"The transformations that need to be applied to the source values to make it match the target values, if necessary.\\\", \\\"type\\\": \\\"string\\\"}, \\\"explanation\\\": {\\\"title\\\": \\\"Explanation\\\", \\\"description\\\": \\\"A brief explanation of why the source column maps to the target column. Include any other relevant information.\\\", \\\"type\\\": \\\"string\\\"}}, \\\"required\\\": [\\\"source_column_name\\\", \\\"target_column_name\\\", \\\"value_transformations\\\", \\\"explanation\\\"]}}}\\n```\\n \\n Remember:\\n - Be concise: you are speaking to engineers, not customers.\\n - Be precise: all of these values are case sensitive. \\n - DO NOT transform or impute values: your job is to explain the data, not change it.\\n - DO NOT include commas, quotes, or any other characters that might interfere with JSON serialization or CSV generation\\n\\n Your response:\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] [14.63s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"{\\\"table_mappings\\\": [\\n {\\n \\\"source_column_name\\\": \\\"case_date\\\",\\n \\\"target_column_name\\\": \\\"CaseDate\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"lastname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"firstname\\\",\\n \\\"target_column_name\\\": \\\"\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_type\\\",\\n \\\"target_column_name\\\": \\\"CaseType\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_id\\\",\\n \\\"target_column_name\\\": \\\"CaseID\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"court_fee\\\",\\n \\\"target_column_name\\\": \\\"Fee\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"jurisdiction\\\",\\n \\\"target_column_name\\\": \\\"Jurisdiction\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\"\\n }\\n]}\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"stop\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"{\\\"table_mappings\\\": [\\n {\\n \\\"source_column_name\\\": \\\"case_date\\\",\\n \\\"target_column_name\\\": \\\"CaseDate\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"lastname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"firstname\\\",\\n \\\"target_column_name\\\": \\\"\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_type\\\",\\n \\\"target_column_name\\\": \\\"CaseType\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_id\\\",\\n \\\"target_column_name\\\": \\\"CaseID\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"court_fee\\\",\\n \\\"target_column_name\\\": \\\"Fee\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"jurisdiction\\\",\\n \\\"target_column_name\\\": \\\"Jurisdiction\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\"\\n }\\n]}\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 1138,\n", + " \"completion_tokens\": 422,\n", + " \"total_tokens\": 1560\n", + " },\n", + " \"model_name\": \"gpt-3.5-turbo\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:PydanticOutputParser] Entering Parser run with input:\n", + "\u001b[0m[inputs]\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:PydanticOutputParser] [1ms] Exiting Parser run with output:\n", + "\u001b[0m{\n", + " \"lc\": 1,\n", + " \"type\": \"not_implemented\",\n", + " \"id\": [\n", + " \"__main__\",\n", + " \"TableMapping\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence] [14.66s] Exiting Chain run with output:\n", + "\u001b[0m[outputs]\n" + ] + } + ], + "source": [ + "chain_1 = analyst_prompt | model | table_mapping_parser\n", + "table_mapping: TableMapping = chain_1.invoke({\"source_1_csv_str\": get_data_str_from_df_for_prompt(entries_a_df), \"target_csv_str\": get_data_str_from_df_for_prompt(template_df)})" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence] Entering Chain run with input:\n", + "\u001b[0m{\n", + " \"source_1_csv_str\": \"\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\",\n", + " \"target_csv_str\": \"\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:\n", + "\u001b[0m{\n", + " \"source_1_csv_str\": \"\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\",\n", + " \"target_csv_str\": \"\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [0ms] Exiting Prompt run with output:\n", + "\u001b[0m{\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"prompts\",\n", + " \"chat\",\n", + " \"ChatPromptValue\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"messages\": [\n", + " {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"HumanMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"\\n You are a Data Scientist, who specializes in tabular data and ETL pipelines. You are working with othere Data Scientists and Software Engineers to transform data from one source format to a target format.\\n Here is the head of a CSV file, called source_1_csv:\\n \\n \\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\\n \\n Here is the head of another, called target_csv:\\n \\n \\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\\n \\n Your job is to generate a thorough, precise summary of exactly which columns in the source file should map to the target file, including a brief explanation for each.\\n In addition, if the cell values appear to be different, you should note that as well. For example, if the source date is formatted like this 2020-01-01 and the target date is formatted like this 01/01/2020, you should note that.\\n \\n The output should be formatted as a JSON instance that conforms to the JSON schema below.\\n\\nAs an example, for the schema {\\\"properties\\\": {\\\"foo\\\": {\\\"title\\\": \\\"Foo\\\", \\\"description\\\": \\\"a list of strings\\\", \\\"type\\\": \\\"array\\\", \\\"items\\\": {\\\"type\\\": \\\"string\\\"}}}, \\\"required\\\": [\\\"foo\\\"]}\\nthe object {\\\"foo\\\": [\\\"bar\\\", \\\"baz\\\"]} is a well-formatted instance of the schema. The object {\\\"properties\\\": {\\\"foo\\\": [\\\"bar\\\", \\\"baz\\\"]}} is not well-formatted.\\n\\nHere is the output schema:\\n```\\n{\\\"properties\\\": {\\\"table_mappings\\\": {\\\"title\\\": \\\"Table Mappings\\\", \\\"description\\\": \\\"A list of table mappings.\\\", \\\"type\\\": \\\"array\\\", \\\"items\\\": {\\\"$ref\\\": \\\"#/definitions/TableMappingEntry\\\"}}}, \\\"required\\\": [\\\"table_mappings\\\"], \\\"definitions\\\": {\\\"TableMappingEntry\\\": {\\\"title\\\": \\\"TableMappingEntry\\\", \\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"source_column_name\\\": {\\\"title\\\": \\\"Source Column Name\\\", \\\"description\\\": \\\"The name of the column in the source table.\\\", \\\"type\\\": \\\"string\\\"}, \\\"target_column_name\\\": {\\\"title\\\": \\\"Target Column Name\\\", \\\"description\\\": \\\"The name of the column in the target table.\\\", \\\"type\\\": \\\"string\\\"}, \\\"value_transformations\\\": {\\\"title\\\": \\\"Value Transformations\\\", \\\"description\\\": \\\"The transformations that need to be applied to the source values to make it match the target values, if necessary.\\\", \\\"type\\\": \\\"string\\\"}, \\\"explanation\\\": {\\\"title\\\": \\\"Explanation\\\", \\\"description\\\": \\\"A brief explanation of why the source column maps to the target column.\\\", \\\"type\\\": \\\"string\\\"}}, \\\"required\\\": [\\\"source_column_name\\\", \\\"target_column_name\\\", \\\"value_transformations\\\", \\\"explanation\\\"]}}}\\n```\\n \\n Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.\\n Do not include rows: focus on the column names and data types.\\n DO NOT INCLUDE COMMAS OR QUOTES ANYWHERE THIS WILL BE USED IN CSV GENERATION.\\n Instead of quotes, use backticks.\\n Your response:\\n\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " ]\n", + " }\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"Human: \\n You are a Data Scientist, who specializes in tabular data and ETL pipelines. You are working with othere Data Scientists and Software Engineers to transform data from one source format to a target format.\\n Here is the head of a CSV file, called source_1_csv:\\n \\n \\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n\\n \\n Here is the head of another, called target_csv:\\n \\n \\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n\\n \\n Your job is to generate a thorough, precise summary of exactly which columns in the source file should map to the target file, including a brief explanation for each.\\n In addition, if the cell values appear to be different, you should note that as well. For example, if the source date is formatted like this 2020-01-01 and the target date is formatted like this 01/01/2020, you should note that.\\n \\n The output should be formatted as a JSON instance that conforms to the JSON schema below.\\n\\nAs an example, for the schema {\\\"properties\\\": {\\\"foo\\\": {\\\"title\\\": \\\"Foo\\\", \\\"description\\\": \\\"a list of strings\\\", \\\"type\\\": \\\"array\\\", \\\"items\\\": {\\\"type\\\": \\\"string\\\"}}}, \\\"required\\\": [\\\"foo\\\"]}\\nthe object {\\\"foo\\\": [\\\"bar\\\", \\\"baz\\\"]} is a well-formatted instance of the schema. The object {\\\"properties\\\": {\\\"foo\\\": [\\\"bar\\\", \\\"baz\\\"]}} is not well-formatted.\\n\\nHere is the output schema:\\n```\\n{\\\"properties\\\": {\\\"table_mappings\\\": {\\\"title\\\": \\\"Table Mappings\\\", \\\"description\\\": \\\"A list of table mappings.\\\", \\\"type\\\": \\\"array\\\", \\\"items\\\": {\\\"$ref\\\": \\\"#/definitions/TableMappingEntry\\\"}}}, \\\"required\\\": [\\\"table_mappings\\\"], \\\"definitions\\\": {\\\"TableMappingEntry\\\": {\\\"title\\\": \\\"TableMappingEntry\\\", \\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"source_column_name\\\": {\\\"title\\\": \\\"Source Column Name\\\", \\\"description\\\": \\\"The name of the column in the source table.\\\", \\\"type\\\": \\\"string\\\"}, \\\"target_column_name\\\": {\\\"title\\\": \\\"Target Column Name\\\", \\\"description\\\": \\\"The name of the column in the target table.\\\", \\\"type\\\": \\\"string\\\"}, \\\"value_transformations\\\": {\\\"title\\\": \\\"Value Transformations\\\", \\\"description\\\": \\\"The transformations that need to be applied to the source values to make it match the target values, if necessary.\\\", \\\"type\\\": \\\"string\\\"}, \\\"explanation\\\": {\\\"title\\\": \\\"Explanation\\\", \\\"description\\\": \\\"A brief explanation of why the source column maps to the target column.\\\", \\\"type\\\": \\\"string\\\"}}, \\\"required\\\": [\\\"source_column_name\\\", \\\"target_column_name\\\", \\\"value_transformations\\\", \\\"explanation\\\"]}}}\\n```\\n \\n Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.\\n Do not include rows: focus on the column names and data types.\\n DO NOT INCLUDE COMMAS OR QUOTES ANYWHERE THIS WILL BE USED IN CSV GENERATION.\\n Instead of quotes, use backticks.\\n Your response:\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] [19.31s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"{\\\"table_mappings\\\": [\\n {\\n \\\"source_column_name\\\": \\\"case_date\\\",\\n \\\"target_column_name\\\": \\\"CaseDate\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `case_date` column in the source file maps to the `CaseDate` column in the target file. The values in both columns represent the date of the case. No value transformations are needed.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"lastname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `lastname` column in the source file maps to the `FullName` column in the target file. The values in both columns represent the last name of the person associated with the case. No value transformations are needed.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"firstname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `firstname` column in the source file also maps to the `FullName` column in the target file. The values in both columns represent the first name of the person associated with the case. No value transformations are needed.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_type\\\",\\n \\\"target_column_name\\\": \\\"CaseType\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `case_type` column in the source file maps to the `CaseType` column in the target file. The values in both columns represent the type of the case. No value transformations are needed.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_id\\\",\\n \\\"target_column_name\\\": \\\"CaseID\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `case_id` column in the source file maps to the `CaseID` column in the target file. The values in both columns represent the unique identifier of the case. No value transformations are needed.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"court_fee\\\",\\n \\\"target_column_name\\\": \\\"Fee\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `court_fee` column in the source file maps to the `Fee` column in the target file. The values in both columns represent the fee associated with the case. No value transformations are needed.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"jurisdiction\\\",\\n \\\"target_column_name\\\": \\\"Jurisdiction\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `jurisdiction` column in the source file maps to the `Jurisdiction` column in the target file. The values in both columns represent the jurisdiction where the case is being held. No value transformations are needed.\\\"\\n }\\n]}\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"stop\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"{\\\"table_mappings\\\": [\\n {\\n \\\"source_column_name\\\": \\\"case_date\\\",\\n \\\"target_column_name\\\": \\\"CaseDate\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `case_date` column in the source file maps to the `CaseDate` column in the target file. The values in both columns represent the date of the case. No value transformations are needed.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"lastname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `lastname` column in the source file maps to the `FullName` column in the target file. The values in both columns represent the last name of the person associated with the case. No value transformations are needed.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"firstname\\\",\\n \\\"target_column_name\\\": \\\"FullName\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `firstname` column in the source file also maps to the `FullName` column in the target file. The values in both columns represent the first name of the person associated with the case. No value transformations are needed.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_type\\\",\\n \\\"target_column_name\\\": \\\"CaseType\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `case_type` column in the source file maps to the `CaseType` column in the target file. The values in both columns represent the type of the case. No value transformations are needed.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"case_id\\\",\\n \\\"target_column_name\\\": \\\"CaseID\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `case_id` column in the source file maps to the `CaseID` column in the target file. The values in both columns represent the unique identifier of the case. No value transformations are needed.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"court_fee\\\",\\n \\\"target_column_name\\\": \\\"Fee\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `court_fee` column in the source file maps to the `Fee` column in the target file. The values in both columns represent the fee associated with the case. No value transformations are needed.\\\"\\n },\\n {\\n \\\"source_column_name\\\": \\\"jurisdiction\\\",\\n \\\"target_column_name\\\": \\\"Jurisdiction\\\",\\n \\\"value_transformations\\\": \\\"\\\",\\n \\\"explanation\\\": \\\"The `jurisdiction` column in the source file maps to the `Jurisdiction` column in the target file. The values in both columns represent the jurisdiction where the case is being held. No value transformations are needed.\\\"\\n }\\n]}\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 1096,\n", + " \"completion_tokens\": 558,\n", + " \"total_tokens\": 1654\n", + " },\n", + " \"model_name\": \"gpt-3.5-turbo\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:PydanticOutputParser] Entering Parser run with input:\n", + "\u001b[0m[inputs]\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:PydanticOutputParser] [1ms] Exiting Parser run with output:\n", + "\u001b[0m{\n", + " \"lc\": 1,\n", + " \"type\": \"not_implemented\",\n", + " \"id\": [\n", + " \"__main__\",\n", + " \"TableMapping\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence] [19.32s] Exiting Chain run with output:\n", + "\u001b[0m[outputs]\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'table_mappings': [{'source_column_name': 'case_date',\n", + " 'target_column_name': 'CaseDate',\n", + " 'value_transformations': '',\n", + " 'explanation': 'The `case_date` column in the source file maps to the `CaseDate` column in the target file. The values in both columns represent the date of the case. No value transformations are needed.'},\n", + " {'source_column_name': 'lastname',\n", + " 'target_column_name': 'FullName',\n", + " 'value_transformations': '',\n", + " 'explanation': 'The `lastname` column in the source file maps to the `FullName` column in the target file. The values in both columns represent the last name of the person associated with the case. No value transformations are needed.'},\n", + " {'source_column_name': 'firstname',\n", + " 'target_column_name': 'FullName',\n", + " 'value_transformations': '',\n", + " 'explanation': 'The `firstname` column in the source file also maps to the `FullName` column in the target file. The values in both columns represent the first name of the person associated with the case. No value transformations are needed.'},\n", + " {'source_column_name': 'case_type',\n", + " 'target_column_name': 'CaseType',\n", + " 'value_transformations': '',\n", + " 'explanation': 'The `case_type` column in the source file maps to the `CaseType` column in the target file. The values in both columns represent the type of the case. No value transformations are needed.'},\n", + " {'source_column_name': 'case_id',\n", + " 'target_column_name': 'CaseID',\n", + " 'value_transformations': '',\n", + " 'explanation': 'The `case_id` column in the source file maps to the `CaseID` column in the target file. The values in both columns represent the unique identifier of the case. No value transformations are needed.'},\n", + " {'source_column_name': 'court_fee',\n", + " 'target_column_name': 'Fee',\n", + " 'value_transformations': '',\n", + " 'explanation': 'The `court_fee` column in the source file maps to the `Fee` column in the target file. The values in both columns represent the fee associated with the case. No value transformations are needed.'},\n", + " {'source_column_name': 'jurisdiction',\n", + " 'target_column_name': 'Jurisdiction',\n", + " 'value_transformations': '',\n", + " 'explanation': 'The `jurisdiction` column in the source file maps to the `Jurisdiction` column in the target file. The values in both columns represent the jurisdiction where the case is being held. No value transformations are needed.'}]}" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "employee_script_str.dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.utilities import PythonREPL\n", + "from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate\n", + "\n", + "template = \"\"\"Write some python code to solve the user's problem. \n", + "\n", + "Return only python code in Markdown format, eg:\n", + "\n", + "```python\n", + "....\n", + "```\"\"\"\n", + "python_prompt = ChatPromptTemplate(messages=[\n", + " SystemMessagePromptTemplate.from_template(template),\n", + " HumanMessagePromptTemplate.from_template(\"{input}\")\n", + "])\n", + "\n", + "# Input is sanitized by default\n", + "python_chain = python_prompt | model | StrOutputParser()" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence] Entering Chain run with input:\n", + "\u001b[0m{\n", + " \"input\": \"\\n You are an expert Python Engineer, working on an ETL pipeline. Your colleague has written a pydantic object that describes the mapping between a source table and a target table.\\n This source table data is now in a Pandas DataFrame called `source_df`. Your job is to write valid python code to copy this Pandas DataFrame and transform it to the target schema.\\n Here is a stringified version of the mapping object:\\n \\n table_mappings=[TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"), TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"), TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"), TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"), TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"), TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"), TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")]\\n\\n Your code should do the following:\\n - Review the mapping object and understand the transformation logic\\n - Copy the source_df to a new DataFrame called `target_df`\\n - DO NOT MODIFY the `source_df`\\n - Transform the `target_df` to the target schema\\n - Validate that it matches the target schema\\n - Return the `target_df`\\n\\n Once executed, this code must return a Pandas DataFrame that matches the target schema.\\n\\n Return only python code in Markdown format, eg:\\n ```python\\n ....\\n ```\\\"\\\"\\\"\\n\"\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:\n", + "\u001b[0m{\n", + " \"input\": \"\\n You are an expert Python Engineer, working on an ETL pipeline. Your colleague has written a pydantic object that describes the mapping between a source table and a target table.\\n This source table data is now in a Pandas DataFrame called `source_df`. Your job is to write valid python code to copy this Pandas DataFrame and transform it to the target schema.\\n Here is a stringified version of the mapping object:\\n \\n table_mappings=[TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"), TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"), TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"), TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"), TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"), TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"), TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")]\\n\\n Your code should do the following:\\n - Review the mapping object and understand the transformation logic\\n - Copy the source_df to a new DataFrame called `target_df`\\n - DO NOT MODIFY the `source_df`\\n - Transform the `target_df` to the target schema\\n - Validate that it matches the target schema\\n - Return the `target_df`\\n\\n Once executed, this code must return a Pandas DataFrame that matches the target schema.\\n\\n Return only python code in Markdown format, eg:\\n ```python\\n ....\\n ```\\\"\\\"\\\"\\n\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [0ms] Exiting Prompt run with output:\n", + "\u001b[0m{\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"prompts\",\n", + " \"chat\",\n", + " \"ChatPromptValue\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"messages\": [\n", + " {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"SystemMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"Write some python code to solve the user's problem. \\n\\nReturn only python code in Markdown format, eg:\\n\\n```python\\n....\\n```\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " },\n", + " {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"HumanMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"\\n You are an expert Python Engineer, working on an ETL pipeline. Your colleague has written a pydantic object that describes the mapping between a source table and a target table.\\n This source table data is now in a Pandas DataFrame called `source_df`. Your job is to write valid python code to copy this Pandas DataFrame and transform it to the target schema.\\n Here is a stringified version of the mapping object:\\n \\n table_mappings=[TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"), TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"), TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"), TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"), TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"), TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"), TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")]\\n\\n Your code should do the following:\\n - Review the mapping object and understand the transformation logic\\n - Copy the source_df to a new DataFrame called `target_df`\\n - DO NOT MODIFY the `source_df`\\n - Transform the `target_df` to the target schema\\n - Validate that it matches the target schema\\n - Return the `target_df`\\n\\n Once executed, this code must return a Pandas DataFrame that matches the target schema.\\n\\n Return only python code in Markdown format, eg:\\n ```python\\n ....\\n ```\\\"\\\"\\\"\\n\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " ]\n", + " }\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"System: Write some python code to solve the user's problem. \\n\\nReturn only python code in Markdown format, eg:\\n\\n```python\\n....\\n```\\nHuman: \\n You are an expert Python Engineer, working on an ETL pipeline. Your colleague has written a pydantic object that describes the mapping between a source table and a target table.\\n This source table data is now in a Pandas DataFrame called `source_df`. Your job is to write valid python code to copy this Pandas DataFrame and transform it to the target schema.\\n Here is a stringified version of the mapping object:\\n \\n table_mappings=[TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"), TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"), TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"), TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"), TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"), TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"), TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")]\\n\\n Your code should do the following:\\n - Review the mapping object and understand the transformation logic\\n - Copy the source_df to a new DataFrame called `target_df`\\n - DO NOT MODIFY the `source_df`\\n - Transform the `target_df` to the target schema\\n - Validate that it matches the target schema\\n - Return the `target_df`\\n\\n Once executed, this code must return a Pandas DataFrame that matches the target schema.\\n\\n Return only python code in Markdown format, eg:\\n ```python\\n ....\\n ```\\\"\\\"\\\"\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] [27.50s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"```python\\nimport pandas as pd\\n\\n# Define the mapping object\\nclass TableMappingEntry:\\n def __init__(self, source_column_name, target_column_name, value_transformations, explanation):\\n self.source_column_name = source_column_name\\n self.target_column_name = target_column_name\\n self.value_transformations = value_transformations\\n self.explanation = explanation\\n\\n# Define the source DataFrame\\nsource_df = pd.DataFrame({\\n 'case_date': ['2021-01-01', '2021-01-02'],\\n 'lastname': ['Doe', 'Smith'],\\n 'firstname': ['John', 'Jane'],\\n 'case_type': ['Criminal', 'Civil'],\\n 'case_id': [1, 2],\\n 'court_fee': [100, 200],\\n 'jurisdiction': ['State', 'Federal']\\n})\\n\\n# Define the target schema\\ntarget_df_schema = pd.DataFrame({\\n 'CaseDate': pd.Series(dtype='datetime64[ns]'),\\n 'FullName': pd.Series(dtype='object'),\\n 'CaseType': pd.Series(dtype='object'),\\n 'CaseID': pd.Series(dtype='int64'),\\n 'Fee': pd.Series(dtype='float64'),\\n 'Jurisdiction': pd.Series(dtype='object')\\n})\\n\\n# Define the mapping object\\ntable_mappings = [\\n TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"),\\n TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")\\n]\\n\\n# Copy the source_df to target_df\\ntarget_df = source_df.copy()\\n\\n# Transform the target_df to the target schema\\nfor mapping in table_mappings:\\n if mapping.target_column_name:\\n target_column_name = mapping.target_column_name\\n source_column_name = mapping.source_column_name\\n \\n if target_column_name == 'FullName':\\n target_df[target_column_name] = target_df[source_column_name] + ' ' + target_df['lastname']\\n else:\\n target_df.rename(columns={source_column_name: target_column_name}, inplace=True)\\n\\n# Validate the target_df matches the target schema\\nassert set(target_df.columns) == set(target_df_schema.columns)\\n\\n# Return the target_df\\ntarget_df\\n```\\nThis code defines the mapping object, creates the source DataFrame, and the target schema. It then copies the source DataFrame to the target DataFrame and applies the required transformations based on the mapping object. Finally, it validates that the target DataFrame matches the target schema and returns the target DataFrame.\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"stop\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"```python\\nimport pandas as pd\\n\\n# Define the mapping object\\nclass TableMappingEntry:\\n def __init__(self, source_column_name, target_column_name, value_transformations, explanation):\\n self.source_column_name = source_column_name\\n self.target_column_name = target_column_name\\n self.value_transformations = value_transformations\\n self.explanation = explanation\\n\\n# Define the source DataFrame\\nsource_df = pd.DataFrame({\\n 'case_date': ['2021-01-01', '2021-01-02'],\\n 'lastname': ['Doe', 'Smith'],\\n 'firstname': ['John', 'Jane'],\\n 'case_type': ['Criminal', 'Civil'],\\n 'case_id': [1, 2],\\n 'court_fee': [100, 200],\\n 'jurisdiction': ['State', 'Federal']\\n})\\n\\n# Define the target schema\\ntarget_df_schema = pd.DataFrame({\\n 'CaseDate': pd.Series(dtype='datetime64[ns]'),\\n 'FullName': pd.Series(dtype='object'),\\n 'CaseType': pd.Series(dtype='object'),\\n 'CaseID': pd.Series(dtype='int64'),\\n 'Fee': pd.Series(dtype='float64'),\\n 'Jurisdiction': pd.Series(dtype='object')\\n})\\n\\n# Define the mapping object\\ntable_mappings = [\\n TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"),\\n TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")\\n]\\n\\n# Copy the source_df to target_df\\ntarget_df = source_df.copy()\\n\\n# Transform the target_df to the target schema\\nfor mapping in table_mappings:\\n if mapping.target_column_name:\\n target_column_name = mapping.target_column_name\\n source_column_name = mapping.source_column_name\\n \\n if target_column_name == 'FullName':\\n target_df[target_column_name] = target_df[source_column_name] + ' ' + target_df['lastname']\\n else:\\n target_df.rename(columns={source_column_name: target_column_name}, inplace=True)\\n\\n# Validate the target_df matches the target schema\\nassert set(target_df.columns) == set(target_df_schema.columns)\\n\\n# Return the target_df\\ntarget_df\\n```\\nThis code defines the mapping object, creates the source DataFrame, and the target schema. It then copies the source DataFrame to the target DataFrame and applies the required transformations based on the mapping object. Finally, it validates that the target DataFrame matches the target schema and returns the target DataFrame.\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 566,\n", + " \"completion_tokens\": 798,\n", + " \"total_tokens\": 1364\n", + " },\n", + " \"model_name\": \"gpt-3.5-turbo\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] Entering Parser run with input:\n", + "\u001b[0m[inputs]\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence > 4:parser:StrOutputParser] [0ms] Exiting Parser run with output:\n", + "\u001b[0m{\n", + " \"output\": \"```python\\nimport pandas as pd\\n\\n# Define the mapping object\\nclass TableMappingEntry:\\n def __init__(self, source_column_name, target_column_name, value_transformations, explanation):\\n self.source_column_name = source_column_name\\n self.target_column_name = target_column_name\\n self.value_transformations = value_transformations\\n self.explanation = explanation\\n\\n# Define the source DataFrame\\nsource_df = pd.DataFrame({\\n 'case_date': ['2021-01-01', '2021-01-02'],\\n 'lastname': ['Doe', 'Smith'],\\n 'firstname': ['John', 'Jane'],\\n 'case_type': ['Criminal', 'Civil'],\\n 'case_id': [1, 2],\\n 'court_fee': [100, 200],\\n 'jurisdiction': ['State', 'Federal']\\n})\\n\\n# Define the target schema\\ntarget_df_schema = pd.DataFrame({\\n 'CaseDate': pd.Series(dtype='datetime64[ns]'),\\n 'FullName': pd.Series(dtype='object'),\\n 'CaseType': pd.Series(dtype='object'),\\n 'CaseID': pd.Series(dtype='int64'),\\n 'Fee': pd.Series(dtype='float64'),\\n 'Jurisdiction': pd.Series(dtype='object')\\n})\\n\\n# Define the mapping object\\ntable_mappings = [\\n TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"),\\n TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")\\n]\\n\\n# Copy the source_df to target_df\\ntarget_df = source_df.copy()\\n\\n# Transform the target_df to the target schema\\nfor mapping in table_mappings:\\n if mapping.target_column_name:\\n target_column_name = mapping.target_column_name\\n source_column_name = mapping.source_column_name\\n \\n if target_column_name == 'FullName':\\n target_df[target_column_name] = target_df[source_column_name] + ' ' + target_df['lastname']\\n else:\\n target_df.rename(columns={source_column_name: target_column_name}, inplace=True)\\n\\n# Validate the target_df matches the target schema\\nassert set(target_df.columns) == set(target_df_schema.columns)\\n\\n# Return the target_df\\ntarget_df\\n```\\nThis code defines the mapping object, creates the source DataFrame, and the target schema. It then copies the source DataFrame to the target DataFrame and applies the required transformations based on the mapping object. Finally, it validates that the target DataFrame matches the target schema and returns the target DataFrame.\"\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:RunnableSequence] [27.50s] Exiting Chain run with output:\n", + "\u001b[0m{\n", + " \"output\": \"```python\\nimport pandas as pd\\n\\n# Define the mapping object\\nclass TableMappingEntry:\\n def __init__(self, source_column_name, target_column_name, value_transformations, explanation):\\n self.source_column_name = source_column_name\\n self.target_column_name = target_column_name\\n self.value_transformations = value_transformations\\n self.explanation = explanation\\n\\n# Define the source DataFrame\\nsource_df = pd.DataFrame({\\n 'case_date': ['2021-01-01', '2021-01-02'],\\n 'lastname': ['Doe', 'Smith'],\\n 'firstname': ['John', 'Jane'],\\n 'case_type': ['Criminal', 'Civil'],\\n 'case_id': [1, 2],\\n 'court_fee': [100, 200],\\n 'jurisdiction': ['State', 'Federal']\\n})\\n\\n# Define the target schema\\ntarget_df_schema = pd.DataFrame({\\n 'CaseDate': pd.Series(dtype='datetime64[ns]'),\\n 'FullName': pd.Series(dtype='object'),\\n 'CaseType': pd.Series(dtype='object'),\\n 'CaseID': pd.Series(dtype='int64'),\\n 'Fee': pd.Series(dtype='float64'),\\n 'Jurisdiction': pd.Series(dtype='object')\\n})\\n\\n# Define the mapping object\\ntable_mappings = [\\n TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"),\\n TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"),\\n TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")\\n]\\n\\n# Copy the source_df to target_df\\ntarget_df = source_df.copy()\\n\\n# Transform the target_df to the target schema\\nfor mapping in table_mappings:\\n if mapping.target_column_name:\\n target_column_name = mapping.target_column_name\\n source_column_name = mapping.source_column_name\\n \\n if target_column_name == 'FullName':\\n target_df[target_column_name] = target_df[source_column_name] + ' ' + target_df['lastname']\\n else:\\n target_df.rename(columns={source_column_name: target_column_name}, inplace=True)\\n\\n# Validate the target_df matches the target schema\\nassert set(target_df.columns) == set(target_df_schema.columns)\\n\\n# Return the target_df\\ntarget_df\\n```\\nThis code defines the mapping object, creates the source DataFrame, and the target schema. It then copies the source DataFrame to the target DataFrame and applies the required transformations based on the mapping object. Finally, it validates that the target DataFrame matches the target schema and returns the target DataFrame.\"\n", + "}\n" + ] + } + ], + "source": [ + "question = f'''\n", + " You are an expert Python Engineer, working on an ETL pipeline. Your colleague has written a pydantic object that describes the mapping between a source table and a target table.\n", + " This source table data is now in a Pandas DataFrame called `source_df`. Your job is to write valid python code to copy this Pandas DataFrame and transform it to the target schema.\n", + " Here is a stringified version of the mapping object:\n", + " \n", + " {str(table_mapping)}\n", + "\n", + " Your code should do the following:\n", + " - Review the mapping object and understand the transformation logic\n", + " - Copy the source_df to a new DataFrame called `target_df`\n", + " - DO NOT MODIFY the `source_df`\n", + " - Transform the `target_df` to the target schema\n", + " - Validate that it matches the target schema\n", + " - Return the `target_df`\n", + "\n", + " Once executed, this code must return a Pandas DataFrame that matches the target schema.\n", + "\n", + " Return only python code in Markdown format, eg:\n", + " ```python\n", + " ....\n", + " ```\"\"\"\n", + "'''\n", + "res = python_chain.invoke({\"input\": question})" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'```python\\nimport pandas as pd\\n\\n# Define the mapping object\\nclass TableMappingEntry:\\n def __init__(self, source_column_name, target_column_name, value_transformations, explanation):\\n self.source_column_name = source_column_name\\n self.target_column_name = target_column_name\\n self.value_transformations = value_transformations\\n self.explanation = explanation\\n\\n# Define the source DataFrame\\nsource_df = pd.DataFrame({\\n \\'case_date\\': [\\'2021-01-01\\', \\'2021-01-02\\'],\\n \\'lastname\\': [\\'Doe\\', \\'Smith\\'],\\n \\'firstname\\': [\\'John\\', \\'Jane\\'],\\n \\'case_type\\': [\\'Criminal\\', \\'Civil\\'],\\n \\'case_id\\': [1, 2],\\n \\'court_fee\\': [100, 200],\\n \\'jurisdiction\\': [\\'State\\', \\'Federal\\']\\n})\\n\\n# Define the target schema\\ntarget_df_schema = pd.DataFrame({\\n \\'CaseDate\\': pd.Series(dtype=\\'datetime64[ns]\\'),\\n \\'FullName\\': pd.Series(dtype=\\'object\\'),\\n \\'CaseType\\': pd.Series(dtype=\\'object\\'),\\n \\'CaseID\\': pd.Series(dtype=\\'int64\\'),\\n \\'Fee\\': pd.Series(dtype=\\'float64\\'),\\n \\'Jurisdiction\\': pd.Series(dtype=\\'object\\')\\n})\\n\\n# Define the mapping object\\ntable_mappings = [\\n TableMappingEntry(source_column_name=\\'case_date\\', target_column_name=\\'CaseDate\\', value_transformations=\\'\\', explanation=\"The source column \\'case_date\\' maps to the target column \\'CaseDate\\' without any value transformations.\"),\\n TableMappingEntry(source_column_name=\\'lastname\\', target_column_name=\\'FullName\\', value_transformations=\\'\\', explanation=\"The source column \\'lastname\\' maps to the target column \\'FullName\\' without any value transformations.\"),\\n TableMappingEntry(source_column_name=\\'firstname\\', target_column_name=\\'\\', value_transformations=\\'\\', explanation=\"The source column \\'firstname\\' does not have a direct mapping in the target schema. The values from \\'firstname\\' need to be combined with \\'lastname\\' to form \\'FullName\\' in the target schema.\"),\\n TableMappingEntry(source_column_name=\\'case_type\\', target_column_name=\\'CaseType\\', value_transformations=\\'\\', explanation=\"The source column \\'case_type\\' maps to the target column \\'CaseType\\' without any value transformations.\"),\\n TableMappingEntry(source_column_name=\\'case_id\\', target_column_name=\\'CaseID\\', value_transformations=\\'\\', explanation=\"The source column \\'case_id\\' maps to the target column \\'CaseID\\' without any value transformations.\"),\\n TableMappingEntry(source_column_name=\\'court_fee\\', target_column_name=\\'Fee\\', value_transformations=\\'\\', explanation=\"The source column \\'court_fee\\' maps to the target column \\'Fee\\' without any value transformations.\"),\\n TableMappingEntry(source_column_name=\\'jurisdiction\\', target_column_name=\\'Jurisdiction\\', value_transformations=\\'\\', explanation=\"The source column \\'jurisdiction\\' maps to the target column \\'Jurisdiction\\' without any value transformations.\")\\n]\\n\\n# Copy the source_df to target_df\\ntarget_df = source_df.copy()\\n\\n# Transform the target_df to the target schema\\nfor mapping in table_mappings:\\n if mapping.target_column_name:\\n target_column_name = mapping.target_column_name\\n source_column_name = mapping.source_column_name\\n \\n if target_column_name == \\'FullName\\':\\n target_df[target_column_name] = target_df[source_column_name] + \\' \\' + target_df[\\'lastname\\']\\n else:\\n target_df.rename(columns={source_column_name: target_column_name}, inplace=True)\\n\\n# Validate the target_df matches the target schema\\nassert set(target_df.columns) == set(target_df_schema.columns)\\n\\n# Return the target_df\\ntarget_df\\n```\\nThis code defines the mapping object, creates the source DataFrame, and the target schema. It then copies the source DataFrame to the target DataFrame and applies the required transformations based on the mapping object. Finally, it validates that the target DataFrame matches the target schema and returns the target DataFrame.'" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor] Entering Chain run with input:\n", + "\u001b[0m{\n", + " \"input\": \"\\n df is the source data. Here is the mapping between the source schema and the target schema:\\n table_mappings=[TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"), TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"), TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"), TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"), TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"), TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"), TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")]\\n Transform df to the target schema.\\n Do not query the whole df. Use the head, generate the logic, apply it, validate the results.\\n\",\n", + " \"verbose\": true\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"System: \\nYou are working with a pandas dataframe in Python. The name of the dataframe is `df`.\\nThis is the result of `print(df.head())`:\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\nHuman: \\n df is the source data. Here is the mapping between the source schema and the target schema:\\n table_mappings=[TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"), TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"), TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"), TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"), TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"), TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"), TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")]\\n Transform df to the target schema.\\n Do not query the whole df. Use the head, generate the logic, apply it, validate the results.\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] [4.77s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"function_call\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"\",\n", + " \"additional_kwargs\": {\n", + " \"function_call\": {\n", + " \"name\": \"python_repl_ast\",\n", + " \"arguments\": \"\\n{\\n\\\"__arg1\\\": \\\"df['FullName'] = df['firstname'] + ' ' + df['lastname']\\\"\\n}\"\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 736,\n", + " \"completion_tokens\": 35,\n", + " \"total_tokens\": 771\n", + " },\n", + " \"model_name\": \"gpt-4-0613\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[tool/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 3:tool:python_repl_ast] Entering Tool run with input:\n", + "\u001b[0m\"df['FullName'] = df['firstname'] + ' ' + df['lastname']\"\n", + "\u001b[36;1m\u001b[1;3m[tool/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 3:tool:python_repl_ast] [6ms] Exiting Tool run with output:\n", + "\u001b[0m\"\"\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 4:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"System: \\nYou are working with a pandas dataframe in Python. The name of the dataframe is `df`.\\nThis is the result of `print(df.head())`:\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\nHuman: \\n df is the source data. Here is the mapping between the source schema and the target schema:\\n table_mappings=[TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"), TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"), TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"), TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"), TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"), TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"), TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")]\\n Transform df to the target schema.\\n Do not query the whole df. Use the head, generate the logic, apply it, validate the results.\\n\\nAI: {'name': 'python_repl_ast', 'arguments': '\\\\n{\\\\n\\\"__arg1\\\": \\\"df[\\\\'FullName\\\\'] = df[\\\\'firstname\\\\'] + \\\\' \\\\' + df[\\\\'lastname\\\\']\\\"\\\\n}'}\\nFunction:\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 4:llm:ChatOpenAI] [6.55s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"The 'firstname' and 'lastname' columns have been combined to form the 'FullName' column in the dataframe. Now, let's rename the other columns to match the target schema.\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"function_call\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"The 'firstname' and 'lastname' columns have been combined to form the 'FullName' column in the dataframe. Now, let's rename the other columns to match the target schema.\",\n", + " \"additional_kwargs\": {\n", + " \"function_call\": {\n", + " \"name\": \"python_repl_ast\",\n", + " \"arguments\": \"\\n{\\n\\\"__arg1\\\": \\\"df = df.rename(columns={'case_date': 'CaseDate', 'case_type': 'CaseType', 'case_id': 'CaseID', 'court_fee': 'Fee', 'jurisdiction': 'Jurisdiction'})\\\"\\n}\"\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 781,\n", + " \"completion_tokens\": 100,\n", + " \"total_tokens\": 881\n", + " },\n", + " \"model_name\": \"gpt-4-0613\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[tool/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 5:tool:python_repl_ast] Entering Tool run with input:\n", + "\u001b[0m\"df = df.rename(columns={'case_date': 'CaseDate', 'case_type': 'CaseType', 'case_id': 'CaseID', 'court_fee': 'Fee', 'jurisdiction': 'Jurisdiction'})\"\n", + "\u001b[36;1m\u001b[1;3m[tool/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 5:tool:python_repl_ast] [5ms] Exiting Tool run with output:\n", + "\u001b[0m\"\"\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 6:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"System: \\nYou are working with a pandas dataframe in Python. The name of the dataframe is `df`.\\nThis is the result of `print(df.head())`:\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\nHuman: \\n df is the source data. Here is the mapping between the source schema and the target schema:\\n table_mappings=[TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"), TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"), TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"), TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"), TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"), TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"), TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")]\\n Transform df to the target schema.\\n Do not query the whole df. Use the head, generate the logic, apply it, validate the results.\\n\\nAI: {'name': 'python_repl_ast', 'arguments': '\\\\n{\\\\n\\\"__arg1\\\": \\\"df[\\\\'FullName\\\\'] = df[\\\\'firstname\\\\'] + \\\\' \\\\' + df[\\\\'lastname\\\\']\\\"\\\\n}'}\\nFunction: \\nAI: The 'firstname' and 'lastname' columns have been combined to form the 'FullName' column in the dataframe. Now, let's rename the other columns to match the target schema.{'name': 'python_repl_ast', 'arguments': '\\\\n{\\\\n\\\"__arg1\\\": \\\"df = df.rename(columns={\\\\'case_date\\\\': \\\\'CaseDate\\\\', \\\\'case_type\\\\': \\\\'CaseType\\\\', \\\\'case_id\\\\': \\\\'CaseID\\\\', \\\\'court_fee\\\\': \\\\'Fee\\\\', \\\\'jurisdiction\\\\': \\\\'Jurisdiction\\\\'})\\\"\\\\n}'}\\nFunction:\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 6:llm:ChatOpenAI] [5.68s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"The columns have been renamed to match the target schema. Now, let's check the first few rows of the transformed dataframe to validate the results.\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"function_call\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"The columns have been renamed to match the target schema. Now, let's check the first few rows of the transformed dataframe to validate the results.\",\n", + " \"additional_kwargs\": {\n", + " \"function_call\": {\n", + " \"name\": \"python_repl_ast\",\n", + " \"arguments\": \"\\n{\\n\\\"__arg1\\\": \\\"df.head()\\\"\\n}\"\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 894,\n", + " \"completion_tokens\": 50,\n", + " \"total_tokens\": 944\n", + " },\n", + " \"model_name\": \"gpt-4-0613\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[32;1m\u001b[1;3m[tool/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 7:tool:python_repl_ast] Entering Tool run with input:\n", + "\u001b[0m\"df.head()\"\n", + "\u001b[36;1m\u001b[1;3m[tool/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 7:tool:python_repl_ast] [4ms] Exiting Tool run with output:\n", + "\u001b[0m\"CaseDate lastname firstname CaseType CaseID Fee Jurisdiction \\\n", + "0 2023-05-12 Kim Miguel Civil CR-1095 100 BOSTON \n", + "1 2023-04-20 Lee John Criminl CR-8597 150 houston \n", + "2 2023-02-10 Smith Dmitri Criminal CR-6833 200 chicago \n", + "3 2023-03-16 Patel Dmitri Criminal CR-2899 100 BOSTON \n", + "4 2023-06-15 Ivanov Jane Family CR-5997 200 houston \n", + "\n", + " FullName \n", + "0 Miguel Kim \n", + "1 John Lee \n", + "2 Dmitri Smith \n", + "3 Dmitri Patel \n", + "4 Jane Ivanov\"\n", + "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 8:llm:ChatOpenAI] Entering LLM run with input:\n", + "\u001b[0m{\n", + " \"prompts\": [\n", + " \"System: \\nYou are working with a pandas dataframe in Python. The name of the dataframe is `df`.\\nThis is the result of `print(df.head())`:\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\nHuman: \\n df is the source data. Here is the mapping between the source schema and the target schema:\\n table_mappings=[TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\\\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\\\"), TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\\\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\\\"), TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\\\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\\\"), TableMappingEntry(source_column_name='case_type', target_column_name='CaseType', value_transformations='', explanation=\\\"The source column 'case_type' maps to the target column 'CaseType' without any value transformations.\\\"), TableMappingEntry(source_column_name='case_id', target_column_name='CaseID', value_transformations='', explanation=\\\"The source column 'case_id' maps to the target column 'CaseID' without any value transformations.\\\"), TableMappingEntry(source_column_name='court_fee', target_column_name='Fee', value_transformations='', explanation=\\\"The source column 'court_fee' maps to the target column 'Fee' without any value transformations.\\\"), TableMappingEntry(source_column_name='jurisdiction', target_column_name='Jurisdiction', value_transformations='', explanation=\\\"The source column 'jurisdiction' maps to the target column 'Jurisdiction' without any value transformations.\\\")]\\n Transform df to the target schema.\\n Do not query the whole df. Use the head, generate the logic, apply it, validate the results.\\n\\nAI: {'name': 'python_repl_ast', 'arguments': '\\\\n{\\\\n\\\"__arg1\\\": \\\"df[\\\\'FullName\\\\'] = df[\\\\'firstname\\\\'] + \\\\' \\\\' + df[\\\\'lastname\\\\']\\\"\\\\n}'}\\nFunction: \\nAI: The 'firstname' and 'lastname' columns have been combined to form the 'FullName' column in the dataframe. Now, let's rename the other columns to match the target schema.{'name': 'python_repl_ast', 'arguments': '\\\\n{\\\\n\\\"__arg1\\\": \\\"df = df.rename(columns={\\\\'case_date\\\\': \\\\'CaseDate\\\\', \\\\'case_type\\\\': \\\\'CaseType\\\\', \\\\'case_id\\\\': \\\\'CaseID\\\\', \\\\'court_fee\\\\': \\\\'Fee\\\\', \\\\'jurisdiction\\\\': \\\\'Jurisdiction\\\\'})\\\"\\\\n}'}\\nFunction: \\nAI: The columns have been renamed to match the target schema. Now, let's check the first few rows of the transformed dataframe to validate the results.{'name': 'python_repl_ast', 'arguments': '\\\\n{\\\\n\\\"__arg1\\\": \\\"df.head()\\\"\\\\n}'}\\nFunction: CaseDate lastname firstname CaseType CaseID Fee Jurisdiction \\\\\\n0 2023-05-12 Kim Miguel Civil CR-1095 100 BOSTON \\n1 2023-04-20 Lee John Criminl CR-8597 150 houston \\n2 2023-02-10 Smith Dmitri Criminal CR-6833 200 chicago \\n3 2023-03-16 Patel Dmitri Criminal CR-2899 100 BOSTON \\n4 2023-06-15 Ivanov Jane Family CR-5997 200 houston \\n\\n FullName \\n0 Miguel Kim \\n1 John Lee \\n2 Dmitri Smith \\n3 Dmitri Patel \\n4 Jane Ivanov\"\n", + " ]\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 8:llm:ChatOpenAI] [7.55s] Exiting LLM run with output:\n", + "\u001b[0m{\n", + " \"generations\": [\n", + " [\n", + " {\n", + " \"text\": \"The dataframe has been successfully transformed to match the target schema. The 'FullName' column has been created by combining 'firstname' and 'lastname'. The other columns have been renamed as follows: 'case_date' to 'CaseDate', 'case_type' to 'CaseType', 'case_id' to 'CaseID', 'court_fee' to 'Fee', and 'jurisdiction' to 'Jurisdiction'.\",\n", + " \"generation_info\": {\n", + " \"finish_reason\": \"stop\"\n", + " },\n", + " \"message\": {\n", + " \"lc\": 1,\n", + " \"type\": \"constructor\",\n", + " \"id\": [\n", + " \"langchain\",\n", + " \"schema\",\n", + " \"messages\",\n", + " \"AIMessage\"\n", + " ],\n", + " \"kwargs\": {\n", + " \"content\": \"The dataframe has been successfully transformed to match the target schema. The 'FullName' column has been created by combining 'firstname' and 'lastname'. The other columns have been renamed as follows: 'case_date' to 'CaseDate', 'case_type' to 'CaseType', 'case_id' to 'CaseID', 'court_fee' to 'Fee', and 'jurisdiction' to 'Jurisdiction'.\",\n", + " \"additional_kwargs\": {}\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " ],\n", + " \"llm_output\": {\n", + " \"token_usage\": {\n", + " \"prompt_tokens\": 1146,\n", + " \"completion_tokens\": 84,\n", + " \"total_tokens\": 1230\n", + " },\n", + " \"model_name\": \"gpt-4-0613\"\n", + " },\n", + " \"run\": null\n", + "}\n", + "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor] [24.57s] Exiting Chain run with output:\n", + "\u001b[0m{\n", + " \"output\": \"The dataframe has been successfully transformed to match the target schema. The 'FullName' column has been created by combining 'firstname' and 'lastname'. The other columns have been renamed as follows: 'case_date' to 'CaseDate', 'case_type' to 'CaseType', 'case_id' to 'CaseID', 'court_fee' to 'Fee', and 'jurisdiction' to 'Jurisdiction'.\"\n", + "}\n" + ] + } + ], + "source": [ + "from langchain.agents.agent_toolkits import create_python_agent\n", + "from langchain.tools.python.tool import PythonREPLTool\n", + "from langchain.python import PythonREPL\n", + "from langchain.llms.openai import OpenAI\n", + "from langchain.agents.agent_types import AgentType\n", + "from langchain.chat_models import ChatOpenAI\n", + "import io\n", + "from langchain.agents import create_pandas_dataframe_agent\n", + "\n", + "agent = create_pandas_dataframe_agent(\n", + " ChatOpenAI(temperature=0, model=\"gpt-4-0613\"),\n", + " # ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\"),\n", + " entries_a_df,\n", + " verbose=True,\n", + " agent_type=AgentType.OPENAI_FUNCTIONS,\n", + " handle_parsing_errors=True\n", + ")\n", + "\n", + "question = f'''\n", + " df is the source data. Here is the mapping between the source schema and the target schema:\n", + " {str(table_mapping)}\n", + " Transform df to the target schema.\n", + " Do not query the whole df. Use the head, generate the logic, apply it, validate the results.\n", + "'''\n", + "\n", + "res = agent.run(input=question, verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
case_datelastnamefirstnamecase_typecase_idcourt_feejurisdiction
02023-05-12KimMiguelCivilCR-1095100BOSTON
12023-04-20LeeJohnCriminlCR-8597150houston
22023-02-10SmithDmitriCriminalCR-6833200chicago
32023-03-16PatelDmitriCriminalCR-2899100BOSTON
42023-06-15IvanovJaneFamilyCR-5997200houston
........................
952023-08-13RodriguezJaneFamillyCR-8883200chicago
962023-03-27PatelJohnFamillyCR-2838100BOSTON
972023-07-27OkaforMiguelCriminalCR-3885250new York
982023-01-14IvanovAlanFamilyCR-1066250houston
992023-03-20OkaforOluwaseunCriminlCR-9851200chicago
\n", + "

100 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " case_date lastname firstname case_type case_id court_fee \\\n", + "0 2023-05-12 Kim Miguel Civil CR-1095 100 \n", + "1 2023-04-20 Lee John Criminl CR-8597 150 \n", + "2 2023-02-10 Smith Dmitri Criminal CR-6833 200 \n", + "3 2023-03-16 Patel Dmitri Criminal CR-2899 100 \n", + "4 2023-06-15 Ivanov Jane Family CR-5997 200 \n", + ".. ... ... ... ... ... ... \n", + "95 2023-08-13 Rodriguez Jane Familly CR-8883 200 \n", + "96 2023-03-27 Patel John Familly CR-2838 100 \n", + "97 2023-07-27 Okafor Miguel Criminal CR-3885 250 \n", + "98 2023-01-14 Ivanov Alan Family CR-1066 250 \n", + "99 2023-03-20 Okafor Oluwaseun Criminl CR-9851 200 \n", + "\n", + " jurisdiction \n", + "0 BOSTON \n", + "1 houston \n", + "2 chicago \n", + "3 BOSTON \n", + "4 houston \n", + ".. ... \n", + "95 chicago \n", + "96 BOSTON \n", + "97 new York \n", + "98 houston \n", + "99 chicago \n", + "\n", + "[100 rows x 7 columns]" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "entries_a_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}