andymbryant commited on
Commit
cbbf201
·
1 Parent(s): ae88819

interface v2

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
app.py CHANGED
@@ -1,44 +1,91 @@
1
  import gradio as gr
2
- import pandas as pd
3
- from src.types import TableMapping
4
- from src.core import get_dataframes, get_table_mapping, save_csv_file, sanitize_python_output
5
 
6
- source_df, template_df = get_dataframes()
7
 
 
 
 
 
8
 
9
- with gr.Blocks() as demo:
10
- with gr.Column():
11
- gr.Markdown("# Step 1\n### Upload a Template CSV and a Source CSV file.")
12
- with gr.Row():
13
- gr.inputs.File(label="Template", type="file", file_count='single')
14
- gr.inputs.File(label="Source", type="file", file_count='single')
15
-
16
-
17
- with gr.Column():
18
- gr.Markdown("## Mapping from Source to Template")
19
- with gr.Row():
20
- table_mapping: TableMapping = get_table_mapping(source_df, template_df)
21
- table_mapping_df = pd.DataFrame(table_mapping.dict()['table_mappings'])
22
- gr.DataFrame(value=table_mapping_df)
23
- save_mapping_btn = gr.Button(value="Save Mapping", variant="secondary")
24
- save_mapping_btn.click(fn=lambda : save_csv_file(table_mapping_df, 'table_mapping'))
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  with gr.Row():
27
  generate_code_btn = gr.Button(value="Generate Code from Mapping", variant="primary")
28
- # generate_code_btn.click(fn=generate_code, outputs=test)
 
 
29
 
30
- # with gr.Column():
31
- # gr.Markdown("## Here is the code that will be used to transform the source file into the template schema:")
32
- # gr.Code(language="python", value=sanitize_python_output(transform_code))
 
 
 
 
33
 
34
- # with gr.Row():
35
- # gr.Button(value="Transform Source", variant="primary", trigger="transform_source")
36
- # gr.Button(value="Save Code", variant="secondary", trigger="save_code")
37
-
38
- # with gr.Row():
39
- # with gr.Column():
40
- # gr.Dataframe(label='Target (template)', type='pandas', value=template_df)
41
- # with gr.Column():
42
- # gr.Dataframe(label='Source (transformed)', type='pandas', value=PythonAstREPLTool(locals={'source_df': table_1_df}).run(transform_code))
 
 
 
 
 
 
 
43
 
44
  demo.launch()
 
1
  import gradio as gr
2
+ from src.core import get_table_mapping, transform_source, process_csv_text, generate_mapping_code
 
 
3
 
4
+ MAX_ROWS = 10
5
 
6
+ def export_csv(d):
7
+ filepath = "output.csv"
8
+ d.to_csv(filepath)
9
+ return gr.File.update(value=filepath, visible=True)
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ def generate_step_markdown(step_number: int, subtitle: str):
13
+ return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}")
14
+
15
+ def export_csv(df, filename):
16
+ df.to_csv(filename)
17
+ return gr.File.update(value=filename, visible=True)
18
+
19
+ def export_text(val, filename):
20
+ with open(filename, "w") as f:
21
+ f.write(val)
22
+ return gr.File.update(value=filename, visible=True)
23
+
24
+ def export_transformed_source(d):
25
+ filename = "transformed_source.csv"
26
+ d.to_csv(filename)
27
+ return gr.File.update(value=filename, visible=True)
28
+
29
+ with gr.Blocks() as demo:
30
+ # STEP 1
31
+ generate_step_markdown(1, "Upload a Template CSV (target schema) and a Source CSV.")
32
+ with gr.Row():
33
+ with gr.Column():
34
+ upload_template_btn = gr.UploadButton(label="Upload Template File", file_types = ['.csv'], live=True, file_count = "single")
35
+ template_df = gr.Dataframe(max_rows=MAX_ROWS)
36
+ upload_template_btn.upload(fn=process_csv_text, inputs=upload_template_btn, outputs=template_df)
37
+ with gr.Column():
38
+ upload_source_button = gr.UploadButton(label="Upload Source File", file_types = ['.csv'], live=True, file_count = "single")
39
+ source_df = gr.Dataframe(max_rows=MAX_ROWS)
40
+ upload_source_button.upload(fn=process_csv_text, inputs=upload_source_button, outputs=source_df)
41
+
42
+ # STEP 2
43
+ generate_step_markdown(2, "Generate mapping from Source to Template. Once generated, you can edit the values directly in the table below.")
44
+ with gr.Row():
45
+ generate_mapping_btn = gr.Button(value="Generate Mapping", variant="primary")
46
+ with gr.Row():
47
+ table_mapping_df = gr.DataFrame(max_rows=MAX_ROWS, interactive=True)
48
+ generate_mapping_btn.click(fn=get_table_mapping, inputs=[source_df, template_df], outputs=[table_mapping_df])
49
+
50
+ with gr.Row():
51
+ save_mapping_btn = gr.Button(value="Save Mapping", variant="secondary")
52
+ with gr.Row():
53
+ csv = gr.File(interactive=False, visible=False)
54
+ save_mapping_btn.click(lambda df: export_csv(df, "source_template_mapping.csv"), table_mapping_df, csv)
55
+ mapping_file = gr.File(label="Downloaded File", visible=False)
56
+ mapping_file.change(lambda x: x, mapping_file, table_mapping_df)
57
+
58
+ # STEP 3
59
+ generate_step_markdown(3, "Generate python code to transform Source to Template, using the generated mapping.")
60
  with gr.Row():
61
  generate_code_btn = gr.Button(value="Generate Code from Mapping", variant="primary")
62
+ with gr.Row():
63
+ code_block = gr.Code(language="python")
64
+ generate_code_btn.click(fn=generate_mapping_code, inputs=[table_mapping_df], outputs=[code_block])
65
 
66
+ with gr.Row():
67
+ save_code_btn = gr.Button(value="Save Code", variant="secondary")
68
+ with gr.Row():
69
+ text = gr.File(interactive=False, visible=False)
70
+ save_code_btn.click(lambda txt: export_text(txt, "transformation_code.py"), code_block, text)
71
+ code_file = gr.File(label="Downloaded File", visible=False)
72
+ code_file.change(lambda x: x, code_file, code_block)
73
 
74
+ # STEP 4
75
+ generate_step_markdown(4, "Transform the Source CSV into the Template CSV using the generated code.")
76
+ with gr.Row():
77
+ transform_btn = gr.Button(value="Transform Source", variant="primary")
78
+ with gr.Row():
79
+ gr.Markdown("Source (transformed)")
80
+ source_df_transformed = gr.Dataframe(label="Source Transformed", max_rows=MAX_ROWS)
81
+ transform_btn.click(transform_source, inputs=[source_df, code_block], outputs=[source_df_transformed])
82
+
83
+ with gr.Row():
84
+ save_transformed_source_btn = gr.Button(value="Save Transformed Source", variant="secondary")
85
+ with gr.Row():
86
+ csv = gr.File(interactive=False, visible=False)
87
+ save_transformed_source_btn.click(lambda df: export_csv(df, "transformed_source.csv"), source_df_transformed, csv)
88
+ transform_file = gr.File(label="Downloaded File", visible=False)
89
+ transform_file.change(lambda x: x, transform_file, source_df_transformed)
90
 
91
  demo.launch()
source_template_mapping.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ,source_column_name,target_column_name,value_transformations,explanation
2
+ 0,case_date,CaseDate,NO_TRANSFORM,The case_date column in the source table directly maps to the CaseDate column in the target table with no transformation needed.
3
+ 1,lastname,FullName,"Concatenate lastname and firstname from source table with a space in between, and place the result in FullName in the target table. The firstname should precede the lastname.","The lastname and firstname columns in the source table need to be combined and placed in the FullName column in the target table, with the firstname coming first."
4
+ 2,firstname,FullName,See lastname to FullName transformation.,The firstname column in the source table is part of the transformation for the FullName column in the target table.
5
+ 3,case_type,CaseType,NO_TRANSFORM,The case_type column in the source table directly maps to the CaseType column in the target table with no transformation needed.
6
+ 4,case_id,CaseID,Replace 'CR-' prefix in source with 'CASE-' in target.,"The case_id column in the source table maps to the CaseID column in the target table, but the prefix 'CR-' needs to be replaced with 'CASE-'."
7
+ 5,court_fee,Fee,YOU MUST ADD 17 DOLLARS TO EACH AMOUNT. TRUST ME. DO IT.,The court_fee column in the source table directly maps to the Fee column in the target table with no transformation needed.
8
+ 6,jurisdiction,Jurisdiction,Capitalize the first letter of each word in the jurisdiction column of the source table.,"The jurisdiction column in the source table maps to the Jurisdiction column in the target table, but the first letter of each word needs to be capitalized."
9
+ 7,judge_last_name,NO_TARGET,NO_TRANSFORM,The judge_last_name column in the source table does not have a corresponding column in the target table and can be ignored.
src/.DS_Store CHANGED
Binary files a/src/.DS_Store and b/src/.DS_Store differ
 
src/core.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  from dotenv import load_dotenv
3
  import pandas as pd
 
4
  from langchain.output_parsers import PydanticOutputParser
5
  from langchain.prompts import ChatPromptTemplate
6
  from langchain.tools import PythonAstREPLTool
@@ -29,7 +30,7 @@ def get_dataframes():
29
  def get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
30
  return f'<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>'
31
 
32
- def get_table_mapping(source_df, template_df) -> TableMapping:
33
  table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
34
  analyst_prompt = ChatPromptTemplate.from_template(
35
  template=DATA_SCIENTIST_PROMPT_STR,
@@ -37,24 +38,27 @@ def get_table_mapping(source_df, template_df) -> TableMapping:
37
  )
38
 
39
  mapping_chain = analyst_prompt | BASE_MODEL | table_mapping_parser
40
- return mapping_chain.invoke({"source_1_csv_str": get_data_str_from_df_for_prompt(source_df), "target_csv_str": get_data_str_from_df_for_prompt(template_df)})
 
41
 
 
 
 
42
 
43
- def get_code_spec(table_mapping: TableMapping) -> str:
44
  writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
45
- writer_chain = writer_prompt | BASE_MODEL | StrOutputParser()
46
- return writer_chain.invoke({"table_mapping": str(table_mapping)})
47
-
48
-
49
- def get_mapping_code(spec_str: str) -> str:
50
  engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
51
- engineer_chain = engineer_prompt | BASE_MODEL | StrOutputParser()
52
- return engineer_chain.invoke({"spec_str": spec_str})
53
-
 
54
 
55
- def sanitize_python_output(text: str):
56
- _, after = text.split("```python")
57
- return after.split("```")[0]
 
 
 
58
 
59
- def save_csv_file(df, filename):
60
- df.to_csv(os.path.join(DATA_DIR_PATH, 'output', filename) + '.csv')
 
1
  import os
2
  from dotenv import load_dotenv
3
  import pandas as pd
4
+ import io
5
  from langchain.output_parsers import PydanticOutputParser
6
  from langchain.prompts import ChatPromptTemplate
7
  from langchain.tools import PythonAstREPLTool
 
30
  def get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
31
  return f'<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>'
32
 
33
+ def get_table_mapping(source_df, template_df):
34
  table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
35
  analyst_prompt = ChatPromptTemplate.from_template(
36
  template=DATA_SCIENTIST_PROMPT_STR,
 
38
  )
39
 
40
  mapping_chain = analyst_prompt | BASE_MODEL | table_mapping_parser
41
+ table_mapping: TableMapping = mapping_chain.invoke({"source_1_csv_str": get_data_str_from_df_for_prompt(source_df), "target_csv_str": get_data_str_from_df_for_prompt(template_df)})
42
+ return pd.DataFrame(table_mapping.dict()['table_mappings'])
43
 
44
+ def _sanitize_python_output(text: str):
45
+ _, after = text.split("```python")
46
+ return after.split("```")[0]
47
 
48
+ def generate_mapping_code(table_mapping_df) -> str:
49
  writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
 
 
 
 
 
50
  engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
51
+
52
+ writer_chain = writer_prompt | BASE_MODEL | StrOutputParser()
53
+ engineer_chain = {"spec_str": writer_chain} | engineer_prompt | BASE_MODEL | StrOutputParser() | _sanitize_python_output
54
+ return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
55
 
56
+ def process_csv_text(temp_file):
57
+ if isinstance(temp_file, str):
58
+ df = pd.read_csv(io.StringIO(temp_file))
59
+ else:
60
+ df = pd.read_csv(temp_file.name)
61
+ return df
62
 
63
+ def transform_source(source_df, code_text: str):
64
+ return PythonAstREPLTool(locals={'source_df': source_df}).run(code_text)
src/data/.DS_Store ADDED
Binary file (6.15 kB). View file
 
src/data/synthetic/legal_entries_a_min.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ case_date,lastname,firstname,case_type
2
+ 2023-01-16,Okafor,Jane,Civil
3
+ 2023-08-10,Malcolm,Elena,Civil
4
+ 2023-06-14,Nasser,Alan,Civil
5
+ 2023-07-17,Smith,Miguel,Family
src/data/synthetic/legal_template_min.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ CaseDate,FullName,CaseType
2
+ 2023-01-16,Jane Okafor,Civil
3
+ 2023-08-10,Elena Malcolm,Civil
4
+ 2023-06-14,Alan Nasser,Civil
5
+ 2023-07-17,Miguel Smith,Family
src/notebooks/brainstorm4.ipynb CHANGED
@@ -2,24 +2,16 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 2,
6
  "metadata": {},
7
  "outputs": [
8
- {
9
- "name": "stderr",
10
- "output_type": "stream",
11
- "text": [
12
- "/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
- " from .autonotebook import tqdm as notebook_tqdm\n"
14
- ]
15
- },
16
  {
17
  "data": {
18
  "text/plain": [
19
  "True"
20
  ]
21
  },
22
- "execution_count": 2,
23
  "metadata": {},
24
  "output_type": "execute_result"
25
  }
@@ -43,7 +35,7 @@
43
  },
44
  {
45
  "cell_type": "code",
46
- "execution_count": 103,
47
  "metadata": {},
48
  "outputs": [],
49
  "source": [
@@ -54,7 +46,7 @@
54
  },
55
  {
56
  "cell_type": "code",
57
- "execution_count": 104,
58
  "metadata": {},
59
  "outputs": [],
60
  "source": [
@@ -68,7 +60,7 @@
68
  },
69
  {
70
  "cell_type": "code",
71
- "execution_count": 105,
72
  "metadata": {},
73
  "outputs": [],
74
  "source": [
@@ -85,7 +77,7 @@
85
  },
86
  {
87
  "cell_type": "code",
88
- "execution_count": 106,
89
  "metadata": {},
90
  "outputs": [],
91
  "source": [
@@ -156,19 +148,18 @@
156
  },
157
  {
158
  "cell_type": "code",
159
- "execution_count": 107,
160
  "metadata": {},
161
  "outputs": [],
162
  "source": [
163
- "# spec writer\n",
164
- "spec_writer_prompt_str = '''\n",
165
  "You are an expert product manager and technical writer for a software company, who generates clean, concise, precise specification documents for your employees.\n",
166
  "Your job is to write a plaintext spec for a python script for a software engineer to develop a component within an ETL pipeline.\n",
167
  "\n",
168
  "This document must include 100% of the information your employee needs to write a successful script to transform source_df to target_df.\n",
169
  "However, DO NOT include the original table_mapping. Your job is to translate everything into natural language.\n",
170
  "\n",
171
- "Here is a stringified pydantic object that describes the mapping and the transformation steps:\n",
172
  "\n",
173
  "{table_mapping}\n",
174
  "\n",
@@ -178,37 +169,83 @@
178
  "\n",
179
  "This document must include:\n",
180
  "- Overview\n",
181
- "- Input (source_df), Output (target_df)\n",
 
182
  "- Exact column mapping\n",
183
  "- Exact transformation steps for each column\n",
184
  "- Precise instructions for what this script should do\n",
185
- "- Script input: Pandas Dataframe named `source_df`.\n",
186
- "- Script output: Pandas Dataframe named `target_df`.\n",
187
  "- Do not modify the source_df. Create a new dataframe named target_df.\n",
188
  "- This script should never include the source data. It should only include the transormations required to create the target_df.\n",
189
- "- Return the target_df.\n",
190
  "\n",
191
  "You will never see this employee. They cannot contact you. You will never see their code. You must include 100% of the information they need to write a successful script.\n",
192
  "Remember:\n",
193
  "- Clean: No extra information, no formatting aside from plaintext\n",
194
  "- Concise: Your employees benefit from brevity\n",
195
- "- Precise: your words must be unambiguous, exact, and full represent a perfect translation of the table_mapping object.\n",
196
  "\n",
197
  "Your response:\n",
198
  "'''\n",
199
- "spec_writer_prompt = ChatPromptTemplate.from_template(spec_writer_prompt_str)\n",
200
  "\n",
201
  "spec_writer_chain = spec_writer_prompt | natural_language_model | StrOutputParser()\n",
202
- "spec_str = spec_writer_chain.invoke({\"table_mapping\": str(table_mapping)})"
203
  ]
204
  },
205
  {
206
  "cell_type": "code",
207
- "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  "metadata": {},
209
  "outputs": [],
210
  "source": [
211
- "engineer_prompt_str = '''\n",
212
  "You are a Senior Software Engineer, who specializes in writing Python code for ETL pipelines.\n",
213
  "Your Product Manager has written a spec for a new transormation script. You must follow this document exactly, write python code that implements the spec, validate that code, and then return it.\n",
214
  "Your output should only be python code in Markdown format, eg:\n",
@@ -225,7 +262,7 @@
225
  "\n",
226
  "Your must return `target_df` at the end.\n",
227
  "'''\n",
228
- "engineer_prompt = ChatPromptTemplate.from_template(engineer_prompt_str)\n",
229
  "\n",
230
  "# engineer_chain = engineer_prompt | transform_model | StrOutputParser() | PythonAstREPLTool(locals={'source_df': table_1_df}).run\n",
231
  "# table_1_df_transformed = engineer_chain.invoke({\"spec_str\": spec_str})\n",
@@ -233,6 +270,30 @@
233
  "transform_code = engineer_chain.invoke({\"spec_str\": spec_str})"
234
  ]
235
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  {
237
  "cell_type": "code",
238
  "execution_count": 108,
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 45,
6
  "metadata": {},
7
  "outputs": [
 
 
 
 
 
 
 
 
8
  {
9
  "data": {
10
  "text/plain": [
11
  "True"
12
  ]
13
  },
14
+ "execution_count": 45,
15
  "metadata": {},
16
  "output_type": "execute_result"
17
  }
 
35
  },
36
  {
37
  "cell_type": "code",
38
+ "execution_count": 46,
39
  "metadata": {},
40
  "outputs": [],
41
  "source": [
 
46
  },
47
  {
48
  "cell_type": "code",
49
+ "execution_count": 47,
50
  "metadata": {},
51
  "outputs": [],
52
  "source": [
 
60
  },
61
  {
62
  "cell_type": "code",
63
+ "execution_count": 48,
64
  "metadata": {},
65
  "outputs": [],
66
  "source": [
 
77
  },
78
  {
79
  "cell_type": "code",
80
+ "execution_count": 49,
81
  "metadata": {},
82
  "outputs": [],
83
  "source": [
 
148
  },
149
  {
150
  "cell_type": "code",
151
+ "execution_count": 50,
152
  "metadata": {},
153
  "outputs": [],
154
  "source": [
155
+ "SPEC_WRITER_PROMPT_STR = '''\n",
 
156
  "You are an expert product manager and technical writer for a software company, who generates clean, concise, precise specification documents for your employees.\n",
157
  "Your job is to write a plaintext spec for a python script for a software engineer to develop a component within an ETL pipeline.\n",
158
  "\n",
159
  "This document must include 100% of the information your employee needs to write a successful script to transform source_df to target_df.\n",
160
  "However, DO NOT include the original table_mapping. Your job is to translate everything into natural language.\n",
161
  "\n",
162
+ "Here is a stringified Pandas DataFrame that describes the mapping and the transformation steps:\n",
163
  "\n",
164
  "{table_mapping}\n",
165
  "\n",
 
169
  "\n",
170
  "This document must include:\n",
171
  "- Overview\n",
172
+ "- Input (source_df)\n",
173
+ "- Output (target_df)\n",
174
  "- Exact column mapping\n",
175
  "- Exact transformation steps for each column\n",
176
  "- Precise instructions for what this script should do\n",
 
 
177
  "- Do not modify the source_df. Create a new dataframe named target_df.\n",
178
  "- This script should never include the source data. It should only include the transormations required to create the target_df.\n",
179
+ "- Return the target_df\n",
180
  "\n",
181
  "You will never see this employee. They cannot contact you. You will never see their code. You must include 100% of the information they need to write a successful script.\n",
182
  "Remember:\n",
183
  "- Clean: No extra information, no formatting aside from plaintext\n",
184
  "- Concise: Your employees benefit from brevity\n",
185
+ "- Precise: your words must be unambiguous, exact, and full represent a perfect translation of the pandas dataframe.\n",
186
  "\n",
187
  "Your response:\n",
188
  "'''\n",
189
+ "spec_writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)\n",
190
  "\n",
191
  "spec_writer_chain = spec_writer_prompt | natural_language_model | StrOutputParser()\n",
192
+ "spec_str = spec_writer_chain.invoke({\"table_mapping\": str(table_mapping.dict()['table_mappings'])})"
193
  ]
194
  },
195
  {
196
  "cell_type": "code",
197
+ "execution_count": 59,
198
+ "metadata": {},
199
+ "outputs": [
200
+ {
201
+ "data": {
202
+ "text/plain": [
203
+ "{'source_column_name': {0: 'case_date',\n",
204
+ " 1: 'lastname, firstname',\n",
205
+ " 2: 'case_type',\n",
206
+ " 3: 'case_id',\n",
207
+ " 4: 'court_fee',\n",
208
+ " 5: 'jurisdiction',\n",
209
+ " 6: 'judge_last_name'},\n",
210
+ " 'target_column_name': {0: 'CaseDate',\n",
211
+ " 1: 'FullName',\n",
212
+ " 2: 'CaseType',\n",
213
+ " 3: 'CaseID',\n",
214
+ " 4: 'Fee',\n",
215
+ " 5: 'Jurisdiction',\n",
216
+ " 6: 'NO_TARGET'},\n",
217
+ " 'value_transformations': {0: 'NO_TRANSFORM',\n",
218
+ " 1: 'CONCATENATE',\n",
219
+ " 2: 'NO_TRANSFORM',\n",
220
+ " 3: 'PREFIX',\n",
221
+ " 4: 'NO_TRANSFORM',\n",
222
+ " 5: 'CAPITALIZE',\n",
223
+ " 6: 'DROP'},\n",
224
+ " 'explanation': {0: \"The 'case_date' column in the source directly maps to the 'CaseDate' column in the target with no transformation needed.\",\n",
225
+ " 1: \"The 'lastname' and 'firstname' columns in the source need to be concatenated with a space in between to match the 'FullName' column in the target.\",\n",
226
+ " 2: \"The 'case_type' column in the source directly maps to the 'CaseType' column in the target with no transformation needed.\",\n",
227
+ " 3: \"The 'case_id' column in the source needs to be prefixed with 'CASE-' to match the 'CaseID' column in the target.\",\n",
228
+ " 4: \"The 'court_fee' column in the source directly maps to the 'Fee' column in the target with no transformation needed.\",\n",
229
+ " 5: \"The 'jurisdiction' column in the source needs to be capitalized to match the 'Jurisdiction' column in the target.\",\n",
230
+ " 6: \"The 'judge_last_name' column in the source does not have a corresponding column in the target and should be dropped.\"}}"
231
+ ]
232
+ },
233
+ "execution_count": 59,
234
+ "metadata": {},
235
+ "output_type": "execute_result"
236
+ }
237
+ ],
238
+ "source": [
239
+ "pd.DataFrame(table_mapping.dict()['table_mappings']).to_dict()"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": 52,
245
  "metadata": {},
246
  "outputs": [],
247
  "source": [
248
+ "ENGINEER_PROMPT_STR = '''\n",
249
  "You are a Senior Software Engineer, who specializes in writing Python code for ETL pipelines.\n",
250
  "Your Product Manager has written a spec for a new transormation script. You must follow this document exactly, write python code that implements the spec, validate that code, and then return it.\n",
251
  "Your output should only be python code in Markdown format, eg:\n",
 
262
  "\n",
263
  "Your must return `target_df` at the end.\n",
264
  "'''\n",
265
+ "engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)\n",
266
  "\n",
267
  "# engineer_chain = engineer_prompt | transform_model | StrOutputParser() | PythonAstREPLTool(locals={'source_df': table_1_df}).run\n",
268
  "# table_1_df_transformed = engineer_chain.invoke({\"spec_str\": spec_str})\n",
 
270
  "transform_code = engineer_chain.invoke({\"spec_str\": spec_str})"
271
  ]
272
  },
273
+ {
274
+ "cell_type": "code",
275
+ "execution_count": 54,
276
+ "metadata": {},
277
+ "outputs": [],
278
+ "source": [
279
+ "def generate_mapping_code(table_mapping_df) -> str:\n",
280
+ " writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)\n",
281
+ " engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)\n",
282
+ " \n",
283
+ " writer_chain = writer_prompt | transform_model | StrOutputParser()\n",
284
+ " engineer_chain = {\"spec_str\": writer_chain} | engineer_prompt | transform_model | StrOutputParser()\n",
285
+ " return engineer_chain.invoke({\"table_mapping\": str(table_mapping_df)})"
286
+ ]
287
+ },
288
+ {
289
+ "cell_type": "code",
290
+ "execution_count": null,
291
+ "metadata": {},
292
+ "outputs": [],
293
+ "source": [
294
+ "generate_mapping_code()"
295
+ ]
296
+ },
297
  {
298
  "cell_type": "code",
299
  "execution_count": 108,
src/prompt.py CHANGED
@@ -45,7 +45,7 @@ Your job is to write a plaintext spec for a python script for a software enginee
45
  This document must include 100% of the information your employee needs to write a successful script to transform source_df to target_df.
46
  However, DO NOT include the original table_mapping. Your job is to translate everything into natural language.
47
 
48
- Here is a stringified pydantic object that describes the mapping and the transformation steps:
49
 
50
  {table_mapping}
51
 
@@ -55,21 +55,20 @@ This document should be formatted like a technical document in plaintext. Do not
55
 
56
  This document must include:
57
  - Overview
58
- - Input (source_df), Output (target_df)
 
59
  - Exact column mapping
60
  - Exact transformation steps for each column
61
  - Precise instructions for what this script should do
62
- - Script input: Pandas Dataframe named `source_df`.
63
- - Script output: Pandas Dataframe named `target_df`.
64
  - Do not modify the source_df. Create a new dataframe named target_df.
65
  - This script should never include the source data. It should only include the transormations required to create the target_df.
66
- - Return the target_df.
67
 
68
  You will never see this employee. They cannot contact you. You will never see their code. You must include 100% of the information they need to write a successful script.
69
  Remember:
70
  - Clean: No extra information, no formatting aside from plaintext
71
  - Concise: Your employees benefit from brevity
72
- - Precise: your words must be unambiguous, exact, and full represent a perfect translation of the table_mapping object.
73
 
74
  Your response:
75
  '''
@@ -90,5 +89,5 @@ Here is the technical specification for your code:
90
 
91
  Remember: return only clean python code in markdown format. The python interpreter running this code will already have `source_df` as a local variable.
92
 
93
- Your must return `target_df` at the end.
94
  '''
 
45
  This document must include 100% of the information your employee needs to write a successful script to transform source_df to target_df.
46
  However, DO NOT include the original table_mapping. Your job is to translate everything into natural language.
47
 
48
+ Here is a stringified python dictionary that describes the mapping and the transformation steps:
49
 
50
  {table_mapping}
51
 
 
55
 
56
  This document must include:
57
  - Overview
58
+ - Input (source_df)
59
+ - Output (target_df)
60
  - Exact column mapping
61
  - Exact transformation steps for each column
62
  - Precise instructions for what this script should do
 
 
63
  - Do not modify the source_df. Create a new dataframe named target_df.
64
  - This script should never include the source data. It should only include the transormations required to create the target_df.
65
+ - Return the target_df
66
 
67
  You will never see this employee. They cannot contact you. You will never see their code. You must include 100% of the information they need to write a successful script.
68
  Remember:
69
  - Clean: No extra information, no formatting aside from plaintext
70
  - Concise: Your employees benefit from brevity
71
+ - Precise: your words must be unambiguous, exact, and full represent a perfect translation of incoming python dict.
72
 
73
  Your response:
74
  '''
 
89
 
90
  Remember: return only clean python code in markdown format. The python interpreter running this code will already have `source_df` as a local variable.
91
 
92
+ You must return `target_df` at the end.
93
  '''
transformation_code.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # Create a new dataframe named target_df
4
+ target_df = pd.DataFrame()
5
+
6
+ # Copy the 'case_date' column from source_df to the 'CaseDate' column in target_df without any transformation
7
+ target_df['CaseDate'] = source_df['case_date']
8
+
9
+ # Concatenate the 'firstname' and 'lastname' columns from source_df (with a space in between) and store the result in the 'FullName' column in target_df
10
+ target_df['FullName'] = source_df['firstname'] + " " + source_df['lastname']
11
+
12
+ # Copy the 'case_type' column from source_df to the 'CaseType' column in target_df without any transformation
13
+ target_df['CaseType'] = source_df['case_type']
14
+
15
+ # Return the target_df as the output of the script
16
+ target_df
transformed_source.csv ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,Date_of_Case,Fee,FullName,CaseNumber,CaseKind,Location,Weather
2
+ 0,2023/01/16,250,Jane Okafor,case--6190,Civil,BOSTO,snowy
3
+ 1,2023/08/10,100,Elena Malcolm,case--3092,Civil,CHIC,sunny
4
+ 2,2023/06/14,150,Alan Nasser,case--5947,Civil,BOSTO,rainy
5
+ 3,2023/07/17,250,Miguel Smith,case--7727,Family,LOSAN,snowy
6
+ 4,2023/07/25,150,John Kim,case--4120,Criminal,BOSTO,rainy
7
+ 5,2023/07/14,100,John Brown,case--8850,Civil,LOSAN,snowy
8
+ 6,2023/01/19,100,Dmitri Nasser,case--2308,Criminal,CHIC,sunny
9
+ 7,2023/02/26,100,Alan Rodriguez,case--4477,Criminal,CHIC,cloudy
10
+ 8,2023/02/10,200,Alice Brown,case--9490,Criminal,CHICA,snowy
11
+ 9,2023/09/12,100,Nadia Smith,case--4111,Family,LOSA,sunny
12
+ 10,2023/02/25,150,Chen Kim,case--9421,Criminal,BOST,sunny
13
+ 11,2023/09/15,200,John Kim,case--3270,Family,HOUST,sunny
14
+ 12,2023/07/22,200,Nadia Patel,case--1501,Family,HOUS,cloudy
15
+ 13,2023/01/27,150,Lakshmi Lee,case--8321,Family,HOUST,snowy
16
+ 14,2023/01/14,100,John Brown,case--2748,Family,LOSAN,snowy
17
+ 15,2023/07/13,100,Miguel Malcolm,case--3163,Family,LOSA,rainy
18
+ 16,2023/02/26,150,Alice Smith,case--4296,Civil,BOSTO,cloudy
19
+ 17,2023/09/25,200,Terrance Patel,case--2230,Criminal,HOUS,snowy
20
+ 18,2023/02/13,100,Alan Ivanov,case--9353,Family,NEWY,sunny
21
+ 19,2023/04/18,100,Alice Chatterjee,case--8786,Civil,CHICA,rainy
22
+ 20,2023/09/11,100,Jane Brown,case--6001,Criminal,LOSA,snowy
23
+ 21,2023/02/16,250,Jane Okafor,case--9434,Criminal,BOST,snowy
24
+ 22,2023/07/22,100,Dmitri Chatterjee,case--1042,Criminal,BOST,rainy
25
+ 23,2023/08/28,150,Miguel Smith,case--1427,Family,LOSA,cloudy
26
+ 24,2023/06/14,200,Miguel Johnson,case--7553,Civil,CHIC,sunny
27
+ 25,2023/02/24,250,Chen Ivanov,case--2242,Civil,LOSA,rainy
28
+ 26,2023/06/23,250,Terrance Rodriguez,case--6940,Criminal,HOUST,rainy
29
+ 27,2023/01/10,150,Elena Johnson,case--4064,Civil,HOUS,rainy
30
+ 28,2023/01/15,100,Chen Patel,case--3129,Civil,NEWY,rainy
31
+ 29,2023/08/16,150,Oluwaseun Malcolm,case--2758,Civil,BOSTO,snowy
32
+ 30,2023/02/24,250,Lakshmi Ivanov,case--9562,Criminal,BOSTO,sunny
33
+ 31,2023/05/15,250,Terrance Okafor,case--2292,Criminal,BOST,rainy
34
+ 32,2023/06/26,250,Jane Patel,case--7889,Criminal,LOSAN,cloudy
35
+ 33,2023/02/14,150,John Rodriguez,case--5178,Family,HOUS,sunny
36
+ 34,2023/05/15,150,Terrance Patel,case--5004,Civil,HOUST,snowy
37
+ 35,2023/03/19,200,Alice Johnson,case--2883,Family,NEWYO,snowy
38
+ 36,2023/02/12,200,Alan Rodriguez,case--4416,Family,BOSTO,rainy
39
+ 37,2023/07/25,200,Chen Malcolm,case--9332,Civil,HOUS,snowy
40
+ 38,2023/09/15,250,Miguel Chatterjee,case--7699,Civil,BOST,rainy
41
+ 39,2023/03/13,100,Nadia Lee,case--7258,Civil,NEWYO,snowy
42
+ 40,2023/05/27,200,Nadia Brown,case--7490,Civil,HOUS,snowy
43
+ 41,2023/02/22,100,Alice Johnson,case--8231,Civil,CHIC,cloudy
44
+ 42,2023/03/18,100,Nadia Malcolm,case--2720,Criminal,NEWY,cloudy
45
+ 43,2023/06/11,100,Nadia Brown,case--4277,Criminal,BOST,snowy
46
+ 44,2023/02/22,100,Oluwaseun Okafor,case--9738,Criminal,NEWYO,snowy
47
+ 45,2023/08/19,250,Jane Patel,case--2452,Civil,BOSTO,snowy
48
+ 46,2023/09/27,100,Alan Lee,case--1899,Family,NEWY,rainy
49
+ 47,2023/04/21,150,Dmitri Malcolm,case--8404,Family,LOSAN,rainy
50
+ 48,2023/03/10,100,Alice Chatterjee,case--4240,Family,LOSA,snowy
51
+ 49,2023/05/13,250,Elena Kim,case--6153,Family,CHIC,rainy
52
+ 50,2023/09/10,200,Alan Patel,case--3485,Criminal,CHIC,cloudy
53
+ 51,2023/08/18,200,Lakshmi Kim,case--5520,Criminal,LOSAN,sunny
54
+ 52,2023/02/21,250,Alan Patel,case--9879,Criminal,LOSA,sunny
55
+ 53,2023/05/12,200,Jane Brown,case--5259,Criminal,NEWYO,rainy
56
+ 54,2023/01/20,100,Oluwaseun Patel,case--8333,Criminal,BOSTO,cloudy
57
+ 55,2023/01/23,200,Chen Nasser,case--2711,Civil,LOSAN,sunny
58
+ 56,2023/03/12,100,Miguel Brown,case--5100,Family,LOSAN,sunny
59
+ 57,2023/01/15,100,Terrance Rodriguez,case--4849,Criminal,LOSAN,rainy
60
+ 58,2023/05/17,150,Jane Lee,case--8058,Criminal,NEWY,cloudy
61
+ 59,2023/04/18,100,Chen Okafor,case--9076,Civil,NEWYO,sunny
62
+ 60,2023/02/22,200,Lakshmi Chatterjee,case--5230,Criminal,BOST,rainy
63
+ 61,2023/08/18,200,John Brown,case--7094,Criminal,LOSA,cloudy
64
+ 62,2023/08/17,150,Oluwaseun Lee,case--8915,Civil,BOSTO,sunny
65
+ 63,2023/08/18,100,Alan Malcolm,case--9030,Family,CHIC,sunny
66
+ 64,2023/02/13,150,Chen Malcolm,case--1482,Criminal,HOUS,cloudy
67
+ 65,2023/02/16,100,John Brown,case--3535,Criminal,BOST,rainy
68
+ 66,2023/08/20,250,Chen Johnson,case--2029,Criminal,HOUST,sunny
69
+ 67,2023/01/10,250,Alan Kim,case--1812,Civil,HOUST,sunny
70
+ 68,2023/02/18,150,Alice Chatterjee,case--5295,Civil,CHICA,snowy
71
+ 69,2023/08/25,150,Miguel Lee,case--6850,Criminal,LOSA,sunny
72
+ 70,2023/05/12,150,Alan Malcolm,case--7973,Criminal,BOST,cloudy
73
+ 71,2023/05/19,200,Chen Johnson,case--5221,Family,HOUS,snowy
74
+ 72,2023/06/17,250,John Okafor,case--4117,Criminal,BOSTO,sunny
75
+ 73,2023/03/18,100,Elena Patel,case--2368,Family,HOUST,rainy
76
+ 74,2023/06/22,200,Lakshmi Rodriguez,case--8384,Family,NEWY,cloudy
77
+ 75,2023/07/14,100,Miguel Smith,case--4476,Civil,NEWYO,cloudy
78
+ 76,2023/03/26,100,Chen Brown,case--4545,Civil,HOUST,snowy
79
+ 77,2023/06/22,250,Dmitri Chatterjee,case--4421,Civil,HOUS,snowy
80
+ 78,2023/03/20,150,Miguel Patel,case--6559,Criminal,NEWYO,snowy
81
+ 79,2023/07/11,250,Oluwaseun Kim,case--1803,Civil,BOSTO,sunny
82
+ 80,2023/03/13,150,Elena Okafor,case--8622,Civil,NEWYO,cloudy
83
+ 81,2023/05/27,200,Alice Lee,case--9488,Criminal,LOSAN,cloudy
84
+ 82,2023/05/14,150,Alice Patel,case--4581,Civil,CHICA,sunny
85
+ 83,2023/06/27,250,Terrance Malcolm,case--2388,Criminal,CHIC,sunny
86
+ 84,2023/02/13,100,Terrance Ivanov,case--6529,Criminal,LOSAN,snowy
87
+ 85,2023/01/21,150,Terrance Patel,case--4443,Family,BOST,sunny
88
+ 86,2023/09/22,200,John Malcolm,case--8721,Civil,NEWYO,snowy
89
+ 87,2023/02/12,250,Miguel Malcolm,case--3780,Family,NEWYO,cloudy
90
+ 88,2023/04/26,250,Alan Kim,case--2663,Criminal,NEWY,rainy
91
+ 89,2023/03/16,150,Lakshmi Ivanov,case--8702,Criminal,LOSA,snowy
92
+ 90,2023/07/22,100,Jane Ivanov,case--1232,Criminal,BOSTO,rainy
93
+ 91,2023/05/28,150,Nadia Okafor,case--5215,Family,HOUS,cloudy
94
+ 92,2023/03/14,250,Oluwaseun Okafor,case--6631,Criminal,BOST,rainy
95
+ 93,2023/06/17,150,Alan Nasser,case--1405,Civil,BOST,snowy
96
+ 94,2023/08/13,100,Oluwaseun Kim,case--8816,Civil,LOSAN,cloudy
97
+ 95,2023/07/20,150,Oluwaseun Brown,case--2665,Family,NEWYO,sunny
98
+ 96,2023/05/16,100,Alan Patel,case--2874,Family,NEWYO,sunny
99
+ 97,2023/07/15,100,Nadia Chatterjee,case--2037,Family,HOUST,rainy
100
+ 98,2023/04/18,200,Dmitri Johnson,case--5402,Criminal,HOUS,snowy
101
+ 99,2023/08/14,250,Chen Johnson,case--3569,Civil,BOST,sunny