Spaces:

andymbryant
/

data-mapper

Runtime error

App Files Files Community

andymbryant commited on Aug 18, 2023

Commit

faeec87

1 Parent(s): 8a2c637

cleaned up prompts

Browse files

Files changed (3) hide show

app.py +7 -15
src/core.py +8 -8
src/prompt.py +19 -27

app.py CHANGED Viewed

@@ -3,39 +3,32 @@ from src.core import get_table_mapping, transform_source, process_csv_text, gene
 MAX_ROWS = 10
-def export_csv(d):
-    filepath = "output.csv"
-    d.to_csv(filepath)
-    return gr.File.update(value=filepath, visible=True)
 def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
     return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
 def export_csv(df, filename):
     df.to_csv(filename, index=False)
     return gr.File.update(value=filename, visible=True)
 def export_text(val, filename):
     with open(filename, "w") as f:
         f.write(val)
     return gr.File.update(value=filename, visible=True)
-def export_transformed_source(d):
-    filename = "transformed_source.csv"
-    d.to_csv(filename)
-    return gr.File.update(value=filename, visible=True)
 with gr.Blocks() as demo:
     # STEP 1
-    generate_step_markdown(1, "Upload a Template CSV (target schema) and a Source CSV.")
     with gr.Row():
         with gr.Column():
             upload_template_btn = gr.UploadButton(label="Upload Template File", file_types = ['.csv'], live=True, file_count = "single")
-            template_df = gr.Dataframe(max_rows=MAX_ROWS)
             upload_template_btn.upload(fn=process_csv_text, inputs=upload_template_btn, outputs=template_df)
         with gr.Column():
             upload_source_button = gr.UploadButton(label="Upload Source File", file_types = ['.csv'], live=True, file_count = "single")
-            source_df = gr.Dataframe(max_rows=MAX_ROWS)
             upload_source_button.upload(fn=process_csv_text, inputs=upload_source_button, outputs=source_df)
     # STEP 2
@@ -75,8 +68,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         transform_btn = gr.Button(value="Transform Source", variant="primary")
     with gr.Row():
-        gr.Markdown("Source (transformed)")
-        source_df_transformed = gr.Dataframe(label="Source Transformed", max_rows=MAX_ROWS)
         transform_btn.click(transform_source, inputs=[source_df, code_block], outputs=[source_df_transformed])
     with gr.Row():

 MAX_ROWS = 10
 def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
     return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
+# TODO: use tempfile
 def export_csv(df, filename):
     df.to_csv(filename, index=False)
     return gr.File.update(value=filename, visible=True)
+# TODO: use tempfile
 def export_text(val, filename):
     with open(filename, "w") as f:
         f.write(val)
     return gr.File.update(value=filename, visible=True)
 with gr.Blocks() as demo:
+    gr.Markdown("# LLM Data Mapper\nThis is a LacThis is a demo of the LangChain platform. It is a tool for generating python code from natural language prompts. This demo is a simple ETL pipeline, where you upload a source CSV and a template CSV, and then generate python code to transform the source CSV into the template CSV. This is a simple example, but the platform can be used for much more complex tasks, such as generating python code from a natural language specification document.")
     # STEP 1
+    generate_step_markdown(1, "Upload a Template CSV and a Source CSV.", "The schema will be extracted from the template file and the source file will be transformed to match the schema.")
     with gr.Row():
         with gr.Column():
             upload_template_btn = gr.UploadButton(label="Upload Template File", file_types = ['.csv'], live=True, file_count = "single")
+            template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
             upload_template_btn.upload(fn=process_csv_text, inputs=upload_template_btn, outputs=template_df)
         with gr.Column():
             upload_source_button = gr.UploadButton(label="Upload Source File", file_types = ['.csv'], live=True, file_count = "single")
+            source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
             upload_source_button.upload(fn=process_csv_text, inputs=upload_source_button, outputs=source_df)
     # STEP 2
     with gr.Row():
         transform_btn = gr.Button(value="Transform Source", variant="primary")
     with gr.Row():
+        source_df_transformed = gr.Dataframe(label="Source (transformed)", max_rows=MAX_ROWS)
         transform_btn.click(transform_source, inputs=[source_df, code_block], outputs=[source_df_transformed])
     with gr.Row():

src/core.py CHANGED Viewed

@@ -17,35 +17,33 @@ load_dotenv()
 DATA_DIR_PATH = os.path.join(os.path.dirname(__file__), 'data')
 SYNTHETIC_DATA_DIR_PATH = os.path.join(DATA_DIR_PATH, 'synthetic')
 BASE_MODEL = ChatOpenAI(
     model_name='gpt-4',
     temperature=0,
 )
-def get_dataframes():
-    source = pd.read_csv(os.path.join(SYNTHETIC_DATA_DIR_PATH, 'legal_entries_a.csv'))
-    template = pd.read_csv(os.path.join(SYNTHETIC_DATA_DIR_PATH, 'legal_template.csv'))
-    return source, template
-def get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
     return f'<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>'
 def get_table_mapping(source_df, template_df):
     table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
     analyst_prompt = ChatPromptTemplate.from_template(
         template=DATA_SCIENTIST_PROMPT_STR,
         partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},
     )
     mapping_chain = analyst_prompt | BASE_MODEL | table_mapping_parser
-    table_mapping: TableMapping = mapping_chain.invoke({"source_1_csv_str": get_data_str_from_df_for_prompt(source_df), "target_csv_str": get_data_str_from_df_for_prompt(template_df)})
     return pd.DataFrame(table_mapping.dict()['table_mappings'])
 def _sanitize_python_output(text: str):
     _, after = text.split("```python")
     return after.split("```")[0]
 def generate_mapping_code(table_mapping_df) -> str:
     writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
     engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
@@ -54,6 +52,7 @@ def generate_mapping_code(table_mapping_df) -> str:
     return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
 def process_csv_text(temp_file):
     if isinstance(temp_file, str):
       df = pd.read_csv(io.StringIO(temp_file))
     else:
@@ -61,4 +60,5 @@ def process_csv_text(temp_file):
     return df
 def transform_source(source_df, code_text: str):
     return PythonAstREPLTool(locals={'source_df': source_df}).run(code_text)

 DATA_DIR_PATH = os.path.join(os.path.dirname(__file__), 'data')
 SYNTHETIC_DATA_DIR_PATH = os.path.join(DATA_DIR_PATH, 'synthetic')
+# TODO: consider different models for different prompts, e.g. natural language prompt might be better with higher temperature
 BASE_MODEL = ChatOpenAI(
     model_name='gpt-4',
     temperature=0,
 )
+def _get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
     return f'<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>'
 def get_table_mapping(source_df, template_df):
+    '''Use PydanticOutputParser to parse the output of the Data Scientist prompt into a TableMapping object.'''
     table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
     analyst_prompt = ChatPromptTemplate.from_template(
         template=DATA_SCIENTIST_PROMPT_STR,
         partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},
     )
     mapping_chain = analyst_prompt | BASE_MODEL | table_mapping_parser
+    table_mapping: TableMapping = mapping_chain.invoke({"source_1_csv_str": _get_data_str_from_df_for_prompt(source_df), "target_csv_str": _get_data_str_from_df_for_prompt(template_df)})
     return pd.DataFrame(table_mapping.dict()['table_mappings'])
 def _sanitize_python_output(text: str):
+    '''Remove markdown from python code, as prompt returns it.'''
     _, after = text.split("```python")
     return after.split("```")[0]
 def generate_mapping_code(table_mapping_df) -> str:
+    '''Chain two prompts together to generate python code from a table mapping: 1. technical spec writer, 2. python engineer'''
     writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
     engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
     return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
 def process_csv_text(temp_file):
+    '''Process a CSV file into a dataframe, either from a string or a file.'''
     if isinstance(temp_file, str):
       df = pd.read_csv(io.StringIO(temp_file))
     else:
     return df
 def transform_source(source_df, code_text: str):
+    '''Use PythonAstREPLTool to transform a source dataframe using python code.'''
     return PythonAstREPLTool(locals={'source_df': source_df}).run(code_text)

src/prompt.py CHANGED Viewed

@@ -39,36 +39,28 @@ Your response:
 SPEC_WRITER_PROMPT_STR = '''
-You are an expert product manager and technical writer for a software company, who generates clean, concise, precise specification documents for your employees.
-Your job is to write a plaintext spec for a python script for a software engineer to develop a component within an ETL pipeline.
-This document must include 100% of the information your employee needs to write a successful script to transform source_df to target_df.
-However, DO NOT include the original table_mapping. Your job is to translate everything into natural language.
-Here is a stringified python dictionary that describes the mapping and the transformation steps:
 {table_mapping}
-You must translate this into clean, concise, and complete instructions for your employee.
-This document should be formatted like a technical document in plaintext. Do not include code or data.
-This document must include:
-- Overview
-- Input (source_df)
-- Output (target_df)
-- Exact column mapping
-- Exact transformation steps for each column
-- Precise instructions for what this script should do
-- Do not modify the source_df. Create a new dataframe named target_df.
-- This script should never include the source data. It should only include the transormations required to create the target_df.
-- Return the target_df
-You will never see this employee. They cannot contact you. You will never see their code. You must include 100% of the information they need to write a successful script.
-Remember:
-- Clean: No extra information, no formatting aside from plaintext
-- Concise: Your employees benefit from brevity
-- Precise: your words must be unambiguous, exact, and full represent a perfect translation of incoming python dict.
 Your response:
 '''

 SPEC_WRITER_PROMPT_STR = '''
+You are a product manager and technical writer for a software firm. Your task is to draft a specification document for a software engineer to design a component within an ETL pipeline, converting `source_df` to `target_df` using the provided mapping:
 {table_mapping}
+Translate this information into clear, succinct instructions. Avoid including the raw `table_mapping` or any code.
+The specification should encompass:
+- **Overview**: A brief summary of the task.
+- **Input**: Description of `source_df`.
+- **Output**: Description of `target_df`.
+- **Column Mapping**: Clearly define how columns from the source map to the target.
+- **Transformations**: Detail the transformations required for each column.
+- **Instructions**: The script should:
+  - Not modify `source_df`.
+  - Generate a new dataframe named `target_df`.
+  - Not incorporate any source data, only transformations.
+  - Return `target_df`.
+This will be your only communication to the engineer. Ensure it's:
+- **Clear**: Eliminate any unnecessary details.
+- **Concise**: Aim for brevity.
+- **Precise**: Be unambiguous and exact.
 Your response:
 '''