Spaces:
Runtime error
Runtime error
andymbryant
commited on
Commit
•
faeec87
1
Parent(s):
8a2c637
cleaned up prompts
Browse files- app.py +7 -15
- src/core.py +8 -8
- src/prompt.py +19 -27
app.py
CHANGED
@@ -3,39 +3,32 @@ from src.core import get_table_mapping, transform_source, process_csv_text, gene
|
|
3 |
|
4 |
MAX_ROWS = 10
|
5 |
|
6 |
-
def export_csv(d):
|
7 |
-
filepath = "output.csv"
|
8 |
-
d.to_csv(filepath)
|
9 |
-
return gr.File.update(value=filepath, visible=True)
|
10 |
-
|
11 |
def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
|
12 |
return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
|
13 |
|
|
|
14 |
def export_csv(df, filename):
|
15 |
df.to_csv(filename, index=False)
|
16 |
return gr.File.update(value=filename, visible=True)
|
17 |
|
|
|
18 |
def export_text(val, filename):
|
19 |
with open(filename, "w") as f:
|
20 |
f.write(val)
|
21 |
return gr.File.update(value=filename, visible=True)
|
22 |
|
23 |
-
def export_transformed_source(d):
|
24 |
-
filename = "transformed_source.csv"
|
25 |
-
d.to_csv(filename)
|
26 |
-
return gr.File.update(value=filename, visible=True)
|
27 |
-
|
28 |
with gr.Blocks() as demo:
|
|
|
29 |
# STEP 1
|
30 |
-
generate_step_markdown(1, "Upload a Template CSV
|
31 |
with gr.Row():
|
32 |
with gr.Column():
|
33 |
upload_template_btn = gr.UploadButton(label="Upload Template File", file_types = ['.csv'], live=True, file_count = "single")
|
34 |
-
template_df = gr.Dataframe(max_rows=MAX_ROWS)
|
35 |
upload_template_btn.upload(fn=process_csv_text, inputs=upload_template_btn, outputs=template_df)
|
36 |
with gr.Column():
|
37 |
upload_source_button = gr.UploadButton(label="Upload Source File", file_types = ['.csv'], live=True, file_count = "single")
|
38 |
-
source_df = gr.Dataframe(max_rows=MAX_ROWS)
|
39 |
upload_source_button.upload(fn=process_csv_text, inputs=upload_source_button, outputs=source_df)
|
40 |
|
41 |
# STEP 2
|
@@ -75,8 +68,7 @@ with gr.Blocks() as demo:
|
|
75 |
with gr.Row():
|
76 |
transform_btn = gr.Button(value="Transform Source", variant="primary")
|
77 |
with gr.Row():
|
78 |
-
gr.
|
79 |
-
source_df_transformed = gr.Dataframe(label="Source Transformed", max_rows=MAX_ROWS)
|
80 |
transform_btn.click(transform_source, inputs=[source_df, code_block], outputs=[source_df_transformed])
|
81 |
|
82 |
with gr.Row():
|
|
|
3 |
|
4 |
MAX_ROWS = 10
|
5 |
|
|
|
|
|
|
|
|
|
|
|
6 |
def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
|
7 |
return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
|
8 |
|
9 |
+
# TODO: use tempfile
|
10 |
def export_csv(df, filename):
|
11 |
df.to_csv(filename, index=False)
|
12 |
return gr.File.update(value=filename, visible=True)
|
13 |
|
14 |
+
# TODO: use tempfile
|
15 |
def export_text(val, filename):
|
16 |
with open(filename, "w") as f:
|
17 |
f.write(val)
|
18 |
return gr.File.update(value=filename, visible=True)
|
19 |
|
|
|
|
|
|
|
|
|
|
|
20 |
with gr.Blocks() as demo:
|
21 |
+
gr.Markdown("# LLM Data Mapper\nThis is a LacThis is a demo of the LangChain platform. It is a tool for generating python code from natural language prompts. This demo is a simple ETL pipeline, where you upload a source CSV and a template CSV, and then generate python code to transform the source CSV into the template CSV. This is a simple example, but the platform can be used for much more complex tasks, such as generating python code from a natural language specification document.")
|
22 |
# STEP 1
|
23 |
+
generate_step_markdown(1, "Upload a Template CSV and a Source CSV.", "The schema will be extracted from the template file and the source file will be transformed to match the schema.")
|
24 |
with gr.Row():
|
25 |
with gr.Column():
|
26 |
upload_template_btn = gr.UploadButton(label="Upload Template File", file_types = ['.csv'], live=True, file_count = "single")
|
27 |
+
template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
|
28 |
upload_template_btn.upload(fn=process_csv_text, inputs=upload_template_btn, outputs=template_df)
|
29 |
with gr.Column():
|
30 |
upload_source_button = gr.UploadButton(label="Upload Source File", file_types = ['.csv'], live=True, file_count = "single")
|
31 |
+
source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
|
32 |
upload_source_button.upload(fn=process_csv_text, inputs=upload_source_button, outputs=source_df)
|
33 |
|
34 |
# STEP 2
|
|
|
68 |
with gr.Row():
|
69 |
transform_btn = gr.Button(value="Transform Source", variant="primary")
|
70 |
with gr.Row():
|
71 |
+
source_df_transformed = gr.Dataframe(label="Source (transformed)", max_rows=MAX_ROWS)
|
|
|
72 |
transform_btn.click(transform_source, inputs=[source_df, code_block], outputs=[source_df_transformed])
|
73 |
|
74 |
with gr.Row():
|
src/core.py
CHANGED
@@ -17,35 +17,33 @@ load_dotenv()
|
|
17 |
DATA_DIR_PATH = os.path.join(os.path.dirname(__file__), 'data')
|
18 |
SYNTHETIC_DATA_DIR_PATH = os.path.join(DATA_DIR_PATH, 'synthetic')
|
19 |
|
|
|
20 |
BASE_MODEL = ChatOpenAI(
|
21 |
model_name='gpt-4',
|
22 |
temperature=0,
|
23 |
)
|
24 |
|
25 |
-
def
|
26 |
-
source = pd.read_csv(os.path.join(SYNTHETIC_DATA_DIR_PATH, 'legal_entries_a.csv'))
|
27 |
-
template = pd.read_csv(os.path.join(SYNTHETIC_DATA_DIR_PATH, 'legal_template.csv'))
|
28 |
-
return source, template
|
29 |
-
|
30 |
-
def get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
|
31 |
return f'<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>'
|
32 |
|
33 |
def get_table_mapping(source_df, template_df):
|
|
|
34 |
table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
|
35 |
analyst_prompt = ChatPromptTemplate.from_template(
|
36 |
template=DATA_SCIENTIST_PROMPT_STR,
|
37 |
partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},
|
38 |
)
|
39 |
-
|
40 |
mapping_chain = analyst_prompt | BASE_MODEL | table_mapping_parser
|
41 |
-
table_mapping: TableMapping = mapping_chain.invoke({"source_1_csv_str":
|
42 |
return pd.DataFrame(table_mapping.dict()['table_mappings'])
|
43 |
|
44 |
def _sanitize_python_output(text: str):
|
|
|
45 |
_, after = text.split("```python")
|
46 |
return after.split("```")[0]
|
47 |
|
48 |
def generate_mapping_code(table_mapping_df) -> str:
|
|
|
49 |
writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
|
50 |
engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
|
51 |
|
@@ -54,6 +52,7 @@ def generate_mapping_code(table_mapping_df) -> str:
|
|
54 |
return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
|
55 |
|
56 |
def process_csv_text(temp_file):
|
|
|
57 |
if isinstance(temp_file, str):
|
58 |
df = pd.read_csv(io.StringIO(temp_file))
|
59 |
else:
|
@@ -61,4 +60,5 @@ def process_csv_text(temp_file):
|
|
61 |
return df
|
62 |
|
63 |
def transform_source(source_df, code_text: str):
|
|
|
64 |
return PythonAstREPLTool(locals={'source_df': source_df}).run(code_text)
|
|
|
17 |
DATA_DIR_PATH = os.path.join(os.path.dirname(__file__), 'data')
|
18 |
SYNTHETIC_DATA_DIR_PATH = os.path.join(DATA_DIR_PATH, 'synthetic')
|
19 |
|
20 |
+
# TODO: consider different models for different prompts, e.g. natural language prompt might be better with higher temperature
|
21 |
BASE_MODEL = ChatOpenAI(
|
22 |
model_name='gpt-4',
|
23 |
temperature=0,
|
24 |
)
|
25 |
|
26 |
+
def _get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
|
|
|
|
|
|
|
|
|
|
|
27 |
return f'<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>'
|
28 |
|
29 |
def get_table_mapping(source_df, template_df):
|
30 |
+
'''Use PydanticOutputParser to parse the output of the Data Scientist prompt into a TableMapping object.'''
|
31 |
table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
|
32 |
analyst_prompt = ChatPromptTemplate.from_template(
|
33 |
template=DATA_SCIENTIST_PROMPT_STR,
|
34 |
partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},
|
35 |
)
|
|
|
36 |
mapping_chain = analyst_prompt | BASE_MODEL | table_mapping_parser
|
37 |
+
table_mapping: TableMapping = mapping_chain.invoke({"source_1_csv_str": _get_data_str_from_df_for_prompt(source_df), "target_csv_str": _get_data_str_from_df_for_prompt(template_df)})
|
38 |
return pd.DataFrame(table_mapping.dict()['table_mappings'])
|
39 |
|
40 |
def _sanitize_python_output(text: str):
|
41 |
+
'''Remove markdown from python code, as prompt returns it.'''
|
42 |
_, after = text.split("```python")
|
43 |
return after.split("```")[0]
|
44 |
|
45 |
def generate_mapping_code(table_mapping_df) -> str:
|
46 |
+
'''Chain two prompts together to generate python code from a table mapping: 1. technical spec writer, 2. python engineer'''
|
47 |
writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
|
48 |
engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
|
49 |
|
|
|
52 |
return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
|
53 |
|
54 |
def process_csv_text(temp_file):
|
55 |
+
'''Process a CSV file into a dataframe, either from a string or a file.'''
|
56 |
if isinstance(temp_file, str):
|
57 |
df = pd.read_csv(io.StringIO(temp_file))
|
58 |
else:
|
|
|
60 |
return df
|
61 |
|
62 |
def transform_source(source_df, code_text: str):
|
63 |
+
'''Use PythonAstREPLTool to transform a source dataframe using python code.'''
|
64 |
return PythonAstREPLTool(locals={'source_df': source_df}).run(code_text)
|
src/prompt.py
CHANGED
@@ -39,36 +39,28 @@ Your response:
|
|
39 |
|
40 |
|
41 |
SPEC_WRITER_PROMPT_STR = '''
|
42 |
-
You are
|
43 |
-
Your job is to write a plaintext spec for a python script for a software engineer to develop a component within an ETL pipeline.
|
44 |
-
|
45 |
-
This document must include 100% of the information your employee needs to write a successful script to transform source_df to target_df.
|
46 |
-
However, DO NOT include the original table_mapping. Your job is to translate everything into natural language.
|
47 |
-
|
48 |
-
Here is a stringified python dictionary that describes the mapping and the transformation steps:
|
49 |
|
50 |
{table_mapping}
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
-
|
58 |
-
-
|
59 |
-
-
|
60 |
-
-
|
61 |
-
-
|
62 |
-
-
|
63 |
-
-
|
64 |
-
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
-
|
70 |
-
- Concise: Your employees benefit from brevity
|
71 |
-
- Precise: your words must be unambiguous, exact, and full represent a perfect translation of incoming python dict.
|
72 |
|
73 |
Your response:
|
74 |
'''
|
|
|
39 |
|
40 |
|
41 |
SPEC_WRITER_PROMPT_STR = '''
|
42 |
+
You are a product manager and technical writer for a software firm. Your task is to draft a specification document for a software engineer to design a component within an ETL pipeline, converting `source_df` to `target_df` using the provided mapping:
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
{table_mapping}
|
45 |
|
46 |
+
Translate this information into clear, succinct instructions. Avoid including the raw `table_mapping` or any code.
|
47 |
+
|
48 |
+
The specification should encompass:
|
49 |
+
- **Overview**: A brief summary of the task.
|
50 |
+
- **Input**: Description of `source_df`.
|
51 |
+
- **Output**: Description of `target_df`.
|
52 |
+
- **Column Mapping**: Clearly define how columns from the source map to the target.
|
53 |
+
- **Transformations**: Detail the transformations required for each column.
|
54 |
+
- **Instructions**: The script should:
|
55 |
+
- Not modify `source_df`.
|
56 |
+
- Generate a new dataframe named `target_df`.
|
57 |
+
- Not incorporate any source data, only transformations.
|
58 |
+
- Return `target_df`.
|
59 |
+
|
60 |
+
This will be your only communication to the engineer. Ensure it's:
|
61 |
+
- **Clear**: Eliminate any unnecessary details.
|
62 |
+
- **Concise**: Aim for brevity.
|
63 |
+
- **Precise**: Be unambiguous and exact.
|
|
|
|
|
64 |
|
65 |
Your response:
|
66 |
'''
|