andymbryant commited on
Commit
faeec87
1 Parent(s): 8a2c637

cleaned up prompts

Browse files
Files changed (3) hide show
  1. app.py +7 -15
  2. src/core.py +8 -8
  3. src/prompt.py +19 -27
app.py CHANGED
@@ -3,39 +3,32 @@ from src.core import get_table_mapping, transform_source, process_csv_text, gene
3
 
4
  MAX_ROWS = 10
5
 
6
- def export_csv(d):
7
- filepath = "output.csv"
8
- d.to_csv(filepath)
9
- return gr.File.update(value=filepath, visible=True)
10
-
11
  def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
12
  return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
13
 
 
14
  def export_csv(df, filename):
15
  df.to_csv(filename, index=False)
16
  return gr.File.update(value=filename, visible=True)
17
 
 
18
  def export_text(val, filename):
19
  with open(filename, "w") as f:
20
  f.write(val)
21
  return gr.File.update(value=filename, visible=True)
22
 
23
- def export_transformed_source(d):
24
- filename = "transformed_source.csv"
25
- d.to_csv(filename)
26
- return gr.File.update(value=filename, visible=True)
27
-
28
  with gr.Blocks() as demo:
 
29
  # STEP 1
30
- generate_step_markdown(1, "Upload a Template CSV (target schema) and a Source CSV.")
31
  with gr.Row():
32
  with gr.Column():
33
  upload_template_btn = gr.UploadButton(label="Upload Template File", file_types = ['.csv'], live=True, file_count = "single")
34
- template_df = gr.Dataframe(max_rows=MAX_ROWS)
35
  upload_template_btn.upload(fn=process_csv_text, inputs=upload_template_btn, outputs=template_df)
36
  with gr.Column():
37
  upload_source_button = gr.UploadButton(label="Upload Source File", file_types = ['.csv'], live=True, file_count = "single")
38
- source_df = gr.Dataframe(max_rows=MAX_ROWS)
39
  upload_source_button.upload(fn=process_csv_text, inputs=upload_source_button, outputs=source_df)
40
 
41
  # STEP 2
@@ -75,8 +68,7 @@ with gr.Blocks() as demo:
75
  with gr.Row():
76
  transform_btn = gr.Button(value="Transform Source", variant="primary")
77
  with gr.Row():
78
- gr.Markdown("Source (transformed)")
79
- source_df_transformed = gr.Dataframe(label="Source Transformed", max_rows=MAX_ROWS)
80
  transform_btn.click(transform_source, inputs=[source_df, code_block], outputs=[source_df_transformed])
81
 
82
  with gr.Row():
 
3
 
4
  MAX_ROWS = 10
5
 
 
 
 
 
 
6
  def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
7
  return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
8
 
9
+ # TODO: use tempfile
10
  def export_csv(df, filename):
11
  df.to_csv(filename, index=False)
12
  return gr.File.update(value=filename, visible=True)
13
 
14
+ # TODO: use tempfile
15
  def export_text(val, filename):
16
  with open(filename, "w") as f:
17
  f.write(val)
18
  return gr.File.update(value=filename, visible=True)
19
 
 
 
 
 
 
20
  with gr.Blocks() as demo:
21
+ gr.Markdown("# LLM Data Mapper\nThis is a LacThis is a demo of the LangChain platform. It is a tool for generating python code from natural language prompts. This demo is a simple ETL pipeline, where you upload a source CSV and a template CSV, and then generate python code to transform the source CSV into the template CSV. This is a simple example, but the platform can be used for much more complex tasks, such as generating python code from a natural language specification document.")
22
  # STEP 1
23
+ generate_step_markdown(1, "Upload a Template CSV and a Source CSV.", "The schema will be extracted from the template file and the source file will be transformed to match the schema.")
24
  with gr.Row():
25
  with gr.Column():
26
  upload_template_btn = gr.UploadButton(label="Upload Template File", file_types = ['.csv'], live=True, file_count = "single")
27
+ template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
28
  upload_template_btn.upload(fn=process_csv_text, inputs=upload_template_btn, outputs=template_df)
29
  with gr.Column():
30
  upload_source_button = gr.UploadButton(label="Upload Source File", file_types = ['.csv'], live=True, file_count = "single")
31
+ source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
32
  upload_source_button.upload(fn=process_csv_text, inputs=upload_source_button, outputs=source_df)
33
 
34
  # STEP 2
 
68
  with gr.Row():
69
  transform_btn = gr.Button(value="Transform Source", variant="primary")
70
  with gr.Row():
71
+ source_df_transformed = gr.Dataframe(label="Source (transformed)", max_rows=MAX_ROWS)
 
72
  transform_btn.click(transform_source, inputs=[source_df, code_block], outputs=[source_df_transformed])
73
 
74
  with gr.Row():
src/core.py CHANGED
@@ -17,35 +17,33 @@ load_dotenv()
17
  DATA_DIR_PATH = os.path.join(os.path.dirname(__file__), 'data')
18
  SYNTHETIC_DATA_DIR_PATH = os.path.join(DATA_DIR_PATH, 'synthetic')
19
 
 
20
  BASE_MODEL = ChatOpenAI(
21
  model_name='gpt-4',
22
  temperature=0,
23
  )
24
 
25
- def get_dataframes():
26
- source = pd.read_csv(os.path.join(SYNTHETIC_DATA_DIR_PATH, 'legal_entries_a.csv'))
27
- template = pd.read_csv(os.path.join(SYNTHETIC_DATA_DIR_PATH, 'legal_template.csv'))
28
- return source, template
29
-
30
- def get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
31
  return f'<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>'
32
 
33
  def get_table_mapping(source_df, template_df):
 
34
  table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
35
  analyst_prompt = ChatPromptTemplate.from_template(
36
  template=DATA_SCIENTIST_PROMPT_STR,
37
  partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},
38
  )
39
-
40
  mapping_chain = analyst_prompt | BASE_MODEL | table_mapping_parser
41
- table_mapping: TableMapping = mapping_chain.invoke({"source_1_csv_str": get_data_str_from_df_for_prompt(source_df), "target_csv_str": get_data_str_from_df_for_prompt(template_df)})
42
  return pd.DataFrame(table_mapping.dict()['table_mappings'])
43
 
44
  def _sanitize_python_output(text: str):
 
45
  _, after = text.split("```python")
46
  return after.split("```")[0]
47
 
48
  def generate_mapping_code(table_mapping_df) -> str:
 
49
  writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
50
  engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
51
 
@@ -54,6 +52,7 @@ def generate_mapping_code(table_mapping_df) -> str:
54
  return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
55
 
56
  def process_csv_text(temp_file):
 
57
  if isinstance(temp_file, str):
58
  df = pd.read_csv(io.StringIO(temp_file))
59
  else:
@@ -61,4 +60,5 @@ def process_csv_text(temp_file):
61
  return df
62
 
63
  def transform_source(source_df, code_text: str):
 
64
  return PythonAstREPLTool(locals={'source_df': source_df}).run(code_text)
 
17
  DATA_DIR_PATH = os.path.join(os.path.dirname(__file__), 'data')
18
  SYNTHETIC_DATA_DIR_PATH = os.path.join(DATA_DIR_PATH, 'synthetic')
19
 
20
+ # TODO: consider different models for different prompts, e.g. natural language prompt might be better with higher temperature
21
  BASE_MODEL = ChatOpenAI(
22
  model_name='gpt-4',
23
  temperature=0,
24
  )
25
 
26
+ def _get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
 
 
 
 
 
27
  return f'<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>'
28
 
29
  def get_table_mapping(source_df, template_df):
30
+ '''Use PydanticOutputParser to parse the output of the Data Scientist prompt into a TableMapping object.'''
31
  table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
32
  analyst_prompt = ChatPromptTemplate.from_template(
33
  template=DATA_SCIENTIST_PROMPT_STR,
34
  partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},
35
  )
 
36
  mapping_chain = analyst_prompt | BASE_MODEL | table_mapping_parser
37
+ table_mapping: TableMapping = mapping_chain.invoke({"source_1_csv_str": _get_data_str_from_df_for_prompt(source_df), "target_csv_str": _get_data_str_from_df_for_prompt(template_df)})
38
  return pd.DataFrame(table_mapping.dict()['table_mappings'])
39
 
40
  def _sanitize_python_output(text: str):
41
+ '''Remove markdown from python code, as prompt returns it.'''
42
  _, after = text.split("```python")
43
  return after.split("```")[0]
44
 
45
  def generate_mapping_code(table_mapping_df) -> str:
46
+ '''Chain two prompts together to generate python code from a table mapping: 1. technical spec writer, 2. python engineer'''
47
  writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
48
  engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
49
 
 
52
  return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
53
 
54
  def process_csv_text(temp_file):
55
+ '''Process a CSV file into a dataframe, either from a string or a file.'''
56
  if isinstance(temp_file, str):
57
  df = pd.read_csv(io.StringIO(temp_file))
58
  else:
 
60
  return df
61
 
62
  def transform_source(source_df, code_text: str):
63
+ '''Use PythonAstREPLTool to transform a source dataframe using python code.'''
64
  return PythonAstREPLTool(locals={'source_df': source_df}).run(code_text)
src/prompt.py CHANGED
@@ -39,36 +39,28 @@ Your response:
39
 
40
 
41
  SPEC_WRITER_PROMPT_STR = '''
42
- You are an expert product manager and technical writer for a software company, who generates clean, concise, precise specification documents for your employees.
43
- Your job is to write a plaintext spec for a python script for a software engineer to develop a component within an ETL pipeline.
44
-
45
- This document must include 100% of the information your employee needs to write a successful script to transform source_df to target_df.
46
- However, DO NOT include the original table_mapping. Your job is to translate everything into natural language.
47
-
48
- Here is a stringified python dictionary that describes the mapping and the transformation steps:
49
 
50
  {table_mapping}
51
 
52
- You must translate this into clean, concise, and complete instructions for your employee.
53
-
54
- This document should be formatted like a technical document in plaintext. Do not include code or data.
55
-
56
- This document must include:
57
- - Overview
58
- - Input (source_df)
59
- - Output (target_df)
60
- - Exact column mapping
61
- - Exact transformation steps for each column
62
- - Precise instructions for what this script should do
63
- - Do not modify the source_df. Create a new dataframe named target_df.
64
- - This script should never include the source data. It should only include the transormations required to create the target_df.
65
- - Return the target_df
66
-
67
- You will never see this employee. They cannot contact you. You will never see their code. You must include 100% of the information they need to write a successful script.
68
- Remember:
69
- - Clean: No extra information, no formatting aside from plaintext
70
- - Concise: Your employees benefit from brevity
71
- - Precise: your words must be unambiguous, exact, and full represent a perfect translation of incoming python dict.
72
 
73
  Your response:
74
  '''
 
39
 
40
 
41
  SPEC_WRITER_PROMPT_STR = '''
42
+ You are a product manager and technical writer for a software firm. Your task is to draft a specification document for a software engineer to design a component within an ETL pipeline, converting `source_df` to `target_df` using the provided mapping:
 
 
 
 
 
 
43
 
44
  {table_mapping}
45
 
46
+ Translate this information into clear, succinct instructions. Avoid including the raw `table_mapping` or any code.
47
+
48
+ The specification should encompass:
49
+ - **Overview**: A brief summary of the task.
50
+ - **Input**: Description of `source_df`.
51
+ - **Output**: Description of `target_df`.
52
+ - **Column Mapping**: Clearly define how columns from the source map to the target.
53
+ - **Transformations**: Detail the transformations required for each column.
54
+ - **Instructions**: The script should:
55
+ - Not modify `source_df`.
56
+ - Generate a new dataframe named `target_df`.
57
+ - Not incorporate any source data, only transformations.
58
+ - Return `target_df`.
59
+
60
+ This will be your only communication to the engineer. Ensure it's:
61
+ - **Clear**: Eliminate any unnecessary details.
62
+ - **Concise**: Aim for brevity.
63
+ - **Precise**: Be unambiguous and exact.
 
 
64
 
65
  Your response:
66
  '''