Spaces:

du-lab
/

MLR-Copilot

Running

App Files Files Community

Lim0011 commited on Aug 30, 2024

Commit

fd63d9f

verified ·

1 Parent(s): f4a4a2e

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -164

app.py CHANGED Viewed

@@ -11,29 +11,74 @@ from reactagent.users.user import User
 # Global variables to store session state
 env = None
 agent = None
-# Predefined research paper text (example)
-predefined_paper_text = """
-    Title:
-    Dataset and Baseline for Automatic Student Feedback Analysis
-    Abstract:
-    This paper presents a student feedback corpus containing 3000 instances of feedback written by university students. The dataset has been annotated for aspect terms, opinion terms, polarities of the opinion terms towards targeted aspects, document-level opinion polarities, and sentence separations. A hierarchical taxonomy for aspect categorization covering all areas of the teaching-learning process was developed. Both implicit and explicit aspects were annotated using this taxonomy. The paper discusses the annotation methodology, difficulties faced during the annotation, and details about aspect term categorization. The annotated corpus can be used for Aspect Extraction, Aspect Level Sentiment Analysis, and Document Level Sentiment Analysis. Baseline results for all three tasks are provided.
-"""
-# Predefined extracted elements based on the paper text
-predefined_research_tasks = "The primary research tasks include the creation of a comprehensive student feedback corpus, aspect term annotation, opinion polarity annotation, and the development of a hierarchical taxonomy."
-predefined_research_gaps = "Gaps include the lack of detailed aspect-level annotations in existing datasets and the focus on document-level sentiment analysis."
-predefined_keywords = "Student Feedback Corpus, Aspect Terms, Opinion Terms, Polarity, Hierarchical Taxonomy, Aspect Extraction, Aspect Level Sentiment Analysis, Document Level Sentiment Analysis"
-predefined_recent_works = """
-    1. "Students feedback analysis model using deep learning-based method and linguistic knowledge for intelligent educational systems."
-    2. "An Automated Approach for Analysing Students Feedback Using Sentiment Analysis Techniques."
-    """
 # Extraction function to simulate the extraction of Research Tasks (t), Research Gaps (g), Keywords (k), and Recent Works (R)
 def extract_research_elements(paper_text):
-    # Returning the predefined extracted content
-    return predefined_research_tasks, predefined_research_gaps, predefined_keywords, predefined_recent_works
 # Generation function for Research Hypothesis and Experiment Plan
 def generate_research_idea_and_plan(tasks, gaps, keywords, recent_works):
@@ -131,37 +176,9 @@ Objective: Understand the training script, including data processing, [...]
 [Feedback]: The script structure is clear, but key functions (train_model, predict) need proper implementation for proposed model training and prediction.
 """
-predefined_response = """
-[Reasoning]: Execute the "final_model.py" using ExecuteScript action to evaluate performance of the final model.
-[Action]: Execute "final_model.py" using ExecuteScript action.
-Input: {"script_name": "final_model.py"}
-"""
-predefined_observation = """
-Epoch [1/10],
-Train MSE: 0.543,
-Test MSE: 0.688
-Epoch [2/10],
-Train MSE: 0.242,
-Test MSE: 0.493
-"""
-# # Structured input as list of dictionaries
-# process_steps = [
-#     "Action: Inspect Script Lines (train.py)\nObservation: The train.py script imports necessary libraries (e.g., pandas, sklearn, torch). Sets random seeds for reproducibility. Defines compute_metrics_for_regression function to calculate RMSE for different dimensions. Placeholder functions train_model and predict exist without implementations.\nFeedback: The script structure is clear, but key functions (train_model, predict) need proper implementation for proposed model training and prediction.",
-#     "Action: Execute Script (train.py)\nObservation: The script executed successfully. Generated embeddings using the BERT model. Completed the training process without errors. Metrics calculation placeholders indicated areas needing implementation.\nFeedback: Experimental model definition and training logic are missing.",
-#     "Action: Edit Script (train.py)\nObservation: Edited train.py to separate data loading, model definition, training loop, and evaluation into distinct functions. The edited train.py now has clearly defined functions for data loading (load_data), model definition (build_model), training (train_model), and evaluation (evaluate_model). Similarly, eval.py is reorganized to load the model and perform predictions efficiently.\nFeedback: Modify model architecture, retrieve the hybrid model of CNN, BiLSTM, and attention mechanisms, similar to the DTLP to align with the experiment design.",
-#     "Action: Retrieve Model\nObservation: CNN and BiLSTM retrieved.\nFeedback: Modify the model architecture.",
-#     "Action: Execute Script (train.py)\nObservation: The model trained over the specified number of epochs. Training and validation loss values are recorded for each epoch, the decrease in loss indicates improved model performance.\nFeedback: Continue with the next steps in model evaluation.",
-#     predefined_observation
-# ]
-action_list = [
-    predefined_response,
-    predefined_observation
-]
 # Predefined code to display in Phase 2
 predefined_code = """import pandas as pd
 from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
@@ -474,72 +491,41 @@ if __name__ == "__main__":
 """
-class SessionInfo:
-    def __init__(self):
-        self.coro_cache = {}
-        self.parser = create_parser()
-    def make_session(self, prompt, session_hash):
-        id = session_hash
-        llm_name='claude-3-5-sonnet-20240620'
-        fastllm_name='claude-3-haiku-20240307'
-        rawargs = [
-            '--research-problem', prompt,
-            '--log-dir', str(Path('logs', id)),
-            '--work-dir', str(Path('workspaces', id)),
-            '--llm-name', llm_name,
-            '--edit-script-llm-name', llm_name,
-            '--fast-llm-name', fastllm_name,
         ]
-        args = self.parser.parse_args(rawargs)
-        # llm.FAST_MODEL = args.fast_llm_name
-        env = Environment(args)
-        # agent = ResearchAgent(args, env)
-        coro = agent.run(env)
-        self.coro_cache[id] = coro
-        return id
-    def get_response(self, human_input, session_hash):
-        coro_input = human_input
-        if session_hash not in self.coro_cache:
-            self.make_session(human_input, session_hash)
-            coro_input = None
-        try:
-            output = self.coro_cache[session_hash].send(coro_input)
-        except StopIteration:
-            output = None
-            del self.coro_cache[session_hash]
-        return output
-session_info = SessionInfo()
-def info_to_message(info):
-    msg = ""
-    for k, v in info.items():
-        if isinstance(v, dict):
-            tempv = v
-            v = ""
-            for k2, v2 in tempv.items():
-                v += f"{k2}:\n  {v2}\n"
-        v = User.indent_text(v, 2)
-        msg += '-' * 64
-        msg += '\n'
-        msg += f"{k}:\n{v}\n"
-    msg += "Please provide feedback based on the history, response entries, and observation, and questions: "
-    return msg
-def predict(message, history, request: gr.Request):
-    response = session_info.get_response(message, request.session_hash)
-    if response is None:
-        return "Agent is finished. Enter a new instruction."
-    return info_to_message(response)
 # Initialize the global step_index and history
 process_steps = [
@@ -584,40 +570,35 @@ process_steps = [
         "Observation": predefined_observation,
     }
 ]
-# step_index = 0
-# def info_to_message(info):
-#     msg = "Agent Response:\n"
-#     for k, v in info.items():
-#         if isinstance(v, dict):
-#             tempv = v
-#             v = ""
-#             for k2, v2 in tempv.items():
-#                 v += f"{k2}:\n  {v2}\n"
-#         v = User.indent_text(v, 2)
-#         msg += '-' * 64
-#         msg += '\n'
-#         msg += f"{k}:\n{v}\n"
-#     msg += "Please provide feedback based on the history, response entries, and observation, and questions: "
-#     print(msg)
-#     return msg
-# def predict(message, history):
-#     global step_index  # Declare the use of global variable
-#     if step_index < len(process_steps):
-#         response_info = process_steps[step_index]
-#         response = info_to_message(response_info)  # Convert dictionary to formatted string
-#         step_index += 1
-#     else:
-#         response = "Agent Finished."
-#     return response, "N/A"  # Return the formatted string and clear input
 # Gradio Interface
 with gr.Blocks() as app:
-    gr.Markdown("# AI Research Assistant with Research Agent")
     # Use state variables to store generated hypothesis and experiment plan
     hypothesis_state = gr.State("")
@@ -629,13 +610,13 @@ with gr.Blocks() as app:
         with gr.Row():
             with gr.Column():
-                paper_text_input = gr.Textbox(value=predefined_paper_text, lines=10, label="Research Paper Text")
                 extract_button = gr.Button("Extract Research Elements")
                 with gr.Row():
-                    tasks_output = gr.Textbox(placeholder="Research task definition", label="Research Tasks", lines=2, interactive=False)
-                    gaps_output = gr.Textbox(placeholder="Research gaps of current works", label="Research Gaps", lines=2, interactive=False)
-                    keywords_output = gr.Textbox(placeholder="Paper keywords", label="Keywords", lines=2, interactive=False)
-                    recent_works_output = gr.Textbox(placeholder="Recent works extracted from Semantic Scholar", label="Recent Works", lines=2, interactive=False)
             with gr.Column():
                 with gr.Row():  # Move the button to the top right
                     generate_button = gr.Button("Generate Research Hypothesis & Experiment Plan")
@@ -663,6 +644,21 @@ with gr.Blocks() as app:
             outputs=[hypothesis_output, experiment_plan_output, hypothesis_state, experiment_plan_state]
         )
     # Phase 2: Interactive Session Tab
     with gr.Tab("Phase 2&3: Experiment implementation and execution"):
         gr.Markdown("### Interact with the ExperimentAgent")
@@ -680,40 +676,29 @@ with gr.Blocks() as app:
                     code_display = gr.Code(label="Implementation", language="python", interactive=False)
             with gr.Column():
-                # chatbot = gr.ChatInterface(predict)
-                response = gr.Textbox(label = "ExperimentAgent Response",  lines=30, interactive=False)
-                feedback = gr.Textbox(placeholder="N/A", label = "User Feedback",  lines=3, interactive=True)
                 submit_button = gr.Button("Submit", elem_classes=["Submit-btn"])
         def submit_feedback(user_feedback, history, previous_response):
             global step_index
             if_end = False
             step_index += 1
-            if (step_index >= len(process_steps)):
-                step_index  = 0
-                msg = ""
             msg = history
             if step_index < len(process_steps):
-                msg += previous_response + "\nUser feedback:" + user_feedback +"\n\n"
                 response_info = process_steps[step_index]
                 response = info_to_message(response_info)  # Convert dictionary to formatted string
                 step_index += 1
             else:
                 if_end = True
                 response = "Agent Finished."
-            msg += response
             return msg, response, predefined_code if if_end else final_code
-# def predict(message, history):
-#     global step_index  # Declare the use of global variable
-#     if step_index < len(process_steps):
-#         response_info = process_steps[step_index]
-#         response = info_to_message(response_info)  # Convert dictionary to formatted string
-#         step_index += 1
-#     else:
-#         response = "Agent Finished."
-        # Automatically populate the hypothesis and plan in Phase 2
         def load_phase_2_inputs(hypothesis, plan):
             return hypothesis, plan, "# Code implementation will be displayed here after Start ExperimentAgent."
@@ -734,6 +719,7 @@ with gr.Blocks() as app:
             inputs=[hypothesis_state, experiment_plan_state],
             outputs=[code_display, log]
         )
         submit_button.click(
             fn=submit_feedback,
             inputs=[feedback, log, response],
@@ -741,6 +727,5 @@ with gr.Blocks() as app:
         )
 if __name__ == "__main__":
-    # app.launch(share=True)
     step_index = 0
     app.launch()

 # Global variables to store session state
 env = None
 agent = None
+example_data = {
+    1: {
+        "title": "Dataset and Baseline for Automatic Student Feedback Analysis",
+        "abstract": """
+            This paper presents a student feedback corpus containing 3000 instances of feedback written by university students.
+            The dataset has been annotated for aspect terms, opinion terms, polarities of the opinion terms towards targeted aspects,
+            document-level opinion polarities, and sentence separations. A hierarchical taxonomy for aspect categorization
+            covering all areas of the teaching-learning process was developed. Both implicit and explicit aspects were annotated
+            using this taxonomy. The paper discusses the annotation methodology, difficulties faced during the annotation,
+            and details about aspect term categorization. The annotated corpus can be used for Aspect Extraction,
+            Aspect Level Sentiment Analysis, and Document Level Sentiment Analysis. Baseline results for all three tasks are provided.
+        """,
+        "research_tasks": "The primary research tasks include the creation of a comprehensive student feedback corpus, aspect term annotation, opinion polarity annotation, and the development of a hierarchical taxonomy.",
+        "research_gaps": "Gaps include the lack of detailed aspect-level annotations in existing datasets and the focus on document-level sentiment analysis.",
+        "keywords": "Student Feedback Corpus, Aspect Terms, Opinion Terms, Polarity, Hierarchical Taxonomy, Aspect Extraction, Aspect Level Sentiment Analysis, Document Level Sentiment Analysis",
+        "recent_works": [
+            "Students feedback analysis model using deep learning-based method and linguistic knowledge for intelligent educational systems.",
+            "An Automated Approach for Analysing Students Feedback Using Sentiment Analysis Techniques."
+        ]
+    },
+    2: {
+        "title": "An Empirical Study on the Impact of Code Review on Software Quality",
+        "abstract": """
+            This paper presents an empirical study examining the impact of code reviews on the quality of software projects.
+            The study involved analyzing over 500,000 code reviews across 20 open-source projects on GitHub.
+            The analysis was conducted to assess the relationship between code review practices and key software quality metrics,
+            such as defect density, code churn, and the frequency of post-release defects. The findings suggest that code reviews,
+            particularly when conducted by experienced reviewers, significantly reduce the number of defects in the codebase.
+            The paper discusses the methodology used for data collection, the statistical methods employed for analysis,
+            and the implications of these findings for software development practices.
+        """,
+        "research_tasks": "The primary research tasks include collecting and analyzing data on code reviews from open-source projects, measuring software quality metrics, and assessing the correlation between code review practices and software quality.",
+        "research_gaps": "Gaps include the lack of large-scale empirical studies that quantify the impact of code reviews on software quality and the limited focus on the role of reviewer expertise in existing literature.",
+        "keywords": "Code Reviews, Software Quality, Defect Density, Code Churn, Post-Release Defects, Empirical Study, Open-Source Projects, GitHub",
+        "recent_works": [
+            "The Effectiveness of Code Reviews in Identifying Defects: A Meta-Analysis of Empirical Studies",
+            "A Study on the Impact of Code Review Tools on Developer Productivity and Software Quality"
+        ]
+    }
+}
+# # Predefined research paper text (example)
+# predefined_paper_text = """
+#     Title:
+#     Dataset and Baseline for Automatic Student Feedback Analysis
+#     Abstract:
+#     This paper presents a student feedback corpus containing 3000 instances of feedback written by university students. The dataset has been annotated for aspect terms, opinion terms, polarities of the opinion terms towards targeted aspects, document-level opinion polarities, and sentence separations. A hierarchical taxonomy for aspect categorization covering all areas of the teaching-learning process was developed. Both implicit and explicit aspects were annotated using this taxonomy. The paper discusses the annotation methodology, difficulties faced during the annotation, and details about aspect term categorization. The annotated corpus can be used for Aspect Extraction, Aspect Level Sentiment Analysis, and Document Level Sentiment Analysis. Baseline results for all three tasks are provided.
+# """
+# # Predefined extracted elements based on the paper text
+# predefined_research_tasks = "The primary research tasks include the creation of a comprehensive student feedback corpus, aspect term annotation, opinion polarity annotation, and the development of a hierarchical taxonomy."
+# predefined_research_gaps = "Gaps include the lack of detailed aspect-level annotations in existing datasets and the focus on document-level sentiment analysis."
+# predefined_keywords = "Student Feedback Corpus, Aspect Terms, Opinion Terms, Polarity, Hierarchical Taxonomy, Aspect Extraction, Aspect Level Sentiment Analysis, Document Level Sentiment Analysis"
+# predefined_recent_works = """
+#     1. "Students feedback analysis model using deep learning-based method and linguistic knowledge for intelligent educational systems."
+#     2. "An Automated Approach for Analysing Students Feedback Using Sentiment Analysis Techniques."
+#     """
 # Extraction function to simulate the extraction of Research Tasks (t), Research Gaps (g), Keywords (k), and Recent Works (R)
 def extract_research_elements(paper_text):
+    global index_ex
+    example = example_data[index_ex]
+    tasks = example['research_tasks']
+    gaps = example['research_gaps']
+    keywords = example['keywords']
+    recent_works = "\n".join(example['recent_works'])
+    return tasks, gaps, keywords, recent_works
 # Generation function for Research Hypothesis and Experiment Plan
 def generate_research_idea_and_plan(tasks, gaps, keywords, recent_works):
 [Feedback]: The script structure is clear, but key functions (train_model, predict) need proper implementation for proposed model training and prediction.
 """
 # Predefined code to display in Phase 2
 predefined_code = """import pandas as pd
 from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 """
+# Example data structure
+example_data = {
+    1: {
+        "title": "Dataset and Baseline for Automatic Student Feedback Analysis",
+        "abstract": "This paper presents a student feedback corpus containing 3000 instances of feedback written by university students. The dataset has been annotated for aspect terms, opinion terms, polarities of the opinion terms towards targeted aspects, document-level opinion polarities, and sentence separations. A hierarchical taxonomy for aspect categorization covering all areas of the teaching-learning process was developed. Both implicit and explicit aspects were annotated using this taxonomy. The paper discusses the annotation methodology, difficulties faced during the annotation, and details about aspect term categorization. The annotated corpus can be used for Aspect Extraction, Aspect Level Sentiment Analysis, and Document Level Sentiment Analysis. Baseline results for all three tasks are provided.",
+        "research_tasks": "The primary research tasks include the creation of a comprehensive student feedback corpus, aspect term annotation, opinion polarity annotation, and the development of a hierarchical taxonomy.",
+        "research_gaps": "Gaps include the lack of detailed aspect-level annotations in existing datasets and the focus on document-level sentiment analysis.",
+        "keywords": "Student Feedback Corpus, Aspect Terms, Opinion Terms, Polarity, Hierarchical Taxonomy, Aspect Extraction, Aspect Level Sentiment Analysis, Document Level Sentiment Analysis",
+        "recent_works": [
+            "Students feedback analysis model using deep learning-based method and linguistic knowledge for intelligent educational systems.",
+            "An Automated Approach for Analysing Students Feedback Using Sentiment Analysis Techniques."
         ]
+    },
+    2: {
+        "title": "An Empirical Study on the Impact of Code Review on Software Quality",
+        "abstract": "This paper presents an empirical study examining the impact of code reviews on the quality of software projects. The study involved analyzing over 500,000 code reviews across 20 open-source projects on GitHub. The analysis was conducted to assess the relationship between code review practices and key software quality metrics, such as defect density, code churn, and the frequency of post-release defects. The findings suggest that code reviews, particularly when conducted by experienced reviewers, significantly reduce the number of defects in the codebase. The paper discusses the methodology used for data collection, the statistical methods employed for analysis, and the implications of these findings for software development practices.",
+        "research_tasks": "The primary research tasks include collecting and analyzing data on code reviews from open-source projects, measuring software quality metrics, and assessing the correlation between code review practices and software quality.",
+        "research_gaps": "Gaps include the lack of large-scale empirical studies that quantify the impact of code reviews on software quality and the limited focus on the role of reviewer expertise in existing literature.",
+        "keywords": "Code Reviews, Software Quality, Defect Density, Code Churn, Post-Release Defects, Empirical Study, Open-Source Projects, GitHub",
+        "recent_works": [
+            "The Effectiveness of Code Reviews in Identifying Defects: A Meta-Analysis of Empirical Studies",
+            "A Study on the Impact of Code Review Tools on Developer Productivity and Software Quality"
+        ]
+    }
+}
+predefined_observation = """
+Epoch [1/10],
+Train MSE: 0.543,
+Test MSE: 0.688
+Epoch [2/10],
+Train MSE: 0.242,
+Test MSE: 0.493
+"""
 # Initialize the global step_index and history
 process_steps = [
         "Observation": predefined_observation,
     }
 ]
+def info_to_message(info):
+    msg = ""
+    for k, v in info.items():
+        if isinstance(v, dict):
+            tempv = v
+            v = ""
+            for k2, v2 in tempv.items():
+                v += f"{k2}:\n  {v2}\n"
+        v = User.indent_text(v, 2)
+        msg += '-' * 64
+        msg += '\n'
+        msg += f"{k}:\n{v}\n"
+    return msg
+index_ex = 1
+# Function to handle the selection of an example and populate the respective fields
+def load_example(example_id):
+    global index_ex
+    index_ex = example_id
+    example = example_data[example_id]
+    paper_text = 'Title:\t' +  example['title'] + '\nAbstract:\t' + example['abstract']
+    return paper_text
 # Gradio Interface
 with gr.Blocks() as app:
+    gr.Markdown("# MLR- Copilot: AI Research Assistant with Research Agent")
+    gr.Markdown("### MLR-Copilot is a framework where LLMs mimic researchers’ thought processes, designed to enhance the productivity of machine learning research by automating the generation and implementation of research ideas.It begins with a research paper, autonomously generating and validating these ideas, while incorporating human feedback to help reach executable research outcomes.")
     # Use state variables to store generated hypothesis and experiment plan
     hypothesis_state = gr.State("")
         with gr.Row():
             with gr.Column():
+                paper_text_input = gr.Textbox(value=load_example(1), lines=10, label="Research Paper Text")
                 extract_button = gr.Button("Extract Research Elements")
                 with gr.Row():
+                    tasks_output = gr.Textbox(placeholder="Research task definition", label="Research Tasks", lines=2, interactive=True)
+                    gaps_output = gr.Textbox(placeholder="Research gaps of current works", label="Research Gaps", lines=2, interactive=True)
+                    keywords_output = gr.Textbox(placeholder="Paper keywords", label="Keywords", lines=2, interactive=True)
+                    recent_works_output = gr.Textbox(placeholder="Recent works extracted from Semantic Scholar", label="Recent Works", lines=2, interactive=True)
             with gr.Column():
                 with gr.Row():  # Move the button to the top right
                     generate_button = gr.Button("Generate Research Hypothesis & Experiment Plan")
             outputs=[hypothesis_output, experiment_plan_output, hypothesis_state, experiment_plan_state]
         )
+        # Example Buttons
+        with gr.Row():
+            example_1_button = gr.Button("Load Example 1:")
+            example_2_button = gr.Button("Load Example 2:")
+        example_1_button.click(
+            fn=lambda: load_example(1),
+            outputs=[paper_text_input]
+        )
+        example_2_button.click(
+            fn=lambda: load_example(2),
+            outputs=[paper_text_input]
+        )
     # Phase 2: Interactive Session Tab
     with gr.Tab("Phase 2&3: Experiment implementation and execution"):
         gr.Markdown("### Interact with the ExperimentAgent")
                     code_display = gr.Code(label="Implementation", language="python", interactive=False)
             with gr.Column():
+                response = gr.Textbox(label="ExperimentAgent Response", lines=30, interactive=False)
+                feedback = gr.Textbox(placeholder="N/A", label="User Feedback", lines=3, interactive=True)
                 submit_button = gr.Button("Submit", elem_classes=["Submit-btn"])
         def submit_feedback(user_feedback, history, previous_response):
             global step_index
             if_end = False
             step_index += 1
             msg = history
             if step_index < len(process_steps):
+                msg += previous_response + "\nUser feedback:" + user_feedback + "\n\n"
                 response_info = process_steps[step_index]
                 response = info_to_message(response_info)  # Convert dictionary to formatted string
+                response += "Please provide feedback based on the history, response entries, and observation, and questions: "
                 step_index += 1
+                msg += response
             else:
                 if_end = True
                 response = "Agent Finished."
             return msg, response, predefined_code if if_end else final_code
         def load_phase_2_inputs(hypothesis, plan):
             return hypothesis, plan, "# Code implementation will be displayed here after Start ExperimentAgent."
             inputs=[hypothesis_state, experiment_plan_state],
             outputs=[code_display, log]
         )
         submit_button.click(
             fn=submit_feedback,
             inputs=[feedback, log, response],
         )
 if __name__ == "__main__":
     step_index = 0
     app.launch()