kenken999 commited on
Commit
e1aa577
·
1 Parent(s): d14d824
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. AutoPrompt/.gitignore +7 -0
  3. AutoPrompt/LICENSE +201 -0
  4. AutoPrompt/Pipfile +25 -0
  5. AutoPrompt/Pipfile.lock +0 -0
  6. AutoPrompt/README.md +229 -0
  7. AutoPrompt/config/config_default.yml +58 -0
  8. AutoPrompt/config/config_diff/config_batch_classification.yml +14 -0
  9. AutoPrompt/config/config_diff/config_generation.yml +25 -0
  10. AutoPrompt/config/config_diff/config_ranking.yml +5 -0
  11. AutoPrompt/config/llm_env.yml +12 -0
  12. AutoPrompt/dataset/base_dataset.py +158 -0
  13. AutoPrompt/docs/AutoPrompt_Diagram.png +0 -0
  14. AutoPrompt/docs/arch_overview.png +0 -0
  15. AutoPrompt/docs/architecture.md +18 -0
  16. AutoPrompt/docs/argilla_movie_spoilers_example.png +0 -0
  17. AutoPrompt/docs/autoprompt_recording.gif +3 -0
  18. AutoPrompt/docs/contributing.md +13 -0
  19. AutoPrompt/docs/examples.md +243 -0
  20. AutoPrompt/docs/how-it-works.md +58 -0
  21. AutoPrompt/docs/installation.md +75 -0
  22. AutoPrompt/environment_dev.yml +23 -0
  23. AutoPrompt/estimator/__init__.py +37 -0
  24. AutoPrompt/estimator/estimator_argilla.py +119 -0
  25. AutoPrompt/estimator/estimator_llm.py +95 -0
  26. AutoPrompt/estimator/estimator_llm_batch.py +68 -0
  27. AutoPrompt/eval/eval_utils.py +24 -0
  28. AutoPrompt/eval/evaluator.py +152 -0
  29. AutoPrompt/optimization_pipeline.py +277 -0
  30. AutoPrompt/prompts/meta_prompts_classification/error_analysis.prompt +24 -0
  31. AutoPrompt/prompts/meta_prompts_classification/initial.prompt +11 -0
  32. AutoPrompt/prompts/meta_prompts_classification/initial_verbose.prompt +17 -0
  33. AutoPrompt/prompts/meta_prompts_classification/output_schemes.py +97 -0
  34. AutoPrompt/prompts/meta_prompts_classification/step_prompt.prompt +21 -0
  35. AutoPrompt/prompts/meta_prompts_classification/step_prompt_verbose.prompt +25 -0
  36. AutoPrompt/prompts/meta_prompts_classification/step_samples.prompt +24 -0
  37. AutoPrompt/prompts/meta_prompts_completion/error_analysis.prompt +24 -0
  38. AutoPrompt/prompts/meta_prompts_completion/initial.prompt +16 -0
  39. AutoPrompt/prompts/meta_prompts_completion/output_schemes.py +40 -0
  40. AutoPrompt/prompts/meta_prompts_completion/step_prompt.prompt +29 -0
  41. AutoPrompt/prompts/meta_prompts_completion/step_samples.prompt +18 -0
  42. AutoPrompt/prompts/meta_prompts_generation/error_analysis.prompt +25 -0
  43. AutoPrompt/prompts/meta_prompts_generation/initial.prompt +20 -0
  44. AutoPrompt/prompts/meta_prompts_generation/output_schemes.py +97 -0
  45. AutoPrompt/prompts/meta_prompts_generation/step_prompt.prompt +20 -0
  46. AutoPrompt/prompts/meta_prompts_generation/step_samples.prompt +24 -0
  47. AutoPrompt/prompts/meta_prompts_ranking/error_analysis.prompt +24 -0
  48. AutoPrompt/prompts/meta_prompts_ranking/initial.prompt +17 -0
  49. AutoPrompt/prompts/meta_prompts_ranking/initial_verbose.prompt +17 -0
  50. AutoPrompt/prompts/meta_prompts_ranking/output_schemes.py +97 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  *.duckdb filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  *.duckdb filter=lfs diff=lfs merge=lfs -text
36
+ *.gif filter=lfs diff=lfs merge=lfs -text
AutoPrompt/.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ /venv
2
+ __pycache__/
3
+ *.log
4
+ /wandb
5
+ .idea/
6
+ dump/
7
+
AutoPrompt/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
AutoPrompt/Pipfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [[source]]
2
+ url = "https://pypi.org/simple"
3
+ verify_ssl = true
4
+ name = "pypi"
5
+
6
+ [packages]
7
+ openai = "*"
8
+ langchain = "*"
9
+ pandas = "*"
10
+ wandb = "*"
11
+ transformers = "*"
12
+ tqdm = "*"
13
+ faiss-cpu = "*"
14
+ sentence-transformers = "*"
15
+ prodict = "*"
16
+ schedule = "*"
17
+ easydict = "*"
18
+ argilla = "*"
19
+ langchain-google-genai = "*"
20
+
21
+ [dev-packages]
22
+
23
+ [requires]
24
+ python_version = "3.10"
25
+ python_full_version = "3.10.13"
AutoPrompt/Pipfile.lock ADDED
The diff for this file is too large to render. See raw diff
 
AutoPrompt/README.md ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+ <!-- community badges -->
3
+ <a href="https://discord.gg/G2rSbAf8uP"><img src="https://img.shields.io/badge/Join-Discord-blue.svg"/></a>
4
+ <!-- license badge -->
5
+ <a href="https://github.com/Eladlev/AutoPrompt/blob/main/LICENSE">
6
+ <img alt="License" src="https://img.shields.io/badge/License-Apache_2.0-blue.svg"></a>
7
+ </p>
8
+
9
+ # 📝 AutoPrompt
10
+
11
+
12
+ <!-- MARKDOWN LINKS & IMAGES -->
13
+ <!-- https://www.markdownguide.org/basic-syntax/#reference-style-links -->
14
+
15
+ **Auto Prompt is a prompt optimization framework designed to enhance and perfect your prompts for real-world use cases.**
16
+
17
+ The framework automatically generates high-quality, detailed prompts tailored to user intentions. It employs a refinement (calibration) process, where it iteratively builds a dataset of challenging edge cases and optimizes the prompt accordingly. This approach not only reduces manual effort in prompt engineering but also effectively addresses common issues such as prompt [sensitivity](https://arxiv.org/abs/2307.09009) and inherent prompt [ambiguity](https://arxiv.org/abs/2311.04205) issues.
18
+
19
+
20
+ **Our mission:** Empower users to produce high-quality robust prompts using the power of large language models (LLMs).
21
+
22
+ # Why Auto Prompt?
23
+ - **Prompt Engineering Challenges.** The quality of LLMs greatly depends on the prompts used. Even [minor changes](#prompt-sensitivity-example) can significantly affect their performance.
24
+ - **Benchmarking Challenges.** Creating a benchmark for production-grade prompts is often labour-intensive and time-consuming.
25
+ - **Reliable Prompts.** Auto Prompt generates robust high-quality prompts, offering measured accuracy and performance enhancement using minimal data and annotation steps.
26
+ - **Modularity and Adaptability.** With modularity at its core, Auto Prompt integrates seamlessly with popular open-source tools such as LangChain, Wandb, and Argilla, and can be adapted for a variety of tasks, including data synthesis and prompt migration.
27
+
28
+ ## System Overview
29
+
30
+ ![System Overview](./docs/AutoPrompt_Diagram.png)
31
+
32
+ The system is designed for real-world scenarios, such as moderation tasks, which are often challenged by imbalanced data distributions. The system implements the [Intent-based Prompt Calibration](https://arxiv.org/abs/2402.03099) method. The process begins with a user-provided initial prompt and task description, optionally including user examples. The refinement process iteratively generates diverse samples, annotates them via user/LLM, and evaluates prompt performance, after which an LLM suggests an improved prompt.
33
+
34
+ The optimization process can be extended to content generation tasks by first devising a ranker prompt and then performing the prompt optimization with this learned ranker. The optimization concludes upon reaching the budget or iteration limit.
35
+
36
+
37
+ This joint synthetic data generation and prompt optimization approach outperform traditional methods while requiring minimal data and iterations. Learn more in our paper
38
+ [Intent-based Prompt Calibration: Enhancing prompt optimization with synthetic boundary cases](https://arxiv.org/abs/2402.03099) by E. Levi et al. (2024).
39
+
40
+
41
+ **Using GPT-4 Turbo, this optimization typically completes in just a few minutes at a cost of under $1.** To manage costs associated with GPT-4 LLM's token usage, the framework enables users to set a budget limit for optimization, in USD or token count, configured as illustrated [here](docs/examples.md#steps-to-run-example).
42
+
43
+ ## Demo
44
+
45
+ ![pipeline_recording](./docs/autoprompt_recording.gif)
46
+
47
+
48
+ ## 📖 Documentation
49
+ - [How to install](docs/installation.md) (Setup instructions)
50
+ - [Prompt optimization examples](docs/examples.md) (Use cases: movie review classification, generation, and chat moderation)
51
+ - [How it works](docs/how-it-works.md) (Explanation of pipelines)
52
+ - [Architecture guide](docs/architecture.md) (Overview of main components)
53
+
54
+ ## Features
55
+ - 📝 Boosts prompt quality with a minimal amount of data and annotation steps.
56
+ - 🛬 Designed for production use cases like moderation, multi-label classification, and content generation.
57
+ - ⚙️ Enables seamless migrating of prompts across model versions or LLM providers.
58
+ - 🎓 Supports prompt squeezing. Combine multiple rules into a single efficient prompt.
59
+
60
+
61
+ ## QuickStart
62
+ AutoPrompt requires `python <= 3.10`
63
+ <br />
64
+
65
+ > **Step 1** - Download the project
66
+
67
+ ```bash
68
+ git clone git@github.com:Eladlev/AutoPrompt.git
69
+ cd AutoPrompt
70
+ ```
71
+
72
+ <br />
73
+
74
+ > **Step 2** - Install dependencies
75
+
76
+ Use either Conda or pip, depending on your preference. Using Conda:
77
+ ```bash
78
+ conda env create -f environment_dev.yml
79
+ conda activate AutoPrompt
80
+ ```
81
+
82
+ Using pip:
83
+ ```bash
84
+ pip install -r requirements.txt
85
+ ```
86
+
87
+ Using pipenv:
88
+ ```bash
89
+ pip install pipenv
90
+ pipenv sync
91
+ ```
92
+
93
+ <br />
94
+
95
+ > **Step 3** - Configure your LLM.
96
+
97
+ Set your OpenAI API key by updating the configuration file `config/llm_env.yml`
98
+ - If you need help locating your API key, visit this [link](https://help.openai.com/en/articles/4936850-where-do-i-find-my-api-key).
99
+
100
+ - We recommend using [OpenAI's GPT-4](https://platform.openai.com/docs/guides/gpt) for the LLM. Our framework also supports other providers and open-source models, as discussed [here](docs/installation.md#configure-your-llm).
101
+
102
+ <br />
103
+
104
+ > **Step 4** - Configure your Annotator
105
+ - Select an annotation approach for your project. We recommend beginning with a human-in-the-loop method, utilizing [Argilla](https://docs.argilla.io/en/latest/index.html). Follow the [Argilla setup instructions](https://docs.argilla.io/en/latest/getting_started/installation/deployments/huggingface-spaces.html) to configure your server. Alternatively, you can set up an LLM as your annotator by following these [configuration steps](docs/installation.md#configure-llm-annotator).
106
+
107
+ - The default predictor LLM, GPT-3.5, for estimating prompt performance, is configured in the `predictor` section of `config/config_default.yml`.
108
+
109
+ - Define your budget in the input config yaml file using the `max_usage parameter`. For OpenAI models, `max_usage` sets the maximum spend in USD. For other LLMs, it limits the maximum token count.
110
+
111
+ <br />
112
+
113
+
114
+ > **Step 5** - Run the pipeline
115
+
116
+ First, configure your labels by editing `config/config_default.yml`
117
+ ```
118
+ dataset:
119
+ label_schema: ["Yes", "No"]
120
+ ```
121
+
122
+
123
+ For a **classification pipeline**, use the following command from your terminal within the appropriate working directory:
124
+ ```bash
125
+ python run_pipeline.py
126
+ ```
127
+ If the initial prompt and task description are not provided directly as input, you will be guided to provide these details. Alternatively, specify them as command-line arguments:
128
+ ```bash
129
+ python run_pipeline.py \
130
+ --prompt "Does this movie review contain a spoiler? answer Yes or No" \
131
+ --task_description "Assistant is an expert classifier that will classify a movie review, and let the user know if it contains a spoiler for the reviewed movie or not." \
132
+ --num_steps 30
133
+ ```
134
+ You can track the optimization progress using the [W&B](https://wandb.ai/site) dashboard, with setup instructions available [here](docs/installation.md#monitoring-weights-and-biases-setup).
135
+
136
+
137
+ If you are using pipenv, be sure to activate the environment:
138
+ ``` bash
139
+ pipenv shell
140
+ python run_pipeline.py
141
+ ```
142
+ or alternatively prefix your command with `pipenv run`:
143
+ ```bash
144
+ pipenv run python run_pipeline.py
145
+ ```
146
+
147
+ #### Generation pipeline
148
+ To run the generation pipeline, use the following example command:
149
+ ```bash
150
+ python run_generation_pipeline.py \
151
+ --prompt "Write a good and comprehensive movie review about a specific movie." \
152
+ --task_description "Assistant is a large language model that is tasked with writing movie reviews."
153
+ ```
154
+ For more information, refer to our [generation task example](docs/examples.md#generating-movie-reviews-generation-task).
155
+
156
+ <br />
157
+
158
+ Enjoy the results. Completion of these steps yields a **refined (calibrated)
159
+ prompt** tailored for your task, alongside a **benchmark** featuring challenging samples,
160
+ stored in the default `dump` path.
161
+
162
+
163
+
164
+ ## Tips
165
+
166
+ - Prompt accuracy may fluctuate during the optimization. To identify the best prompts, we recommend continuous refinement following the initial generation of the benchmark. Set the number of optimization iterations with `--num_steps` and control sample generation by specifying `max_samples` in the `dataset` section. For instance, setting `max_samples: 50` and `--num_steps 30` limits the benchmark to 50 samples, allowing for 25 additional refinement iterations, assuming 10 samples per iteration.
167
+
168
+ - The framework supports checkpoints for easy resumption of optimization from the last saved state. It automatically saves the most recent optimization state in a `dump` path. Use `--output_dump` to set this path and `--load_path` to resume from a checkpoint.
169
+ - The iterations include multiple calls to the LLM service, with long prompts and requests for a relatively large amount of generated tokens by the LLM. This might take time ~1 minute (especially in the generative tasks), so please be patient.
170
+ - If there are some issues with the Argilla server connection/error, try to restart the space.
171
+ <!--
172
+ Meanwhile, the num_initialize_samples and num_generated_samples fields within the meta_prompts section specify the counts for initial and per iteration sample generation, respectively. -->
173
+
174
+
175
+ ## Prompt Sensitivity Example
176
+ You write a prompt for identifying movie spoilers:
177
+ ```
178
+ Review the content provided and indicate whether it includes any significant plot revelations or critical points that could reveal important elements of the story or its outcome. Respond with "Yes" if it contains such spoilers or critical insights, and "No" if it refrains from unveiling key story elements.
179
+ ```
180
+ This prompt scores 81 on your [benchmark](docs/examples.md#filtering-movie-reviews-with-spoilers-classification-task) using GPT-4 LLM. Then, you make a minor modification:
181
+ ```
182
+ Review the text and determine if it provides essential revelations or critical details about the story that would constitute a spoiler. Respond with "Yes" for the presence of spoilers, and "No" for their absence.
183
+ ```
184
+ Surprisingly, the second prompt scores 72, representing an 11% drop in accuracy. This illustrates the need for a careful prompt engineering process.
185
+
186
+ ## 🚀 Contributing
187
+
188
+ Your contributions are greatly appreciated! If you're eager to contribute, kindly refer to our [Contributing Guidelines](docs/contributing.md)) for detailed information.
189
+
190
+ <!-- For an insight into our future plans, visit our Project Roadmap. -->
191
+ If you wish to be a part of our journey, we invite you to connect with us through our [Discord Community](https://discord.gg/G2rSbAf8uP). We're excited to have you onboard!
192
+
193
+ ## 🛡 Disclaimer
194
+
195
+ The AutoPrompt project is provided on an "as-is" basis without any guarantees or warranties, expressed or implied.
196
+
197
+ Our perspective on the optimization and usage of prompts:
198
+
199
+ 1. The core objective of AutoPrompt is to refine and perfect prompts to achieve high-quality results. This is achieved through an iterative calibration process, which helps in reducing errors and enhancing the performance of LLMs. However, the framework does not guarantee absolute correctness or unbiased results in every instance.
200
+
201
+ 2. AutoPrompt aims to improve the reliability of prompts and mitigate sensitivity issues, but it does not claim to completely eliminate such issues.
202
+ <!-- Our community is committed to exploring the most effective ways to interact with LLMs, fostering a space for diverse views and approaches. -->
203
+
204
+ Please note that using LLMs like OpenAI's GPT-4, supported by AutoPrompt, may lead to significant costs due to token usage. By using AutoPrompt, you acknowledge your responsibility to monitor and manage your token use and expenses. We advise regularly reviewing your LLM provider's API usage and establishing limits or alerts to prevent unexpected charges.
205
+ To manage costs associated with GPT-4 LLM's token usage, the framework enables users to set a budget limit for optimization, in USD or token count, configured as illustrated [here](docs/examples.md#steps-to-run-example).
206
+
207
+ ## Citation
208
+
209
+ If you have used our code in your research, please cite our [paper](https://arxiv.org/abs/2402.03099):
210
+
211
+ ```
212
+ @misc{2402.03099,
213
+ Author = {Elad Levi and Eli Brosh and Matan Friedmann},
214
+ Title = {Intent-based Prompt Calibration: Enhancing prompt optimization with synthetic boundary cases},
215
+ Year = {2024},
216
+ Eprint = {arXiv:2402.03099},
217
+ }
218
+ ```
219
+
220
+
221
+ ## License
222
+
223
+ This framework is licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
224
+
225
+ ## ✉️ Support / Contact us
226
+ - [Community Discord](https://discord.gg/G2rSbAf8uP)
227
+ - Our email: [‫autopromptai@gmail.com‬](mailto:autopromptai@gmail.com)
228
+
229
+
AutoPrompt/config/config_default.yml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use_wandb: False
2
+ dataset:
3
+ name: 'dataset'
4
+ records_path: null
5
+ initial_dataset: ''
6
+ label_schema: ["Yes", "No"]
7
+ max_samples: 50
8
+ semantic_sampling: False # Change to True in case you don't have M1. Currently there is an issue with faiss and M1
9
+
10
+ annotator:
11
+ method : 'argilla'
12
+ config:
13
+ api_url: 'https://kenken999-arglira.hf.space'
14
+ api_key: 'admin.apikey'
15
+ workspace: 'admin'
16
+ time_interval: 5
17
+
18
+ predictor:
19
+ method : 'llm'
20
+ config:
21
+ llm:
22
+ type: 'OpenAI'
23
+ name: 'llama3-8b-8192'
24
+ # async_params:
25
+ # retry_interval: 10
26
+ # max_retries: 2
27
+ model_kwargs: {"seed": 220}
28
+ num_workers: 5
29
+ prompt: 'prompts/predictor_completion/prediction.prompt'
30
+ mini_batch_size: 1 #change to >1 if you want to include multiple samples in the one prompt
31
+ mode: 'prediction'
32
+
33
+ meta_prompts:
34
+ folder: 'prompts/meta_prompts_classification'
35
+ num_err_prompt: 1 # Number of error examples per sample in the prompt generation
36
+ num_err_samples: 2 # Number of error examples per sample in the sample generation
37
+ history_length: 4 # Number of sample in the meta-prompt history
38
+ num_generated_samples: 10 # Number of generated samples at each iteration
39
+ num_initialize_samples: 10 # Number of generated samples at iteration 0, in zero-shot case
40
+ samples_generation_batch: 10 # Number of samples generated in one call to the LLM
41
+ num_workers: 5 #Number of parallel workers
42
+ warmup: 4 # Number of warmup steps
43
+
44
+ eval:
45
+ function_name: 'accuracy'
46
+ num_large_errors: 4
47
+ num_boundary_predictions : 0
48
+ error_threshold: 0.5
49
+
50
+ llm:
51
+ type: 'OpenAI'
52
+ name: 'llama3-70b-8192'
53
+ temperature: 0.8
54
+
55
+ stop_criteria:
56
+ max_usage: 2 #In $ in case of OpenAI models, otherwise number of tokens
57
+ patience: 10 # Number of patience steps
58
+ min_delta: 0.01 # Delta for the improvement definition
AutoPrompt/config/config_diff/config_batch_classification.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use_wandb: True
2
+ dataset:
3
+ label_schema: ["Yes", "No"]
4
+
5
+ annotator:
6
+ method : 'llm_batch'
7
+ config:
8
+ instructions: ['Is there is an address in the text?', 'Is there is a phone number in the text?',
9
+ 'Is there is a password in the text?']
10
+ aggregation_mode: 'exist' #'majority_vote', 'exist', or 'all'. exist/all is working only in case label_schema: ["Yes", "No"]!
11
+ estimator_config:
12
+ num_workers: 2
13
+ prompt: 'prompts/predictor/prediction.prompt'
14
+ mode: 'annotation'
AutoPrompt/config/config_diff/config_generation.yml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotator:
2
+ method : ''
3
+
4
+ dataset:
5
+ max_samples: 20
6
+ label_schema: ["1","2","3","4","5"]
7
+
8
+ predictor:
9
+ method : 'llm'
10
+ config:
11
+ prompt: 'prompts/predictor_completion/prediction_generation.prompt'
12
+ mini_batch_size: 1
13
+ llm:
14
+ type: 'OpenAI'
15
+ name: 'llama3-70b-8192' #'gpt-3.5-turbo-0613'
16
+ num_workers: 7
17
+
18
+ meta_prompts:
19
+ folder: 'prompts/meta_prompts_generation'
20
+ warmup: 1
21
+
22
+ eval:
23
+ function_name: 'ranking'
24
+ error_threshold: 4
25
+
AutoPrompt/config/config_diff/config_ranking.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ dataset:
2
+ label_schema: ["1","2","3","4","5"]
3
+
4
+ meta_prompts:
5
+ folder: 'prompts/meta_prompts_ranking'
AutoPrompt/config/llm_env.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai:
2
+ OPENAI_API_KEY: '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'
3
+ OPENAI_API_BASE: 'https://api.groq.com/openai/v1'
4
+ OPENAI_ORGANIZATION: ''
5
+
6
+ azure:
7
+ AZURE_OPENAI_API_KEY: ''
8
+ AZURE_OPENAI_ENDPOINT: ''
9
+ OPENAI_API_VERSION: ''
10
+
11
+ google:
12
+ GOOGLE_API_KEY: ''
AutoPrompt/dataset/base_dataset.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+ import logging
3
+ import pandas as pd
4
+ from pathlib import Path
5
+ from datetime import datetime
6
+ import csv
7
+
8
+ from utils.dedup import Dedup
9
+
10
+ class DatasetBase:
11
+ """
12
+ This class store and manage all the dataset records (including the annotations and prediction)
13
+ """
14
+
15
+ def __init__(self, config):
16
+ if config.records_path is None:
17
+ self.records = pd.DataFrame(columns=['id', 'text', 'prediction',
18
+ 'annotation', 'metadata', 'score', 'batch_id'])
19
+ else:
20
+ self.records = pd.read_csv(config.records_path)
21
+ dt_string = datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
22
+
23
+ self.name = config.name + '__' + dt_string
24
+ self.label_schema = config.label_schema
25
+ self.dedup = Dedup(config)
26
+ self.sample_size = config.get("sample_size", 3)
27
+ self.semantic_sampling = config.get("semantic_sampling", False)
28
+ if not config.get('dedup_new_samples', False):
29
+ self.remove_duplicates = self._null_remove
30
+
31
+ def __len__(self):
32
+ """
33
+ Return the number of samples in the dataset.
34
+ """
35
+ return len(self.records)
36
+
37
+ def __getitem__(self, batch_idx):
38
+ """
39
+ Return the batch idx.
40
+ """
41
+ extract_records = self.records[self.records['batch_id'] == batch_idx]
42
+ extract_records = extract_records.reset_index(drop=True)
43
+ return extract_records
44
+
45
+ def get_leq(self, batch_idx):
46
+ """
47
+ Return all the records up to batch_idx (includes).
48
+ """
49
+ extract_records = self.records[self.records['batch_id'] <= batch_idx]
50
+ extract_records = extract_records.reset_index(drop=True)
51
+ return extract_records
52
+
53
+ def add(self, sample_list: dict = None, batch_id: int = None, records: pd.DataFrame = None):
54
+ """
55
+ Add records to the dataset.
56
+ :param sample_list: The samples to add in a dict structure (only used in case record=None)
57
+ :param batch_id: The batch_id for the upload records (only used in case record= None)
58
+ :param records: dataframes, update using pandas
59
+ """
60
+ if records is None:
61
+ records = pd.DataFrame([{'id': len(self.records) + i, 'text': sample, 'batch_id': batch_id} for
62
+ i, sample in enumerate(sample_list)])
63
+ self.records = pd.concat([self.records, records], ignore_index=True)
64
+
65
+ def update(self, records: pd.DataFrame):
66
+ """
67
+ Update records in dataset.
68
+ """
69
+ # Ignore if records is empty
70
+ if len(records) == 0:
71
+ return
72
+
73
+ # Set 'id' as the index for both DataFrames
74
+ records.set_index('id', inplace=True)
75
+ self.records.set_index('id', inplace=True)
76
+
77
+ # Update using 'id' as the key
78
+ self.records.update(records)
79
+
80
+ # Remove null annotations
81
+ if len(self.records.loc[self.records["annotation"]=="Discarded"]) > 0:
82
+ discarded_annotation_records = self.records.loc[self.records["annotation"]=="Discarded"]
83
+ #TODO: direct `discarded_annotation_records` to another dataset to be used later for corner-cases
84
+ self.records = self.records.loc[self.records["annotation"]!="Discarded"]
85
+
86
+ # Reset index
87
+ self.records.reset_index(inplace=True)
88
+
89
+ def modify(self, index: int, record: dict):
90
+ """
91
+ Modify a record in the dataset.
92
+ """
93
+ self.records[index] = record
94
+
95
+ def apply(self, function, column_name: str):
96
+ """
97
+ Apply function on each record.
98
+ """
99
+ self.records[column_name] = self.records.apply(function, axis=1)
100
+
101
+ def save_dataset(self, path: Path):
102
+ self.records.to_csv(path, index=False, quoting=csv.QUOTE_NONNUMERIC)
103
+
104
+ def load_dataset(self, path: Path):
105
+ """
106
+ Loading dataset
107
+ :param path: path for the csv
108
+ """
109
+ if os.path.isfile(path):
110
+ self.records = pd.read_csv(path, dtype={'annotation': str, 'prediction': str, 'batch_id': int})
111
+ else:
112
+ logging.warning('Dataset dump not found, initializing from zero')
113
+
114
+ def remove_duplicates(self, samples: list) -> list:
115
+ """
116
+ Remove (soft) duplicates from the given samples
117
+ :param samples: The samples
118
+ :return: The samples without duplicates
119
+ """
120
+ dd = self.dedup.copy()
121
+ df = pd.DataFrame(samples, columns=['text'])
122
+ df_dedup = dd.sample(df, operation_function=min)
123
+ return df_dedup['text'].tolist()
124
+
125
+ def _null_remove(self, samples: list) -> list:
126
+ # Identity function that returns the input unmodified
127
+ return samples
128
+
129
+ def sample_records(self, n: int = None) -> pd.DataFrame:
130
+ """
131
+ Return a sample of the records after semantic clustering
132
+ :param n: The number of samples to return
133
+ :return: A sample of the records
134
+ """
135
+ n = n or self.sample_size
136
+ if self.semantic_sampling:
137
+ dd = self.dedup.copy()
138
+ df_samples = dd.sample(self.records).head(n)
139
+
140
+ if len(df_samples) < n:
141
+ df_samples = self.records.head(n)
142
+ else:
143
+ df_samples = self.records.sample(n)
144
+ return df_samples
145
+
146
+ @staticmethod
147
+ def samples_to_text(records: pd.DataFrame) -> str:
148
+ """
149
+ Return a string that organize the samples for a meta-prompt
150
+ :param records: The samples for the step
151
+ :return: A string that contains the organized samples
152
+ """
153
+ txt_res = '##\n'
154
+ for i, row in records.iterrows():
155
+ txt_res += f"Sample:\n {row.text}\n#\n"
156
+ return txt_res
157
+
158
+
AutoPrompt/docs/AutoPrompt_Diagram.png ADDED
AutoPrompt/docs/arch_overview.png ADDED
AutoPrompt/docs/architecture.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Architecture Guide
2
+ <img src="./arch_overview.png" alt="Architecture overview" width="70%">
3
+
4
+ This document outlines the system design of AutoPrompt, which is built around four primary components: Dataset, Estimator, Evaluator, and Optimizer Manager. These components collaborate to refine prompts through an iterative process involving sample generation, annotation, prediction, evaluation of scores, and optimization.
5
+
6
+ * __Dataset.__ This component manages the dataset and performs operations such as insertion, modification, deletion, and applying functions, on the dataset rows. The component also handles data cleaning by removing semantic duplications and performing semantic sampling. Since the system is optimized for small datasets, the current implementation is based on a local database using [pandas](https://pandas.pydata.org).
7
+ * __Estimator.__ The estimator is responsible for estimating a batch of samples. We implement this component in two forms, once for the predictions and once for the annotations. Such a generic implementation (for both use cases) allows for easy adaptation of the system to diverse use cases, including prompt calibration, prompt distillation and prompt squashing. The currently supported types of estimators are:
8
+ 1. __Human annotation__: Using [Argilla UI](https://docs.argilla.io/en/latest/index.html#). The system is connected to the Argilla server and is waiting until the annotation task is completed.
9
+ 2. __LLM estimator__: Using an LLM to estimate the sample given a prompt. We support various types of LLMs, using [Langchain](https://python.langchain.com/docs/get_started/introduction) integration. For efficiency, the system supports parallelism using both workers and async calls. The system also supports sending a few samples in one prompt (prompt batching), which can reduce the cost significantly.
10
+ 3. __Batch estimator__: The batch estimator runs multiple LLM estimators and adds a policy layer to aggregate the results. It is mainly used for prompt-squashing, aiming to optimize a single prompt that achieves the efficacy of multiple prompts. For example, in case of a user with several moderation rules.
11
+ * __Evaluator.__ The evaluator is responsible for evaluating the records after the prediction and annotation stage. The evaluator accepts a function and applies it to each row. It's important to note that the function is generic, for example in the generation pipeline the function is performed by invoking an LLM. The evaluator is also responsible for defining the errors and handling the error analysis using the Analyzer meta-prompt.
12
+ * __Optimizer manager (Optimization Pipeline).__ The optimizer manager handles the whole optimization process flow, it performs the iteration steps described in the system flow [documentation](how-it-works.md) and is responsible for stopping and returning the final calibrated prompt. The currently supported criteria are either convergence (determined by a patient hyper-parameter), or usage limit (determined by maximal cost if relevant, or by the number of generated tokens).
13
+
14
+ ## Design Considerations
15
+
16
+ - **Modularity and Flexibility**: Each component is designed with modularity in mind, allowing for easy swaps or upgrades to accommodate different use cases.
17
+ - **Scalability**: The framework's architecture supports scaling, from handling small datasets efficiently to accommodating the computational demands of parallel processing and batch estimation.
18
+ - **Cost-Efficiency**: Features like prompt batching and the use of a batch estimator are specifically included to manage and minimize operational costs associated with LLM usage.
AutoPrompt/docs/argilla_movie_spoilers_example.png ADDED
AutoPrompt/docs/autoprompt_recording.gif ADDED

Git LFS Details

  • SHA256: e4156d4ad7c4d971a7a7721b0a031000a43ba677dd3dd20e2c15f54de88b6172
  • Pointer size: 132 Bytes
  • Size of remote file: 2.33 MB
AutoPrompt/docs/contributing.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to AutoPrompt
2
+
3
+ Thank you for considering contributing to AutoPrompt! We deeply appreciate your interest in improving our project.
4
+
5
+ ## Bug Fixes and Documentation Enhancements
6
+
7
+ Bug fixes and documentation improvements, including compelling examples and use cases, greatly benefit our project. If you encounter any bugs or identify areas where the documentation could be strengthened, please do not hesitate to submit a pull request (PR) containing your proposed changes.
8
+
9
+ ## Feature Requests
10
+
11
+ For significant feature additions, we encourage you to open an issue on GitHub. Additionally, we invite you to join our Discord community and engage in discussions about the feature in the #features-requests channel. This collaborative environment enables us to delve deeper into the proposed features and foster meaningful dialogue.
12
+
13
+ We value your contributions and look forward to working together to enhance AutoPrompt!
AutoPrompt/docs/examples.md ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Prompt Optimization Examples
3
+
4
+ This document provides practical examples of using the AutoPrompt pipeline across various scenarios. It focuses on movie review and chat moderation tasks to demonstrate the flexibility and effectiveness of the AutoPrompt framework.
5
+
6
+
7
+ 1. [Filtering Movie Reviews with Spoilers (Classification task)](#filtering-movie-reviews-with-spoilers-i-task)
8
+ 2. [Movie Genre Identification (Multi-label classification task)](#movie-genre-identification-multi-label-classification)
9
+ 3. [Rating Movie Reviews (Scoring task)](#rating-movie-reviews-scoring-task)
10
+ 4. [Generating Movie Reviews (Generation task)](#generating-movie-reviews-generation-task)
11
+ 5. [Single Topic Moderation](#single-topic-moderation)
12
+ 6. [Multi-Topic Moderation (Prompt squeezing task)](#multi-topic-moderation-prompt-squeezing)
13
+
14
+ ### Filtering Movie Reviews with Spoilers (Classification task)
15
+
16
+ In this binary classification example, we aim to filter out movie reviews containing spoilers for a specific movie. A correctly implemented filter can be a powerful tool in a large-scale movie review system.
17
+
18
+ We'll start with a simple initial prompt and task description:
19
+ - Initial prompt: “Does this movie review contain a spoiler? answer Yes or No”<br>
20
+ - Task description: “Assistant is an expert classifier that will classify a movie review, and let the user know if it contains a spoiler for the reviewed movie or not.”
21
+
22
+ #### Steps to Run Example
23
+
24
+ 1. Configure your labels by editing `config/config_default.yml`. Modify the `label_schema` in the `dataset` section to include only 'Yes' and 'No' options.
25
+
26
+ ```
27
+ dataset:
28
+ name: 'dataset'
29
+ records_path: null
30
+ initial_dataset: 'dump/dataset.csv'
31
+ label_schema: ["Yes", "No"]
32
+ max_samples: 50
33
+ ```
34
+ 2. Run the main pipeline from an IDE or the command line
35
+ ```bash
36
+ > python run_pipeline.py
37
+ ```
38
+
39
+ *Note*: Without input parameters, the pipeline prompts the user to provide them. Alternatively, specify initial prompt and task description as command-line arguments:
40
+ ```bash
41
+ > python run_pipeline.py \
42
+ --prompt "Does this movie review contain a spoiler? answer Yes or No" \
43
+ --task_description "Assistant is an expert classifier that will classify a movie review, and let the user know if it contains a spoiler for the reviewed movie or not."
44
+ ```
45
+
46
+ 3. A browser window displaying the Argilla workspace will open for manual annotations
47
+ ![argilla_example](./argilla_movie_spoilers_example.png)
48
+
49
+ Annotate the generated examples as they appear and monitor the pipeline's progress. Control the number of optimization iterations with the `num_steps` parameter, specified at start:
50
+ ```bash
51
+ > python run_pipeline.py --num_steps 30
52
+ ```
53
+ The pipeline concludes after reaching the `num_steps` or meeting a predefined stop criteria, defined in `config/config_default.yml`:
54
+ ```
55
+ stop_criteria:
56
+ max_usage: 0.5 # Max budget for optimization (USD for OpenAI's LLM model)
57
+ patience: 3 # Number of iterations to wait for improvement
58
+ min_delta: 0.05 # Minimum improvement between iterations
59
+ ```
60
+ Note that the framework also supports using an LLM as the annotator, see setup instructions [here](installation.md#configure-llm-annotator).
61
+
62
+ 4. After completion, the pipeline outputs a **refined (calibrated) prompt** tailored for the task and a reference **benchmark** with challenging samples. In this example, the final spoiler identification prompt might be:
63
+
64
+ ```
65
+ Review Spoiler Identification Protocol: For the task of classifying IMDB reviews for
66
+ the presence of spoilers, the classifier must label reviews with a heightened sensitivity to
67
+ nuanced language and indirect spoiler cues. The classification labels are ’Yes’ for spoilers
68
+ and ’No’ for non-spoilers. Apply the following criteria rigorously: Label ’Yes’ if a review: -
69
+ Contains subtle references or nuanced language that hints at plot developments or character
70
+ arcs, without explicit detail. - Includes emotional responses or descriptive language that
71
+ indirectly reveals plot outcomes or twists. - Employs suggestive language that points to future
72
+ events or endings, even if it does not reveal specific information. Label ’No’ if a review: -
73
+ Discusses technical aspects, acting, direction, or personal viewer impressions in a manner
74
+ that does not hint at or reveal any plot details. - Comments on thematic elements, genre
75
+ characteristics, or storytelling techniques without disclosing or implying crucial plot twists.
76
+ ```
77
+
78
+ - The framework automatically saves the benchmark, run log, and a checkpoint file (which stores the state of the optimization, enabling seamless continuation from a previous run) in a default `dump` path, adjustable with the `--output_dump` command line argument.
79
+ - Note that the steps above are relevant to all classification and generation tasks. See the following examples for more use cases.
80
+
81
+ 5. Until now, we've initiated the pipeline with just an initial prompt and task description. However, you can also include a few examples by specifying an initial dataset in the `initial_dataset` field within the `dataset` section of the `config/config_default.yml` file. For example:
82
+ ```
83
+ dataset:
84
+ initial_dataset: 'dump/dataset.csv'
85
+ ```
86
+ An example of an initial dataset with two samples is shown below:
87
+ ```
88
+ id,text,prediction,annotation,metadata,score,batch_id
89
+ 0,"The cinematography was mesmerizing, especially during the scene where they finally reveal the mysterious room that captivated the main character.",No,Yes,,,0
90
+ 1,"The director's bold choice to leave the world's fate unclear until the final frame will spark audience discussions.",No,Yes,,,0
91
+ ```
92
+
93
+
94
+ ### Movie Genre Identification (Multi-label classification):
95
+
96
+ In this example, we want to segment movie reviews into pre-defined genres. The initial prompt and task description might look like this:
97
+ - Initial prompt: "Based on the following movie review, what genre is this movie? Select between Action, Comedy, Drama, Romance or Horror."
98
+ - Task description: "Assistant is an expert cinema critic for all genres, and is tasked with classifying other movie reviews."
99
+
100
+ #### Run Example
101
+ For this multi-label classification, update the `label_schema` in `config/config_default.yml`
102
+ ```
103
+ dataset:
104
+ label_schema: ["Action", "Comedy", "Drama", "Romance", "Horror"]
105
+ ```
106
+ And then simply run the pipeline with the corresponding input parameters:
107
+ ```bash
108
+ > python run_pipeline.py \
109
+ --prompt "Based on the following movie review, what genre is this movie? Select between Action, Comedy, Drama, Romance or Horror." \
110
+ --task_description "Assistant is an expert cinema critic for all genres, and is tasked with classifying other movie reviews."
111
+ ```
112
+ Please follow the same annotation and monitoring procedures as shown in the previous example.
113
+
114
+ ### Rating Movie Reviews (Scoring task):
115
+ In this example, we aim to score (rank) the movie reviews based on various criteria, assigning a numerical rating to each
116
+
117
+ We'll start with a simple initial prompt:
118
+ - Initial prompt: "How well is this movie review written? Give it a score between 1 and 5, with 1 being the lowest score."
119
+ - Task description: "Assistant is an expert cinema reviewer and editor, and is tasked with scoring other movie reviews."
120
+
121
+ Note that although this task involves scoring, it is treated as a classification task, similar to the examples above.
122
+
123
+ #### Run Example
124
+ To run this task, update the `label_scheme` in the input `config/config_default.yml` config file:
125
+ ```
126
+ dataset:
127
+ label_schema: ["1", "2", "3", "4", "5"]
128
+ ```
129
+ And then simply use the input parameters to run the pipeline:
130
+ ```bash
131
+ > python run_pipeline.py \
132
+ --prompt "How well is this movie review written? Give it a score between 1 and 5, with 1 being the lowest score." \
133
+ --task_description "Assistant is an expert cinema reviewer and editor, and is tasked with scoring other movie reviews."
134
+ ```
135
+ Follow the same steps as in the simple classification example for running the pipeline and annotating through the Argilla UI.
136
+
137
+ ### Generating Movie Reviews (Generation task):
138
+ Here, we aim to generate good (insightful and comprehensive) movie reviews from scratch. The initial prompt might look something like this:
139
+ - Initial prompt: “Write a good and comprehensive movie review about a specific movie.”
140
+ - Task description: “Assistant is a large language model that is tasked with writing movie reviews.”
141
+
142
+ This time, we'll need to use the `run_generation_pipeline.py` to initiate a generation run. This pipeline is different from but builds on the classification pipeline in our earlier examples.
143
+
144
+ The generation pipeline starts by taking the initial prompt and modifying it for a scoring task, similar to the scoring example above. Once it establishes a robust estimtor for high-quality content, in this instance movie reviews, it runs the generation pipeline without the need for human annotation.
145
+
146
+ To facilitate this, two distinct input config files are employed: `config/config_diff/config_ranking.yml`, and `config/config_diff/config_generation.yml`.
147
+
148
+ Note that the `annotator` section in the generation config yaml file remains empty:
149
+ ```
150
+ annotator:
151
+ method : ''
152
+ ```
153
+
154
+ #### Run Example
155
+
156
+ Run the generation pipeline with appropriate arguments:
157
+ ```bash
158
+ > python run_generation_pipeline.py \
159
+ --prompt "Write a good and comprehensive movie review about a specific movie." \
160
+ --task_description "Assistant is a large language model that is tasked with writing movie reviews."
161
+ ```
162
+
163
+ As the pipeline runs, the user will be prompted to annotate ranking examples of movie reviews. The final output will be a calibrated prompt for the generation task.
164
+
165
+ ### Single Topic Moderation:
166
+
167
+ In this example, we aim to monitor user interactions on an Enterprise's chat platform to moderate (filter out) any unsolicited advertisements. This ensures a focused and relevant communication environment.
168
+
169
+ The initial prompt could be as follows:
170
+
171
+ - Initial prompt: “Assess whether the message contains advertising. Answer 'Yes' or 'No'.”
172
+ - Task description: “As a moderation expert at FabricFantasia, an online store selling clothes, you meticulously review customer inquiries and support tickets.”
173
+
174
+ #### Run Example
175
+ For the moderation, update the `label_schema` in `config/config_default.yml`
176
+ ```
177
+ dataset:
178
+ label_schema: ["Yes", "No"]
179
+ ```
180
+ And then execute the pipeline with the specified input parameters:
181
+ ```bash
182
+ > python run_pipeline.py \
183
+ --prompt "Assess whether the message contains advertising. Answer 'Yes' or 'No'." \
184
+ --task_description "As a moderation expert at FabricFantasia, an online store selling clothes, you meticulously review customer inquiries and support tickets."
185
+ ```
186
+ Please follow the same annotation and monitoring procedures as shown in the previous examples.
187
+
188
+ ### Multi Topic Moderation (Prompt squeezing task):
189
+ In this example, our goal is to monitor user interactions on an enterprise's chat platform and moderate (filter out) any problematic topics, including disclosing personal information, deceptive practices, spam, illegal activities, conflict of interest, and off-topic content.
190
+
191
+ The initial prompt could be structured as follows:
192
+
193
+ - Initial prompt: “Does this message contain any problematic topics such as disclosing personal information, deceptive practices, spam, illegal activities, conflict of interest, or off-topic content? Respond with 'Yes' or 'No'.”
194
+ - Task description: “As a moderation expert at FabricFantasia, an online store selling clothes, you meticulously review customer inquiries and support tickets.”
195
+
196
+
197
+ #### Run Example
198
+ In a multi-topic moderation setting, we use various moderation rules to annotate a sample. Each rule is evaluated independently, and the outcomes are combined to generate the final labels. We employ an LLM annotator to avoid time-intensive manual annotation.
199
+
200
+ This task utilizes two distinct input configuration files: `config/config_default.yml`, used previously, and `config/config_diff/config_batch_classification.yml`, which specifies the individual moderation rules, the policy for aggregating results, and LLM configuration. The available aggregation policies are 'exist', 'majority', and 'all'. The 'exist' and 'all' policies are suited for scenarios with 'Yes' or 'No' labels, while the 'majority' policy assigns the final label based on the most frequently occurring outcome across the rules.
201
+
202
+ In our case, it can look like this:
203
+ ```
204
+ dataset:
205
+ label_schema: ["Yes", "No"]
206
+
207
+ annotator:
208
+ method : 'llm_batch'
209
+ config:
210
+ instructions:
211
+ ['Does the message disclosure sensitive personal information? Answer Yes or No',
212
+ 'Does the message involve deceptive practices? Answer Yes or No',
213
+ 'Is this message an example of spam? Answer Yes or No',
214
+ 'Does the message reference or promote any illegal activities? Answer Yes or No',
215
+ 'Does the message come from someone with a potential conflict of interest? Answer Yes or No',
216
+ 'Is this message completely irrelevant to the services or products offered? Answer Yes or No'
217
+ ]
218
+ aggregation_mode: 'exist' #'majority', 'exist', or 'all'. exist/all is working only in case label_schema: ["Yes", "No"]!
219
+ estimator_config:
220
+ num_workers: 2
221
+ prompt: 'prompts/predictor/prediction.prompt'
222
+ mode: 'annotation'
223
+ mini_batch_size: 1
224
+ llm:
225
+ type: 'OpenAI'
226
+ name: 'gpt-4-1106-preview'
227
+ ```
228
+
229
+ Also, update the `label_schema` in `config/config_default.yml`
230
+ ```
231
+ dataset:
232
+ label_schema: ["Yes", "No"]
233
+ ```
234
+
235
+ #### Run Example
236
+ As before, we'll use the `run_pipeline.py` to initiate a multi-topic moderation run.
237
+ ```bash
238
+ > python run_pipeline.py \
239
+ --batch_config_path "config/config_diff/config_batch_classification.yml" \
240
+ --prompt "Assess whether the message contains any of the following problematic topics: disclosing personal information, deceptive practices, spam, illegal activities, conflict of interest, off-topic content. Answer 'Yes' if it does or 'No' otherwise." \
241
+ --task_description "As a moderation expert at FabricFantasia, an online store selling clothes, you meticulously review customer inquiries and support tickets."
242
+ ```
243
+ Please follow the same annotation and monitoring procedures as shown in the previous examples.
AutoPrompt/docs/how-it-works.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # How AutoPrompt works
3
+
4
+ This document outlines the optimization process flows of AutoPrompt. The framework is designed with modularity and adaptability in mind, allowing for easy extension of the prompt calibration process from classification tasks to generative tasks.
5
+
6
+
7
+ ## Classification Pipeline Overview
8
+
9
+ The classification pipeline executes a calibration process involving the following steps:
10
+
11
+ 1. **User Input:**
12
+ - The user provides an initial prompt and task description to kickstart the calibration process.
13
+
14
+ 2. **Challenging Examples:**
15
+ - A set of challenging examples is proposed to the user to enhance the model's performance.
16
+
17
+ 3. **Annotation:**
18
+ - The provided examples are annotated, utilizing either a human-in-the-loop approach or leveraging Language Model (LLM) capabilities.
19
+
20
+ 4. **Prediction:**
21
+ - The annotated samples are evaluated using the current prompt to assess model performance.
22
+
23
+ 5. **Prompt Analysis:**
24
+ - The pipeline analyzes the prompt scores and identifies instances of large errors.
25
+
26
+ 6. **Prompt Refinement:**
27
+ - A new prompt is suggested based on the evaluation results, aiming to improve model accuracy.
28
+
29
+ 7. **Iteration:**
30
+ - Steps 2-6 are iteratively repeated until convergence, refining the prompt and enhancing the model's performance throughout the process.
31
+
32
+
33
+ ## Generation Pipeline Overview
34
+
35
+ The generation pipeline shares a common structure with the classification flow but introduces a modification step for generation prompts. The process unfolds as follows:
36
+
37
+ 1. **User Input:**
38
+ - The user provides an initial prompt and task description for the generation process.
39
+
40
+ 2. **Prompt Modification (LLM):**
41
+ - The initial prompt is transformed into a classification-compatible input using a Language Model (LLM), creating an intermediary task for boolean classification or ranking.
42
+
43
+ 3. **Annotation (Classification):**
44
+ - Challenging examples are annotated for boolean classification or ranking based on the modified prompts. This step is analogous to the classification flow.
45
+
46
+ 4. **Ranker Calibration (LLM):**
47
+ - Utilizing the annotated examples, a ranking prompt (implemented as an LLM estimator) is fitted.
48
+
49
+ 5. **Calibration (Generation):**
50
+ - The original generation prompt is calibrated using the ranking LLM estimator (now used for evaluation), resulting in enhanced prompt formulations for generation tasks.
51
+
52
+
53
+
54
+ The modular architecture of the pipeline demonstrates the flexibility of the core calibration process and effectiveness for both classification and generation tasks. The additional step in the generation flow seamlessly integrates with the overall iterative prompt calibration approach.
55
+
56
+
57
+
58
+
AutoPrompt/docs/installation.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Installation
2
+
3
+ This guide provides detailed instructions for setting up your development environment, configuring LLMs, and integrating various tools necessary for your project.
4
+
5
+ ## Python version
6
+ We recommend using python 3.10.13
7
+
8
+ ## Install with Conda
9
+ We recommend installing using Conda:
10
+ ```bash
11
+ conda env create -f environment_dev.yml
12
+ conda activate AutoPrompt
13
+ ```
14
+
15
+ ## Install with pip
16
+ Install using pip directly:
17
+ ```bash
18
+ pip install -r requirements.txt
19
+ ```
20
+
21
+ ## Install with pipenv
22
+ Install using pipenv:
23
+ ```bash
24
+ pip install pipenv
25
+ pipenv sync
26
+ ```
27
+
28
+ ### Configure your LLM
29
+
30
+ Set your OpenAI API key in the configuration file `config/llm_env.yml`. For assistance locating your API key, visit this [link](https://help.openai.com/en/articles/4936850-where-do-i-find-my-api-key).
31
+
32
+ - For LLM, we recommend using [OpenAI's GPT-4](https://platform.openai.com/docs/guides/gpt). Alternatively, configure Azure by setting llm type in `config/config_default.yml` to `"Azure"` and specifying the key in `config/llm_env.yml`. Our system also supports various LLMs, including open source models, through [Langchain Pipeline](https://python.langchain.com/docs/integrations/llms/huggingface_pipelines). Change the llm `type` to `"HuggingFacePipeline"` and specify the model ID in the llm `name` field.
33
+
34
+ - **Configure your Predictor**. We employ a predictor to estimate prompt performance. The default predictor LLM is GPT-3.5. Configuration is located in the `predictor` section of `config/config_default.yml`.
35
+
36
+ ### Configure Human-in-the-Loop Annotator
37
+
38
+ Our pipeline incorporates a human-in-the-loop annotation process using [Argilla](https://docs.argilla.io/en/latest/index.html). Follow these steps to set it up:
39
+
40
+ 1. **Set Up Argilla Server and UI:** Follow the [instructions](https://docs.argilla.io/en/latest/getting_started/quickstart_installation.html) to install and set up an Argilla server and user interface.
41
+
42
+ 2. **Quick Installation Option:** For a faster setup, we recommend deploying Argilla on a Hugging Face [space](https://huggingface.co/new-space?template=argilla/argilla-template-space).
43
+
44
+ 3. **Configure API Settings:** After setting up the server, modify the `api_url` and `api_key` in the `config/config_default.yml` file. For instance, if using the recommended Hugging Face space, your API URL should be formatted as follows: `api_url: 'https://<your-argilla-space-name>.hf.space'`.
45
+
46
+
47
+ ### Configure LLM Annotator
48
+
49
+ To specify an LLM as the annotation tool in your pipeline, update the `annotator` section in the `config/config_default.yml` file as follows:
50
+
51
+ ```
52
+ annotator:
53
+ method: 'llm'
54
+ config:
55
+ llm:
56
+ type: 'OpenAI'
57
+ name: 'gpt-4-1106-preview'
58
+ instruction:
59
+ 'Assess whether the text contains a harmful topic.
60
+ Answer Yes if it does and No otherwise.'
61
+ num_workers: 5
62
+ prompt: 'prompts/predictor_completion/prediction.prompt'
63
+ mini_batch_size: 1
64
+ mode: 'annotation'
65
+ ```
66
+ We recommend using a robust LLM, like GPT-4, for annotation purposes. In the `instruction` field, you specify the task instructions for the annotation. The `mini_batch_size` field determines the number of samples processed in a single annotation pass, allowing you to balance efficiency with LLM token usage.
67
+
68
+
69
+ ### Monitoring: Weights and Biases Setup
70
+
71
+ To effectively track your optimization process, including metrics like score, prompts instances, and error analysis across iterations, we recommend using [Weights and Biases](https://wandb.ai/site).
72
+
73
+ 1. **Sign Up for Weights and Biases:** Visit their [website](https://wandb.ai/site) and follow the instructions to create an account.
74
+
75
+ 2. **Enable wandb in Your Configuration:** In your project's `config/config_default.yml` file, set `use_wandb` to `True` to activate wandb support.
AutoPrompt/environment_dev.yml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: AutoPrompt
2
+
3
+ channels:
4
+ - conda-forge
5
+ dependencies:
6
+ - python=3.10.13
7
+ - pip>=2.22.0
8
+ - openai
9
+ - langchain
10
+ - pandas
11
+ - wandb
12
+ - transformers
13
+ - tqdm
14
+ - faiss-cpu
15
+ - sentence-transformers
16
+ - pip:
17
+ - prodict
18
+ - argilla==1.25.0
19
+ - schedule
20
+ - pandas
21
+ - easydict
22
+ - pillow==10.2.0
23
+ - langchain-google-genai==0.0.9
AutoPrompt/estimator/__init__.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ from .estimator_argilla import ArgillaEstimator
4
+ from .estimator_llm import LLMEstimator
5
+ from .estimator_llm_batch import LLMBatchEstimator
6
+ from dataset.base_dataset import DatasetBase
7
+
8
+
9
+ class DummyEstimator:
10
+ """
11
+ A dummy callback for the Estimator class.
12
+ This is a method to handle an empty estimator.
13
+ """
14
+
15
+ @staticmethod
16
+ def calc_usage():
17
+ """
18
+ Dummy function to calculate the usage of the dummy estimator
19
+ """
20
+ return 0
21
+
22
+ @staticmethod
23
+ def apply(dataset: DatasetBase, batch_id: int):
24
+ """
25
+ Dummy function to mimic the apply method, returns an empty dataframe
26
+ """
27
+ return pd.DataFrame()
28
+
29
+ def give_estimator(opt):
30
+ if opt.method == 'argilla':
31
+ return ArgillaEstimator(opt.config)
32
+ elif opt.method == 'llm':
33
+ return LLMEstimator(opt.config)
34
+ elif opt.method == 'llm_batch':
35
+ return LLMBatchEstimator(opt.config)
36
+ else:
37
+ return DummyEstimator()
AutoPrompt/estimator/estimator_argilla.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argilla as rg
2
+ import time
3
+ import pandas as pd
4
+ from argilla.client.singleton import active_client
5
+ from utils.config import Color
6
+ from dataset.base_dataset import DatasetBase
7
+ import json
8
+ import webbrowser
9
+ import base64
10
+
11
+ class ArgillaEstimator:
12
+ """
13
+ The ArgillaEstimator class is responsible to generate the GT for the dataset by using Argilla interface.
14
+ In particular using the text classification mode.
15
+ """
16
+ def __init__(self, opt):
17
+ """
18
+ Initialize a new instance of the ArgillaEstimator class.
19
+ """
20
+ try:
21
+ self.opt = opt
22
+ rg.init(
23
+ api_url=opt.api_url,
24
+ api_key=opt.api_key,
25
+ workspace=opt.workspace
26
+ )
27
+ self.time_interval = opt.time_interval
28
+ except:
29
+ raise Exception("Failed to connect to argilla, check connection details")
30
+
31
+ @staticmethod
32
+ def initialize_dataset(dataset_name: str, label_schema: set[str]):
33
+ """
34
+ Initialize a new dataset in the Argilla system
35
+ :param dataset_name: The name of the dataset
36
+ :param label_schema: The list of classes
37
+ """
38
+ try:
39
+ settings = rg.TextClassificationSettings(label_schema=label_schema)
40
+ rg.configure_dataset_settings(name=dataset_name, settings=settings)
41
+ except:
42
+ raise Exception("Failed to create dataset")
43
+
44
+ @staticmethod
45
+ def upload_missing_records(dataset_name: str, batch_id: int, batch_records: pd.DataFrame):
46
+ """
47
+ Update the Argilla dataset by adding missing records from batch_id that appears in batch_records
48
+ :param dataset_name: The dataset name
49
+ :param batch_id: The batch id
50
+ :param batch_records: A dataframe of the batch records
51
+ """
52
+ #TODO: sort visualization according to batch_id descending
53
+ query = "metadata.batch_id:{}".format(batch_id)
54
+ result = rg.load(name=dataset_name, query=query)
55
+ df = result.to_pandas()
56
+ if len(df) == len(batch_records):
57
+ return
58
+ if df.empty:
59
+ upload_df = batch_records
60
+ else:
61
+ merged_df = pd.merge(batch_records, df['text'], on='text', how='left', indicator=True)
62
+ upload_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
63
+ record_list = []
64
+ for index, row in upload_df.iterrows():
65
+ config = {'text': row['text'], 'metadata': {"batch_id": row['batch_id'], 'id': row['id']}, "id": row['id']}
66
+ # if not (row[['prediction']].isnull().any()):
67
+ # config['prediction'] = row['prediction'] # TODO: fix it incorrect type!!!
68
+ if not(row[['annotation']].isnull().any()): # TODO: fix it incorrect type!!!
69
+ config['annotation'] = row['annotation']
70
+ record_list.append(rg.TextClassificationRecord(**config))
71
+ rg.log(records=record_list, name=dataset_name)
72
+
73
+ def calc_usage(self):
74
+ """
75
+ Dummy function to calculate the usage of the estimator
76
+ """
77
+ return 0
78
+
79
+ def apply(self, dataset: DatasetBase, batch_id: int):
80
+ """
81
+ Apply the estimator on the dataset. The function enter to infinite loop until all the records are annotated.
82
+ Then it update the dataset with all the annotations
83
+ :param dataset: DatasetBase object, contains all the processed records
84
+ :param batch_id: The batch id to annotate
85
+ """
86
+ current_api = active_client()
87
+ try:
88
+ rg_dataset = current_api.datasets.find_by_name(dataset.name)
89
+ except:
90
+ self.initialize_dataset(dataset.name, dataset.label_schema)
91
+ rg_dataset = current_api.datasets.find_by_name(dataset.name)
92
+ batch_records = dataset[batch_id]
93
+ if batch_records.empty:
94
+ return []
95
+ self.upload_missing_records(dataset.name, batch_id, batch_records)
96
+ data = {'metadata': {'batch_id': [str(batch_id)]}}
97
+ json_data = json.dumps(data)
98
+ encoded_bytes = base64.b64encode(json_data.encode('utf-8'))
99
+ encoded_string = str(encoded_bytes, "utf-8")
100
+ url_link = self.opt.api_url + '/datasets/' + self.opt.workspace + '/' \
101
+ + dataset.name + '?query=' + encoded_string
102
+ print(f"{Color.GREEN}Waiting for annotations from batch {batch_id}:\n{url_link}{Color.END}")
103
+ webbrowser.open(url_link)
104
+ while True:
105
+ query = "(status:Validated OR status:Discarded) AND metadata.batch_id:{}".format(batch_id)
106
+ search_results = current_api.search.search_records(
107
+ name=dataset.name,
108
+ task=rg_dataset.task,
109
+ size=0,
110
+ query_text=query,
111
+ )
112
+ if search_results.total == len(batch_records):
113
+ result = rg.load(name=dataset.name, query=query)
114
+ df = result.to_pandas()[['text', 'annotation', 'metadata', 'status']]
115
+ df["annotation"] = df.apply(lambda x: 'Discarded' if x['status']=='Discarded' else x['annotation'], axis=1)
116
+ df = df.drop(columns=['status'])
117
+ df['id'] = df.apply(lambda x: x['metadata']['id'], axis=1)
118
+ return df
119
+ time.sleep(self.time_interval)
AutoPrompt/estimator/estimator_llm.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.llm_chain import ChainWrapper, get_chain_metadata
2
+ from pathlib import Path
3
+ from dataset.base_dataset import DatasetBase
4
+ import pandas as pd
5
+
6
+ class LLMEstimator:
7
+ """
8
+ A wrapper for an estimator using LLM
9
+ """
10
+
11
+ def __init__(self, opt):
12
+ """
13
+ Initialize a new instance of the LLMEstimator class.
14
+ :param opt: The configuration file (EasyDict)
15
+ """
16
+ self.opt = opt
17
+ self.chain = None
18
+ self.mini_batch_size = opt.mini_batch_size
19
+ self.mode = opt.mode
20
+ self.num_workers = opt.num_workers
21
+ if 'instruction' in opt.keys():
22
+ self.cur_instruct = opt.instruction
23
+ else:
24
+ self.cur_instruct = None
25
+
26
+ @staticmethod
27
+ def generate_sample_text(sample_id: int, text: str) -> str:
28
+ """
29
+ Generate a sample text for the chain prompt
30
+ :param sample_id: The sample id
31
+ :param text: The text of the sample
32
+ :return: The sample text for the prompt
33
+ """
34
+ return f"ID: {sample_id}; Sample: {text}\n"
35
+
36
+ def calc_usage(self) -> float:
37
+ """"
38
+ Calculate the usage of the estimator
39
+ """
40
+ return self.chain.accumulate_usage
41
+
42
+ def init_chain(self, label_schema: set[str]):
43
+ """
44
+ Initialize the chain
45
+ :param label_schema: The label schema
46
+ """
47
+ chain_metadata = get_chain_metadata(Path(self.opt.prompt), retrieve_module=True)
48
+ if hasattr(chain_metadata['module'], 'update_classification_prediction_schema'):
49
+ chain_metadata['json_schema'] = chain_metadata['module'].update_classification_prediction_schema(
50
+ chain_metadata['json_schema'],
51
+ label_schema
52
+ )
53
+ self.chain = ChainWrapper(self.opt.llm, self.opt.prompt, chain_metadata['json_schema'],
54
+ chain_metadata['parser_func'])
55
+
56
+ def apply_dataframe(self, record: pd.DataFrame):
57
+ """
58
+ Apply the estimator on a dataframe
59
+ :param record: The record
60
+ """
61
+ chain_input = ''
62
+ mini_batch_inputs = []
63
+ record[self.mode] = 'Discarded'
64
+ # prepare all the inputs for the chains
65
+ for i, row in record.iterrows():
66
+ chain_input += self.generate_sample_text(i, row['text'])
67
+ if ((i + 1) % self.mini_batch_size) == 0:
68
+ mini_batch_inputs.append({'batch_size': self.mini_batch_size, 'task_instruction': self.cur_instruct,
69
+ 'samples': chain_input})
70
+ chain_input = ''
71
+ if not (chain_input == ''):
72
+ mini_batch_inputs.append({'batch_size': self.mini_batch_size, 'task_instruction': self.cur_instruct,
73
+ 'samples': chain_input})
74
+
75
+ all_results = self.chain.batch_invoke(mini_batch_inputs, self.num_workers)
76
+ union_results = [element for sublist in all_results for element in sublist['results']]
77
+ for res in union_results:
78
+ record.loc[res['id'], self.mode] = res['prediction']
79
+ return record
80
+
81
+ def apply(self, dataset: DatasetBase, idx: int, leq: bool = False):
82
+ """
83
+ Apply the estimator on the batches up to idx (includes), it then updates the annotation field
84
+ if self.mode is 'annotation', otherwise it update the prediction field.
85
+ :param dataset: The dataset
86
+ :param idx: The current batch index
87
+ :param leq: If True, apply on all the batches up to idx (includes), otherwise apply only on idx
88
+ """
89
+ if self.chain is None:
90
+ self.init_chain(dataset.label_schema)
91
+ if leq:
92
+ batch_records = dataset.get_leq(idx)
93
+ else:
94
+ batch_records = dataset[idx]
95
+ return self.apply_dataframe(batch_records)
AutoPrompt/estimator/estimator_llm_batch.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from estimator.estimator_llm import LLMEstimator
2
+ from dataset.base_dataset import DatasetBase
3
+ import pandas as pd
4
+
5
+
6
+ class LLMBatchEstimator:
7
+ """
8
+ A wrapper for an estimator using aggregation of multiple LLMs estimators
9
+ """
10
+
11
+ def __init__(self, opt):
12
+ """
13
+ Initialize a new instance of the LLMEstimator class.
14
+ :param opt: The configuration file (EasyDict)
15
+ """
16
+ self.llm_estimators = [LLMEstimator(opt.estimator_config) for _ in range(len(opt.instructions))]
17
+ for i, estimator in enumerate(self.llm_estimators):
18
+ estimator.cur_instruct = opt.instructions[i]
19
+ self.mode = opt.estimator_config.mode
20
+ self.aggregation_mode = opt.aggregation_mode
21
+
22
+ def calc_usage(self) -> float:
23
+ """"
24
+ Calculate the usage of the estimator
25
+ """
26
+ return sum([estimator.calc_usage() for estimator in self.llm_estimators])
27
+
28
+ def get_aggregation_function(self):
29
+ if self.aggregation_mode == 'max':
30
+ return lambda record: max(record)
31
+ elif self.aggregation_mode == 'min':
32
+ return lambda record: min(record)
33
+ elif self.aggregation_mode == 'mean':
34
+ return lambda record: sum(record) / len(record)
35
+ elif self.aggregation_mode == 'median':
36
+ return lambda record: sorted(record)[len(record) // 2]
37
+ elif self.aggregation_mode == 'majority':
38
+ return lambda record: max(set(record), key=record.count)
39
+ elif self.aggregation_mode == 'exist':
40
+ return lambda record: 'Yes' if any([t == 'Yes' for t in record]) else 'No'
41
+ elif self.aggregation_mode == 'all':
42
+ return lambda record: 'Yes' if all([t == 'Yes' for t in record]) else 'No'
43
+ else:
44
+ raise Exception(f'Unknown aggregation class {self.aggregation_mode}')
45
+
46
+ def apply(self, dataset: DatasetBase, idx: int, leq: bool = False):
47
+ """
48
+ Apply the estimator on the batches up to idx (includes), it then updates the annotation field
49
+ if self.mode is 'annotation', otherwise it update the prediction field.
50
+ :param dataset: The dataset
51
+ :param idx: The current batch index
52
+ :param leq: If True, apply on all the batches up to idx (includes), otherwise apply only on idx
53
+ """
54
+ update_datasets = [estimator.apply(dataset, idx, leq) for estimator in self.llm_estimators]
55
+ res_dataset = update_datasets[0]
56
+ if res_dataset.empty:
57
+ return res_dataset
58
+ for i, df in enumerate(update_datasets[1:]):
59
+ # Merge the dataframes on the 'id' column
60
+ merged_df = pd.merge(res_dataset, df[['id', self.mode]], on='id', how='left', suffixes=('_left', '_right'))
61
+ if i == 0:
62
+ res_dataset[self.mode] = merged_df.apply(lambda row: [str(row['{}_left'.format(self.mode)])] +
63
+ [str(row['{}_right'.format(self.mode)])], axis=1)
64
+ else:
65
+ res_dataset[self.mode] = merged_df.apply(lambda row: row['{}_left'.format(self.mode)] +
66
+ [str(row['{}_right'.format(self.mode)])], axis=1)
67
+ res_dataset[self.mode] = res_dataset[self.mode].apply(self.get_aggregation_function())
68
+ return res_dataset
AutoPrompt/eval/eval_utils.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from estimator.estimator_llm import LLMEstimator
2
+
3
+
4
+ def set_function_from_iterrow(func):
5
+ def wrapper(dataset):
6
+ dataset['score'] = dataset.apply(func, axis=1)
7
+ return dataset
8
+
9
+ return wrapper
10
+
11
+
12
+ def set_ranking_function(params):
13
+ evaluator = LLMEstimator(params)
14
+ evaluator.init_chain(params.label_schema)
15
+ evaluator.mode = 'score'
16
+ def wrapper(dataset):
17
+ generation_dataset = dataset.copy()
18
+ generation_dataset['text'] = '###User input:\n' + generation_dataset['text'] + '\n####model prediction:\n' + generation_dataset['prediction']
19
+
20
+ generation_dataset = evaluator.apply_dataframe(generation_dataset)
21
+ generation_dataset.score = generation_dataset.score.astype(int)
22
+ dataset.score = generation_dataset.score
23
+ return dataset
24
+ return wrapper
AutoPrompt/eval/evaluator.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.metrics import confusion_matrix
4
+ import eval.eval_utils as utils
5
+
6
+ class Eval:
7
+ """
8
+ The Eval class is responsible to calculate the score and the large errors
9
+ """
10
+
11
+ def __init__(self, config, analyzer=None, label_schema=None):
12
+ """
13
+ Initialize a new instance of the Eval class.
14
+ :param config: The configuration file (EasyDict)
15
+ :analyzer (optional): A chain that analyze the errors
16
+ :label_schema (optional): The label schema
17
+ """
18
+ self.score_function_name = config.function_name
19
+ self.score_func = self.get_eval_function(config)
20
+ self.num_errors = config.num_large_errors
21
+ self.error_threshold = config.error_threshold
22
+ self.dataset = None
23
+ self.mean_score = None
24
+ self.label_schema = label_schema
25
+ self.errors = None
26
+ self.history = []
27
+ self.analyzer = analyzer
28
+
29
+ @staticmethod
30
+ def get_eval_function(config: dict):
31
+ """
32
+ Returns the eval function
33
+ :param config: The eval configuration
34
+ :return: The function implementation on a record
35
+ """
36
+ if config.function_name == 'accuracy':
37
+ return utils.set_function_from_iterrow(lambda record: record['annotation'] == record['prediction'])
38
+ elif config.function_name == 'ranking':
39
+ return utils.set_ranking_function(config.function_params)
40
+ else:
41
+ raise NotImplementedError("Eval function not implemented")
42
+
43
+ def eval_score(self) -> float:
44
+ """
45
+ Calculate the score on each row and return the mean score.
46
+ :return: The mean score
47
+ """
48
+ # filter out the discarded samples
49
+ self.dataset = self.dataset[(self.dataset['prediction'] != 'Discarded') &
50
+ (self.dataset['annotation'] != 'Discarded')]
51
+ self.dataset = self.score_func(self.dataset)
52
+ self.mean_score = self.dataset['score'].mean()
53
+ return self.mean_score
54
+
55
+ def get_max_score(self, warmup=0):
56
+ """
57
+ Return the maximum 'mean score' (with respect to all history epochs, starting form warmup, up to last) and the epoch index of the maximum score
58
+ :return: The epoch index of the maximum score, and the maximum score
59
+ """
60
+ max_idx = np.argmax([epoch['score'] for epoch in self.history[warmup:-1]])
61
+ max_idx += warmup
62
+ return max_idx, self.history[max_idx]['score']
63
+
64
+
65
+ def large_error_to_str(self, error_df: pd.DataFrame, num_large_errors_per_label: int) -> str:
66
+ """
67
+ Return a string that contains the large errors
68
+ :param error_df: A dataframe contains all the mislabeled samples
69
+ :param num_large_errors_per_label: The (maximum) number of large errors per label
70
+ :return: A string that contains the large errors that is used in the meta-prompt
71
+ """
72
+ required_columns = ['annotation', 'text', 'score', 'prediction']
73
+ label_schema = error_df['annotation'].unique()
74
+ if self.score_function_name == 'ranker':
75
+ gt_name = 'Rank:'
76
+ else:
77
+ gt_name = 'GT:'
78
+ error_res_df_list = []
79
+ txt_res = ''
80
+ for label in label_schema:
81
+ cur_df = error_df[error_df['annotation'] == label]
82
+ cur_df = cur_df.sample(frac=1.0, random_state=42)[:num_large_errors_per_label]
83
+ error_res_df_list.append(cur_df[required_columns])
84
+ if len(error_res_df_list) > 0:
85
+ error_res_df = pd.concat(error_res_df_list, ignore_index=True)
86
+ error_res_df = error_res_df.sample(frac=1.0, random_state=42)
87
+ for i, row in error_res_df.iterrows():
88
+ txt_res += f"Sample: {row.text}\nPrediction: {row.prediction}, {gt_name}: {row.annotation}\n#\n"
89
+ return txt_res
90
+
91
+ def sample_to_text(self, sample: dict, num_errors_per_label: int = 0, is_score: bool = True) -> str:
92
+ """
93
+ Return a string that organize the information of from the step run for the meta-prompt
94
+ :param sample: The eval information for specific step
95
+ :param num_errors_per_label: The max number of large errors per class that will appear in the meta-prompt
96
+ :param is_score: If True, add the score information to the meta-prompt
97
+ :return: A string that contains the information of the step run
98
+ """
99
+ if is_score:
100
+ return f"####\n##Prompt Score: {sample['score']:.2f}\n##Prompt:\n{sample['prompt']}\n#################\n"
101
+ else:
102
+ return f"####\n##Prompt:\n{sample['prompt']}\n{self.large_error_to_str(sample['errors'], num_errors_per_label)}####\n "
103
+
104
+ def add_history(self, prompt: str, task_description: str):
105
+ """
106
+ Add the current step information to the history
107
+ :param prompt: The current prompt
108
+ :param task_description: The task description
109
+ """
110
+ conf_matrix = None
111
+ large_error_to_str = self.large_error_to_str(self.errors, self.num_errors)
112
+ prompt_input = {'task_description': task_description, 'accuracy': self.mean_score, 'prompt': prompt,
113
+ 'failure_cases': large_error_to_str}
114
+ if self.score_function_name == 'accuracy':
115
+ conf_matrix = confusion_matrix(self.dataset['annotation'],
116
+ self.dataset['prediction'], labels=self.label_schema)
117
+ conf_text = f"Confusion matrix columns:{self.label_schema} the matrix data:"
118
+ for i, row in enumerate(conf_matrix):
119
+ conf_text += f"\n{self.label_schema[i]}: {row}"
120
+ prompt_input['confusion_matrix'] = conf_text
121
+ elif self.score_function_name == 'ranking':
122
+ prompt_input['labels'] = self.label_schema
123
+ analysis = self.analyzer.invoke(prompt_input)
124
+
125
+ self.history.append({'prompt': prompt, 'score': self.mean_score,
126
+ 'errors': self.errors, 'confusion_matrix': conf_matrix, 'analysis': analysis['text']})
127
+
128
+ def extract_errors(self) -> pd.DataFrame:
129
+ """
130
+ Extract the errors from the dataset
131
+ :return: records that contains the errors
132
+ """
133
+ df = self.dataset
134
+ err_df = df[df['score'] < self.error_threshold]
135
+ err_df.sort_values(by=['score'])
136
+ self.errors = err_df
137
+ return self.errors
138
+
139
+ def extract_correct(self) -> pd.DataFrame:
140
+ """
141
+ Extract the correct samples from the dataset
142
+ :return: records that contains the correct samples
143
+ """
144
+ df = self.dataset
145
+ return df[df['score'] > self.error_threshold]
146
+
147
+ def extract_boundary_predictions(self) -> pd.DataFrame:
148
+ """
149
+ Extract boundary samples on which the model is uncertain
150
+ :return: records that contains boundary samples
151
+ """
152
+ pass
AutoPrompt/optimization_pipeline.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ from eval.evaluator import Eval
4
+ from dataset.base_dataset import DatasetBase
5
+ from utils.llm_chain import MetaChain
6
+ from estimator import give_estimator
7
+ from pathlib import Path
8
+ import pickle
9
+ import os
10
+ import json
11
+ import logging
12
+ import wandb
13
+
14
+
15
+ class OptimizationPipeline:
16
+ """
17
+ The main pipeline for optimization. The pipeline is composed of 4 main components:
18
+ 1. dataset - The dataset handle the data including the annotation and the prediction
19
+ 2. annotator - The annotator is responsible generate the GT
20
+ 3. predictor - The predictor is responsible to generate the prediction
21
+ 4. eval - The eval is responsible to calculate the score and the large errors
22
+ """
23
+
24
+ def __init__(self, config, task_description: str = None, initial_prompt: str = None, output_path: str = ''):
25
+ """
26
+ Initialize a new instance of the ClassName class.
27
+ :param config: The configuration file (EasyDict)
28
+ :param task_description: Describe the task that needed to be solved
29
+ :param initial_prompt: Provide an initial prompt to solve the task
30
+ :param output_path: The output dir to save dump, by default the dumps are not saved
31
+ """
32
+
33
+ if config.use_wandb: # In case of using W&B
34
+ wandb.login()
35
+ self.wandb_run = wandb.init(
36
+ project="AutoGPT",
37
+ config=config,
38
+ )
39
+ if output_path == '':
40
+ self.output_path = None
41
+ else:
42
+ if not os.path.isdir(output_path):
43
+ os.makedirs(output_path)
44
+ self.output_path = Path(output_path)
45
+ logging.basicConfig(filename=self.output_path / 'info.log', level=logging.DEBUG,
46
+ format='%(asctime)s - %(levelname)s - %(message)s', force=True)
47
+
48
+ self.dataset = None
49
+ self.config = config
50
+ self.meta_chain = MetaChain(config)
51
+ self.initialize_dataset()
52
+
53
+ self.task_description = task_description
54
+ self.cur_prompt = initial_prompt
55
+
56
+ self.predictor = give_estimator(config.predictor)
57
+ self.annotator = give_estimator(config.annotator)
58
+ self.eval = Eval(config.eval, self.meta_chain.error_analysis, self.dataset.label_schema)
59
+ self.batch_id = 0
60
+ self.patient = 0
61
+
62
+ @staticmethod
63
+ def log_and_print(message):
64
+ print(message)
65
+ logging.info(message)
66
+
67
+ def initialize_dataset(self):
68
+ """
69
+ Initialize the dataset: Either empty dataset or loading an existing dataset
70
+ """
71
+ logging.info('Initialize dataset')
72
+ self.dataset = DatasetBase(self.config.dataset)
73
+ if 'initial_dataset' in self.config.dataset.keys():
74
+ logging.info(f'Load initial dataset from {self.config.dataset.initial_dataset}')
75
+ self.dataset.load_dataset(self.config.dataset.initial_dataset)
76
+
77
+ def calc_usage(self):
78
+ """
79
+ Calculate the usage of the optimization process (either $ in case of openAI or #tokens the other cases)
80
+ """
81
+ total_usage = 0
82
+ total_usage += self.meta_chain.calc_usage()
83
+ total_usage += self.annotator.calc_usage()
84
+ total_usage += self.predictor.calc_usage()
85
+ return total_usage
86
+
87
+ def extract_best_prompt(self):
88
+ sorted_history = sorted(
89
+ self.eval.history[min(self.config.meta_prompts.warmup - 1, len(self.eval.history) - 1):],
90
+ key=lambda x: x['score'],
91
+ reverse=False)
92
+ return {'prompt': sorted_history[-1]['prompt'], 'score': sorted_history[-1]['score']}
93
+
94
+ def run_step_prompt(self):
95
+ """
96
+ Run the meta-prompts and get new prompt suggestion, estimated prompt score and a set of challenging samples
97
+ for the new prompts
98
+ """
99
+ step_num = len(self.eval.history)
100
+ if (step_num < self.config.meta_prompts.warmup) or (step_num % 3) > 0:
101
+ last_history = self.eval.history[-self.config.meta_prompts.history_length:]
102
+ else:
103
+ sorted_history = sorted(self.eval.history[self.config.meta_prompts.warmup - 1:], key=lambda x: x['score'],
104
+ reverse=False)
105
+ last_history = sorted_history[-self.config.meta_prompts.history_length:]
106
+ history_prompt = '\n'.join([self.eval.sample_to_text(sample,
107
+ num_errors_per_label=self.config.meta_prompts.num_err_prompt,
108
+ is_score=True) for sample in last_history])
109
+ prompt_input = {"history": history_prompt, "task_description": self.task_description,
110
+ 'error_analysis': last_history[-1]['analysis']}
111
+ if 'label_schema' in self.config.dataset.keys():
112
+ prompt_input["labels"] = json.dumps(self.config.dataset.label_schema)
113
+ prompt_suggestion = self.meta_chain.step_prompt_chain.invoke(prompt_input)
114
+ self.log_and_print(f'Previous prompt score:\n{self.eval.mean_score}\n#########\n')
115
+ self.log_and_print(f'Get new prompt:\n{prompt_suggestion["prompt"]}')
116
+ self.batch_id += 1
117
+ if len(self.dataset) < self.config.dataset.max_samples:
118
+ batch_input = {"num_samples": self.config.meta_prompts.samples_generation_batch,
119
+ "task_description": self.task_description,
120
+ "prompt": prompt_suggestion['prompt']}
121
+ batch_inputs = self.generate_samples_batch(batch_input, self.config.meta_prompts.num_generated_samples,
122
+ self.config.meta_prompts.samples_generation_batch)
123
+
124
+ if sum([len(t['errors']) for t in last_history]) > 0:
125
+ history_samples = '\n'.join([self.eval.sample_to_text(sample,
126
+ num_errors_per_label=self.config.meta_prompts.num_err_samples,
127
+ is_score=False) for sample in last_history])
128
+ for batch in batch_inputs:
129
+ extra_samples = self.dataset.sample_records()
130
+ extra_samples_text = DatasetBase.samples_to_text(extra_samples)
131
+ batch['history'] = history_samples
132
+ batch['extra_samples'] = extra_samples_text
133
+ else:
134
+ for batch in batch_inputs:
135
+ extra_samples = self.dataset.sample_records()
136
+ extra_samples_text = DatasetBase.samples_to_text(extra_samples)
137
+ batch['history'] = 'No previous errors information'
138
+ batch['extra_samples'] = extra_samples_text
139
+
140
+ samples_batches = self.meta_chain.step_samples.batch_invoke(batch_inputs,
141
+ self.config.meta_prompts.num_workers)
142
+ new_samples = [element for sublist in samples_batches for element in sublist['samples']]
143
+ new_samples = self.dataset.remove_duplicates(new_samples)
144
+ self.dataset.add(new_samples, self.batch_id)
145
+ logging.info('Get new samples')
146
+ self.cur_prompt = prompt_suggestion['prompt']
147
+
148
+ def stop_criteria(self):
149
+ """
150
+ Check if the stop criteria holds. The conditions for stopping:
151
+ 1. Usage is above the threshold
152
+ 2. There was no improvement in the last > patient steps
153
+ """
154
+ if 0 < self.config.stop_criteria.max_usage < self.calc_usage():
155
+ return True
156
+ if len(self.eval.history) <= self.config.meta_prompts.warmup:
157
+ self.patient = 0
158
+ return False
159
+ min_batch_id, max_score = self.eval.get_max_score(self.config.meta_prompts.warmup-1)
160
+ if max_score - self.eval.history[-1]['score'] > -self.config.stop_criteria.min_delta:
161
+ self.patient += 1
162
+ else:
163
+ self.patient = 0
164
+ if self.patient > self.config.stop_criteria.patience:
165
+ return True
166
+ return False
167
+
168
+ @staticmethod
169
+ def generate_samples_batch(batch_input, num_samples, batch_size):
170
+ """
171
+ Generate samples in batch
172
+ """
173
+ batch_num = num_samples // batch_size
174
+ all_batches = [batch_input.copy() for _ in range(batch_num)]
175
+ reminder = num_samples - batch_num * batch_size
176
+ if reminder > 0:
177
+ all_batches.append(batch_input.copy())
178
+ all_batches[-1]['num_samples'] = reminder
179
+ return all_batches
180
+
181
+ def generate_initial_samples(self):
182
+ """
183
+ In case the initial dataset is empty generate the initial samples
184
+ """
185
+ batch_input = {"num_samples": self.config.meta_prompts.samples_generation_batch,
186
+ "task_description": self.task_description,
187
+ "instruction": self.cur_prompt}
188
+ batch_inputs = self.generate_samples_batch(batch_input, self.config.meta_prompts.num_initialize_samples,
189
+ self.config.meta_prompts.samples_generation_batch)
190
+
191
+ samples_batches = self.meta_chain.initial_chain.batch_invoke(batch_inputs, self.config.meta_prompts.num_workers)
192
+ samples_list = [element for sublist in samples_batches for element in sublist['samples']]
193
+ samples_list = self.dataset.remove_duplicates(samples_list)
194
+ self.dataset.add(samples_list, 0)
195
+
196
+ def save_state(self):
197
+ """
198
+ Save the process state
199
+ """
200
+ if self.output_path is None:
201
+ return
202
+ logging.info('Save state')
203
+ self.dataset.save_dataset(self.output_path / 'dataset.csv')
204
+ state = {'history': self.eval.history, 'batch_id': self.batch_id,
205
+ 'prompt': self.cur_prompt, 'task_description': self.task_description,
206
+ 'patient': self.patient}
207
+ pickle.dump(state, open(self.output_path / 'history.pkl', 'wb'))
208
+
209
+ def load_state(self, path: str):
210
+ """
211
+ Load pretrain state
212
+ """
213
+ path = Path(path)
214
+ if (path / 'dataset.csv').is_file():
215
+ self.dataset.load_dataset(path / 'dataset.csv')
216
+ if (path / 'history.pkl').is_file():
217
+ state = pickle.load(open(path / 'history.pkl', 'rb'))
218
+ self.eval.history = state['history']
219
+ self.batch_id = state['batch_id']
220
+ self.cur_prompt = state['prompt']
221
+ self.task_description = state['task_description']
222
+ self.patient = state['patient']
223
+
224
+ def step(self, current_iter, total_iter):
225
+ """
226
+ This is the main optimization process step.
227
+ """
228
+ self.log_and_print(f'Starting step {self.batch_id}')
229
+ if len(self.dataset.records) == 0:
230
+ self.log_and_print('Dataset is empty generating initial samples')
231
+ self.generate_initial_samples()
232
+ if self.config.use_wandb:
233
+ cur_batch = self.dataset.get_leq(self.batch_id)
234
+ random_subset = cur_batch.sample(n=min(10, len(cur_batch)))[['text']]
235
+ self.wandb_run.log(
236
+ {"Prompt": wandb.Html(f"<p>{self.cur_prompt}</p>"), "Samples": wandb.Table(dataframe=random_subset)},
237
+ step=self.batch_id)
238
+
239
+ logging.info('Running annotator')
240
+ records = self.annotator.apply(self.dataset, self.batch_id)
241
+ self.dataset.update(records)
242
+
243
+ self.predictor.cur_instruct = self.cur_prompt
244
+ logging.info('Running Predictor')
245
+ records = self.predictor.apply(self.dataset, self.batch_id, leq=True)
246
+ self.dataset.update(records)
247
+
248
+ self.eval.dataset = self.dataset.get_leq(self.batch_id)
249
+ self.eval.eval_score()
250
+ logging.info('Calculating Score')
251
+ large_errors = self.eval.extract_errors()
252
+ self.eval.add_history(self.cur_prompt, self.task_description)
253
+ if self.config.use_wandb:
254
+ large_errors = large_errors.sample(n=min(6, len(large_errors)))
255
+ correct_samples = self.eval.extract_correct()
256
+ correct_samples = correct_samples.sample(n=min(6, len(correct_samples)))
257
+ vis_data = pd.concat([large_errors, correct_samples])
258
+ self.wandb_run.log({"score": self.eval.history[-1]['score'],
259
+ "prediction_result": wandb.Table(dataframe=vis_data),
260
+ 'Total usage': self.calc_usage()}, step=self.batch_id)
261
+ if self.stop_criteria():
262
+ self.log_and_print('Stop criteria reached')
263
+ return True
264
+ if current_iter != total_iter-1:
265
+ self.run_step_prompt()
266
+ self.save_state()
267
+ return False
268
+
269
+ def run_pipeline(self, num_steps: int):
270
+ # Run the optimization pipeline for num_steps
271
+ num_steps_remaining = num_steps - self.batch_id
272
+ for i in range(num_steps_remaining):
273
+ stop_criteria = self.step(i, num_steps_remaining)
274
+ if stop_criteria:
275
+ break
276
+ final_result = self.extract_best_prompt()
277
+ return final_result
AutoPrompt/prompts/meta_prompts_classification/error_analysis.prompt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to provide a high quality analysis for every task.
2
+ You are given the following task description
3
+ {task_description}
4
+
5
+ Here is the prompt instructions that was given to the model:
6
+ {prompt}
7
+
8
+ The accuracy for this prompt is: {accuracy}
9
+ The confusion matrix for this prompt is: {confusion_matrix}
10
+ ##
11
+ Here is a list of failure cases for the given prompt:
12
+ ##Failure Cases:
13
+ {failure_cases}
14
+
15
+ ###
16
+ Note that the ground-truth labels are __absolutely correct__, but the prompts (task descriptions) may be incorrect and need modification.
17
+ Your task is to provide a brief analysis of the given prompt performance.
18
+ Guidelines:
19
+ 1. The analysis should contain only the following information:
20
+ - If there exists abnormal behavior in the confusion matrix, describe it.
21
+ - A summary of the common failure cases, try to cluster the failure cases into groups and describe each group.
22
+ 3. The total length of your analysis should be less than 200 token!
23
+ ###
24
+ Analysis:
AutoPrompt/prompts/meta_prompts_classification/initial.prompt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to generate challenging samples for every task.
2
+ Generate a list of {num_samples} challenging samples for the following task.
3
+ ### Task description:
4
+ {task_description}
5
+ ### Task Instruction:
6
+ {instruction}
7
+ ###
8
+ ### Requirements for Challenging Samples:
9
+ 1. The generated samples must be challenging and diverse such that using the task instruction as a prompt will result in the wrong result.
10
+ 2. The number of generated samples from each class in the task instruction should be balanced (i.e. the same number of samples for each class)
11
+ 3. The generated samples should be distinct, realistic, and vary significantly to ensure diversity.
AutoPrompt/prompts/meta_prompts_classification/initial_verbose.prompt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ As an advanced language model you should create {num_samples} challenging and unique samples for the task outlined below.
2
+ These samples should be intricately designed to test the limits of the task's instructions, challenging yet relevant to the task description.
3
+
4
+ ### Task Description:
5
+ {task_description}
6
+
7
+ ### Task Instructions:
8
+ {instruction}
9
+
10
+ ### Requirements for Challenging Samples:
11
+ 1. Each sample must present a unique and intricate challenge.
12
+ 2. The complexity of the samples should be such that simply applying the given task instruction would likely lead to incorrect or incomplete results.
13
+ 3. The samples should cover a diverse range of scenarios within the scope of the task, avoiding repetition and predictability.
14
+ 4. Ensure that the samples, while challenging, remain realistic and pertinent to the task's context.
15
+
16
+ Generate the samples keeping these requirements in mind.
17
+ ###
AutoPrompt/prompts/meta_prompts_classification/output_schemes.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A file containing the json schema for the output of all the LLM chains
2
+
3
+ initial_schema = step_samples_schema = {
4
+ "description": "A List of all results",
5
+ "properties": {
6
+ "samples": {
7
+ "description": "Each sample is a string containing the sample content, without any additional information like the Prediction or GT",
8
+ "items": {
9
+ "type": "string"
10
+ },
11
+ "title": "Samples",
12
+ "type": "array"
13
+ }
14
+ },
15
+ "required": [
16
+ "samples"
17
+ ],
18
+ "title": "Sample_List",
19
+ "type": "object"
20
+ }
21
+
22
+
23
+ classification_prediction_schema = {
24
+ "$defs": {
25
+ "Result": {
26
+ "description": "A single result",
27
+ "properties": {
28
+ "id": {
29
+ "description": "The sample id",
30
+ "title": "Id",
31
+ "type": "integer"
32
+ },
33
+ "prediction": {
34
+ "description": "The prediction of the sample.",
35
+ "title": "Prediction",
36
+ "type": "string"
37
+ }
38
+ },
39
+ "required": [
40
+ "id",
41
+ "prediction"
42
+ ],
43
+ "title": "Result",
44
+ "type": "object"
45
+ }
46
+ },
47
+ "description": "A List of task classification results",
48
+ "properties": {
49
+ "results": {
50
+ "description": "Each item contain the id and the prediction of the sample",
51
+ "items": {
52
+ "$ref": "#/$defs/Result"
53
+ },
54
+ "title": "Results",
55
+ "type": "array"
56
+ }
57
+ },
58
+ "required": [
59
+ "results"
60
+ ],
61
+ "title": "Results_List",
62
+ "type": "object"
63
+ }
64
+
65
+
66
+ step_prompt_schema = {
67
+ "description": "A prompt suggestion which expect to get high score, and the associated score prediction",
68
+ "properties": {
69
+ "prompt": {
70
+ "description": "The prompt prediction",
71
+ "title": "Prompt",
72
+ "type": "string"
73
+ },
74
+ "score": {
75
+ "description": "The score prediction",
76
+ "title": "Score",
77
+ "type": "number"
78
+ }
79
+ },
80
+ "required": [
81
+ "prompt",
82
+ "score"
83
+ ],
84
+ "title": "Suggested_Prompt",
85
+ "type": "object"
86
+ }
87
+
88
+ def update_classification_prediction_schema(label_schema:list)->dict:
89
+ """
90
+ Updates the classification prediction schema with the label schema from the yaml file
91
+ :param yaml_data: The yaml data
92
+ """
93
+
94
+ classification_prediction_schema['$defs']['Result']['properties']['prediction']['enum'] = label_schema
95
+ classification_prediction_schema['$defs']['Result']['properties']['prediction'][
96
+ 'description'] += 'The answer must be one of the following options: {} !!'.format(label_schema)
97
+ return classification_prediction_schema
AutoPrompt/prompts/meta_prompts_classification/step_prompt.prompt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to provide the best prompt for every task.
2
+ Below are a few suggested prompts for the task and their score, for the following task:
3
+ {task_description}
4
+
5
+ ## Examples
6
+ {history}
7
+ ######
8
+ This is the error analysis for the last prompt:
9
+ {error_analysis}
10
+ ######
11
+ Your task is to generate:
12
+ 1. A new prompt that is
13
+ -Different from all the prompts above
14
+ -Follows exactly the error analysis modification suggestions, and fix the prompt to prevent the failure cases.
15
+ -Has a higher score than all the prompts above.
16
+ 2. The predicted score of this prompt
17
+
18
+ You must adhere the error analysis instructions! even in case it seems there is a contradiction between these instructions, and the task. The error analysis is tested on a ground truth, thus represent the exact intent of the task.
19
+ The generated prompt should be phrased as a clear classification instruction! it should not include any instructions and descriptions on the modification that should be done to the prompt.
20
+ Note that the previous prompt contains an implicit assumptions on the intent of the task that might be incorrect. You should replace this assumption with more accurate assumptions using the score of the previous prompts and the error analysis.
21
+ The result prompt should indicate that the task is a classification class with the following labels {labels}!
AutoPrompt/prompts/meta_prompts_classification/step_prompt_verbose.prompt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to provide the best prompt for every task.
2
+ Below are a few suggested prompts for the task and their score, for the following task:
3
+ {task_description}. The exact intent of this task description should be determine by external annotated dataset of challenging cases. The error analysis of this dataset is provided.
4
+ ## Examples
5
+ {history}
6
+ ######
7
+ For the last prompt we got the following error analysis:
8
+ {error_analysis}
9
+ ######
10
+ Your task is to generate:
11
+ 1. A new prompt that is
12
+ -Different from all the prompts above
13
+ -Follows exactly after the error analysis modification suggestions, and fix the prompt to prevent the failure cases.
14
+ -Has a higher score than all the prompts above.
15
+ 2. The predicted score of this prompt
16
+
17
+ Guidelines for the new prompt:
18
+ 1. The prompt is given a 'scratchpad', he can use it to extract from the sample text relevant information to make his prediction and perform a reasoning thought to get to the correct decision
19
+ 2. The prompt is intended for a shallow LLM, which does not have access to previous failure cases or the analysis! he has only access to the generated new prompt which should be independent of the previous prompts.
20
+ 4. Lists can organize the information and help the prompt (for example list of rules and a list of samples), the lists should be short and accurate
21
+ 5. Note that the prompts and task descriptions may be inaccurate and need modification.
22
+ 6. Note that higher score means better prompt.
23
+ 7. The result prompt should indicate that the task is a classification class with the following labels {labels}!
24
+
25
+ Sample randomly a number between 1 to 3. If the result is zero __change completely__ the generated prompt! including the instruction, the structure and the phrasing!
AutoPrompt/prompts/meta_prompts_classification/step_samples.prompt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to generate challenging samples for every task.
2
+ Below a few prompts that were build to answer the given task description and their failure case.
3
+ Task description:
4
+ {task_description}
5
+
6
+ ## Examples of common failure, each sample is followed by the the model prediction and the GT (ground truth)
7
+ {history}
8
+ ######
9
+ Here are few unique samples derived from realistic scenarios for the task outlined above.
10
+ ## Realistic Samples
11
+ {extra_samples}
12
+ #####
13
+ This was the new proposed prompt:
14
+ ## Prompt
15
+ {prompt}
16
+
17
+ Your task is to generate {num_samples} by following this guidelines:
18
+ 1. The generated samples should be diverse
19
+ 2. They should preserve the style and the length of the given examples
20
+ 3. The samples must be challenging and hard to classify by the model. This can be achieved by:
21
+ 1. targeting the same weakness that the model failed on in the given examples
22
+ 2. targeting weakness that are different from the existing examples in the failure cases
23
+ 4. The number of generated samples from each class should be almost balanced (i.e. the same number of samples for each class)
24
+ 5. The generated samples should include only the sample content without additional information! (like the model prediction and the ground truth)
AutoPrompt/prompts/meta_prompts_completion/error_analysis.prompt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to provide a high quality analysis for every task.
2
+ You are given the following task description
3
+ {task_description}
4
+
5
+ Here is the prompt instructions that was given to the model:
6
+ {prompt}
7
+
8
+ The accuracy for this prompt is: {accuracy}
9
+ The confusion matrix for this prompt is: {confusion_matrix}
10
+ ##
11
+ Here is a list of failure cases for the given prompt:
12
+ ##Failure Cases:
13
+ {failure_cases}
14
+
15
+ ###
16
+ Note that the ground-truth labels are __absolutely correct__, but the prompts (task descriptions) may be incorrect and need modification.
17
+ Your task is to provide a brief analysis of the given prompt performance.
18
+ Guidelines:
19
+ 1. The analysis should contain only the following information:
20
+ - If there exists abnormal behavior in the confusion matrix, describe it.
21
+ - A summary of the common failure cases, try to cluster the failure cases into groups and describe each group.
22
+ 3. The total length of your analysis should be less than 200 token!
23
+ ###
24
+ Analysis:
AutoPrompt/prompts/meta_prompts_completion/initial.prompt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to generate challenging samples for every task.
2
+ Generate a list of {num_samples} challenging samples for the following task.
3
+ ### Task description:
4
+ {task_description}
5
+ ### Task Instruction:
6
+ {instruction}
7
+ ###
8
+ The generated samples should be challenging and diverse such that using the task instruction as a prompt will result in the wrong result.
9
+
10
+ Answer in the following format:
11
+ #### Sample 1:
12
+ <text>
13
+ #### Sample 2:
14
+ <text>
15
+ ############
16
+ Results:
AutoPrompt/prompts/meta_prompts_completion/output_schemes.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A file containing the parser for the output of all the LLM chains
2
+ import re
3
+
4
+ def initial_parser(response: dict) -> dict:
5
+ """
6
+ Parse the response from the LLM chain
7
+ :param response: The response from the LLM chain
8
+ :return: The parsed response
9
+ """
10
+ pattern = r'(#### Sample \d+:)([\s\S]*?)(?=(#### Sample \d+:|$))'
11
+
12
+ matches = re.findall(pattern, response['text'])
13
+ results = {'samples' :[]}
14
+ for match in matches:
15
+ header, content = match[0], match[1]
16
+ results['samples'].append(content.strip())
17
+ return results
18
+
19
+ step_samples_parser = initial_parser
20
+
21
+ def step_prompt_parser(response: dict) -> dict:
22
+ """
23
+ Parse the response from the LLM chain
24
+ :param response: The response from the LLM chain
25
+ :return: The parsed response
26
+ """
27
+ pattern = re.compile( r"#### prompt:\n(?P<prompt>.*?)\n#### score:\n(?P<score>[\d.]+)", re.DOTALL)
28
+ match = pattern.search(response['text'])
29
+ if match:
30
+ result = {
31
+ 'prompt': match.group('prompt'),
32
+ 'score': float(match.group('score'))
33
+ }
34
+ return result
35
+ else:
36
+ result = {
37
+ 'prompt': '',
38
+ 'score': 0.0
39
+ }
40
+ return result
AutoPrompt/prompts/meta_prompts_completion/step_prompt.prompt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to provide the best prompt for every task.
2
+ Below are a few suggested prompts for the task and their score, for the following task:
3
+ {task_description}
4
+
5
+ ## Examples
6
+ {history}
7
+ ######
8
+ This is the error analysis for the last prompt:
9
+ {error_analysis}
10
+ ######
11
+ Your task is to generate:
12
+ 1. A new prompt that is
13
+ -Different from all the prompts above
14
+ -Follows exactly the error analysis modification suggestions, and fix the prompt to prevent the failure cases.
15
+ -Has a higher score than all the prompts above.
16
+ 2. The predicted score of this prompt
17
+
18
+ You must adhere the error analysis instructions! even in case it seems there is a contradiction between these instructions, and the task. The error analysis is tested on a ground truth, thus represent the exact intent of the task.
19
+ The generated prompt should be phrased as a clear classification instruction! it should not include any instructions and descriptions on the modification that should be done to the prompt.
20
+ Note that the previous prompt contains an implicit assumptions on the intent of the task that might be incorrect. You should replace this assumption with more accurate assumptions using the score of the previous prompts and the error analysis.
21
+ The result prompt should indicate that the task is a classification class with the following labels {labels}!
22
+
23
+ Answer in the following format:
24
+ #### prompt:
25
+ <prompt suggestion>
26
+ #### score:
27
+ <score>
28
+ ############
29
+ Results:
AutoPrompt/prompts/meta_prompts_completion/step_samples.prompt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to generate challenging samples for every task.
2
+ Below a few prompts and their failure case, for the following task:
3
+ {task_description}
4
+
5
+ ## Examples of common failure
6
+ {history}
7
+ ######
8
+ Your task is to generate {num_samples} challenging and diverse samples that will confuse the model with the following prompt:
9
+ ## Prompt
10
+ {prompt}
11
+
12
+ Answer in the following format:
13
+ #### Sample 1:
14
+ <text>
15
+ #### Sample 2:
16
+ <text>
17
+ ############
18
+ Results:
AutoPrompt/prompts/meta_prompts_generation/error_analysis.prompt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to provide a high quality analysis for every task.
2
+ You are given the following task description
3
+ {task_description}
4
+
5
+ Here is the prompt instructions that was given to the model:
6
+ {prompt}
7
+
8
+ An expert ranker evaluated the model's performance on the given task description.
9
+ and rank according to the following scale: {labels}
10
+
11
+ The mean score for this prompt is: {accuracy}
12
+ ##
13
+ Here is a list of challenging cases for the given prompt and their rank:
14
+ ##Challenging Cases:
15
+ {failure_cases}
16
+
17
+ ###
18
+ Note that the ranker labels are __absolutely correct__, but the prompts (task descriptions) may be incorrect and need modification.
19
+ Your task is to provide a brief analysis of the given prompt performance.
20
+ Guidelines:
21
+ 1. The analysis should contain only the following information:
22
+ - A summary of the common mistakes of the prompt and the ways he can be improve his generation, try to cluster the failure cases into groups and describe each group.
23
+ 2. The total length of your analysis should be less than 200 token!
24
+ ###
25
+ Analysis:
AutoPrompt/prompts/meta_prompts_generation/initial.prompt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ As an advanced language model you should create {num_samples} challenging and unique prompts for the task outlined below.
2
+ These samples should be intricately designed to test the limits of the task's instructions, challenging yet relevant to the task description.
3
+
4
+ The task description and instruction is phrased as a generative task. The results prompts samples should be input to the the model.
5
+ The model will be able then to generate an example given the instructions and the prompt input.
6
+
7
+ ### Task Description:
8
+ {task_description}
9
+
10
+ ### Task Instructions:
11
+ {instruction}
12
+
13
+ ### Requirements for Challenging Samples:
14
+ 1. Each prompt must present a unique and intricate challenge.
15
+ 2. The prompts should cover a diverse range of scenarios within the scope of the task, avoiding repetition and predictability.
16
+ 3. Each prompt should contain only the prompt part, without generating also the results
17
+ 4. Each prompt should contain only the prompt part, without any mention of the task description or instructions!!
18
+
19
+ Generate the prompt samples keeping these requirements in mind.
20
+ ###
AutoPrompt/prompts/meta_prompts_generation/output_schemes.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A file containing the json schema for the output of all the LLM chains
2
+
3
+ initial_schema = step_samples_schema = {
4
+ "description": "A List of all results",
5
+ "properties": {
6
+ "samples": {
7
+ "description": "Each sample is a string containing only the prompt sample content, without any additional information",
8
+ "items": {
9
+ "type": "string"
10
+ },
11
+ "title": "Samples",
12
+ "type": "array"
13
+ }
14
+ },
15
+ "required": [
16
+ "samples"
17
+ ],
18
+ "title": "Sample_List",
19
+ "type": "object"
20
+ }
21
+
22
+
23
+ classification_prediction_schema = {
24
+ "$defs": {
25
+ "Result": {
26
+ "description": "A single result",
27
+ "properties": {
28
+ "id": {
29
+ "description": "The sample id",
30
+ "title": "Id",
31
+ "type": "integer"
32
+ },
33
+ "prediction": {
34
+ "description": "The prediction of the sample.",
35
+ "title": "Prediction",
36
+ "type": "string"
37
+ }
38
+ },
39
+ "required": [
40
+ "id",
41
+ "prediction"
42
+ ],
43
+ "title": "Result",
44
+ "type": "object"
45
+ }
46
+ },
47
+ "description": "A List of task classification results",
48
+ "properties": {
49
+ "results": {
50
+ "description": "Each item contain the id and the prediction of the sample",
51
+ "items": {
52
+ "$ref": "#/$defs/Result"
53
+ },
54
+ "title": "Results",
55
+ "type": "array"
56
+ }
57
+ },
58
+ "required": [
59
+ "results"
60
+ ],
61
+ "title": "Results_List",
62
+ "type": "object"
63
+ }
64
+
65
+
66
+ step_prompt_schema = {
67
+ "description": "A prompt suggestion which expect to get high score, and the associated score prediction",
68
+ "properties": {
69
+ "prompt": {
70
+ "description": "The prompt prediction",
71
+ "title": "Prompt",
72
+ "type": "string"
73
+ },
74
+ "score": {
75
+ "description": "The score prediction",
76
+ "title": "Score",
77
+ "type": "number"
78
+ }
79
+ },
80
+ "required": [
81
+ "prompt",
82
+ "score"
83
+ ],
84
+ "title": "Suggested_Prompt",
85
+ "type": "object"
86
+ }
87
+
88
+ def update_classification_prediction_schema(label_schema:list)->dict:
89
+ """
90
+ Updates the classification prediction schema with the label schema from the yaml file
91
+ :param yaml_data: The yaml data
92
+ """
93
+
94
+ classification_prediction_schema['$defs']['Result']['properties']['prediction']['enum'] = label_schema
95
+ classification_prediction_schema['$defs']['Result']['properties']['prediction'][
96
+ 'description'] += 'The answer must be one of the following options: {} !!'.format(label_schema)
97
+ return classification_prediction_schema
AutoPrompt/prompts/meta_prompts_generation/step_prompt.prompt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to provide the best instructions for every task.
2
+ Below are a few suggested instructions for the task and score (mean of the rank), for the following task description:
3
+ {task_description}
4
+
5
+ ## Examples
6
+ {history}
7
+ ######
8
+ This is the analysis for the last instruction:
9
+ {error_analysis}
10
+ ######
11
+ Your task is to generate:
12
+ 1. A new instruction that is
13
+ -Different from all the instructions above
14
+ -Follows exactly the error analysis modification suggestions, and fix the instruction to improve the quality of the instruction.
15
+ -Has a higher score than all the instructions above.
16
+ 2. The predicted score of this instructions
17
+
18
+ You must adhere the error analysis instructions! even in case it seems there is a contradiction between these instructions, and the task. The error analysis was evaluate by an expert ranker, thus represent the exact intent of the task.
19
+ The generated instruction should be phrased as a clear generation instruction! it should not include any instructions and descriptions on the modification that should be done to the instruction.
20
+ Note that the previous instruction contains an implicit assumptions on the intent of the task that might be incorrect. You should replace this assumption with more accurate assumptions using the score of the previous instructions and the error analysis.
AutoPrompt/prompts/meta_prompts_generation/step_samples.prompt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to generate challenging samples for every task.
2
+ Below a few prompts that were build to answer the given task description and their failure case.
3
+ Task description:
4
+ {task_description}
5
+
6
+ ## Examples, each sample is followed by the the moder prediction and the GT (ground truth)
7
+ {history}
8
+ ######
9
+ Here are few unique samples derived from realistic scenarios for the task outlined above.
10
+ ## Realistic Samples
11
+ {extra_samples}
12
+ #####
13
+ This was the new proposed prompt:
14
+ ## Prompt
15
+ {prompt}
16
+
17
+ Your task is to generate {num_samples} by following this guidelines:
18
+ 1. The generated samples should be diverse
19
+ 2. They should preserve the style and the length of the given examples
20
+ 3. The samples must be challenging and hard to classify by the model. This can be achieved by:
21
+ 1. targeting the same weakness that the model failed on in the given examples
22
+ 2. targeting weakness that are different from the existing examples in the failure cases
23
+ 4. The number of generated samples from each class should be almost balanced (i.e. the same number of samples for each class)
24
+ 5. The generated samples should include only the sample content without additional information! (like the model prediction and the ground truth)
AutoPrompt/prompts/meta_prompts_ranking/error_analysis.prompt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to provide a high quality analysis for every task.
2
+ You are given the following task description
3
+ {task_description}
4
+
5
+ Here is the prompt instructions that was given to the model:
6
+ {prompt}
7
+
8
+ The accuracy for this prompt is: {accuracy}
9
+ The confusion matrix for this prompt is: {confusion_matrix}
10
+ ##
11
+ Here is a list of failure cases for the given prompt:
12
+ ##Failure Cases:
13
+ {failure_cases}
14
+
15
+ ###
16
+ Note that the ground-truth labels are __absolutely correct__, but the prompts (task descriptions) may be incorrect and need modification.
17
+ Your task is to provide a brief analysis of the given prompt performance.
18
+ Guidelines:
19
+ 1. The analysis should contain only the following information:
20
+ - If there exists abnormal behavior in the confusion matrix, describe it.
21
+ - A summary of the common failure cases, try to cluster the failure cases into groups and describe each group.
22
+ 3. The total length of your analysis should be less than 200 token!
23
+ ###
24
+ Analysis:
AutoPrompt/prompts/meta_prompts_ranking/initial.prompt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assistant is a large language model designed to generate challenging samples for every task.
2
+ Generate a list of {num_samples} challenging samples for the following task.
3
+ ### Task description:
4
+ {task_description}
5
+ ### Task Instruction:
6
+ {instruction}
7
+ ###
8
+ ### Requirements for Challenging Samples:
9
+ 1. The generated samples must be challenging and diverse such that using the task instruction as a prompt will result in the wrong result.
10
+ 2. The generated samples must be only from the top two scores! With equal distribution between the two.
11
+ 3. The generated samples should be distinct, realistic, and vary significantly to ensure diversity.
12
+
13
+ If the task depends both on a context, or a user input and a generated content then the sample content must include all the relevant parts.
14
+ -In this case the sample content structure should be as follows:
15
+ 1. First write the require context or user input.
16
+ 2. Then write the generated content of the model on this context or user input.
17
+ The style of the separation and the indication of the different parts, should be different in each sample.
AutoPrompt/prompts/meta_prompts_ranking/initial_verbose.prompt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ As an advanced language model you should create {num_samples} challenging and unique samples for the task outlined below.
2
+ These samples should be intricately designed to test the limits of the task's instructions, challenging yet relevant to the task description.
3
+
4
+ ### Task Description:
5
+ {task_description}
6
+
7
+ ### Task Instructions:
8
+ {instruction}
9
+
10
+ ### Requirements for Challenging Samples:
11
+ 1. Each sample must present a unique and intricate challenge.
12
+ 2. The complexity of the samples should be such that simply applying the given task instruction would likely lead to incorrect or incomplete results.
13
+ 3. The samples should cover a diverse range of scenarios within the scope of the task, avoiding repetition and predictability.
14
+ 4. Ensure that the samples, while challenging, remain realistic and pertinent to the task's context.
15
+
16
+ Generate the samples keeping these requirements in mind.
17
+ ###
AutoPrompt/prompts/meta_prompts_ranking/output_schemes.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A file containing the json schema for the output of all the LLM chains
2
+
3
+ initial_schema = step_samples_schema = {
4
+ "description": "A List of all results",
5
+ "properties": {
6
+ "samples": {
7
+ "description": "Each sample is a string containing the sample content, without any additional information like the Prediction or GT",
8
+ "items": {
9
+ "type": "string"
10
+ },
11
+ "title": "Samples",
12
+ "type": "array"
13
+ }
14
+ },
15
+ "required": [
16
+ "samples"
17
+ ],
18
+ "title": "Sample_List",
19
+ "type": "object"
20
+ }
21
+
22
+
23
+ classification_prediction_schema = {
24
+ "$defs": {
25
+ "Result": {
26
+ "description": "A single result",
27
+ "properties": {
28
+ "id": {
29
+ "description": "The sample id",
30
+ "title": "Id",
31
+ "type": "integer"
32
+ },
33
+ "prediction": {
34
+ "description": "The prediction of the sample.",
35
+ "title": "Prediction",
36
+ "type": "string"
37
+ }
38
+ },
39
+ "required": [
40
+ "id",
41
+ "prediction"
42
+ ],
43
+ "title": "Result",
44
+ "type": "object"
45
+ }
46
+ },
47
+ "description": "A List of task classification results",
48
+ "properties": {
49
+ "results": {
50
+ "description": "Each item contain the id and the prediction of the sample",
51
+ "items": {
52
+ "$ref": "#/$defs/Result"
53
+ },
54
+ "title": "Results",
55
+ "type": "array"
56
+ }
57
+ },
58
+ "required": [
59
+ "results"
60
+ ],
61
+ "title": "Results_List",
62
+ "type": "object"
63
+ }
64
+
65
+
66
+ step_prompt_schema = {
67
+ "description": "A prompt suggestion which expect to get high score, and the associated score prediction",
68
+ "properties": {
69
+ "prompt": {
70
+ "description": "The prompt prediction",
71
+ "title": "Prompt",
72
+ "type": "string"
73
+ },
74
+ "score": {
75
+ "description": "The score prediction",
76
+ "title": "Score",
77
+ "type": "number"
78
+ }
79
+ },
80
+ "required": [
81
+ "prompt",
82
+ "score"
83
+ ],
84
+ "title": "Suggested_Prompt",
85
+ "type": "object"
86
+ }
87
+
88
+ def update_classification_prediction_schema(label_schema:list)->dict:
89
+ """
90
+ Updates the classification prediction schema with the label schema from the yaml file
91
+ :param yaml_data: The yaml data
92
+ """
93
+
94
+ classification_prediction_schema['$defs']['Result']['properties']['prediction']['enum'] = label_schema
95
+ classification_prediction_schema['$defs']['Result']['properties']['prediction'][
96
+ 'description'] += 'The answer must be one of the following options: {} !!'.format(label_schema)
97
+ return classification_prediction_schema