xqt commited on
Commit
5446331
β€’
1 Parent(s): fbc2b1b

Upload 3 files

Browse files
Files changed (3) hide show
  1. LlamaManager.py +228 -0
  2. app.py +165 -0
  3. requirements.txt +2 -0
LlamaManager.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import huggingface_hub
2
+ import re
3
+
4
+ class LlamaManager():
5
+ def __init__(self, llama_token = None, verbose = False):
6
+ self.verbose = verbose
7
+
8
+ if self.verbose:
9
+ print("LlamaManager::__init__::Initializing LlamaManager")
10
+ self.client = huggingface_hub.InferenceClient(
11
+ "meta-llama/Meta-Llama-3.1-70B-Instruct",
12
+ token=llama_token,
13
+ )
14
+ if self.verbose:
15
+ print("LlamaManager::__init__::Initialized LlamaManager")
16
+
17
+
18
+ def __get_items_between_tags(self, input_string, tag1, tag2):
19
+ pattern = r'' + tag1 + '(.*?)' + tag2 + ''
20
+ return re.findall(pattern, input_string, re.DOTALL)
21
+
22
+
23
+ def __preprocss_for_auto_generate_questions_categories(self, available_categories):
24
+ if self.verbose:
25
+ print("LlamaManager::__preprocss_for_auto_generate_questions_categories::Preprocessing")
26
+ out = ""
27
+ for available_category in available_categories:
28
+ out += f"[A]{available_category}[/A]"
29
+ return out
30
+
31
+
32
+ def __postprocess_for_auto_generate_questions_categories(self, out):
33
+ if self.verbose:
34
+ print("LlamaManager::__postprocess_for_auto_generate_questions_categories::Postprocessing")
35
+
36
+ out = self.__get_items_between_tags(out, r"\[L\]", r"\[/L\]")[0]
37
+ if not out:
38
+ if self.verbose:
39
+ print("LlamaManager::__postprocess_for_auto_generate_questions_categories::No content found")
40
+ return []
41
+ out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")
42
+ if not out:
43
+ if self.verbose:
44
+ print("LlamaManager::__postprocess_for_auto_generate_questions_categories::No categories found")
45
+ return []
46
+ return out
47
+
48
+
49
+ def auto_generate_questions_categories(
50
+ self,
51
+ count = 20,
52
+ available_categories = ["Variables"],
53
+ seed = 123,
54
+ temperature = 1.0,
55
+ top_p = 0.9,
56
+ frequency_penalty = 0.0
57
+ ):
58
+ available_content_for_assistant = self.__preprocss_for_auto_generate_questions_categories(available_categories)
59
+ if self.verbose:
60
+ print("LlamaManager::auto_generate_questions_categories::Generating questions categories")
61
+
62
+ message_content = [
63
+ {"role": "system", "content": "You are a synthetic data generator. You must only answer questions as a list. Each item of the list should be enclosed in [A] and [/A] tags. The list should be enclosed in [L] and [/L] tags."},
64
+ {"role": "user", "content": f"Write me {count} basic topics for python programming"},
65
+ {"role": "assistant", "content": f"[L]{available_content_for_assistant}"}
66
+ ]
67
+
68
+ out = self.client.chat_completion(
69
+ messages = message_content,
70
+ max_tokens = 1000,
71
+ stream = False,
72
+ seed = seed,
73
+ temperature = temperature,
74
+ top_p = top_p,
75
+ frequency_penalty = frequency_penalty
76
+ )
77
+
78
+ categories = self.__postprocess_for_auto_generate_questions_categories(out.choices[0].message.content)
79
+ if self.verbose:
80
+ print("LlamaManager::auto_generate_questions_categories::Generated questions Categories")
81
+
82
+ return categories
83
+
84
+
85
+ def __postprocess_for_auto_generate_shots_for_category(self, out):
86
+ if self.verbose:
87
+ print("LlamaManager::__postprocess_for_auto_generate_shots_for_category::Postprocessing")
88
+
89
+ out = self.__get_items_between_tags(out, r"\[L\]", r"\[/L\]")[0]
90
+ if not out:
91
+ if self.verbose:
92
+ print("LlamaManager::__postprocess_for_auto_generate_shots_for_category::No content found")
93
+ return []
94
+ out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")
95
+ if not out:
96
+ if self.verbose:
97
+ print("LlamaManager::__postprocess_for_auto_generate_shots_for_category::No questions found")
98
+ return []
99
+ return out
100
+
101
+
102
+ def auto_generate_shots_for_category(
103
+ self,
104
+ count,
105
+ category,
106
+ seed = 123,
107
+ temperature = 1.0,
108
+ top_p = 0.9,
109
+ frequency_penalty = 0.0
110
+ ):
111
+ if self.verbose:
112
+ print("LlamaManager::auto_generate_shots_for_category::Generating shots for category")
113
+
114
+ message_content = [
115
+ {"role": "system", "content": "You are a synthetic data generator. You must only answer questions as a list. Each item of the list should be enclosed in [A] and [/A] tags. The list should be enclosed in [L] and [/L] tags."},
116
+ {"role": "user", "content": f"Write me 2 programming questions on the topic of For Loop in Python. The question should be of medium and hard difficulty. The question should involve use of just one function"},
117
+ {"role": "assistant", "content": f"""[L]
118
+ - [A]Write a program that takes a positive integer as input and computes the sum of its digits using a for loop.[/A]
119
+ - [A]Write a program that generates a spiral matrix of size NxN, where N is always an odd number. Fill the spiral matrix with consecutive prime numbers in a clockwise spiral pattern, starting from the center of the matrix.[/A]
120
+ """},
121
+ {"role": "user", "content": f"Write me {count} programming questions on the topic of {category} in Python. The question should be of medium and hard difficulty. The question should involve use of just one function"},
122
+ {"role": "assistant", "content": f"[L]"}
123
+ ]
124
+
125
+ out = self.client.chat_completion(
126
+ messages = message_content,
127
+ max_tokens = 1000,
128
+ stream = False,
129
+ seed = seed,
130
+ temperature = temperature,
131
+ top_p = top_p,
132
+ frequency_penalty = frequency_penalty
133
+ )
134
+
135
+ shots = self.__postprocess_for_auto_generate_shots_for_category(out.choices[0].message.content + "[/L]")
136
+ if self.verbose:
137
+ print(f"LlamaManager::auto_generate_shots_for_category::Generated {count} shots for {category}")
138
+
139
+ return shots
140
+
141
+
142
+ def __preprocess_for_auto_generate_questions_from_shots(self, shots):
143
+ if self.verbose:
144
+ print("LlamaManager::__preprocess_for_auto_generate_questions_from_shots::Preprocessing")
145
+ out = ""
146
+ for shot in shots:
147
+ out += f"[A]{shot}[/A]"
148
+ return out
149
+
150
+
151
+ def __postprocess_for_auto_generate_questions_from_shots(self, out):
152
+ if self.verbose:
153
+ print("LlamaManager::__postprocess_for_auto_generate_questions_from_shots::Postprocessing")
154
+
155
+ out = self.__get_items_between_tags(out, r"\[L\]", r"\[/L\]")[0]
156
+ if not out:
157
+ if self.verbose:
158
+ print("LlamaManager::__postprocess_for_auto_generate_questions_from_shots::No content found")
159
+ return []
160
+ out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")
161
+ if not out:
162
+ if self.verbose:
163
+ print("LlamaManager::__postprocess_for_auto_generate_questions_from_shots::No questions found")
164
+ return []
165
+ return out
166
+
167
+
168
+ def auto_generate_questions_from_shots(
169
+ self,
170
+ count,
171
+ category,
172
+ shots,
173
+ seed = 123,
174
+ temperature = 1.0,
175
+ top_p = 0.9,
176
+ frequency_penalty = 0.0
177
+ ):
178
+ available_content_for_assistant = self.__preprocess_for_auto_generate_questions_from_shots(shots)
179
+ if self.verbose:
180
+ print("LlamaManager::auto_generate_questions_from_shots::Generating questions from shots")
181
+
182
+ message_content = [
183
+ {"role": "system", "content": "You are a synthetic data generator. You must only answer questions as a list. Each item of the list should be enclosed in [A] and [/A] tags. The list should be enclosed in [L] and [/L] tags."},
184
+ {"role": "user", "content": f"Write me {count} python programming questions which uses {category.lower()}"},
185
+ {"role": "assistant", "content": f"[L]{available_content_for_assistant}"}
186
+ ]
187
+
188
+ previous_iteration_questions_count = []
189
+ questions = []
190
+ token_count = 1000
191
+ while len(questions) < count:
192
+ out = self.client.chat_completion(
193
+ messages = message_content,
194
+ max_tokens = token_count,
195
+ stream = False,
196
+ seed = seed,
197
+ temperature = temperature,
198
+ top_p = top_p,
199
+ frequency_penalty = frequency_penalty
200
+ )
201
+
202
+ questions = self.__postprocess_for_auto_generate_questions_from_shots(out.choices[0].message.content + "[/L]")
203
+ available_content_for_assistant = self.__preprocess_for_auto_generate_questions_from_shots(questions)
204
+ previous_iteration_questions_count.append(len(questions))
205
+ message_content = [
206
+ {"role": "system", "content": "You are a synthetic data generator. You must only answer questions as a list. Each item of the list should be enclosed in [A] and [/A] tags. The list should be enclosed in [L] and [/L] tags."},
207
+ {"role": "user", "content": f"Write me {count} python programming questions which uses {category.lower()}"},
208
+ {"role": "assistant", "content": f"[L]{available_content_for_assistant}"}
209
+ ]
210
+ token_count += 500
211
+
212
+ if len(previous_iteration_questions_count) > 3:
213
+ if previous_iteration_questions_count[-1] == previous_iteration_questions_count[-2] == previous_iteration_questions_count[-3] == previous_iteration_questions_count[-4]:
214
+ if self.verbose:
215
+ print("LlamaManager::auto_generate_questions_from_shots::Generation could not be completed, stopping API calls")
216
+ break
217
+
218
+ if self.verbose:
219
+ print("LlamaManager::auto_generate_questions_from_shots::Generated questions from shots")
220
+
221
+ return questions
222
+
223
+
224
+ if __name__ == "__main__":
225
+ llama_manager = LlamaManager("nope", True)
226
+ categories = llama_manager.auto_generate_questions_categories(20)
227
+ shots = llama_manager.auto_generate_shots_for_category(2, categories[3])
228
+ questions = llama_manager.auto_generate_questions_from_shots(10, categories[3], shots, temperature = 0.5)
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio
2
+ import LlamaManager
3
+ import os
4
+ import huggingface_hub
5
+
6
+ HF_API = huggingface_hub.HfApi()
7
+ LLAMAMANAGER = LlamaManager.LlamaManager(os.environ.get("HF_KEY_2"), True)
8
+
9
+ def store_generated_data(data):
10
+ token = os.environ.get("HF_BOT")
11
+ data = f"{data}"
12
+ HF_API.comment_discussion("xqt/SyntheticMBPP2", 1, data, repo_type = "dataset", token = token)
13
+
14
+
15
+ def authenticate(secret_textbox):
16
+ password_list = os.environ.get("PASSWORD_LIST")
17
+ password_list = password_list.split(":")
18
+ api_key = ""
19
+ if secret_textbox in password_list:
20
+ api_key = os.environ.get("HF_KEY")
21
+ else:
22
+ api_key = secret_textbox
23
+
24
+ LLAMAMANAGER = LlamaManager.LlamaManager(api_key, True)
25
+
26
+
27
+ def generate_categories(categories_count, seed, temperature, top_p, frequency_penalty):
28
+ categories = LLAMAMANAGER.auto_generate_questions_categories(
29
+ count = categories_count,
30
+ seed = seed,
31
+ temperature = temperature,
32
+ top_p = top_p,
33
+ frequency_penalty = frequency_penalty
34
+ )
35
+ data = {
36
+ "type": "generate_categories",
37
+ "categories": categories,
38
+ "count": categories_count,
39
+ "seed": seed,
40
+ "temperature": temperature,
41
+ "top_p": top_p,
42
+ "frequency_penalty": frequency_penalty
43
+ }
44
+ store_generated_data(data)
45
+ return gradio.Dropdown(choices = categories, value = categories[0], label = "Select Category", interactive = True)
46
+
47
+
48
+ def generate_shots(category, shots_count, seed, temperature, top_p, frequency_penalty):
49
+ shots = LLAMAMANAGER.auto_generate_shots_for_category(category, shots_count, seed, temperature, top_p, frequency_penalty)
50
+ shots = [[shot] for shot in shots]
51
+ data = {
52
+ "type": "generate_shots",
53
+ "category": category,
54
+ "shots": shots,
55
+ "count": shots_count,
56
+ "seed": seed,
57
+ "temperature": temperature,
58
+ "top_p": top_p,
59
+ "frequency_penalty": frequency_penalty
60
+ }
61
+ store_generated_data(data)
62
+ return gradio.DataFrame(value = shots, type = "array", label = "Generated Shots", interactive = False, headers = None)
63
+
64
+
65
+ def generate_questions(questions_count, category, shots, seed, temperature, top_p, frequency_penalty):
66
+ questions = LLAMAMANAGER.auto_generate_questions_from_shots(questions_count, category, shots, seed, temperature, top_p, frequency_penalty)
67
+ questions = [[question] for question in questions]
68
+ data = {
69
+ "type": "generate_questions",
70
+ "questions": questions,
71
+ "count": questions_count,
72
+ "category": category,
73
+ "shots": shots,
74
+ "seed": seed,
75
+ "temperature": temperature,
76
+ "top_p": top_p,
77
+ "frequency_penalty": frequency_penalty
78
+ }
79
+ store_generated_data(data)
80
+ return gradio.DataFrame(value = questions, type = "array", label = "Generated Shots", interactive = False, headers = None)
81
+
82
+
83
+ with gradio.Blocks(fill_height=True) as base_app:
84
+ gradio.Markdown("# Synthetic Python Programming Data Generation βš™οΈ")
85
+ gradio.Markdown("# ❗️ Note: The data generated here by Llama3 and the settings used to generate it will be stored in the repository for future use.")
86
+ gradio.Markdown("# ❗️ Feel free to use your own API key if the key here is rate limited. API Key is never stored in the repository.")
87
+ gradio.Markdown("# ❗️ If you want to use a passcode, please text me.")
88
+ gradio.Markdown("# Step 0: Use your own API Key/Passcode")
89
+ with gradio.Row():
90
+ with gradio.Column():
91
+ __secret_textbox = gradio.Textbox(label = "API Key/Passcode", placeholder = "Enter your API Key/Passcode here", type = "password", interactive = True)
92
+ with gradio.Column():
93
+ __passcode_authenticate = gradio.Button("Authenticate", scale = 2)
94
+
95
+ gradio.Markdown("# Step 1: How many categories do you want to generate?")
96
+ with gradio.Row(equal_height = True):
97
+ with gradio.Column(scale = 2):
98
+ __categories_count = gradio.Slider(minimum = 1, maximum = 20, step = 1, value = 10, label = "Number of Categories", interactive = True)
99
+ with gradio.Column():
100
+ __categories_generate = gradio.Button("Generate Categories", scale = 2)
101
+ with gradio.Accordion("Advanced Settings", open = False):
102
+ with gradio.Row():
103
+ with gradio.Column():
104
+ __categories_temperature = gradio.Slider(minimum = 0.1, maximum = 2.0, step = 0.01, value = 1.0, label = "Temperature", interactive = True)
105
+ __categories_top_p = gradio.Slider(minimum = 0.1, maximum = 0.99, step = 0.01, value = 0.9, label = "Top P", interactive = True)
106
+ with gradio.Column():
107
+ __categories_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
108
+ __categories_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
109
+
110
+ gradio.Markdown("# Step 2: Select a category to generate shots for and select the number of shots to generate")
111
+ with gradio.Row():
112
+ with gradio.Column(scale = 2):
113
+ __shots_category = gradio.Dropdown(choices = [], label = "Select Category", interactive = True)
114
+ __shots_count = gradio.Slider(minimum = 2, maximum = 5, step = 1, value = 2, label = "Number of Shots", interactive = True)
115
+ with gradio.Column():
116
+ __shots_generate = gradio.Button("Generate Shots", scale = 2)
117
+ with gradio.Accordion("Advanced Settings", open = False):
118
+ with gradio.Row():
119
+ with gradio.Column():
120
+ __shots_temperature = gradio.Slider(minimum = 0.1, maximum = 2.0, step = 0.01, value = 1.0, label = "Temperature", interactive = True)
121
+ __shots_top_p = gradio.Slider(minimum = 0.1, maximum = 0.99, step = 0.01, value = 0.9, label = "Top P", interactive = True)
122
+ with gradio.Column():
123
+ __shots_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
124
+ __shots_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
125
+ __generated_shots = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Shots", interactive = False, headers = None)
126
+
127
+ gradio.Markdown("# Step 3: Generate Python Programming Questions for the generated shots")
128
+ with gradio.Row():
129
+ with gradio.Column(scale = 2):
130
+ __questions_count = gradio.Slider(minimum = 1, maximum = 30, step = 1, value = 10, label = "Number of Questions", interactive = True)
131
+ with gradio.Column():
132
+ __questions_generate = gradio.Button("Generate Questions", scale = 2)
133
+ with gradio.Accordion("Advanced Settings", open = False):
134
+ with gradio.Row():
135
+ with gradio.Column():
136
+ __questions_temperature = gradio.Slider(minimum = 0.1, maximum = 2.0, step = 0.01, value = 1.0, label = "Temperature", interactive = True)
137
+ __questions_top_p = gradio.Slider(minimum = 0.1, maximum = 0.99, step = 0.01, value = 0.9, label = "Top P", interactive = True)
138
+ with gradio.Column():
139
+ __questions_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
140
+ __questions_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
141
+ __generated_questions = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Shots", interactive = False, headers = None)
142
+
143
+
144
+ __passcode_authenticate.click(authenticate,
145
+ inputs = [__secret_textbox],
146
+ outputs = []
147
+ )
148
+
149
+ __categories_generate.click(generate_categories,
150
+ inputs = [__categories_count, __categories_seed, __categories_temperature, __categories_top_p, __categories_frequency_penalty],
151
+ outputs = [__shots_category]
152
+ )
153
+ __shots_generate.click(generate_shots,
154
+ inputs = [__shots_category, __shots_count, __shots_seed, __shots_temperature, __shots_top_p, __shots_frequency_penalty],
155
+ outputs = [__generated_shots]
156
+ )
157
+
158
+ __questions_generate.click(generate_questions,
159
+ inputs = [__questions_count, __shots_category, __generated_shots, __questions_seed, __questions_temperature, __questions_top_p, __questions_frequency_penalty],
160
+ outputs = [__generated_questions]
161
+ )
162
+
163
+
164
+ if __name__ == "__main__":
165
+ base_app.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio
2
+ huggingface_hub