Spaces:

xqt
/

Synthetic-Python-Programing-Data-Generator

Sleeping

App Files Files Community

xqt commited on Sep 20, 2024

Commit

5446331

verified ·

1 Parent(s): fbc2b1b

Upload 3 files

Browse files

Files changed (3) hide show

LlamaManager.py +228 -0
app.py +165 -0
requirements.txt +2 -0

LlamaManager.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import huggingface_hub
+import re
+class LlamaManager():
+    def __init__(self, llama_token = None, verbose = False):
+        self.verbose = verbose
+        if self.verbose:
+            print("LlamaManager::__init__::Initializing LlamaManager")
+        self.client = huggingface_hub.InferenceClient(
+            "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            token=llama_token,
+        )
+        if self.verbose:
+            print("LlamaManager::__init__::Initialized LlamaManager")
+    def __get_items_between_tags(self, input_string, tag1, tag2):
+        pattern = r'' + tag1 + '(.*?)' + tag2 + ''
+        return re.findall(pattern, input_string, re.DOTALL)
+    def __preprocss_for_auto_generate_questions_categories(self, available_categories):
+        if self.verbose:
+            print("LlamaManager::__preprocss_for_auto_generate_questions_categories::Preprocessing")
+        out = ""
+        for available_category in available_categories:
+            out += f"[A]{available_category}[/A]"
+        return out
+    def __postprocess_for_auto_generate_questions_categories(self, out):
+        if self.verbose:
+            print("LlamaManager::__postprocess_for_auto_generate_questions_categories::Postprocessing")
+        out = self.__get_items_between_tags(out, r"\[L\]", r"\[/L\]")[0]
+        if not out:
+            if self.verbose:
+                print("LlamaManager::__postprocess_for_auto_generate_questions_categories::No content found")
+            return []
+        out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")
+        if not out:
+            if self.verbose:
+                print("LlamaManager::__postprocess_for_auto_generate_questions_categories::No categories found")
+            return []
+        return out
+    def auto_generate_questions_categories(
+        self,
+        count = 20,
+        available_categories = ["Variables"],
+        seed = 123,
+        temperature = 1.0,
+        top_p = 0.9,
+        frequency_penalty = 0.0
+        ):
+        available_content_for_assistant = self.__preprocss_for_auto_generate_questions_categories(available_categories)
+        if self.verbose:
+            print("LlamaManager::auto_generate_questions_categories::Generating questions categories")
+        message_content = [
+            {"role": "system", "content": "You are a synthetic data generator. You must only answer questions as a list. Each item of the list should be enclosed in [A] and [/A] tags. The list should be enclosed in [L] and [/L] tags."},
+            {"role": "user", "content": f"Write me {count} basic topics for python programming"},
+            {"role": "assistant", "content": f"[L]{available_content_for_assistant}"}
+        ]
+        out = self.client.chat_completion(
+            messages = message_content,
+            max_tokens = 1000,
+            stream = False,
+            seed = seed,
+            temperature = temperature,
+            top_p = top_p,
+            frequency_penalty = frequency_penalty
+        )
+        categories = self.__postprocess_for_auto_generate_questions_categories(out.choices[0].message.content)
+        if self.verbose:
+            print("LlamaManager::auto_generate_questions_categories::Generated questions Categories")
+        return categories
+    def __postprocess_for_auto_generate_shots_for_category(self, out):
+        if self.verbose:
+            print("LlamaManager::__postprocess_for_auto_generate_shots_for_category::Postprocessing")
+        out = self.__get_items_between_tags(out, r"\[L\]", r"\[/L\]")[0]
+        if not out:
+            if self.verbose:
+                print("LlamaManager::__postprocess_for_auto_generate_shots_for_category::No content found")
+            return []
+        out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")
+        if not out:
+            if self.verbose:
+                print("LlamaManager::__postprocess_for_auto_generate_shots_for_category::No questions found")
+            return []
+        return out
+    def auto_generate_shots_for_category(
+        self,
+        count,
+        category,
+        seed = 123,
+        temperature = 1.0,
+        top_p = 0.9,
+        frequency_penalty = 0.0
+        ):
+        if self.verbose:
+            print("LlamaManager::auto_generate_shots_for_category::Generating shots for category")
+        message_content = [
+            {"role": "system", "content": "You are a synthetic data generator. You must only answer questions as a list. Each item of the list should be enclosed in [A] and [/A] tags. The list should be enclosed in [L] and [/L] tags."},
+            {"role": "user", "content": f"Write me 2 programming questions on the topic of For Loop in Python. The question should be of medium and hard difficulty. The question should involve use of just one function"},
+            {"role": "assistant", "content": f"""[L]
+             - [A]Write a program that takes a positive integer as input and computes the sum of its digits using a for loop.[/A]
+             - [A]Write a program that generates a spiral matrix of size NxN, where N is always an odd number. Fill the spiral matrix with consecutive prime numbers in a clockwise spiral pattern, starting from the center of the matrix.[/A]
+             """},
+            {"role": "user", "content": f"Write me {count} programming questions on the topic of {category} in Python. The question should be of medium and hard difficulty. The question should involve use of just one function"},
+            {"role": "assistant", "content": f"[L]"}
+        ]
+        out = self.client.chat_completion(
+            messages = message_content,
+            max_tokens = 1000,
+            stream = False,
+            seed = seed,
+            temperature = temperature,
+            top_p = top_p,
+            frequency_penalty = frequency_penalty
+        )
+        shots = self.__postprocess_for_auto_generate_shots_for_category(out.choices[0].message.content + "[/L]")
+        if self.verbose:
+            print(f"LlamaManager::auto_generate_shots_for_category::Generated {count} shots for {category}")
+        return shots
+    def __preprocess_for_auto_generate_questions_from_shots(self, shots):
+        if self.verbose:
+            print("LlamaManager::__preprocess_for_auto_generate_questions_from_shots::Preprocessing")
+        out = ""
+        for shot in shots:
+            out += f"[A]{shot}[/A]"
+        return out
+    def __postprocess_for_auto_generate_questions_from_shots(self, out):
+        if self.verbose:
+            print("LlamaManager::__postprocess_for_auto_generate_questions_from_shots::Postprocessing")
+        out = self.__get_items_between_tags(out, r"\[L\]", r"\[/L\]")[0]
+        if not out:
+            if self.verbose:
+                print("LlamaManager::__postprocess_for_auto_generate_questions_from_shots::No content found")
+            return []
+        out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")
+        if not out:
+            if self.verbose:
+                print("LlamaManager::__postprocess_for_auto_generate_questions_from_shots::No questions found")
+            return []
+        return out
+    def auto_generate_questions_from_shots(
+        self,
+        count,
+        category,
+        shots,
+        seed = 123,
+        temperature = 1.0,
+        top_p = 0.9,
+        frequency_penalty = 0.0
+        ):
+        available_content_for_assistant = self.__preprocess_for_auto_generate_questions_from_shots(shots)
+        if self.verbose:
+            print("LlamaManager::auto_generate_questions_from_shots::Generating questions from shots")
+        message_content = [
+            {"role": "system", "content": "You are a synthetic data generator. You must only answer questions as a list. Each item of the list should be enclosed in [A] and [/A] tags. The list should be enclosed in [L] and [/L] tags."},
+            {"role": "user", "content": f"Write me {count} python programming questions which uses {category.lower()}"},
+            {"role": "assistant", "content": f"[L]{available_content_for_assistant}"}
+        ]
+        previous_iteration_questions_count = []
+        questions = []
+        token_count = 1000
+        while len(questions) < count:
+            out = self.client.chat_completion(
+                messages = message_content,
+                max_tokens = token_count,
+                stream = False,
+                seed = seed,
+                temperature = temperature,
+                top_p = top_p,
+                frequency_penalty = frequency_penalty
+            )
+            questions = self.__postprocess_for_auto_generate_questions_from_shots(out.choices[0].message.content + "[/L]")
+            available_content_for_assistant = self.__preprocess_for_auto_generate_questions_from_shots(questions)
+            previous_iteration_questions_count.append(len(questions))
+            message_content = [
+                {"role": "system", "content": "You are a synthetic data generator. You must only answer questions as a list. Each item of the list should be enclosed in [A] and [/A] tags. The list should be enclosed in [L] and [/L] tags."},
+                {"role": "user", "content": f"Write me {count} python programming questions which uses {category.lower()}"},
+                {"role": "assistant", "content": f"[L]{available_content_for_assistant}"}
+            ]
+            token_count += 500
+            if len(previous_iteration_questions_count) > 3:
+                if previous_iteration_questions_count[-1] == previous_iteration_questions_count[-2] == previous_iteration_questions_count[-3] == previous_iteration_questions_count[-4]:
+                    if self.verbose:
+                        print("LlamaManager::auto_generate_questions_from_shots::Generation could not be completed, stopping API calls")
+                    break
+        if self.verbose:
+            print("LlamaManager::auto_generate_questions_from_shots::Generated questions from shots")
+        return questions
+if __name__ == "__main__":
+    llama_manager = LlamaManager("nope", True)
+    categories = llama_manager.auto_generate_questions_categories(20)
+    shots = llama_manager.auto_generate_shots_for_category(2, categories[3])
+    questions = llama_manager.auto_generate_questions_from_shots(10, categories[3], shots, temperature = 0.5)

app.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import gradio
+import LlamaManager
+import os
+import huggingface_hub
+HF_API = huggingface_hub.HfApi()
+LLAMAMANAGER = LlamaManager.LlamaManager(os.environ.get("HF_KEY_2"), True)
+def store_generated_data(data):
+    token = os.environ.get("HF_BOT")
+    data = f"{data}"
+    HF_API.comment_discussion("xqt/SyntheticMBPP2", 1, data, repo_type = "dataset", token = token)
+def authenticate(secret_textbox):
+    password_list = os.environ.get("PASSWORD_LIST")
+    password_list = password_list.split(":")
+    api_key = ""
+    if secret_textbox in password_list:
+        api_key = os.environ.get("HF_KEY")
+    else:
+        api_key = secret_textbox
+    LLAMAMANAGER = LlamaManager.LlamaManager(api_key, True)
+def generate_categories(categories_count, seed, temperature, top_p, frequency_penalty):
+    categories = LLAMAMANAGER.auto_generate_questions_categories(
+        count = categories_count,
+        seed = seed,
+        temperature = temperature,
+        top_p = top_p,
+        frequency_penalty = frequency_penalty
+    )
+    data = {
+        "type": "generate_categories",
+        "categories": categories,
+        "count": categories_count,
+        "seed": seed,
+        "temperature": temperature,
+        "top_p": top_p,
+        "frequency_penalty": frequency_penalty
+    }
+    store_generated_data(data)
+    return gradio.Dropdown(choices = categories, value = categories[0], label = "Select Category", interactive = True)
+def generate_shots(category, shots_count, seed, temperature, top_p, frequency_penalty):
+    shots = LLAMAMANAGER.auto_generate_shots_for_category(category, shots_count, seed, temperature, top_p, frequency_penalty)
+    shots = [[shot] for shot in shots]
+    data = {
+        "type": "generate_shots",
+        "category": category,
+        "shots": shots,
+        "count": shots_count,
+        "seed": seed,
+        "temperature": temperature,
+        "top_p": top_p,
+        "frequency_penalty": frequency_penalty
+    }
+    store_generated_data(data)
+    return gradio.DataFrame(value = shots, type = "array", label = "Generated Shots", interactive = False, headers = None)
+def generate_questions(questions_count, category, shots, seed, temperature, top_p, frequency_penalty):
+    questions = LLAMAMANAGER.auto_generate_questions_from_shots(questions_count, category, shots, seed, temperature, top_p, frequency_penalty)
+    questions = [[question] for question in questions]
+    data = {
+        "type": "generate_questions",
+        "questions": questions,
+        "count": questions_count,
+        "category": category,
+        "shots": shots,
+        "seed": seed,
+        "temperature": temperature,
+        "top_p": top_p,
+        "frequency_penalty": frequency_penalty
+    }
+    store_generated_data(data)
+    return gradio.DataFrame(value = questions, type = "array", label = "Generated Shots", interactive = False, headers = None)
+with gradio.Blocks(fill_height=True) as base_app:
+    gradio.Markdown("# Synthetic Python Programming Data Generation ⚙️")
+    gradio.Markdown("# ❗️ Note: The data generated here by Llama3 and the settings used to generate it will be stored in the repository for future use.")
+    gradio.Markdown("# ❗️ Feel free to use your own API key if the key here is rate limited. API Key is never stored in the repository.")
+    gradio.Markdown("# ❗️ If you want to use a passcode, please text me.")
+    gradio.Markdown("# Step 0: Use your own API Key/Passcode")
+    with gradio.Row():
+        with gradio.Column():
+            __secret_textbox = gradio.Textbox(label = "API Key/Passcode", placeholder = "Enter your API Key/Passcode here", type = "password", interactive = True)
+        with gradio.Column():
+            __passcode_authenticate = gradio.Button("Authenticate", scale = 2)
+    gradio.Markdown("# Step 1: How many categories do you want to generate?")
+    with gradio.Row(equal_height = True):
+        with gradio.Column(scale = 2):
+            __categories_count = gradio.Slider(minimum = 1, maximum = 20, step = 1, value = 10, label = "Number of Categories", interactive = True)
+        with gradio.Column():
+            __categories_generate = gradio.Button("Generate Categories", scale = 2)
+    with gradio.Accordion("Advanced Settings", open = False):
+        with gradio.Row():
+            with gradio.Column():
+                __categories_temperature = gradio.Slider(minimum = 0.1, maximum = 2.0, step = 0.01, value = 1.0, label = "Temperature", interactive = True)
+                __categories_top_p = gradio.Slider(minimum = 0.1, maximum = 0.99, step = 0.01, value = 0.9, label = "Top P", interactive = True)
+            with gradio.Column():
+                __categories_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
+                __categories_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
+    gradio.Markdown("# Step 2: Select a category to generate shots for and select the number of shots to generate")
+    with gradio.Row():
+        with gradio.Column(scale = 2):
+            __shots_category = gradio.Dropdown(choices = [], label = "Select Category", interactive = True)
+            __shots_count = gradio.Slider(minimum = 2, maximum = 5, step = 1, value = 2, label = "Number of Shots", interactive = True)
+        with gradio.Column():
+            __shots_generate = gradio.Button("Generate Shots", scale = 2)
+    with gradio.Accordion("Advanced Settings", open = False):
+        with gradio.Row():
+            with gradio.Column():
+                __shots_temperature = gradio.Slider(minimum = 0.1, maximum = 2.0, step = 0.01, value = 1.0, label = "Temperature", interactive = True)
+                __shots_top_p = gradio.Slider(minimum = 0.1, maximum = 0.99, step = 0.01, value = 0.9, label = "Top P", interactive = True)
+            with gradio.Column():
+                __shots_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
+                __shots_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
+    __generated_shots = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Shots", interactive = False, headers = None)
+    gradio.Markdown("# Step 3: Generate Python Programming Questions for the generated shots")
+    with gradio.Row():
+        with gradio.Column(scale = 2):
+            __questions_count = gradio.Slider(minimum = 1, maximum = 30, step = 1, value = 10, label = "Number of Questions", interactive = True)
+        with gradio.Column():
+            __questions_generate = gradio.Button("Generate Questions", scale = 2)
+    with gradio.Accordion("Advanced Settings", open = False):
+        with gradio.Row():
+            with gradio.Column():
+                __questions_temperature = gradio.Slider(minimum = 0.1, maximum = 2.0, step = 0.01, value = 1.0, label = "Temperature", interactive = True)
+                __questions_top_p = gradio.Slider(minimum = 0.1, maximum = 0.99, step = 0.01, value = 0.9, label = "Top P", interactive = True)
+            with gradio.Column():
+                __questions_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
+                __questions_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
+    __generated_questions = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Shots", interactive = False, headers = None)
+    __passcode_authenticate.click(authenticate,
+                                    inputs = [__secret_textbox],
+                                    outputs = []
+                                    )
+    __categories_generate.click(generate_categories,
+                                inputs = [__categories_count, __categories_seed, __categories_temperature, __categories_top_p, __categories_frequency_penalty],
+                                outputs = [__shots_category]
+                                )
+    __shots_generate.click(generate_shots,
+                            inputs = [__shots_category, __shots_count, __shots_seed, __shots_temperature, __shots_top_p, __shots_frequency_penalty],
+                            outputs = [__generated_shots]
+                            )
+    __questions_generate.click(generate_questions,
+                                inputs = [__questions_count, __shots_category, __generated_shots, __questions_seed, __questions_temperature, __questions_top_p, __questions_frequency_penalty],
+                                outputs = [__generated_questions]
+                                )
+if __name__ == "__main__":
+    base_app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio
2	+ huggingface_hub