bicycle_maintenance_config_space

Sleeping

App Files Files Community

burtenshaw HF staff commited on Apr 26

Commit

26fb24b

•

1 Parent(s): 63a8770

Upload 9 files

Browse files

Files changed (6) hide show

defaults.py +1 -1
hub.py +24 -2
pages/2_👩🏼‍🔬 Describe Domain.py +20 -22
pages/3_🌱 Generate Dataset.py +66 -175
pipeline.py +140 -174
utils.py +24 -2

defaults.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 SEED_DATA_PATH = "seed_data.json"
 PIPELINE_PATH = "pipeline.yaml"
-REMOTE_CODE_PATHS = ["defaults.py", "domain.py", "pipeline.py", "requirements.txt"]
 DIBT_PARENT_APP_URL = "https://argilla-domain-specific-datasets-welcome.hf.space/"
 N_PERSPECTIVES = 5
 N_TOPICS = 5

 SEED_DATA_PATH = "seed_data.json"
 PIPELINE_PATH = "pipeline.yaml"
+REMOTE_CODE_PATHS = ["requirements.txt"]
 DIBT_PARENT_APP_URL = "https://argilla-domain-specific-datasets-welcome.hf.space/"
 N_PERSPECTIVES = 5
 N_TOPICS = 5

hub.py CHANGED Viewed

@@ -94,7 +94,7 @@ def push_pipeline_to_hub(
     # upload the pipeline to the hub
     hf_api.upload_file(
         path_or_fileobj=pipeline_path,
-        path_in_repo="pipeline.yaml",
         token=hub_token,
         repo_id=repo_id,
         repo_type="dataset",
@@ -115,7 +115,7 @@ def push_pipeline_to_hub(
 def pull_seed_data_from_repo(repo_id, hub_token):
     # pull the dataset repo from the hub
     hf_api.hf_hub_download(
-        repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH, force_download=True
     )
     return json.load(open(SEED_DATA_PATH))
@@ -127,3 +127,25 @@ def push_argilla_dataset_to_hub(
     feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
     local_dataset = feedback_dataset.pull()
     local_dataset.push_to_huggingface(repo_id=repo_id)

     # upload the pipeline to the hub
     hf_api.upload_file(
         path_or_fileobj=pipeline_path,
+        path_in_repo="pipeline.py",
         token=hub_token,
         repo_id=repo_id,
         repo_type="dataset",
 def pull_seed_data_from_repo(repo_id, hub_token):
     # pull the dataset repo from the hub
     hf_api.hf_hub_download(
+        repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH
     )
     return json.load(open(SEED_DATA_PATH))
     feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
     local_dataset = feedback_dataset.pull()
     local_dataset.push_to_huggingface(repo_id=repo_id)
+def push_pipeline_params(
+    pipeline_params,
+    hub_username,
+    hub_token: str,
+    project_name,
+):
+    repo_id = f"{hub_username}/{project_name}"
+    temp_path = mktemp()
+    with open(temp_path, "w") as f:
+        json.dump(pipeline_params, f)
+    # upload the pipeline to the hub
+    hf_api.upload_file(
+        path_or_fileobj=temp_path,
+        path_in_repo="pipeline_params.json",
+        token=hub_token,
+        repo_id=repo_id,
+        repo_type="dataset",
+    )
+    print(f"Pipeline params uploaded to {repo_id}")

pages/2_👩🏼‍🔬 Describe Domain.py CHANGED Viewed

@@ -2,14 +2,9 @@ import json
 import streamlit as st
-from hub import push_dataset_to_hub
 from infer import query
 from defaults import (
-    DEFAULT_DOMAIN,
-    DEFAULT_PERSPECTIVES,
-    DEFAULT_TOPICS,
-    DEFAULT_EXAMPLES,
-    DEFAULT_SYSTEM_PROMPT,
     N_PERSPECTIVES,
     N_TOPICS,
     SEED_DATA_PATH,
@@ -18,12 +13,14 @@ from defaults import (
 )
 from utils import project_sidebar
 st.set_page_config(
     page_title="Domain Data Grower",
     page_icon="🧑‍🌾",
 )
 project_sidebar()
 ################################################################################
 # HEADER
 ################################################################################
@@ -37,6 +34,23 @@ st.write(
     "Define the project details, including the project name, domain, and API credentials"
 )
 ################################################################################
 # Domain Expert Section
 ################################################################################
@@ -212,22 +226,6 @@ with tab_raw_seed:
 st.divider()
-hub_username = DATASET_REPO_ID.split("/")[0]
-project_name = DATASET_REPO_ID.split("/")[1]
-st.write("Define the dataset repo details on the Hub")
-st.session_state["project_name"] = st.text_input("Project Name", project_name)
-st.session_state["hub_username"] = st.text_input("Hub Username", hub_username)
-st.session_state["hub_token"] = st.text_input("Hub Token", type="password", value=None)
-if all(
-    (
-        st.session_state.get("project_name"),
-        st.session_state.get("hub_username"),
-        st.session_state.get("hub_token"),
-    )
-):
-    st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
 if st.button("🤗 Push Dataset Seed") and all(
     (

 import streamlit as st
+from hub import push_dataset_to_hub, pull_seed_data_from_repo
 from infer import query
 from defaults import (
     N_PERSPECTIVES,
     N_TOPICS,
     SEED_DATA_PATH,
 )
 from utils import project_sidebar
 st.set_page_config(
     page_title="Domain Data Grower",
     page_icon="🧑‍🌾",
 )
 project_sidebar()
 ################################################################################
 # HEADER
 ################################################################################
     "Define the project details, including the project name, domain, and API credentials"
 )
+################################################################################
+# LOAD EXISTING DOMAIN DATA
+################################################################################
+DATASET_REPO_ID = (
+    f"{st.session_state['hub_username']}/{st.session_state['project_name']}"
+)
+SEED_DATA = pull_seed_data_from_repo(
+    DATASET_REPO_ID, hub_token=st.session_state["hub_token"]
+)
+DEFAULT_DOMAIN = SEED_DATA.get("domain", "")
+DEFAULT_PERSPECTIVES = SEED_DATA.get("perspectives", [""])
+DEFAULT_TOPICS = SEED_DATA.get("topics", [""])
+DEFAULT_EXAMPLES = SEED_DATA.get("examples", [{"question": "", "answer": ""}])
+DEFAULT_SYSTEM_PROMPT = SEED_DATA.get("domain_expert_prompt", "")
 ################################################################################
 # Domain Expert Section
 ################################################################################
 st.divider()
 if st.button("🤗 Push Dataset Seed") and all(
     (

pages/3_🌱 Generate Dataset.py CHANGED Viewed

@@ -1,18 +1,9 @@
 import streamlit as st
-from hub import pull_seed_data_from_repo, push_pipeline_to_hub
-from defaults import (
-    DEFAULT_SYSTEM_PROMPT,
-    PIPELINE_PATH,
-    PROJECT_NAME,
-    ARGILLA_URL,
-    HUB_USERNAME,
-    CODELESS_DISTILABEL,
-)
 from utils import project_sidebar
-from pipeline import serialize_pipeline, run_pipeline, create_pipelines_run_command
 st.set_page_config(
     page_title="Domain Data Grower",
     page_icon="🧑‍🌾",
@@ -27,20 +18,15 @@ project_sidebar()
 st.header("🧑‍🌾 Domain Data Grower")
 st.divider()
 st.subheader("Step 3. Run the pipeline to generate synthetic data")
-st.write("Define the project repos and models that the pipeline will use.")
-st.divider()
 ###############################################################
 # CONFIGURATION
 ###############################################################
-st.markdown("## Pipeline Configuration")
-st.markdown("#### 🤗 Hub details to pull the seed data")
-hub_username = st.text_input("Hub Username", HUB_USERNAME)
-project_name = st.text_input("Project Name", PROJECT_NAME)
-repo_id = f"{hub_username}/{project_name}"
-hub_token = st.text_input("Hub Token", type="password")
 st.divider()
@@ -89,169 +75,74 @@ st.divider()
 st.markdown("## Run the pipeline")
-st.write(
-    "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
 )
-if CODELESS_DISTILABEL:
-    st.write(
-        """We recommend running the pipeline locally if you're planning on generating a large dataset. \
-            But running the pipeline on this space is a handy way to get started quickly. Your synthetic
-            samples will be pushed to Argilla and available for review.
-            """
     )
-    st.write(
-        """If you're planning on running the pipeline on the space, be aware that it \
-            will take some time to complete and you will need to maintain a \
-            connection to the space."""
     )
-if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
-    if all(
-        [
-            argilla_api_key,
-            argilla_url,
-            base_url,
-            hub_username,
-            project_name,
-            hub_token,
-            argilla_dataset_name,
-        ]
-    ):
-        with st.spinner("Pulling seed data from the Hub..."):
-            try:
-                seed_data = pull_seed_data_from_repo(
-                    repo_id=f"{hub_username}/{project_name}",
-                    hub_token=hub_token,
-                )
-            except Exception:
-                st.error(
-                    "Seed data not found. Please make sure you pushed the data seed in Step 2."
-                )
-            domain = seed_data["domain"]
-            perspectives = seed_data["perspectives"]
-            topics = seed_data["topics"]
-            examples = seed_data["examples"]
-            domain_expert_prompt = seed_data["domain_expert_prompt"]
-        with st.spinner("Serializing the pipeline configuration..."):
-            serialize_pipeline(
-                argilla_api_key=argilla_api_key,
-                argilla_dataset_name=argilla_dataset_name,
-                argilla_api_url=argilla_url,
-                topics=topics,
-                perspectives=perspectives,
-                pipeline_config_path=PIPELINE_PATH,
-                domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
-                hub_token=hub_token,
-                endpoint_base_url=base_url,
-                examples=examples,
-            )
-            push_pipeline_to_hub(
-                pipeline_path=PIPELINE_PATH,
-                hub_token=hub_token,
-                hub_username=hub_username,
-                project_name=project_name,
-            )
-        st.success(f"Pipeline configuration saved to {hub_username}/{project_name}")
-        st.info(
-            "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
-        )
-        st.text(
-            "Execute the following command to generate a synthetic dataset from the seed data:"
-        )
-        command_to_run = create_pipelines_run_command(
-            hub_token=hub_token,
-            pipeline_config_path=PIPELINE_PATH,
-            argilla_dataset_name=argilla_dataset_name,
-            argilla_api_key=argilla_api_key,
-            argilla_api_url=argilla_url,
-        )
-        st.code(
-            f"""
-            pip install git+https://github.com/argilla-io/distilabel.git
-            git clone https://huggingface.co/datasets/{hub_username}/{project_name}
-            cd {project_name}
-            pip install -r requirements.txt
-            {' '.join(["python"] + command_to_run[1:])}
-        """,
-            language="bash",
-        )
-        st.subheader(
-            "👩‍🚀 If you want to access the pipeline and manipulate the locally, you can do:"
-        )
-        st.code(
-            """
-            git clone https://github.com/huggingface/data-is-better-together
-            cd domain-specific-datasets
-            """
-        )
-    else:
-        st.error("Please fill all the required fields.")
-###############################################################
-# SPACE
-###############################################################
-if CODELESS_DISTILABEL:
-    if st.button("🔥 Run pipeline right here, right now!"):
-        if all(
-            [
-                argilla_api_key,
-                argilla_url,
-                base_url,
-                hub_username,
-                project_name,
-                hub_token,
-                argilla_dataset_name,
-            ]
-        ):
-            with st.spinner("Pulling seed data from the Hub..."):
-                try:
-                    seed_data = pull_seed_data_from_repo(
-                        repo_id=f"{hub_username}/{project_name}",
-                        hub_token=hub_token,
-                    )
-                except Exception as e:
-                    st.error(
-                        "Seed data not found. Please make sure you pushed the data seed in Step 2."
-                    )
-                domain = seed_data["domain"]
-                perspectives = seed_data["perspectives"]
-                topics = seed_data["topics"]
-                examples = seed_data["examples"]
-                domain_expert_prompt = seed_data["domain_expert_prompt"]
-                serialize_pipeline(
-                    argilla_api_key=argilla_api_key,
-                    argilla_dataset_name=argilla_dataset_name,
-                    argilla_api_url=argilla_url,
-                    topics=topics,
-                    perspectives=perspectives,
-                    pipeline_config_path=PIPELINE_PATH,
-                    domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
-                    hub_token=hub_token,
-                    endpoint_base_url=base_url,
-                    examples=examples,
-                )
-            with st.spinner("Starting the pipeline..."):
-                logs = run_pipeline(
-                    pipeline_config_path=PIPELINE_PATH,
-                    argilla_api_key=argilla_api_key,
-                    argilla_api_url=argilla_url,
-                    hub_token=hub_token,
-                    argilla_dataset_name=argilla_dataset_name,
-                )
-            st.success(f"Pipeline started successfully! 🚀")
-            with st.expander(label="View Logs", expanded=True):
-                for out in logs:
-                    st.text(out)
-        else:
-            st.error("Please fill all the required fields.")

 import streamlit as st
+from defaults import ARGILLA_URL
+from hub import push_pipeline_params, push_pipeline_to_hub
 from utils import project_sidebar
 st.set_page_config(
     page_title="Domain Data Grower",
     page_icon="🧑‍🌾",
 st.header("🧑‍🌾 Domain Data Grower")
 st.divider()
 st.subheader("Step 3. Run the pipeline to generate synthetic data")
+st.write("Define the distilabel pipeline for generating the dataset.")
 ###############################################################
 # CONFIGURATION
 ###############################################################
+hub_username = st.session_state.get("hub_username")
+project_name = st.session_state.get("project_name")
+hub_token = st.session_state.get("hub_token")
 st.divider()
 st.markdown("## Run the pipeline")
+st.markdown(
+    "Once you've defined the pipeline configuration above, you can run the pipeline from your local machine."
 )
+if all(
+    [
+        argilla_api_key,
+        argilla_url,
+        base_url,
+        hub_token,
+        project_name,
+        hub_token,
+        argilla_dataset_name,
+    ]
+):
+    push_pipeline_params(
+        pipeline_params={
+            "argilla_api_key": argilla_api_key,
+            "argilla_api_url": argilla_url,
+            "argilla_dataset_name": argilla_dataset_name,
+            "endpoint_base_url": base_url,
+        },
+        hub_username=hub_username,
+        hub_token=hub_token,
+        project_name=project_name,
     )
+    push_pipeline_to_hub(
+        pipeline_path="pipeline.py",
+        hub_username=hub_username,
+        hub_token=hub_token,
+        project_name=project_name,
     )
+    st.markdown(
+        "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
+    )
+    st.code(
+        f"""
+        # Install the distilabel library
+        pip install git+https://github.com/argilla-io/distilabel.git
+        """
+    )
+    st.markdown("Next, you'll need to clone your dataset repo and run the pipeline:")
+    st.code(
+        f"""
+        git clone https://huggingface.co/datasets/{hub_username}/{project_name}
+        cd {project_name}
+        pip install -r requirements.txt
+        """
+    )
+    st.markdown("Finally, you can run the pipeline using the following command:")
+    st.code(
+        """
+        huggingface-cli login
+        python pipeline.py""",
+        language="bash",
+    )
+    st.markdown(
+        "👩‍🚀 If you want to customise the pipeline take a look in `pipeline.py` and teh [distilabel docs](https://distilabel.argilla.io/)"
+    )
+else:
+    st.info("Please fill all the required fields.")

pipeline.py CHANGED Viewed

@@ -1,95 +1,142 @@
-import subprocess
-import sys
-import time
-from typing import List
-from distilabel.steps.generators.data import LoadDataFromDicts
-from distilabel.steps.expand import ExpandColumns
-from distilabel.steps.keep import KeepColumns
-from distilabel.steps.tasks.self_instruct import SelfInstruct
-from distilabel.steps.tasks.evol_instruct.base import EvolInstruct
 from distilabel.llms.huggingface import InferenceEndpointsLLM
 from distilabel.pipeline import Pipeline
 from distilabel.steps import TextGenerationToArgilla
-from dotenv import load_dotenv
-from domain import (
-    DomainExpert,
-    CleanNumberedList,
-    create_topics,
-    create_examples_template,
-    APPLICATION_DESCRIPTION,
-)
-load_dotenv()
-def define_pipeline(
-    argilla_api_key: str,
-    argilla_api_url: str,
-    argilla_dataset_name: str,
-    topics: List[str],
-    perspectives: List[str],
-    domain_expert_prompt: str,
-    examples: List[dict],
-    hub_token: str,
-    endpoint_base_url: str,
-):
-    """Define the pipeline for the specific domain."""
-    terms = create_topics(topics, perspectives)
-    template = create_examples_template(examples)
-    with Pipeline("farming") as pipeline:
         load_data = LoadDataFromDicts(
             name="load_data",
             data=[{"input": term} for term in terms],
             batch_size=64,
         )
-        llm = InferenceEndpointsLLM(
-            base_url=endpoint_base_url,
-            api_key=hub_token,
-        )
         self_instruct = SelfInstruct(
-            name="self-instruct",
-            application_description=APPLICATION_DESCRIPTION,
             num_instructions=5,
             input_batch_size=8,
-            llm=llm,
-        )
-        evol_instruction_complexity = EvolInstruct(
-            name="evol_instruction_complexity",
-            llm=llm,
-            num_evolutions=2,
-            store_evolutions=True,
-            input_batch_size=8,
-            include_original_instruction=True,
-            input_mappings={"instruction": "question"},
         )
         expand_instructions = ExpandColumns(
-            name="expand_columns", columns={"instructions": "question"}
-        )
-        cleaner = CleanNumberedList(name="clean_numbered_list")
-        expand_evolutions = ExpandColumns(
-            name="expand_columns_evolved",
-            columns={"evolved_instructions": "evolved_questions"},
         )
         domain_expert = DomainExpert(
             name="domain_expert",
-            llm=llm,
             input_batch_size=8,
-            input_mappings={"instruction": "evolved_questions"},
-            output_mappings={"generation": "domain_expert_answer"},
-        )
-        domain_expert._system_prompt = domain_expert_prompt
-        domain_expert._template = template
-        keep_columns = KeepColumns(
-            name="keep_columns",
-            columns=["model_name", "evolved_questions", "domain_expert_answer"],
         )
         to_argilla = TextGenerationToArgilla(
@@ -98,111 +145,30 @@ def define_pipeline(
             dataset_workspace="admin",
             api_url=argilla_api_url,
             api_key=argilla_api_key,
-            input_mappings={
-                "instruction": "evolved_questions",
-                "generation": "domain_expert_answer",
-            },
         )
         load_data.connect(self_instruct)
         self_instruct.connect(expand_instructions)
-        expand_instructions.connect(cleaner)
-        cleaner.connect(evol_instruction_complexity)
-        evol_instruction_complexity.connect(expand_evolutions)
-        expand_evolutions.connect(domain_expert)
-        domain_expert.connect(keep_columns)
-        keep_columns.connect(to_argilla)
-    return pipeline
-def serialize_pipeline(
-    argilla_api_key: str,
-    argilla_api_url: str,
-    argilla_dataset_name: str,
-    topics: List[str],
-    perspectives: List[str],
-    domain_expert_prompt: str,
-    hub_token: str,
-    endpoint_base_url: str,
-    pipeline_config_path: str = "pipeline.yaml",
-    examples: List[dict] = [],
-):
-    """Serialize the pipeline to a yaml file."""
-    pipeline = define_pipeline(
-        argilla_api_key=argilla_api_key,
-        argilla_api_url=argilla_api_url,
-        argilla_dataset_name=argilla_dataset_name,
-        topics=topics,
-        perspectives=perspectives,
-        domain_expert_prompt=domain_expert_prompt,
-        hub_token=hub_token,
-        endpoint_base_url=endpoint_base_url,
-        examples=examples,
-    )
-    pipeline.save(path=pipeline_config_path, overwrite=True, format="yaml")
-def create_pipelines_run_command(
-    hub_token: str,
-    argilla_api_key: str,
-    argilla_api_url: str,
-    pipeline_config_path: str = "pipeline.yaml",
-    argilla_dataset_name: str = "domain_specific_datasets",
-):
-    """Create the command to run the pipeline."""
-    command_to_run = [
-        sys.executable,
-        "-m",
-        "distilabel",
-        "pipeline",
-        "run",
-        "--config",
-        pipeline_config_path,
-        "--param",
-        f"text_generation_to_argilla.dataset_name={argilla_dataset_name}",
-        "--param",
-        f"text_generation_to_argilla.api_key={argilla_api_key}",
-        "--param",
-        f"text_generation_to_argilla.api_url={argilla_api_url}",
-        "--param",
-        f"self-instruct.llm.api_key={hub_token}",
-        "--param",
-        f"evol_instruction_complexity.llm.api_key={hub_token}",
-        "--param",
-        f"domain_expert.llm.api_key={hub_token}",
-        "--ignore-cache",
-    ]
-    return command_to_run
-def run_pipeline(
-    hub_token: str,
-    argilla_api_key: str,
-    argilla_api_url: str,
-    pipeline_config_path: str = "pipeline.yaml",
-    argilla_dataset_name: str = "domain_specific_datasets",
-):
-    """Run the pipeline and yield the output as a generator of logs."""
-    command_to_run = create_pipelines_run_command(
-        hub_token=hub_token,
-        pipeline_config_path=pipeline_config_path,
-        argilla_dataset_name=argilla_dataset_name,
-        argilla_api_key=argilla_api_key,
-        argilla_api_url=argilla_api_url,
-    )
-    # Run the script file
-    process = subprocess.Popen(
-        args=command_to_run,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        env={"HF_TOKEN": hub_token},
-    )
-    while process.stdout and process.stdout.readable():
-        time.sleep(0.2)
-        line = process.stdout.readline()
-        if not line:
-            break
-        yield line.decode("utf-8")

+import json
+from textwrap import dedent
+from typing import Any, Dict, List
 from distilabel.llms.huggingface import InferenceEndpointsLLM
 from distilabel.pipeline import Pipeline
 from distilabel.steps import TextGenerationToArgilla
+from distilabel.steps.expand import ExpandColumns
+from distilabel.steps.generators.data import LoadDataFromDicts
+from distilabel.steps.tasks.self_instruct import SelfInstruct
+from distilabel.steps.tasks.text_generation import TextGeneration
+from distilabel.steps.tasks.typing import ChatType
+################################################################################
+# Functions to create task prompts
+################################################################################
+def create_application_instruction(domain: str, examples: List[Dict[str, str]]):
+    """Create the instruction for Self-Instruct task."""
+    system_prompt = dedent(
+        f"""You are an AI assistant than generates queries around the domain of {domain}.
+            Your should not expect basic but profound questions from your users.
+            The queries should reflect a diversxamity of vision and economic positions and political positions.
+            The queries may know about different methods of {domain}.
+            The queries can be positioned politically, economically, socially, or practically.
+            Also take into account the impact of diverse causes on diverse domains."""
+    )
+    for example in examples:
+        question = example["question"]
+        answer = example["answer"]
+        system_prompt += f"""\n- Question: {question}\n- Answer: {answer}\n"""
+def create_seed_terms(topics: List[str], perspectives: List[str]) -> List[str]:
+    """Create seed terms for self intruct to start from."""
+    return [
+        f"{topic} from a {perspective} perspective"
+        for topic in topics
+        for perspective in perspectives
+    ]
+################################################################################
+# Define out custom step for the domain expert
+################################################################################
+class DomainExpert(TextGeneration):
+    """A customized task to generate text as a domain expert in the domain of farming and agriculture."""
+    system_prompt: str
+    template: str = """This is the the instruction: {instruction}"""
+    def format_input(self, input: Dict[str, Any]) -> "ChatType":
+        return [
+            {
+                "role": "system",
+                "content": self.system_prompt,
+            },
+            {
+                "role": "user",
+                "content": self.template.format(**input),
+            },
+        ]
+################################################################################
+# Main script to run the pipeline
+################################################################################
+if __name__ == "__main__":
+    import os
+    import json
+    # load pipeline parameters
+    with open("pipeline_params.json", "r") as f:
+        params = json.load(f)
+    argilla_api_key = params.get("argilla_api_key")
+    argilla_api_url = params.get("argilla_api_url")
+    argilla_dataset_name = params.get("argilla_dataset_name")
+    endpoint_base_url = params.get("endpoint_base_url")
+    hub_token = os.environ.get("hub_token")
+    # collect our seed data
+    with open("seed_data.json", "r") as f:
+        seed_data = json.load(f)
+    topics = seed_data.get("topics", [])
+    perspectives = seed_data.get("perspectives", [])
+    domain_expert_prompt = seed_data.get("domain_expert_prompt", "")
+    examples = seed_data.get("examples", [])
+    domain_name = seed_data.get("domain_name", "domain")
+    # Define the task prompts
+    terms = create_seed_terms(topics=topics, perspectives=perspectives)
+    application_instruction = create_application_instruction(
+        domain=domain_name, examples=examples
+    )
+    # Define the distilabel pipeline
+    with Pipeline(domain_name) as pipeline:
         load_data = LoadDataFromDicts(
             name="load_data",
             data=[{"input": term} for term in terms],
             batch_size=64,
         )
         self_instruct = SelfInstruct(
+            name="self_instruct",
             num_instructions=5,
             input_batch_size=8,
+            llm=InferenceEndpointsLLM(
+                base_url=endpoint_base_url,
+                api_key=hub_token,
+            ),
         )
         expand_instructions = ExpandColumns(
+            name="expand_columns", columns={"instructions": "instruction"}
         )
         domain_expert = DomainExpert(
             name="domain_expert",
+            llm=InferenceEndpointsLLM(
+                base_url=endpoint_base_url,
+                api_key=hub_token,
+            ),
             input_batch_size=8,
+            system_prompt=domain_expert_prompt,
         )
         to_argilla = TextGenerationToArgilla(
             dataset_workspace="admin",
             api_url=argilla_api_url,
             api_key=argilla_api_key,
         )
+        # Connect up the pipeline
         load_data.connect(self_instruct)
         self_instruct.connect(expand_instructions)
+        expand_instructions.connect(domain_expert)
+        domain_expert.connect(to_argilla)
+    # Run the pipeline
+    pipeline.run(
+        parameters={
+            "self_instruct": {
+                "llm": {"api_key": hub_token, "base_url": endpoint_base_url}
+            },
+            "domain_expert": {
+                "llm": {"api_key": hub_token, "base_url": endpoint_base_url}
+            },
+            "text_generation_to_argilla": {
+                "dataset_name": argilla_dataset_name,
+                "api_key": argilla_api_key,
+                "api_url": argilla_api_url,
+            },
+        },
+        use_cache=False,
+    )

utils.py CHANGED Viewed

@@ -26,8 +26,30 @@ def project_sidebar():
     )
     st.sidebar.link_button(f"📚 Dataset Repo", DATASET_URL)
     st.sidebar.link_button(f"🤖 Argilla Space", ARGILLA_URL)
-    st.sidebar.divider()
-    st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)
     st.sidebar.link_button(
         "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens"
     )

     )
     st.sidebar.link_button(f"📚 Dataset Repo", DATASET_URL)
     st.sidebar.link_button(f"🤖 Argilla Space", ARGILLA_URL)
+    hub_username = DATASET_REPO_ID.split("/")[0]
+    project_name = DATASET_REPO_ID.split("/")[1]
+    st.session_state["project_name"] = project_name
+    st.session_state["hub_username"] = hub_username
+    st.session_state["hub_token"] = st.sidebar.text_input(
+        "Hub Token", type="password", value=None
+    )
     st.sidebar.link_button(
         "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens"
     )
+    if all(
+        (
+            st.session_state.get("project_name"),
+            st.session_state.get("hub_username"),
+            st.session_state.get("hub_token"),
+        )
+    ):
+        st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
+    st.sidebar.divider()
+    st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)
+    if st.session_state["hub_token"] is None:
+        st.error("Please provide a Hub token to generate answers")
+        st.stop()