File size: 4,260 Bytes
8773ff3 32014a1 8773ff3 32014a1 8773ff3 32014a1 8773ff3 798f8ba 8773ff3 798f8ba 8773ff3 798f8ba 8773ff3 798f8ba 8773ff3 32014a1 8773ff3 32014a1 8773ff3 32014a1 8773ff3 32014a1 8773ff3 32014a1 8773ff3 32014a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import streamlit as st
from defaults import ARGILLA_URL
from utils import project_sidebar
st.set_page_config(
page_title="Domain Data Grower",
page_icon="🧑🌾",
)
project_sidebar()
################################################################################
# HEADER
################################################################################
st.header("🧑🌾 Domain Data Grower")
st.divider()
st.subheader("Step 3. Run the pipeline to generate synthetic data")
st.write("Define the distilabel pipeline for generating the dataset.")
###############################################################
# CONFIGURATION
###############################################################
hub_username = st.session_state.get("hub_username")
project_name = st.session_state.get("project_name")
hub_token = st.session_state.get("hub_token")
st.divider()
st.markdown("#### 🤖 Inference configuration")
st.write(
"Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
)
with st.expander("🤗 Recommended Models"):
st.write("All inference endpoint compatible models can be found via the link below")
st.link_button(
"🤗 Inference compaptible models on the hub",
"https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
)
st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B")
st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B")
st.write("🍃Projects with even less resources could take advantage of Phi-2")
st.code("https://api-inference.huggingface.co/models/microsoft/phi-2")
st.write("Note Hugggingface Pro gives access to more compute resources")
st.link_button(
"🤗 Huggingface Pro",
"https://huggingface.co/pricing",
)
base_url = st.text_input(
label="Base URL for the Inference API",
value="https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta",
)
st.divider()
st.markdown("#### 🔬 Argilla API details to push the generated dataset")
argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
st.divider()
###############################################################
# LOCAL
###############################################################
st.markdown("## Run the pipeline")
st.markdown(
"Once you've defined the pipeline configuration above, you can run the pipeline from your local machine."
)
if all(
[
argilla_api_key,
argilla_url,
base_url,
hub_token,
project_name,
hub_token,
argilla_dataset_name,
]
):
st.markdown(
"To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
)
st.code(
f"""
# Install the distilabel library
pip install git+https://github.com/argilla-io/distilabel.git
"""
)
st.markdown("Next, you'll need to clone your dataset repo and run the pipeline:")
st.code(
f"""
# Clone the project and install the requirements
git clone https://huggingface.co/datasets/{hub_username}/{project_name}
cd {project_name}
pip install -r requirements.txt
# Run the pipeline
python pipeline.py
--argilla-api-key {argilla_api_key}
--argilla-api-url {argilla_url}
--argilla-dataset-name {argilla_dataset_name}
--endpoint-base-url {base_url}
--hub-token {st.session_state["hub_token"]}
""",
language="bash",
)
st.markdown(
"👩🚀 If you want to customise the pipeline take a look in `pipeline.py` and teh [distilabel docs](https://distilabel.argilla.io/)"
)
else:
st.info("Please fill all the required fields.")
|