burtenshaw HF staff commited on
Commit
4b83e74
·
verified ·
1 Parent(s): 8773ff3

Upload 16 files

Browse files
defaults.py CHANGED
@@ -1,12 +1,14 @@
 
1
  import json
2
 
3
  SEED_DATA_PATH = "seed_data.json"
4
  PIPELINE_PATH = "pipeline.yaml"
5
- REMOTE_CODE_PATHS = ["defaults.py", "domain.py", "pipeline.py"]
6
  DIBT_PARENT_APP_URL = "https://argilla-domain-specific-datasets-welcome.hf.space/"
7
  N_PERSPECTIVES = 5
8
  N_TOPICS = 5
9
  N_EXAMPLES = 5
 
10
 
11
  ################################################
12
  # DEFAULTS ON FARMING
@@ -25,14 +27,23 @@ DEFAULT_SYSTEM_PROMPT = DEFAULT_DATA["domain_expert_prompt"]
25
  # PROJECT CONFIG FROM PARENT APP
26
  ################################################
27
 
28
- with open("project_config.json") as f:
29
- PROJECT_CONFIG = json.load(f)
 
30
 
31
- PROJECT_NAME = PROJECT_CONFIG["project_name"]
32
- ARGILLA_SPACE_REPO_ID = PROJECT_CONFIG["argilla_space_repo_id"]
33
- DATASET_REPO_ID = PROJECT_CONFIG["dataset_repo_id"]
34
- ARGILLA_SPACE_NAME = ARGILLA_SPACE_REPO_ID.replace("/", "-").replace("_", "-")
35
- ARGILLA_URL = f"https://{ARGILLA_SPACE_NAME}.hf.space"
36
- PROJECT_SPACE_REPO_ID = PROJECT_CONFIG["project_space_repo_id"]
37
- DATASET_URL = f"https://huggingface.co/datasets/{DATASET_REPO_ID}"
38
- HUB_USERNAME = DATASET_REPO_ID.split("/")[0]
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import json
3
 
4
  SEED_DATA_PATH = "seed_data.json"
5
  PIPELINE_PATH = "pipeline.yaml"
6
+ REMOTE_CODE_PATHS = ["defaults.py", "domain.py", "pipeline.py", "requirements.txt"]
7
  DIBT_PARENT_APP_URL = "https://argilla-domain-specific-datasets-welcome.hf.space/"
8
  N_PERSPECTIVES = 5
9
  N_TOPICS = 5
10
  N_EXAMPLES = 5
11
+ CODELESS_DISTILABEL = os.environ.get("CODELESS_DISTILABEL", True)
12
 
13
  ################################################
14
  # DEFAULTS ON FARMING
 
27
  # PROJECT CONFIG FROM PARENT APP
28
  ################################################
29
 
30
+ try:
31
+ with open("project_config.json") as f:
32
+ PROJECT_CONFIG = json.load(f)
33
 
34
+ PROJECT_NAME = PROJECT_CONFIG["project_name"]
35
+ ARGILLA_SPACE_REPO_ID = PROJECT_CONFIG["argilla_space_repo_id"]
36
+ DATASET_REPO_ID = PROJECT_CONFIG["dataset_repo_id"]
37
+ ARGILLA_SPACE_NAME = ARGILLA_SPACE_REPO_ID.replace("/", "-").replace("_", "-")
38
+ ARGILLA_URL = f"https://{ARGILLA_SPACE_NAME}.hf.space"
39
+ PROJECT_SPACE_REPO_ID = PROJECT_CONFIG["project_space_repo_id"]
40
+ DATASET_URL = f"https://huggingface.co/datasets/{DATASET_REPO_ID}"
41
+ HUB_USERNAME = DATASET_REPO_ID.split("/")[0]
42
+ except FileNotFoundError:
43
+ PROJECT_NAME = "DEFAULT_DOMAIN"
44
+ ARGILLA_SPACE_REPO_ID = ""
45
+ DATASET_REPO_ID = ""
46
+ ARGILLA_URL = ""
47
+ PROJECT_SPACE_REPO_ID = ""
48
+ DATASET_URL = ""
49
+ HUB_USERNAME = ""
pages/2_👩🏼‍🔬 Describe Domain.py CHANGED
@@ -14,7 +14,6 @@ from defaults import (
14
  N_TOPICS,
15
  SEED_DATA_PATH,
16
  PIPELINE_PATH,
17
- PROJECT_NAME,
18
  DATASET_REPO_ID,
19
  )
20
  from utils import project_sidebar
@@ -231,9 +230,18 @@ if st.button("🤗 Push Dataset Seed") and all(
231
  pipeline_path=PIPELINE_PATH,
232
  )
233
 
234
- st.sidebar.success(
235
  f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name})"
236
  )
 
 
 
 
 
 
 
 
 
237
  else:
238
  st.info(
239
  "Please fill in all the required domain fields to push the dataset seed to the Hub"
 
14
  N_TOPICS,
15
  SEED_DATA_PATH,
16
  PIPELINE_PATH,
 
17
  DATASET_REPO_ID,
18
  )
19
  from utils import project_sidebar
 
230
  pipeline_path=PIPELINE_PATH,
231
  )
232
 
233
+ st.success(
234
  f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name})"
235
  )
236
+
237
+ st.write("You can now move on to runnning your distilabel pipeline.")
238
+
239
+ st.page_link(
240
+ page="pages/3_🌱 Generate Dataset.py",
241
+ label="Generate Dataset",
242
+ icon="🌱",
243
+ )
244
+
245
  else:
246
  st.info(
247
  "Please fill in all the required domain fields to push the dataset seed to the Hub"
pages/3_🌱 Generate Dataset.py CHANGED
@@ -1,17 +1,13 @@
1
  import streamlit as st
2
- from streamlit.errors import EntryNotFoundError
3
 
4
  from hub import pull_seed_data_from_repo, push_pipeline_to_hub
5
  from defaults import (
6
  DEFAULT_SYSTEM_PROMPT,
7
  PIPELINE_PATH,
8
  PROJECT_NAME,
9
- ARGILLA_SPACE_REPO_ID,
10
- DATASET_REPO_ID,
11
- ARGILLA_SPACE_NAME,
12
  ARGILLA_URL,
13
- PROJECT_SPACE_REPO_ID,
14
  HUB_USERNAME,
 
15
  )
16
  from utils import project_sidebar
17
 
@@ -75,20 +71,21 @@ st.divider()
75
  st.markdown("### Run the pipeline")
76
 
77
  st.write(
78
- "Once you've defined the pipeline configuration, you can run the pipeline locally or on this space."
79
  )
80
 
81
- st.write(
82
- """We recommend running the pipeline locally if you're planning on generating a large dataset. \
83
- But running the pipeline on this space is a handy way to get started quickly. Your synthetic
84
- samples will be pushed to Argilla and available for review.
85
- """
86
- )
87
- st.write(
88
- """If you're planning on running the pipeline on the space, be aware that it \
89
- will take some time to complete and you will need to maintain a \
90
- connection to the space."""
91
- )
 
92
 
93
 
94
  if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
@@ -147,13 +144,16 @@ if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
147
  hub_token=hub_token,
148
  pipeline_config_path=PIPELINE_PATH,
149
  argilla_dataset_name=argilla_dataset_name,
 
 
150
  )
151
  st.code(
152
  f"""
153
  pip install git+https://github.com/argilla-io/distilabel.git
154
- git clone https://huggingface.co/{hub_username}/{project_name}
155
  cd {project_name}
156
- {' '.join(command_to_run[2:])}
 
157
  """,
158
  language="bash",
159
  )
@@ -163,57 +163,57 @@ if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
163
  ###############################################################
164
  # SPACE
165
  ###############################################################
166
-
167
- if st.button("🔥 Run pipeline right here, right now!"):
168
- if all(
169
- [
170
- argilla_api_key,
171
- argilla_url,
172
- base_url,
173
- hub_username,
174
- project_name,
175
- hub_token,
176
- argilla_dataset_name,
177
- ]
178
- ):
179
- with st.spinner("Pulling seed data from the Hub..."):
180
- try:
181
  seed_data = pull_seed_data_from_repo(
182
  repo_id=f"{hub_username}/{project_name}",
183
  hub_token=hub_token,
184
  )
185
- except EntryNotFoundError:
186
- st.error(
187
- "Seed data not found. Please make sure you pushed the data seed in Step 2."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  )
189
 
190
- domain = seed_data["domain"]
191
- perspectives = seed_data["perspectives"]
192
- topics = seed_data["topics"]
193
- examples = seed_data["examples"]
194
- domain_expert_prompt = seed_data["domain_expert_prompt"]
195
-
196
- with st.spinner("Serializing the pipeline configuration..."):
197
- serialize_pipeline(
198
- argilla_api_key=argilla_api_key,
199
- argilla_dataset_name=argilla_dataset_name,
200
- argilla_api_url=argilla_url,
201
- topics=topics,
202
- perspectives=perspectives,
203
- pipeline_config_path=PIPELINE_PATH,
204
- domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
205
- hub_token=hub_token,
206
- endpoint_base_url=base_url,
207
- examples=examples,
208
- )
209
-
210
- with st.spinner("Starting the pipeline..."):
211
- logs = run_pipeline(PIPELINE_PATH)
212
 
213
- st.success(f"Pipeline started successfully! 🚀")
214
 
215
- with st.expander(label="View Logs", expanded=True):
216
- for out in logs:
217
- st.text(out)
218
- else:
219
- st.error("Please fill all the required fields.")
 
1
  import streamlit as st
 
2
 
3
  from hub import pull_seed_data_from_repo, push_pipeline_to_hub
4
  from defaults import (
5
  DEFAULT_SYSTEM_PROMPT,
6
  PIPELINE_PATH,
7
  PROJECT_NAME,
 
 
 
8
  ARGILLA_URL,
 
9
  HUB_USERNAME,
10
+ CODELESS_DISTILABEL,
11
  )
12
  from utils import project_sidebar
13
 
 
71
  st.markdown("### Run the pipeline")
72
 
73
  st.write(
74
+ "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
75
  )
76
 
77
+ if CODELESS_DISTILABEL:
78
+ st.write(
79
+ """We recommend running the pipeline locally if you're planning on generating a large dataset. \
80
+ But running the pipeline on this space is a handy way to get started quickly. Your synthetic
81
+ samples will be pushed to Argilla and available for review.
82
+ """
83
+ )
84
+ st.write(
85
+ """If you're planning on running the pipeline on the space, be aware that it \
86
+ will take some time to complete and you will need to maintain a \
87
+ connection to the space."""
88
+ )
89
 
90
 
91
  if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
 
144
  hub_token=hub_token,
145
  pipeline_config_path=PIPELINE_PATH,
146
  argilla_dataset_name=argilla_dataset_name,
147
+ argilla_api_key=argilla_api_key,
148
+ argilla_api_url=argilla_url,
149
  )
150
  st.code(
151
  f"""
152
  pip install git+https://github.com/argilla-io/distilabel.git
153
+ git clone https://huggingface.co/datasets/{hub_username}/{project_name}
154
  cd {project_name}
155
+ pip install -r requirements.txt
156
+ {' '.join(["python"] + command_to_run[1:])}
157
  """,
158
  language="bash",
159
  )
 
163
  ###############################################################
164
  # SPACE
165
  ###############################################################
166
+ if CODELESS_DISTILABEL:
167
+ if st.button("🔥 Run pipeline right here, right now!"):
168
+ if all(
169
+ [
170
+ argilla_api_key,
171
+ argilla_url,
172
+ base_url,
173
+ hub_username,
174
+ project_name,
175
+ hub_token,
176
+ argilla_dataset_name,
177
+ ]
178
+ ):
179
+ with st.spinner("Pulling seed data from the Hub..."):
 
180
  seed_data = pull_seed_data_from_repo(
181
  repo_id=f"{hub_username}/{project_name}",
182
  hub_token=hub_token,
183
  )
184
+ domain = seed_data["domain"]
185
+ perspectives = seed_data["perspectives"]
186
+ topics = seed_data["topics"]
187
+ examples = seed_data["examples"]
188
+ domain_expert_prompt = seed_data["domain_expert_prompt"]
189
+
190
+ with st.spinner("Serializing the pipeline configuration..."):
191
+ serialize_pipeline(
192
+ argilla_api_key=argilla_api_key,
193
+ argilla_dataset_name=argilla_dataset_name,
194
+ argilla_api_url=argilla_url,
195
+ topics=topics,
196
+ perspectives=perspectives,
197
+ pipeline_config_path=PIPELINE_PATH,
198
+ domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
199
+ hub_token=hub_token,
200
+ endpoint_base_url=base_url,
201
+ examples=examples,
202
  )
203
 
204
+ with st.spinner("Starting the pipeline..."):
205
+ logs = run_pipeline(
206
+ pipeline_config_path=PIPELINE_PATH,
207
+ argilla_api_key=argilla_api_key,
208
+ argilla_api_url=argilla_url,
209
+ hub_token=hub_token,
210
+ argilla_dataset_name=argilla_dataset_name,
211
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
+ st.success(f"Pipeline started successfully! 🚀")
214
 
215
+ with st.expander(label="View Logs", expanded=True):
216
+ for out in logs:
217
+ st.text(out)
218
+ else:
219
+ st.error("Please fill all the required fields.")
pipeline.py CHANGED
@@ -1,5 +1,5 @@
1
- import os
2
  import subprocess
 
3
  import time
4
  from typing import List
5
 
@@ -82,10 +82,11 @@ def define_pipeline(
82
  input_batch_size=8,
83
  input_mappings={"instruction": "evolved_questions"},
84
  output_mappings={"generation": "domain_expert_answer"},
85
- _system_prompt=domain_expert_prompt,
86
- _template=template,
87
  )
88
 
 
 
 
89
  keep_columns = KeepColumns(
90
  name="keep_columns",
91
  columns=["model_name", "evolved_questions", "domain_expert_answer"],
@@ -142,12 +143,15 @@ def serialize_pipeline(
142
 
143
 
144
  def create_pipelines_run_command(
 
 
 
145
  pipeline_config_path: str = "pipeline.yaml",
146
  argilla_dataset_name: str = "domain_specific_datasets",
147
  ):
148
  """Create the command to run the pipeline."""
149
  command_to_run = [
150
- "python",
151
  "-m",
152
  "distilabel",
153
  "pipeline",
@@ -156,24 +160,44 @@ def create_pipelines_run_command(
156
  pipeline_config_path,
157
  "--param",
158
  f"text_generation_to_argilla.dataset_name={argilla_dataset_name}",
 
 
 
 
 
 
 
 
 
 
 
159
  ]
160
  return command_to_run
161
 
162
 
163
  def run_pipeline(
 
 
 
164
  pipeline_config_path: str = "pipeline.yaml",
165
  argilla_dataset_name: str = "domain_specific_datasets",
166
  ):
167
  """Run the pipeline and yield the output as a generator of logs."""
168
 
169
  command_to_run = create_pipelines_run_command(
 
170
  pipeline_config_path=pipeline_config_path,
171
  argilla_dataset_name=argilla_dataset_name,
 
 
172
  )
173
 
174
  # Run the script file
175
  process = subprocess.Popen(
176
- command_to_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE
 
 
 
177
  )
178
 
179
  while process.stdout and process.stdout.readable():
 
 
1
  import subprocess
2
+ import sys
3
  import time
4
  from typing import List
5
 
 
82
  input_batch_size=8,
83
  input_mappings={"instruction": "evolved_questions"},
84
  output_mappings={"generation": "domain_expert_answer"},
 
 
85
  )
86
 
87
+ domain_expert._system_prompt = domain_expert_prompt
88
+ domain_expert._template = template
89
+
90
  keep_columns = KeepColumns(
91
  name="keep_columns",
92
  columns=["model_name", "evolved_questions", "domain_expert_answer"],
 
143
 
144
 
145
  def create_pipelines_run_command(
146
+ hub_token: str,
147
+ argilla_api_key: str,
148
+ argilla_api_url: str,
149
  pipeline_config_path: str = "pipeline.yaml",
150
  argilla_dataset_name: str = "domain_specific_datasets",
151
  ):
152
  """Create the command to run the pipeline."""
153
  command_to_run = [
154
+ sys.executable,
155
  "-m",
156
  "distilabel",
157
  "pipeline",
 
160
  pipeline_config_path,
161
  "--param",
162
  f"text_generation_to_argilla.dataset_name={argilla_dataset_name}",
163
+ "--param",
164
+ f"text_generation_to_argilla.api_key={argilla_api_key}",
165
+ "--param",
166
+ f"text_generation_to_argilla.api_url={argilla_api_url}",
167
+ "--param",
168
+ f"self-instruct.llm.api_key={hub_token}",
169
+ "--param",
170
+ f"evol_instruction_complexity.llm.api_key={hub_token}",
171
+ "--param",
172
+ f"domain_expert.llm.api_key={hub_token}",
173
+ "--ignore-cache",
174
  ]
175
  return command_to_run
176
 
177
 
178
  def run_pipeline(
179
+ hub_token: str,
180
+ argilla_api_key: str,
181
+ argilla_api_url: str,
182
  pipeline_config_path: str = "pipeline.yaml",
183
  argilla_dataset_name: str = "domain_specific_datasets",
184
  ):
185
  """Run the pipeline and yield the output as a generator of logs."""
186
 
187
  command_to_run = create_pipelines_run_command(
188
+ hub_token=hub_token,
189
  pipeline_config_path=pipeline_config_path,
190
  argilla_dataset_name=argilla_dataset_name,
191
+ argilla_api_key=argilla_api_key,
192
+ argilla_api_url=argilla_api_url,
193
  )
194
 
195
  # Run the script file
196
  process = subprocess.Popen(
197
+ args=command_to_run,
198
+ stdout=subprocess.PIPE,
199
+ stderr=subprocess.PIPE,
200
+ env={"HF_TOKEN": hub_token},
201
  )
202
 
203
  while process.stdout and process.stdout.readable():
pipeline.yaml CHANGED
@@ -1,5 +1,5 @@
1
  distilabel:
2
- version: 1.0.0
3
  pipeline:
4
  name: farming
5
  description: null
@@ -54,7 +54,7 @@ pipeline:
54
  model_id: null
55
  endpoint_name: null
56
  endpoint_namespace: null
57
- base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
58
  tokenizer_id: null
59
  model_display_name: null
60
  use_openai_client: false
@@ -163,7 +163,7 @@ pipeline:
163
  model_id: null
164
  endpoint_name: null
165
  endpoint_namespace: null
166
- base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
167
  tokenizer_id: null
168
  model_display_name: null
169
  use_openai_client: false
@@ -390,7 +390,7 @@ pipeline:
390
  model_id: null
391
  endpoint_name: null
392
  endpoint_namespace: null
393
- base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
394
  tokenizer_id: null
395
  model_display_name: null
396
  use_openai_client: false
@@ -489,9 +489,9 @@ pipeline:
489
  generation: domain_expert_answer
490
  output_mappings: {}
491
  input_batch_size: 50
492
- dataset_name: farming
493
  dataset_workspace: admin
494
- api_url: https://argilla-farming.hf.space
495
  runtime_parameters_info:
496
  - name: input_batch_size
497
  optional: true
 
1
  distilabel:
2
+ version: 1.0.1
3
  pipeline:
4
  name: farming
5
  description: null
 
54
  model_id: null
55
  endpoint_name: null
56
  endpoint_namespace: null
57
+ base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2
58
  tokenizer_id: null
59
  model_display_name: null
60
  use_openai_client: false
 
163
  model_id: null
164
  endpoint_name: null
165
  endpoint_namespace: null
166
+ base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2
167
  tokenizer_id: null
168
  model_display_name: null
169
  use_openai_client: false
 
390
  model_id: null
391
  endpoint_name: null
392
  endpoint_namespace: null
393
+ base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2
394
  tokenizer_id: null
395
  model_display_name: null
396
  use_openai_client: false
 
489
  generation: domain_expert_answer
490
  output_mappings: {}
491
  input_batch_size: 50
492
+ dataset_name: test_3
493
  dataset_workspace: admin
494
+ api_url: https://burtenshaw-test-3-argilla-space.hf.space
495
  runtime_parameters_info:
496
  - name: input_batch_size
497
  optional: true
project_config.json CHANGED
@@ -1 +1 @@
1
- {"project_name": "DEFAULT_DOMAIN", "argilla_space_repo_id": "burtenshaw/domain_test_4_argilla_space", "project_space_repo_id": "burtenshaw/domain_test_4_config_space", "dataset_repo_id": "burtenshaw/domain_test_4"}
 
1
+ {"project_name": "test_3", "argilla_space_repo_id": "burtenshaw/test_3_argilla_space", "project_space_repo_id": "burtenshaw/test_3_config_space", "dataset_repo_id": "burtenshaw/test_3"}