burtenshaw HF staff commited on
Commit
798f8ba
β€’
1 Parent(s): 4b83e74

Upload 16 files

Browse files
pages/2_πŸ‘©πŸΌβ€πŸ”¬ Describe Domain.py CHANGED
@@ -84,13 +84,23 @@ with tab_domain_perspectives:
84
 
85
  perspectives = st.session_state.get(
86
  "perspectives",
87
- [st.text_input(f"Domain Perspective 0", value=DEFAULT_PERSPECTIVES[0])],
88
  )
 
89
 
90
- if st.button("Add New Perspective"):
 
 
 
 
 
 
 
91
  n = len(perspectives)
92
  value = DEFAULT_PERSPECTIVES[n] if n < N_PERSPECTIVES else ""
93
- perspectives.append(st.text_input(f"Domain Perspective {n}", value=""))
 
 
94
  st.session_state["perspectives"] = perspectives
95
 
96
 
@@ -104,14 +114,19 @@ with tab_domain_topics:
104
  """Topics are the main themes or subjects that are relevant to the domain. For example, the domain of farming can have topics like soil health, crop rotation, or livestock management."""
105
  )
106
  topics = st.session_state.get(
107
- "topics", [st.text_input(f"Domain Topic 0", value=DEFAULT_TOPICS[0])]
 
108
  )
109
- new_topic = st.button("Add New Topic")
 
 
 
 
110
 
111
- if new_topic:
112
  n = len(topics)
113
  value = DEFAULT_TOPICS[n] if n < N_TOPICS else ""
114
- topics.append(st.text_input(f"Domain Topic {n}", value=value))
115
  st.session_state["topics"] = topics
116
 
117
 
 
84
 
85
  perspectives = st.session_state.get(
86
  "perspectives",
87
+ [DEFAULT_PERSPECTIVES[0]],
88
  )
89
+ perspectives_container = st.container()
90
 
91
+ perspectives = [
92
+ perspectives_container.text_input(
93
+ f"Domain Perspective {i + 1}", value=perspective
94
+ )
95
+ for i, perspective in enumerate(perspectives)
96
+ ]
97
+
98
+ if st.button("Add Perspective", key="add_perspective"):
99
  n = len(perspectives)
100
  value = DEFAULT_PERSPECTIVES[n] if n < N_PERSPECTIVES else ""
101
+ perspectives.append(
102
+ perspectives_container.text_input(f"Domain Perspective {n + 1}", value="")
103
+ )
104
  st.session_state["perspectives"] = perspectives
105
 
106
 
 
114
  """Topics are the main themes or subjects that are relevant to the domain. For example, the domain of farming can have topics like soil health, crop rotation, or livestock management."""
115
  )
116
  topics = st.session_state.get(
117
+ "topics",
118
+ [DEFAULT_TOPICS[0]],
119
  )
120
+ topics_container = st.container()
121
+ topics = [
122
+ topics_container.text_input(f"Domain Topic {i + 1}", value=topic)
123
+ for i, topic in enumerate(topics)
124
+ ]
125
 
126
+ if st.button("Add Topic", key="add_topic"):
127
  n = len(topics)
128
  value = DEFAULT_TOPICS[n] if n < N_TOPICS else ""
129
+ topics.append(topics_container.text_input(f"Domain Topics {n + 1}", value=""))
130
  st.session_state["topics"] = topics
131
 
132
 
pages/3_🌱 Generate Dataset.py CHANGED
@@ -27,38 +27,57 @@ project_sidebar()
27
  st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
28
  st.divider()
29
  st.subheader("Step 3. Run the pipeline to generate synthetic data")
30
- st.write(
31
- "Define the project details, including the project name, domain, and API credentials"
32
- )
33
-
34
 
 
35
  ###############################################################
36
  # CONFIGURATION
37
  ###############################################################
38
 
39
- st.divider()
40
-
41
- st.markdown("### Pipeline Configuration")
42
 
43
- st.write("πŸ€— Hub details to pull the seed data")
44
  hub_username = st.text_input("Hub Username", HUB_USERNAME)
45
  project_name = st.text_input("Project Name", PROJECT_NAME)
46
  repo_id = f"{hub_username}/{project_name}"
47
  hub_token = st.text_input("Hub Token", type="password")
48
 
49
- st.write("πŸ€– Inference configuration")
 
 
50
 
51
  st.write(
52
  "Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
53
  )
54
- st.link_button(
55
- "πŸ€— Inference compaptible models on the hub",
56
- "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
57
- )
58
 
59
- base_url = st.text_input("Base URL")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- st.write("πŸ”¬ Argilla API details to push the generated dataset")
 
 
 
 
 
62
  argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
63
  argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
64
  argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
@@ -68,7 +87,7 @@ st.divider()
68
  # LOCAL
69
  ###############################################################
70
 
71
- st.markdown("### Run the pipeline")
72
 
73
  st.write(
74
  "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
@@ -101,10 +120,15 @@ if st.button("πŸ’» Run pipeline locally", key="run_pipeline_local"):
101
  ]
102
  ):
103
  with st.spinner("Pulling seed data from the Hub..."):
104
- seed_data = pull_seed_data_from_repo(
105
- repo_id=f"{hub_username}/{project_name}",
106
- hub_token=hub_token,
107
- )
 
 
 
 
 
108
 
109
  domain = seed_data["domain"]
110
  perspectives = seed_data["perspectives"]
@@ -177,17 +201,22 @@ if CODELESS_DISTILABEL:
177
  ]
178
  ):
179
  with st.spinner("Pulling seed data from the Hub..."):
180
- seed_data = pull_seed_data_from_repo(
181
- repo_id=f"{hub_username}/{project_name}",
182
- hub_token=hub_token,
183
- )
 
 
 
 
 
 
184
  domain = seed_data["domain"]
185
  perspectives = seed_data["perspectives"]
186
  topics = seed_data["topics"]
187
  examples = seed_data["examples"]
188
  domain_expert_prompt = seed_data["domain_expert_prompt"]
189
 
190
- with st.spinner("Serializing the pipeline configuration..."):
191
  serialize_pipeline(
192
  argilla_api_key=argilla_api_key,
193
  argilla_dataset_name=argilla_dataset_name,
 
27
  st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
28
  st.divider()
29
  st.subheader("Step 3. Run the pipeline to generate synthetic data")
30
+ st.write("Define the project repos and models that the pipeline will use.")
 
 
 
31
 
32
+ st.divider()
33
  ###############################################################
34
  # CONFIGURATION
35
  ###############################################################
36
 
37
+ st.markdown("## Pipeline Configuration")
 
 
38
 
39
+ st.markdown("#### πŸ€— Hub details to pull the seed data")
40
  hub_username = st.text_input("Hub Username", HUB_USERNAME)
41
  project_name = st.text_input("Project Name", PROJECT_NAME)
42
  repo_id = f"{hub_username}/{project_name}"
43
  hub_token = st.text_input("Hub Token", type="password")
44
 
45
+ st.divider()
46
+
47
+ st.markdown("#### πŸ€– Inference configuration")
48
 
49
  st.write(
50
  "Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
51
  )
 
 
 
 
52
 
53
+ with st.expander("πŸ€— Recommended Models"):
54
+ st.write("All inference endpoint compatible models can be found via the link below")
55
+ st.link_button(
56
+ "πŸ€— Inference compaptible models on the hub",
57
+ "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
58
+ )
59
+ st.write("πŸ”‹Projects with sufficient resources could take advantage of LLama3 70b")
60
+ st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B")
61
+
62
+ st.write("πŸͺ«Projects with less resources could take advantage of LLama 3 8b")
63
+ st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B")
64
+
65
+ st.write("πŸƒProjects with even less resources could take advantage of Phi-2")
66
+ st.code("https://api-inference.huggingface.co/models/microsoft/phi-2")
67
+
68
+ st.write("Note Hugggingface Pro gives access to more compute resources")
69
+ st.link_button(
70
+ "πŸ€— Huggingface Pro",
71
+ "https://huggingface.co/pricing",
72
+ )
73
+
74
 
75
+ base_url = st.text_input(
76
+ label="Base URL for the Inference API",
77
+ value="https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta",
78
+ )
79
+ st.divider()
80
+ st.markdown("#### πŸ”¬ Argilla API details to push the generated dataset")
81
  argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
82
  argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
83
  argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
 
87
  # LOCAL
88
  ###############################################################
89
 
90
+ st.markdown("## Run the pipeline")
91
 
92
  st.write(
93
  "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
 
120
  ]
121
  ):
122
  with st.spinner("Pulling seed data from the Hub..."):
123
+ try:
124
+ seed_data = pull_seed_data_from_repo(
125
+ repo_id=f"{hub_username}/{project_name}",
126
+ hub_token=hub_token,
127
+ )
128
+ except Exception:
129
+ st.error(
130
+ "Seed data not found. Please make sure you pushed the data seed in Step 2."
131
+ )
132
 
133
  domain = seed_data["domain"]
134
  perspectives = seed_data["perspectives"]
 
201
  ]
202
  ):
203
  with st.spinner("Pulling seed data from the Hub..."):
204
+ try:
205
+ seed_data = pull_seed_data_from_repo(
206
+ repo_id=f"{hub_username}/{project_name}",
207
+ hub_token=hub_token,
208
+ )
209
+ except Exception as e:
210
+ st.error(
211
+ "Seed data not found. Please make sure you pushed the data seed in Step 2."
212
+ )
213
+
214
  domain = seed_data["domain"]
215
  perspectives = seed_data["perspectives"]
216
  topics = seed_data["topics"]
217
  examples = seed_data["examples"]
218
  domain_expert_prompt = seed_data["domain_expert_prompt"]
219
 
 
220
  serialize_pipeline(
221
  argilla_api_key=argilla_api_key,
222
  argilla_dataset_name=argilla_dataset_name,
pipeline.yaml CHANGED
@@ -54,7 +54,7 @@ pipeline:
54
  model_id: null
55
  endpoint_name: null
56
  endpoint_namespace: null
57
- base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2
58
  tokenizer_id: null
59
  model_display_name: null
60
  use_openai_client: false
@@ -163,7 +163,7 @@ pipeline:
163
  model_id: null
164
  endpoint_name: null
165
  endpoint_namespace: null
166
- base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2
167
  tokenizer_id: null
168
  model_display_name: null
169
  use_openai_client: false
@@ -390,7 +390,7 @@ pipeline:
390
  model_id: null
391
  endpoint_name: null
392
  endpoint_namespace: null
393
- base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2
394
  tokenizer_id: null
395
  model_display_name: null
396
  use_openai_client: false
 
54
  model_id: null
55
  endpoint_name: null
56
  endpoint_namespace: null
57
+ base_url: https://api-inference.huggingface.co/models/microsoft/phi-2
58
  tokenizer_id: null
59
  model_display_name: null
60
  use_openai_client: false
 
163
  model_id: null
164
  endpoint_name: null
165
  endpoint_namespace: null
166
+ base_url: https://api-inference.huggingface.co/models/microsoft/phi-2
167
  tokenizer_id: null
168
  model_display_name: null
169
  use_openai_client: false
 
390
  model_id: null
391
  endpoint_name: null
392
  endpoint_namespace: null
393
+ base_url: https://api-inference.huggingface.co/models/microsoft/phi-2
394
  tokenizer_id: null
395
  model_display_name: null
396
  use_openai_client: false
utils.py CHANGED
@@ -18,15 +18,16 @@ def project_sidebar():
18
  )
19
  st.stop()
20
 
 
21
  st.sidebar.markdown(
22
- """
23
- ## 🌱 Domain Data Grower
24
-
25
  This space helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
26
  """
27
  )
28
- st.sidebar.subheader(f"Project Details: {PROJECT_NAME}")
29
  st.sidebar.link_button(f"πŸ“š Dataset Repo", DATASET_URL)
30
  st.sidebar.link_button(f"πŸ€– Argilla Space", ARGILLA_URL)
31
  st.sidebar.divider()
32
  st.sidebar.link_button("πŸ§‘β€πŸŒΎ New Project", DIBT_PARENT_APP_URL)
 
 
 
 
18
  )
19
  st.stop()
20
 
21
+ st.sidebar.subheader(f"A Data Growing Project in the domain of {PROJECT_NAME}")
22
  st.sidebar.markdown(
23
+ """
 
 
24
  This space helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
25
  """
26
  )
 
27
  st.sidebar.link_button(f"πŸ“š Dataset Repo", DATASET_URL)
28
  st.sidebar.link_button(f"πŸ€– Argilla Space", ARGILLA_URL)
29
  st.sidebar.divider()
30
  st.sidebar.link_button("πŸ§‘β€πŸŒΎ New Project", DIBT_PARENT_APP_URL)
31
+ st.sidebar.link_button(
32
+ "πŸ€— Get your Hub Token", "https://huggingface.co/settings/tokens"
33
+ )