Upload 5 files
Browse files- app.py +90 -66
- hub.py +32 -98
- project_config.json +1 -1
- seed_data.json +2 -26
app.py
CHANGED
@@ -1,94 +1,118 @@
|
|
1 |
-
import
|
2 |
|
3 |
-
from
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
ARGILLA_URL,
|
8 |
-
PROJECT_SPACE_REPO_ID,
|
9 |
-
DIBT_PARENT_APP_URL,
|
10 |
)
|
11 |
-
from utils import project_sidebar
|
12 |
|
13 |
-
|
14 |
|
15 |
-
project_sidebar()
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
)
|
21 |
-
st.stop()
|
22 |
|
|
|
|
|
|
|
23 |
|
24 |
st.header("π§βπΎ Domain Data Grower")
|
25 |
st.divider()
|
26 |
|
27 |
-
st.
|
28 |
-
"""
|
29 |
-
## π± Create a dataset seed for aligning models to a specific domain
|
30 |
-
|
31 |
-
This app helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
|
32 |
-
Alignment datasets are used to fine-tune models to a specific domain or task, but as yet, there's a shortage of diverse datasets for this purpose.
|
33 |
-
"""
|
34 |
-
)
|
35 |
-
st.markdown(
|
36 |
-
"""
|
37 |
-
## π How it works
|
38 |
-
|
39 |
-
You can create a dataset seed by defining the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
|
40 |
-
The dataset seed is then used to generate synthetic data for training a language model.
|
41 |
-
|
42 |
-
"""
|
43 |
)
|
44 |
-
st.markdown(
|
45 |
-
"""
|
46 |
-
## πΊοΈ The process
|
47 |
|
48 |
-
|
|
|
|
|
49 |
|
50 |
-
|
51 |
-
"""
|
52 |
-
)
|
53 |
-
st.link_button("π ~~Setup Project via the parent app~~", DIBT_PARENT_APP_URL)
|
54 |
|
55 |
st.markdown(
|
56 |
-
"""
|
57 |
-
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
You can collaborate with domain experts to define the domain expertise and perspectives.
|
61 |
"""
|
62 |
)
|
63 |
|
64 |
st.page_link(
|
65 |
-
"pages
|
66 |
-
label="
|
67 |
-
icon="
|
68 |
)
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
|
74 |
-
|
75 |
-
You can run the pipeline locally or in this space to generate synthetic data.
|
76 |
-
"""
|
77 |
-
)
|
78 |
|
79 |
-
st.
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
)
|
84 |
|
85 |
-
st.
|
86 |
-
""
|
87 |
-
### Step 4: Review the Dataset
|
88 |
|
89 |
-
|
|
|
|
|
|
|
90 |
|
|
|
|
|
|
|
91 |
|
92 |
-
""
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
|
3 |
+
from hub import (
|
4 |
+
setup_dataset_on_hub,
|
5 |
+
duplicate_space_on_hub,
|
6 |
+
add_project_config_to_space_repo,
|
|
|
|
|
|
|
7 |
)
|
|
|
8 |
|
9 |
+
import streamlit as st
|
10 |
|
|
|
11 |
|
12 |
+
# Constants
|
13 |
+
# Written here to avoid defaults.py
|
14 |
+
DEFAULT_DOMAIN = "farming"
|
|
|
|
|
15 |
|
16 |
+
st.set_page_config(
|
17 |
+
"Domain Data Grower", page_icon="π§βπΎ", initial_sidebar_state="collapsed"
|
18 |
+
)
|
19 |
|
20 |
st.header("π§βπΎ Domain Data Grower")
|
21 |
st.divider()
|
22 |
|
23 |
+
st.sidebar.link_button(
|
24 |
+
"π€ Get your Hub Token", "https://huggingface.co/settings/tokens"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
)
|
|
|
|
|
|
|
26 |
|
27 |
+
################################################################################
|
28 |
+
# APP MARKDOWN
|
29 |
+
################################################################################
|
30 |
|
31 |
+
st.header("π± Create a domain specific dataset")
|
|
|
|
|
|
|
32 |
|
33 |
st.markdown(
|
34 |
+
"""This space will set up your domain specific dataset project. It will
|
35 |
+
create the resources that you need to build a dataset. Those resources include:
|
36 |
+
|
37 |
+
- A dataset repository on the Hub
|
38 |
+
- Another space to define expert domain and run generation pipelines
|
39 |
|
40 |
+
For a complete overview of the project. Check out the README
|
|
|
41 |
"""
|
42 |
)
|
43 |
|
44 |
st.page_link(
|
45 |
+
"pages/π§βπΎ Domain Data Grower.py",
|
46 |
+
label="Domain Data Grower",
|
47 |
+
icon="π§βπΎ",
|
48 |
)
|
49 |
|
50 |
+
################################################################################
|
51 |
+
# CONFIGURATION
|
52 |
+
################################################################################
|
53 |
|
54 |
+
st.subheader("πΎ Project Configuration")
|
|
|
|
|
|
|
55 |
|
56 |
+
project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
|
57 |
+
hub_username = st.text_input("Hub Username", "argilla")
|
58 |
+
hub_token = st.text_input("Hub Token", type="password")
|
59 |
+
private_selector = st.checkbox("Private Space", value=False)
|
|
|
60 |
|
61 |
+
if st.button("π€ Setup Project Resources"):
|
62 |
+
repo_id = f"{hub_username}/{project_name}"
|
|
|
63 |
|
64 |
+
setup_dataset_on_hub(
|
65 |
+
repo_id=repo_id,
|
66 |
+
hub_token=hub_token,
|
67 |
+
)
|
68 |
|
69 |
+
st.success(
|
70 |
+
f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}). Hold on the repo_id: {repo_id}, we will need it in the next steps."
|
71 |
+
)
|
72 |
|
73 |
+
space_name = f"{project_name}_config_space"
|
74 |
+
|
75 |
+
duplicate_space_on_hub(
|
76 |
+
source_repo="argilla/domain-specific-datasets-template",
|
77 |
+
target_repo=space_name,
|
78 |
+
hub_token=hub_token,
|
79 |
+
private=private_selector,
|
80 |
+
)
|
81 |
+
|
82 |
+
st.success(
|
83 |
+
f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
|
84 |
+
)
|
85 |
+
|
86 |
+
argilla_name = f"{project_name}_argilla_space"
|
87 |
+
|
88 |
+
duplicate_space_on_hub(
|
89 |
+
source_repo="argilla/argilla-template-space",
|
90 |
+
target_repo=argilla_name,
|
91 |
+
hub_token=hub_token,
|
92 |
+
private=private_selector,
|
93 |
+
)
|
94 |
+
|
95 |
+
st.success(
|
96 |
+
f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
|
97 |
+
)
|
98 |
+
|
99 |
+
seconds = 5
|
100 |
+
|
101 |
+
with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
|
102 |
+
time.sleep(seconds)
|
103 |
+
add_project_config_to_space_repo(
|
104 |
+
dataset_repo_id=repo_id,
|
105 |
+
hub_token=hub_token,
|
106 |
+
project_name=project_name,
|
107 |
+
argilla_space_repo_id=f"{hub_username}/{argilla_name}",
|
108 |
+
project_space_repo_id=f"{hub_username}/{space_name}",
|
109 |
+
)
|
110 |
+
|
111 |
+
st.subheader("π’ Next Steps")
|
112 |
+
|
113 |
+
st.write("Go to you project specific space!")
|
114 |
+
|
115 |
+
st.link_button(
|
116 |
+
"π§βπΎ Open Configuration Space",
|
117 |
+
f"https://huggingface.co/spaces/{hub_username}/{space_name}",
|
118 |
+
)
|
hub.py
CHANGED
@@ -1,43 +1,10 @@
|
|
1 |
import json
|
2 |
-
from tempfile import mktemp
|
3 |
|
4 |
-
import
|
5 |
-
from huggingface_hub import HfApi
|
6 |
-
|
7 |
-
from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH
|
8 |
|
9 |
|
10 |
hf_api = HfApi()
|
11 |
|
12 |
-
with open("DATASET_README_BASE.md") as f:
|
13 |
-
DATASET_README_BASE = f.read()
|
14 |
-
|
15 |
-
|
16 |
-
def create_readme(domain_seed_data, project_name, domain):
|
17 |
-
# create a readme for the project that shows the domain and project name
|
18 |
-
readme = DATASET_README_BASE
|
19 |
-
readme += f"# {project_name}\n\n## Domain: {domain}"
|
20 |
-
perspectives = domain_seed_data.get("perspectives")
|
21 |
-
topics = domain_seed_data.get("topics")
|
22 |
-
examples = domain_seed_data.get("examples")
|
23 |
-
if perspectives:
|
24 |
-
readme += "\n\n## Perspectives\n\n"
|
25 |
-
for p in perspectives:
|
26 |
-
readme += f"- {p}\n"
|
27 |
-
if topics:
|
28 |
-
readme += "\n\n## Topics\n\n"
|
29 |
-
for t in topics:
|
30 |
-
readme += f"- {t}\n"
|
31 |
-
if examples:
|
32 |
-
readme += "\n\n## Examples\n\n"
|
33 |
-
for example in examples:
|
34 |
-
readme += f"### {example['question']}\n\n{example['answer']}\n\n"
|
35 |
-
temp_file = mktemp()
|
36 |
-
|
37 |
-
with open(temp_file, "w") as f:
|
38 |
-
f.write(readme)
|
39 |
-
return temp_file
|
40 |
-
|
41 |
|
42 |
def setup_dataset_on_hub(repo_id, hub_token):
|
43 |
# create an empty dataset repo on the hub
|
@@ -45,85 +12,52 @@ def setup_dataset_on_hub(repo_id, hub_token):
|
|
45 |
repo_id=repo_id,
|
46 |
token=hub_token,
|
47 |
repo_type="dataset",
|
48 |
-
exist_ok=True,
|
49 |
)
|
50 |
|
51 |
-
|
52 |
-
def push_dataset_to_hub(
|
53 |
-
domain_seed_data_path,
|
54 |
-
project_name,
|
55 |
-
domain,
|
56 |
-
pipeline_path,
|
57 |
-
hub_username,
|
58 |
-
hub_token: str,
|
59 |
-
):
|
60 |
-
repo_id = f"{hub_username}/{project_name}"
|
61 |
-
|
62 |
-
setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token)
|
63 |
-
|
64 |
-
# upload the seed data and readme to the hub
|
65 |
hf_api.upload_file(
|
66 |
-
path_or_fileobj=
|
67 |
path_in_repo="seed_data.json",
|
68 |
-
token=hub_token,
|
69 |
repo_id=repo_id,
|
70 |
repo_type="dataset",
|
|
|
71 |
)
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
),
|
79 |
-
path_in_repo="README.md",
|
80 |
token=hub_token,
|
81 |
-
|
82 |
-
|
83 |
)
|
84 |
|
85 |
|
86 |
-
def
|
87 |
-
|
88 |
-
|
89 |
-
hub_token: str,
|
90 |
project_name,
|
|
|
|
|
91 |
):
|
92 |
-
|
93 |
-
|
94 |
-
# upload the pipeline to the hub
|
95 |
-
hf_api.upload_file(
|
96 |
-
path_or_fileobj=pipeline_path,
|
97 |
-
path_in_repo="pipeline.yaml",
|
98 |
-
token=hub_token,
|
99 |
-
repo_id=repo_id,
|
100 |
-
repo_type="dataset",
|
101 |
-
)
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
110 |
)
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH
|
119 |
)
|
120 |
-
return json.load(open(SEED_DATA_PATH))
|
121 |
-
|
122 |
-
|
123 |
-
def push_argilla_dataset_to_hub(
|
124 |
-
name: str, repo_id: str, url: str, api_key: str, workspace: str = "admin"
|
125 |
-
):
|
126 |
-
rg.init(api_url=url, api_key=api_key)
|
127 |
-
feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
|
128 |
-
local_dataset = feedback_dataset.pull()
|
129 |
-
local_dataset.push_to_huggingface(repo_id=repo_id)
|
|
|
1 |
import json
|
|
|
2 |
|
3 |
+
from huggingface_hub import duplicate_space, HfApi
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
hf_api = HfApi()
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def setup_dataset_on_hub(repo_id, hub_token):
|
10 |
# create an empty dataset repo on the hub
|
|
|
12 |
repo_id=repo_id,
|
13 |
token=hub_token,
|
14 |
repo_type="dataset",
|
|
|
15 |
)
|
16 |
|
17 |
+
# upload the seed data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
hf_api.upload_file(
|
19 |
+
path_or_fileobj="seed_data.json",
|
20 |
path_in_repo="seed_data.json",
|
|
|
21 |
repo_id=repo_id,
|
22 |
repo_type="dataset",
|
23 |
+
token=hub_token,
|
24 |
)
|
25 |
|
26 |
+
|
27 |
+
def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False):
|
28 |
+
duplicate_space(
|
29 |
+
from_id=source_repo,
|
30 |
+
to_id=target_repo,
|
|
|
|
|
31 |
token=hub_token,
|
32 |
+
private=private,
|
33 |
+
exist_ok=True,
|
34 |
)
|
35 |
|
36 |
|
37 |
+
def add_project_config_to_space_repo(
|
38 |
+
dataset_repo_id,
|
39 |
+
hub_token,
|
|
|
40 |
project_name,
|
41 |
+
argilla_space_repo_id,
|
42 |
+
project_space_repo_id,
|
43 |
):
|
44 |
+
# upload the seed data and readme to the hub
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
+
with open("project_config.json", "w") as f:
|
47 |
+
json.dump(
|
48 |
+
{
|
49 |
+
"project_name": project_name,
|
50 |
+
"argilla_space_repo_id": argilla_space_repo_id,
|
51 |
+
"project_space_repo_id": project_space_repo_id,
|
52 |
+
"dataset_repo_id": dataset_repo_id,
|
53 |
+
},
|
54 |
+
f,
|
55 |
)
|
56 |
|
57 |
+
hf_api.upload_file(
|
58 |
+
path_or_fileobj="project_config.json",
|
59 |
+
path_in_repo="project_config.json",
|
60 |
+
token=hub_token,
|
61 |
+
repo_id=project_space_repo_id,
|
62 |
+
repo_type="space",
|
|
|
63 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
project_config.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"project_name": "
|
|
|
1 |
+
{"project_name": "farming", "argilla_space_repo_id": "ignacioct/farming_argilla_space", "project_space_repo_id": "ignacioct/farming_config_space", "dataset_repo_id": "ignacioct/farming"}
|
seed_data.json
CHANGED
@@ -1,39 +1,15 @@
|
|
1 |
{
|
2 |
"domain": "farming",
|
3 |
"perspectives": [
|
4 |
-
"Family Farming"
|
5 |
-
"Agribusiness",
|
6 |
-
"Permaculture",
|
7 |
-
"Agroforestery",
|
8 |
-
"Conventional Farming"
|
9 |
],
|
10 |
"topics": [
|
11 |
-
"animal welfare"
|
12 |
-
"economic growth",
|
13 |
-
"land",
|
14 |
-
"resources",
|
15 |
-
"efficiency"
|
16 |
],
|
17 |
"examples": [
|
18 |
{
|
19 |
"question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
|
20 |
"answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
|
21 |
-
},
|
22 |
-
{
|
23 |
-
"question": "Compare the environmental footprint of small-scale, local farming versus large-scale, industrial agriculture.",
|
24 |
-
"answer": "Industrial agriculture typically emphasizes high-output, monoculture farming reliant on synthetic fertilizers and pesticides, which, as Horrigan, Lawrence, and Walker (2002) argue, leads to greater greenhouse gas emissions, higher energy use, and more water consumption compared to small-scale farming. In contrast, small-scale farms often employ diverse cropping systems and lower chemical inputs, resulting in a smaller environmental footprint. Pimentel et al. (2005) note that small-scale farms tend to have higher yields per unit area when environmental and sustainability factors are integrated into farming practices."
|
25 |
-
},
|
26 |
-
{
|
27 |
-
"question": "Analyze the economic implications of transitioning from conventional to organic farming.",
|
28 |
-
"answer": "Transitioning from conventional to organic farming involves significant changes in farm management, input use, and market engagement. Crowder and Reganold (2015) present evidence that organic farms often yield smaller outputs initially but achieve higher profitability due to premium prices, lower input costs, and improved soil health over time. However, this transition requires upfront investments in knowledge and infrastructure, which can be economically challenging for some farmers, as noted by Seufert and Ramankutty (2017)."
|
29 |
-
},
|
30 |
-
{
|
31 |
-
"question": "Analyze the social, economic and environnmental impacts of land consolidation vs small-scale farmers.",
|
32 |
-
"answer": "Land consolidation has been associated with increased agricultural productivity but also with negative social and environmental impacts. Larger land holdings typically lead to monocultures, which reduce biodiversity and increase vulnerability to pests and diseases, as highlighted by Li et al. (2017). Economically, while consolidation can lead to economies of scale and potential gains in gross margins, it often displaces rural populations, exacerbating poverty and reducing local food diversity (Sutherland et al., 2015)."
|
33 |
-
},
|
34 |
-
{
|
35 |
-
"question": "Investigate the relationship between land ownership patterns, agricultural productivity and environment sustainability. ",
|
36 |
-
"answer": "Land ownership patterns critically influence agricultural productivity and sustainability. Secure land tenure supports investments in long-term improvements such as soil conservation and water management, which are pivotal for sustainable outcomes. Studies by Barrett et al. (2010) demonstrate that fragmented land ownership often results in inefficient resource use and higher transaction costs, which can detract from sustainability goals."
|
37 |
}
|
38 |
],
|
39 |
"domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."
|
|
|
1 |
{
|
2 |
"domain": "farming",
|
3 |
"perspectives": [
|
4 |
+
"Family Farming"
|
|
|
|
|
|
|
|
|
5 |
],
|
6 |
"topics": [
|
7 |
+
"animal welfare"
|
|
|
|
|
|
|
|
|
8 |
],
|
9 |
"examples": [
|
10 |
{
|
11 |
"question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
|
12 |
"answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
}
|
14 |
],
|
15 |
"domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."
|