File size: 3,514 Bytes
8773ff3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import json
from tempfile import mktemp
import argilla as rg
from huggingface_hub import HfApi
from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH
hf_api = HfApi()
with open("DATASET_README_BASE.md") as f:
DATASET_README_BASE = f.read()
def create_readme(domain_seed_data, project_name, domain):
# create a readme for the project that shows the domain and project name
readme = DATASET_README_BASE
readme += f"# {project_name}\n\n## Domain: {domain}"
perspectives = domain_seed_data.get("perspectives")
topics = domain_seed_data.get("topics")
examples = domain_seed_data.get("examples")
if perspectives:
readme += "\n\n## Perspectives\n\n"
for p in perspectives:
readme += f"- {p}\n"
if topics:
readme += "\n\n## Topics\n\n"
for t in topics:
readme += f"- {t}\n"
if examples:
readme += "\n\n## Examples\n\n"
for example in examples:
readme += f"### {example['question']}\n\n{example['answer']}\n\n"
temp_file = mktemp()
with open(temp_file, "w") as f:
f.write(readme)
return temp_file
def setup_dataset_on_hub(repo_id, hub_token):
# create an empty dataset repo on the hub
hf_api.create_repo(
repo_id=repo_id,
token=hub_token,
repo_type="dataset",
exist_ok=True,
)
def push_dataset_to_hub(
domain_seed_data_path,
project_name,
domain,
pipeline_path,
hub_username,
hub_token: str,
):
repo_id = f"{hub_username}/{project_name}"
setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token)
# upload the seed data and readme to the hub
hf_api.upload_file(
path_or_fileobj=domain_seed_data_path,
path_in_repo="seed_data.json",
token=hub_token,
repo_id=repo_id,
repo_type="dataset",
)
# upload the readme to the hub
domain_seed_data = json.load(open(domain_seed_data_path))
hf_api.upload_file(
path_or_fileobj=create_readme(
domain_seed_data=domain_seed_data, project_name=project_name, domain=domain
),
path_in_repo="README.md",
token=hub_token,
repo_id=repo_id,
repo_type="dataset",
)
def push_pipeline_to_hub(
pipeline_path,
hub_username,
hub_token: str,
project_name,
):
repo_id = f"{hub_username}/{project_name}"
# upload the pipeline to the hub
hf_api.upload_file(
path_or_fileobj=pipeline_path,
path_in_repo="pipeline.yaml",
token=hub_token,
repo_id=repo_id,
repo_type="dataset",
)
for code_path in REMOTE_CODE_PATHS:
hf_api.upload_file(
path_or_fileobj=code_path,
path_in_repo=code_path,
token=hub_token,
repo_id=repo_id,
repo_type="dataset",
)
print(f"Dataset uploaded to {repo_id}")
def pull_seed_data_from_repo(repo_id, hub_token):
# pull the dataset repo from the hub
hf_api.hf_hub_download(
repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH
)
return json.load(open(SEED_DATA_PATH))
def push_argilla_dataset_to_hub(
name: str, repo_id: str, url: str, api_key: str, workspace: str = "admin"
):
rg.init(api_url=url, api_key=api_key)
feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
local_dataset = feedback_dataset.pull()
local_dataset.push_to_huggingface(repo_id=repo_id)
|