feat: hf dataset connection
Browse files- .github/workflows/sync_dataset_hf.yml +27 -0
- app/display.py +0 -1
- app/main.py +7 -0
- app/sync_dataset_hf.py +35 -0
- app/validation_submission/create_json.py +13 -0
- app/validation_submission/get_json.py +6 -2
- app/validation_submission/submission.py +17 -1
- requirements.txt +4 -1
.github/workflows/sync_dataset_hf.yml
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync Hugging Face Dataset
|
2 |
+
|
3 |
+
on:
|
4 |
+
schedule:
|
5 |
+
- cron: '0 * * * *' # Runs every hour
|
6 |
+
|
7 |
+
jobs:
|
8 |
+
sync_dataset:
|
9 |
+
runs-on: ubuntu-latest
|
10 |
+
steps:
|
11 |
+
- name: Checkout repository
|
12 |
+
uses: actions/checkout@v2
|
13 |
+
|
14 |
+
- name: Set up Python
|
15 |
+
uses: actions/setup-python@v2
|
16 |
+
with:
|
17 |
+
python-version: '3.x'
|
18 |
+
|
19 |
+
- name: Install dependencies
|
20 |
+
run: |
|
21 |
+
python -m pip install --upgrade pip
|
22 |
+
pip install -r requirements.txt
|
23 |
+
|
24 |
+
- name: Sync Datasets
|
25 |
+
env:
|
26 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
27 |
+
run: python sync_dataset_hf.py
|
app/display.py
CHANGED
@@ -12,7 +12,6 @@ HEADERS = ["Identifier", "Location", "Wounded", "Dead"]
|
|
12 |
|
13 |
|
14 |
def save_display_individual(gallery, df, error_box, data):
|
15 |
-
#print(data)
|
16 |
individual, error_box, data = validate_save_individual(data, error_box)
|
17 |
if individual:
|
18 |
all_animals = get_json_all_individuals()
|
|
|
12 |
|
13 |
|
14 |
def save_display_individual(gallery, df, error_box, data):
|
|
|
15 |
individual, error_box, data = validate_save_individual(data, error_box)
|
16 |
if individual:
|
17 |
all_animals = get_json_all_individuals()
|
app/main.py
CHANGED
@@ -20,6 +20,12 @@ from styling.theme import css
|
|
20 |
|
21 |
from geolocalisation.js_geolocation import js_geocode, display_location
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# with gr.Blocks(theme=theme, css=css) as demo:
|
24 |
with gr.Blocks(theme='shivi/calm_seafoam') as demo:
|
25 |
individual = gr.State({})
|
@@ -396,6 +402,7 @@ with gr.Blocks(theme='shivi/calm_seafoam') as demo:
|
|
396 |
show_modal.click(lambda: Modal(visible=True), None, modal)
|
397 |
show_modal.click(create_json_one_individual)
|
398 |
show_modal.click(create_tmp)
|
|
|
399 |
#submit_button.click(save_and_rest_df, inputs=[df], outputs=[df])
|
400 |
|
401 |
|
|
|
20 |
|
21 |
from geolocalisation.js_geolocation import js_geocode, display_location
|
22 |
|
23 |
+
from datasets import disable_caching
|
24 |
+
disable_caching()
|
25 |
+
|
26 |
+
dataset_id = "SDSC/digiwild-dataset"
|
27 |
+
data_files = "data/train-00000-of-00001.parquet"
|
28 |
+
|
29 |
# with gr.Blocks(theme=theme, css=css) as demo:
|
30 |
with gr.Blocks(theme='shivi/calm_seafoam') as demo:
|
31 |
individual = gr.State({})
|
|
|
402 |
show_modal.click(lambda: Modal(visible=True), None, modal)
|
403 |
show_modal.click(create_json_one_individual)
|
404 |
show_modal.click(create_tmp)
|
405 |
+
|
406 |
#submit_button.click(save_and_rest_df, inputs=[df], outputs=[df])
|
407 |
|
408 |
|
app/sync_dataset_hf.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset, DownloadMode
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from huggingface_hub import HfApi , hf_hub_download
|
5 |
+
|
6 |
+
dataset_id = "SDSC/digiwild-dataset"
|
7 |
+
token = os.getenv("HUGGINGFACE_TOKEN")
|
8 |
+
|
9 |
+
# Initialize API client
|
10 |
+
api = HfApi(token=token)
|
11 |
+
|
12 |
+
# Load all metadata files
|
13 |
+
files = api.list_repo_files(dataset_id, repo_type="dataset")
|
14 |
+
json_files = [file for file in files if file.endswith(".json")]
|
15 |
+
|
16 |
+
# Load the metadata compilation
|
17 |
+
try:
|
18 |
+
data_files = "data/train-00000-of-00001.parquet"
|
19 |
+
metadata = load_dataset(
|
20 |
+
dataset_id,
|
21 |
+
data_files=data_files)
|
22 |
+
# Add new json entries to dataset
|
23 |
+
for file in json_files:
|
24 |
+
file = hf_hub_download(repo_id=dataset_id, filename=file, repo_type="dataset")
|
25 |
+
with open(file, "r") as f:
|
26 |
+
new = json.load(f)
|
27 |
+
if not(new["image_md5"] in metadata["train"]["image_md5"]):
|
28 |
+
metadata["train"] = metadata["train"].add_item(new)
|
29 |
+
except:
|
30 |
+
metadata = load_dataset(
|
31 |
+
dataset_id,
|
32 |
+
data_files=json_files)
|
33 |
+
|
34 |
+
|
35 |
+
metadata.push_to_hub(dataset_id, token=token)
|
app/validation_submission/create_json.py
CHANGED
@@ -1,6 +1,19 @@
|
|
1 |
import json
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
def create_json_one_individual(one_individual={}):
|
|
|
4 |
one_individual = json.dumps(one_individual)
|
5 |
with open("data/one_individual.json", "w") as outfile:
|
6 |
outfile.write(one_individual)
|
|
|
1 |
import json
|
2 |
|
3 |
+
import random
|
4 |
+
import string
|
5 |
+
|
6 |
+
import hashlib
|
7 |
+
|
8 |
+
def generate_random_md5():
|
9 |
+
# Generate a random string
|
10 |
+
random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=16))
|
11 |
+
# Encode the string and compute its MD5 hash
|
12 |
+
md5_hash = hashlib.md5(random_string.encode()).hexdigest()
|
13 |
+
return md5_hash
|
14 |
+
|
15 |
def create_json_one_individual(one_individual={}):
|
16 |
+
one_individual["image_md5"] = generate_random_md5()
|
17 |
one_individual = json.dumps(one_individual)
|
18 |
with open("data/one_individual.json", "w") as outfile:
|
19 |
outfile.write(one_individual)
|
app/validation_submission/get_json.py
CHANGED
@@ -5,9 +5,13 @@ def get_json_one_individual():
|
|
5 |
one_individual = json.load(openfile)
|
6 |
return one_individual
|
7 |
|
|
|
|
|
8 |
def get_json_all_individuals():
|
9 |
-
|
10 |
-
|
|
|
|
|
11 |
return all_individuals
|
12 |
|
13 |
def get_json_tmp(tmp_name):
|
|
|
5 |
one_individual = json.load(openfile)
|
6 |
return one_individual
|
7 |
|
8 |
+
## TO DO : check this works
|
9 |
+
import os
|
10 |
def get_json_all_individuals():
|
11 |
+
all_animals = os.getfiles("data")
|
12 |
+
all_individuals = []
|
13 |
+
for animal in all_animals:
|
14 |
+
all_individuals.append(animal)
|
15 |
return all_individuals
|
16 |
|
17 |
def get_json_tmp(tmp_name):
|
app/validation_submission/submission.py
CHANGED
@@ -14,4 +14,20 @@ def save_to_all_individuals(one_individual):
|
|
14 |
all_individuals_for_json = json.dumps(all_individuals)
|
15 |
with open("data/all_individuals.json", "w") as outfile:
|
16 |
outfile.write(all_individuals_for_json)
|
17 |
-
return all_individuals
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
all_individuals_for_json = json.dumps(all_individuals)
|
15 |
with open("data/all_individuals.json", "w") as outfile:
|
16 |
outfile.write(all_individuals_for_json)
|
17 |
+
return all_individuals
|
18 |
+
|
19 |
+
from huggingface_hub import HfApi
|
20 |
+
import os
|
21 |
+
|
22 |
+
#save all individuals one by one in JSON wish md5 hash as json name
|
23 |
+
def push_to_dataset_hf():
|
24 |
+
token = os.environ.get("HF_TOKEN", None)
|
25 |
+
api = HfApi(token=token)
|
26 |
+
with open("data/all_individuals.json", "r") as f:
|
27 |
+
all = json.load(f)
|
28 |
+
api.upload_file(
|
29 |
+
path_or_fileobj=f.name,
|
30 |
+
path_in_repo=path_in_repo,
|
31 |
+
repo_id="SDSC/digiwild-dataset",
|
32 |
+
repo_type="dataset",
|
33 |
+
)
|
requirements.txt
CHANGED
@@ -3,4 +3,7 @@ gradio_modal
|
|
3 |
geopy
|
4 |
geopandas
|
5 |
pillow
|
6 |
-
python-dotenv
|
|
|
|
|
|
|
|
3 |
geopy
|
4 |
geopandas
|
5 |
pillow
|
6 |
+
python-dotenv
|
7 |
+
datasets
|
8 |
+
huggingface_hub
|
9 |
+
hashlib
|