tdurbor commited on
Commit
57f3c81
·
1 Parent(s): 34507a1

add upload to dataset

Browse files
Files changed (1) hide show
  1. utils/upload-to-dataset.py +76 -0
utils/upload-to-dataset.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import Dataset, Features, Value, Image
2
+ from huggingface_hub import HfApi
3
+ import os
4
+ from collections import defaultdict
5
+ import pandas as pd
6
+
7
+ # Define the path to your images
8
+ IMAGE_DIR = "../../background-removal-arena-v0/train/data/resized"
9
+
10
+ # Define the dataset features with dedicated columns for each model
11
+ features = Features({
12
+ "original_image": Image(), # Original image feature
13
+ "clipdrop_image": Image(), # Clipdrop segmented image
14
+ "bria_image": Image(), # Bria segmented image
15
+ "photoroom_image": Image(), # Photoroom segmented image
16
+ "removebg_image": Image(), # RemoveBG segmented image
17
+ "original_filename": Value("string") # Original filename
18
+ })
19
+
20
+ # Load image paths and metadata
21
+ data = defaultdict(lambda: {
22
+ "clipdrop_image": None,
23
+ "bria_image": None,
24
+ "photoroom_image": None,
25
+ "removebg_image": None
26
+ })
27
+
28
+ # Walk into the web-original-images folder
29
+ web_original_images_dir = os.path.join(IMAGE_DIR, "web-original-images")
30
+ for root, _, files in os.walk(web_original_images_dir):
31
+ for f in files:
32
+ if f.endswith(('.png', '.jpg', '.jpeg')):
33
+ original_image_path = os.path.join(root, f)
34
+ data[f]["original_image"] = original_image_path
35
+ data[f]["original_filename"] = f
36
+
37
+ # Check for corresponding images in other directories
38
+ for source in ["clipdrop", "bria", "photoroom", "removebg"]:
39
+ # Check for processed images ending in .png or .jpg
40
+ for ext in ['.png', '.jpg']:
41
+ processed_image_filename = os.path.splitext(f)[0] + ext
42
+ source_image_path = os.path.join(IMAGE_DIR, source, processed_image_filename)
43
+
44
+ if os.path.exists(source_image_path):
45
+ data[f][f"{source}_image"] = source_image_path
46
+ break # Stop checking other extensions if a file is found
47
+
48
+ # Convert the data to a dictionary of lists
49
+ dataset_dict = {
50
+ "original_image": [],
51
+ "clipdrop_image": [],
52
+ "bria_image": [],
53
+ "photoroom_image": [],
54
+ "removebg_image": [],
55
+ "original_filename": []
56
+ }
57
+
58
+ for filename, entry in data.items():
59
+ if "original_image" in entry:
60
+ dataset_dict["original_image"].append(entry["original_image"])
61
+ dataset_dict["clipdrop_image"].append(entry["clipdrop_image"])
62
+ dataset_dict["bria_image"].append(entry["bria_image"])
63
+ dataset_dict["photoroom_image"].append(entry["photoroom_image"])
64
+ dataset_dict["removebg_image"].append(entry["removebg_image"])
65
+ dataset_dict["original_filename"].append(filename)
66
+
67
+ # Save the data dictionary to a CSV file for inspection
68
+ df = pd.DataFrame.from_dict(dataset_dict)
69
+ df.to_csv("image_data.csv", index=False)
70
+
71
+ # Create a Dataset
72
+ dataset = Dataset.from_dict(dataset_dict, features=features)
73
+
74
+ # Push the dataset to Hugging Face Hub
75
+ api = HfApi()
76
+ dataset.push_to_hub("bgsys/background-removal-arena-test", token=api.token)