lucagafner commited on
Commit
5630ebc
1 Parent(s): c6ce749

Upload 2 files

Browse files
Files changed (2) hide show
  1. data_curation.py +61 -0
  2. training_config.yaml +11 -0
data_curation.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script is used to curate the data for the project.
3
+
4
+ Implement your functions to to clean the data and prepare it for model training.
5
+
6
+ Note: the competition requires that you use FiftyOne for data curation and you are only allowed to
7
+ use the approaved dataset from the hub, Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set, which can
8
+ be found here: https://huggingface.co/datasets/Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set
9
+ """
10
+
11
+ import fiftyone as fo
12
+ import fiftyone.utils.huggingface as fouh
13
+
14
+ # Implement functions for data curation. below are just dummy functions as examples
15
+
16
+ def shuffle_data(dataset):
17
+ """Shuffle the dataset"""
18
+ return dataset.shuffle(seed=51)
19
+
20
+ def take_random_sample(dataset):
21
+ """Take a sample from the dataset"""
22
+ return dataset.take(size=10,seed=51)
23
+
24
+ def prepare_dataset(name):
25
+ """
26
+ Prepare the dataset for model training.
27
+
28
+ Args:
29
+ name (str): The name of the dataset to load. Must be "Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set".
30
+
31
+ Returns:
32
+ fiftyone.core.dataset.Dataset: The curated dataset.
33
+
34
+ Raises:
35
+ ValueError: If the provided dataset name is not the approved one.
36
+
37
+ Note:
38
+ The following code block MUST NOT be removed from your submission:
39
+
40
+ APPROVED_DATASET = "Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set"
41
+
42
+ if name != APPROVED_DATASET:
43
+ raise ValueError(f"Only the approved dataset '{APPROVED_DATASET}' is allowed for this competition.")
44
+
45
+ This ensures that only the approved dataset is used for the competition.
46
+ """
47
+ APPROVED_DATASET = "Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set"
48
+ Vox
49
+ if name != APPROVED_DATASET:
50
+ raise ValueError(f"Only the approved dataset '{APPROVED_DATASET}' is allowed for this competition.")
51
+
52
+ # Load the approved dataset from the hub
53
+ dataset = fouh.load_from_hub(name, split="train")
54
+
55
+ # Implement your data curation functions here
56
+ dataset = shuffle_data(dataset)
57
+ dataset = take_random_sample(dataset)
58
+
59
+ # Return the curated dataset
60
+ curated_dataset = dataset.clone()
61
+ return curated_dataset
training_config.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset split
2
+ train_split: 0.9
3
+ val_split: 0.1
4
+
5
+ # Training parameters
6
+ train_params:
7
+ epochs: 50
8
+ batch: 16
9
+ imgsz: 640
10
+ lr0: 0.01
11
+ lrf: 0.01