File size: 5,171 Bytes
02d2765
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import json
import pandas as pd
from pathlib import Path
from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature


def process_and_push_dataset(
    data_dir: str, hub_repo: str, token: str, private: bool = True
):
    """
    Process local dataset files and push to Hugging Face Hub.

    Args:
        data_dir (str): Path to the data directory containing submission folders
        hub_repo (str): Name of the Hugging Face repository to push to
        private (bool): Whether to make the pushed dataset private

    Returns:
        datasets.Dataset: The processed dataset
    """
    # List to store all records
    all_records = []

    # Walk through all subdirectories in data_dir
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if file == "question.json":
                file_path = Path(root) / file
                try:
                    # Read the JSON file
                    with open(file_path, "r", encoding="utf-8") as f:
                        record = json.load(f)

                        # Get the folder path for this record
                        folder_path = os.path.dirname(file_path)

                        # Fix image paths to include full path
                        if "question_images" in record:
                            record["question_images"] = [
                                str(Path(folder_path) / img_path)
                                for img_path in record["question_images"]
                                if img_path
                            ]

                        if "rationale_images" in record:
                            record["rationale_images"] = [
                                str(Path(folder_path) / img_path)
                                for img_path in record["rationale_images"]
                                if img_path
                            ]

                        # Flatten author_info dictionary
                        author_info = record.pop("author_info", {})
                        record.update(
                            {f"author_{k}": v for k, v in author_info.items()}
                        )

                        # Add the record
                        all_records.append(record)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    # Convert to DataFrame
    df = pd.DataFrame(all_records)

    # Sort by custom_id for consistency
    if not df.empty and "custom_id" in df.columns:
        df = df.sort_values("custom_id")

    # Ensure all required columns exist with default values
    required_columns = {
        "custom_id": "",
        "author_name": "",
        "author_email_address": "",
        "author_institution": "",
        "question_categories": [],
        "question": "",
        "question_images": [],
        "final_answer": "",
        "rationale_text": "",
        "rationale_images": [],
        "image_attribution": "",
        "subquestions_1_text": "",
        "subquestions_1_answer": "",
        "subquestions_2_text": "",
        "subquestions_2_answer": "",
        "subquestions_3_text": "",
        "subquestions_3_answer": "",
        "subquestions_4_text": "",
        "subquestions_4_answer": "",
        "subquestions_5_text": "",
        "subquestions_5_answer": "",
    }

    for col, default_value in required_columns.items():
        if col not in df.columns:
            df[col] = default_value

    # Define features
    features = Features(
        {
            "custom_id": Value("string"),
            "question": Value("string"),
            "question_images": Sequence(ImageFeature()),
            "question_categories": Sequence(Value("string")),
            "final_answer": Value("string"),
            "rationale_text": Value("string"),
            "rationale_images": Sequence(ImageFeature()),
            "image_attribution": Value("string"),
            "subquestions_1_text": Value("string"),
            "subquestions_1_answer": Value("string"),
            "subquestions_2_text": Value("string"),
            "subquestions_2_answer": Value("string"),
            "subquestions_3_text": Value("string"),
            "subquestions_3_answer": Value("string"),
            "subquestions_4_text": Value("string"),
            "subquestions_4_answer": Value("string"),
            "subquestions_5_text": Value("string"),
            "subquestions_5_answer": Value("string"),
            "author_name": Value("string"),
            "author_email_address": Value("string"),
            "author_institution": Value("string"),
        }
    )

    # Convert DataFrame to dict of lists (Hugging Face Dataset format)
    dataset_dict = {col: df[col].tolist() for col in features.keys()}

    # Create Dataset directly from dict
    dataset = Dataset.from_dict(dataset_dict, features=features)

    # Push to hub
    dataset.push_to_hub(hub_repo, private=private, max_shard_size="200MB", token=token)

    print(f"\nDataset Statistics:")
    print(f"Total number of submissions: {len(dataset)}")
    print(f"\nSuccessfully pushed dataset to {hub_repo}")

    return dataset