taesiri's picture
backup
02d2765
import os
import json
import pandas as pd
from pathlib import Path
from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature
def process_and_push_dataset(
data_dir: str, hub_repo: str, token: str, private: bool = True
):
"""
Process local dataset files and push to Hugging Face Hub.
Args:
data_dir (str): Path to the data directory containing submission folders
hub_repo (str): Name of the Hugging Face repository to push to
private (bool): Whether to make the pushed dataset private
Returns:
datasets.Dataset: The processed dataset
"""
# List to store all records
all_records = []
# Walk through all subdirectories in data_dir
for root, dirs, files in os.walk(data_dir):
for file in files:
if file == "question.json":
file_path = Path(root) / file
try:
# Read the JSON file
with open(file_path, "r", encoding="utf-8") as f:
record = json.load(f)
# Get the folder path for this record
folder_path = os.path.dirname(file_path)
# Fix image paths to include full path
if "question_images" in record:
record["question_images"] = [
str(Path(folder_path) / img_path)
for img_path in record["question_images"]
if img_path
]
if "rationale_images" in record:
record["rationale_images"] = [
str(Path(folder_path) / img_path)
for img_path in record["rationale_images"]
if img_path
]
# Flatten author_info dictionary
author_info = record.pop("author_info", {})
record.update(
{f"author_{k}": v for k, v in author_info.items()}
)
# Add the record
all_records.append(record)
except Exception as e:
print(f"Error processing {file_path}: {e}")
# Convert to DataFrame
df = pd.DataFrame(all_records)
# Sort by custom_id for consistency
if not df.empty and "custom_id" in df.columns:
df = df.sort_values("custom_id")
# Ensure all required columns exist with default values
required_columns = {
"custom_id": "",
"author_name": "",
"author_email_address": "",
"author_institution": "",
"question_categories": [],
"question": "",
"question_images": [],
"final_answer": "",
"rationale_text": "",
"rationale_images": [],
"image_attribution": "",
"subquestions_1_text": "",
"subquestions_1_answer": "",
"subquestions_2_text": "",
"subquestions_2_answer": "",
"subquestions_3_text": "",
"subquestions_3_answer": "",
"subquestions_4_text": "",
"subquestions_4_answer": "",
"subquestions_5_text": "",
"subquestions_5_answer": "",
}
for col, default_value in required_columns.items():
if col not in df.columns:
df[col] = default_value
# Define features
features = Features(
{
"custom_id": Value("string"),
"question": Value("string"),
"question_images": Sequence(ImageFeature()),
"question_categories": Sequence(Value("string")),
"final_answer": Value("string"),
"rationale_text": Value("string"),
"rationale_images": Sequence(ImageFeature()),
"image_attribution": Value("string"),
"subquestions_1_text": Value("string"),
"subquestions_1_answer": Value("string"),
"subquestions_2_text": Value("string"),
"subquestions_2_answer": Value("string"),
"subquestions_3_text": Value("string"),
"subquestions_3_answer": Value("string"),
"subquestions_4_text": Value("string"),
"subquestions_4_answer": Value("string"),
"subquestions_5_text": Value("string"),
"subquestions_5_answer": Value("string"),
"author_name": Value("string"),
"author_email_address": Value("string"),
"author_institution": Value("string"),
}
)
# Convert DataFrame to dict of lists (Hugging Face Dataset format)
dataset_dict = {col: df[col].tolist() for col in features.keys()}
# Create Dataset directly from dict
dataset = Dataset.from_dict(dataset_dict, features=features)
# Push to hub
dataset.push_to_hub(hub_repo, private=private, max_shard_size="200MB", token=token)
print(f"\nDataset Statistics:")
print(f"Total number of submissions: {len(dataset)}")
print(f"\nSuccessfully pushed dataset to {hub_repo}")
return dataset