Spaces:
Running
Running
import os | |
import requests | |
from concurrent.futures import ThreadPoolExecutor | |
from tqdm import tqdm | |
import duckdb | |
import random | |
import argparse | |
import yaml | |
# Create the "public" folders if they don't exist | |
os.makedirs("public", exist_ok=True) | |
# URLs of the files to download | |
urls = [ | |
"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet?download=true", | |
"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/datasets.parquet?download=true", | |
"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/spaces.parquet?download=true" | |
] | |
def download_file(url, overwrite=True): | |
filename = os.path.join("public", url.split("/")[-1].split("?")[0]) | |
if not overwrite and os.path.exists(filename): | |
print(f"File already exists: {filename}. Skipping download.") | |
return | |
response = requests.get(url, stream=True) | |
total_size = int(response.headers.get("Content-Length", 0)) | |
block_size = 1024 # 1 KB | |
with open(filename, "wb") as file, tqdm( | |
desc=filename, | |
total=total_size, | |
unit="iB", | |
unit_scale=True, | |
unit_divisor=1024, | |
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]" | |
) as progress_bar: | |
for data in response.iter_content(block_size): | |
size = file.write(data) | |
progress_bar.update(size) | |
print(f"Downloaded: {filename}") | |
def main(overwrite): | |
# Create a ThreadPoolExecutor with max_workers set to 3 (number of files to download) | |
with ThreadPoolExecutor(max_workers=3) as executor: | |
# Submit download tasks to the executor | |
futures = [executor.submit(download_file, url, overwrite) for url in urls] | |
# Wait for all tasks to complete | |
for future in futures: | |
future.result() | |
print("All files downloaded successfully.") | |
# Process each downloaded Parquet file | |
for url in urls: | |
filename = os.path.join("public", url.split("/")[-1].split("?")[0]) | |
table_name = os.path.splitext(os.path.basename(filename))[0] | |
# Connect to the Parquet file using DuckDB | |
con = duckdb.connect(database=':memory:') | |
con.execute(f"CREATE VIEW {table_name} AS SELECT * FROM parquet_scan('{filename}')") | |
# Retrieve the table structure | |
table_structure = con.execute(f"DESCRIBE {table_name}").fetchall() | |
# Generate the YAML content | |
yaml_content = f"{table_name}:\n" | |
yaml_content += " table_structure:\n" | |
for row in table_structure: | |
column, dtype = row[:2] # Unpack only the first two values | |
yaml_content += f" - column: {column}\n" | |
yaml_content += f" type: {dtype}\n" | |
# Retrieve 10 random items from the table | |
con.execute(f"CREATE VIEW {table_name}_random AS SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 10") | |
random_items = con.execute(f"SELECT * FROM {table_name}_random").fetchall() | |
yaml_content += " random_items:\n" | |
for item in random_items: | |
yaml_content += " - " | |
for column, value in zip([row[0] for row in table_structure], item): | |
yaml_content += f"{column}: {value}\n " | |
yaml_content = yaml_content.rstrip() # Remove trailing spaces | |
yaml_content += "\n" | |
# Save the YAML content to a file in the "public" folder | |
yaml_file = os.path.join("public", f"{table_name}.example.yaml") | |
with open(yaml_file, "w") as file: | |
file.write(yaml_content) | |
print(f"Generated: {yaml_file}") | |
print("Example files generated successfully.") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Download and process Parquet files.") | |
parser.add_argument("--no-overwrite", action="store_true", help="Skip downloading files that already exist.") | |
args = parser.parse_args() | |
main(overwrite=not args.no_overwrite) | |