Upload files with huggingface_hub

Upload test.py
Upload panda70m_training_full.csv
Upload main.py
Upload panda70m_training_10m.csv
Upload panda70m_validation.csv
Upload panda70m_testing.csv
Upload panda70m_training_2m.csv

Files changed (8) hide show

.gitattributes +3 -0
main.py +87 -0
panda70m_testing.csv +0 -0
panda70m_training_10m.csv +3 -0
panda70m_training_2m.csv +3 -0
panda70m_training_full.csv +3 -0
panda70m_validation.csv +0 -0
test.py +28 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+panda70m_training_full.csv filter=lfs diff=lfs merge=lfs -text
+panda70m_training_10m.csv filter=lfs diff=lfs merge=lfs -text
+panda70m_training_2m.csv filter=lfs diff=lfs merge=lfs -text

main.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import json
+import sys, os, os.path as osp
+import yt_dlp
+import asyncio
+import fire
+import pandas as pd
+from random import random
+from concurrent.futures import ProcessPoolExecutor
+def ytb_download(uid, url, json_info, output_dir="ytb_videos/"):
+    os.makedirs(output_dir, exist_ok=True)
+    # uid = url.split("?v=")[-1]
+    yt_opts = {
+        "format": "best",  # Download the best quality available
+        "outtmpl": osp.join(output_dir, f"{uid}.%(ext)s"),  # Set the output template
+        "postprocessors": [
+            {
+                "key": "FFmpegVideoConvertor",
+                "preferedformat": "mp4",  # Convert video to mp4 format
+            }
+        ],
+    }
+    video_path = osp.join(output_dir, f"{uid}.mp4")
+    meta_path = osp.join(output_dir, f"{uid}.json")
+    if osp.exists(video_path) and osp.exists(meta_path):
+        print(f"{uid} already labeled.")
+        return 0
+    try:
+        with yt_dlp.YoutubeDL(yt_opts) as ydl:
+            ydl.download([url])
+        with open(osp.join(output_dir, f"{uid}.json"), "w") as fp:
+            json.dump(json_info, fp, indent=2)
+        return 0
+    except:
+        return -1
+async def main(csv_path, max_workers=256, shards=0, total=-1, limit=False):
+    PPE = ProcessPoolExecutor(max_workers=max_workers)
+    loop = asyncio.get_event_loop()
+    df = pd.read_csv(csv_path)
+    output_dir = csv_path.split(".")[0]
+    tasks = []
+    data_list = list(df.iterrows())
+    if total > 0:
+        chunk = len(data_list) // total
+        begin_idx = shards * chunk
+        end_idx = (shards + 1) * chunk
+        if shards == total - 1:
+            end_idx = len(data_list)
+        data_list = data_list[begin_idx:end_idx]
+    print(f"download total {len(data_list)} videos")
+    for idx, (index, row) in enumerate(data_list):
+        uid = row["videoID"]
+        url = row["url"]
+        json_info = {
+            "timestamp": eval(row["timestamp"]),
+            "caption": eval(row["caption"]),
+            "matching_score": eval(row["matching_score"]),
+        }
+        tasks.append(
+            loop.run_in_executor(PPE, ytb_download, uid, url, json_info, output_dir)
+        )
+        if idx >= 20 and limit:
+            break
+    res = await asyncio.gather(*tasks)
+    print(f"[{sum(res)} / {len(res)}]")
+def entry(csv="panda70m_testing.csv", shards=0, total=-1, limit=False):
+    asyncio.run(main(csv, shards=shards, total=total, limit=limit))
+if __name__ == "__main__":
+    fire.Fire(entry)

panda70m_testing.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

panda70m_training_10m.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7b0eae210bff532b9753a987a4911407d2bfb008cfca2dc3c02957082406e26
+size 1453874594

panda70m_training_2m.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5ea91bfa797ad82a73d5a3ce354f3f1af80da7feb74837b87c470b2e739ae3d
+size 329152920

panda70m_training_full.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d32b0fb20505952004299d8be3dfa1b56436f75ce94dbd2c85923065d9238df
+size 8409689605

panda70m_validation.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

test.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from random import random
+import asyncio
+# task coroutine
+async def task(semaphore, number):
+    # acquire the semaphore
+    async with semaphore:
+        # generate a random value between 0 and 1
+        value = random() + 2
+        # block for a moment
+        await asyncio.sleep(value)
+        # report a message
+        print(f"Task {number} got {value}")
+# main coroutine
+async def main():
+    # create the shared semaphore
+    semaphore = asyncio.Semaphore(2)
+    # create and schedule tasks
+    tasks = [asyncio.create_task(task(semaphore, i)) for i in range(10)]
+    # wait for all tasks to complete
+    _ = await asyncio.wait(tasks)
+# start the asyncio program
+asyncio.run(main())