Ligeng-Zhu commited on
Commit
82c9012
1 Parent(s): 00b63df

Upload files with huggingface_hub

Browse files

Upload test.py
Upload panda70m_training_full.csv
Upload main.py
Upload panda70m_training_10m.csv
Upload panda70m_validation.csv
Upload panda70m_testing.csv
Upload panda70m_training_2m.csv

.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ panda70m_training_full.csv filter=lfs diff=lfs merge=lfs -text
37
+ panda70m_training_10m.csv filter=lfs diff=lfs merge=lfs -text
38
+ panda70m_training_2m.csv filter=lfs diff=lfs merge=lfs -text
main.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sys, os, os.path as osp
3
+ import yt_dlp
4
+ import asyncio
5
+
6
+ import fire
7
+ import pandas as pd
8
+ from random import random
9
+ from concurrent.futures import ProcessPoolExecutor
10
+
11
+
12
+ def ytb_download(uid, url, json_info, output_dir="ytb_videos/"):
13
+ os.makedirs(output_dir, exist_ok=True)
14
+ # uid = url.split("?v=")[-1]
15
+ yt_opts = {
16
+ "format": "best", # Download the best quality available
17
+ "outtmpl": osp.join(output_dir, f"{uid}.%(ext)s"), # Set the output template
18
+ "postprocessors": [
19
+ {
20
+ "key": "FFmpegVideoConvertor",
21
+ "preferedformat": "mp4", # Convert video to mp4 format
22
+ }
23
+ ],
24
+ }
25
+
26
+ video_path = osp.join(output_dir, f"{uid}.mp4")
27
+ meta_path = osp.join(output_dir, f"{uid}.json")
28
+ if osp.exists(video_path) and osp.exists(meta_path):
29
+ print(f"{uid} already labeled.")
30
+ return 0
31
+
32
+ try:
33
+ with yt_dlp.YoutubeDL(yt_opts) as ydl:
34
+ ydl.download([url])
35
+ with open(osp.join(output_dir, f"{uid}.json"), "w") as fp:
36
+ json.dump(json_info, fp, indent=2)
37
+ return 0
38
+ except:
39
+ return -1
40
+
41
+
42
+ async def main(csv_path, max_workers=256, shards=0, total=-1, limit=False):
43
+ PPE = ProcessPoolExecutor(max_workers=max_workers)
44
+ loop = asyncio.get_event_loop()
45
+
46
+ df = pd.read_csv(csv_path)
47
+ output_dir = csv_path.split(".")[0]
48
+
49
+ tasks = []
50
+
51
+ data_list = list(df.iterrows())
52
+
53
+ if total > 0:
54
+ chunk = len(data_list) // total
55
+ begin_idx = shards * chunk
56
+ end_idx = (shards + 1) * chunk
57
+ if shards == total - 1:
58
+ end_idx = len(data_list)
59
+ data_list = data_list[begin_idx:end_idx]
60
+ print(f"download total {len(data_list)} videos")
61
+
62
+ for idx, (index, row) in enumerate(data_list):
63
+ uid = row["videoID"]
64
+ url = row["url"]
65
+
66
+ json_info = {
67
+ "timestamp": eval(row["timestamp"]),
68
+ "caption": eval(row["caption"]),
69
+ "matching_score": eval(row["matching_score"]),
70
+ }
71
+
72
+ tasks.append(
73
+ loop.run_in_executor(PPE, ytb_download, uid, url, json_info, output_dir)
74
+ )
75
+ if idx >= 20 and limit:
76
+ break
77
+ res = await asyncio.gather(*tasks)
78
+
79
+ print(f"[{sum(res)} / {len(res)}]")
80
+
81
+
82
+ def entry(csv="panda70m_testing.csv", shards=0, total=-1, limit=False):
83
+ asyncio.run(main(csv, shards=shards, total=total, limit=limit))
84
+
85
+
86
+ if __name__ == "__main__":
87
+ fire.Fire(entry)
panda70m_testing.csv ADDED
The diff for this file is too large to render. See raw diff
 
panda70m_training_10m.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7b0eae210bff532b9753a987a4911407d2bfb008cfca2dc3c02957082406e26
3
+ size 1453874594
panda70m_training_2m.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5ea91bfa797ad82a73d5a3ce354f3f1af80da7feb74837b87c470b2e739ae3d
3
+ size 329152920
panda70m_training_full.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d32b0fb20505952004299d8be3dfa1b56436f75ce94dbd2c85923065d9238df
3
+ size 8409689605
panda70m_validation.csv ADDED
The diff for this file is too large to render. See raw diff
 
test.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from random import random
2
+ import asyncio
3
+
4
+
5
+ # task coroutine
6
+ async def task(semaphore, number):
7
+ # acquire the semaphore
8
+ async with semaphore:
9
+ # generate a random value between 0 and 1
10
+ value = random() + 2
11
+ # block for a moment
12
+ await asyncio.sleep(value)
13
+ # report a message
14
+ print(f"Task {number} got {value}")
15
+
16
+
17
+ # main coroutine
18
+ async def main():
19
+ # create the shared semaphore
20
+ semaphore = asyncio.Semaphore(2)
21
+ # create and schedule tasks
22
+ tasks = [asyncio.create_task(task(semaphore, i)) for i in range(10)]
23
+ # wait for all tasks to complete
24
+ _ = await asyncio.wait(tasks)
25
+
26
+
27
+ # start the asyncio program
28
+ asyncio.run(main())