Spaces:
Sleeping
Sleeping
Eason Lu
commited on
Commit
•
f1ae450
1
Parent(s):
1db8078
add youtube task creation
Browse filesFormer-commit-id: da4a1419b1c085934eba891dbf1079c2da05888b
- .gitignore +4 -1
- configs/local_launch.yaml +5 -0
- entries/__init_lib_path.py +10 -0
- entries/run.py +80 -0
- entries/web_backend.py +0 -0
- src/task.py +33 -8
.gitignore
CHANGED
@@ -10,4 +10,7 @@ test.py
|
|
10 |
test.srt
|
11 |
test.txt
|
12 |
log_*.csv
|
13 |
-
log.csv
|
|
|
|
|
|
|
|
10 |
test.srt
|
11 |
test.txt
|
12 |
log_*.csv
|
13 |
+
log.csv
|
14 |
+
.chroma
|
15 |
+
*.ini
|
16 |
+
local_dump/
|
configs/local_launch.yaml
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# launch config for local environment
|
2 |
+
model: "gpt-4"
|
3 |
+
local_dump: ./local_dump
|
4 |
+
output_type: srt
|
5 |
+
environ: local
|
entries/__init_lib_path.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
|
4 |
+
def add_path(custom_path):
|
5 |
+
if custom_path not in sys.path: sys.path.insert(0, custom_path)
|
6 |
+
|
7 |
+
this_dir = os.path.dirname(__file__)
|
8 |
+
|
9 |
+
lib_path = os.path.join(this_dir, '..')
|
10 |
+
add_path(lib_path)
|
entries/run.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import __init_lib_path
|
2 |
+
import logging
|
3 |
+
from yaml import Loader, Dumper, load, dump
|
4 |
+
from src.task import Task
|
5 |
+
import openai
|
6 |
+
import argparse
|
7 |
+
import os
|
8 |
+
from pathlib import Path
|
9 |
+
from datetime import datetime
|
10 |
+
import shutil
|
11 |
+
|
12 |
+
def parse_args():
|
13 |
+
parser = argparse.ArgumentParser()
|
14 |
+
parser.add_argument("--link", help="youtube video link here", default=None, type=str, required=False)
|
15 |
+
parser.add_argument("--video_file", help="local video path here", default=None, type=str, required=False)
|
16 |
+
parser.add_argument("--audio_file", help="local audio path here", default=None, type=str, required=False)
|
17 |
+
parser.add_argument("--srt_file", help="srt file input path here", default=None, type=str, required=False)
|
18 |
+
parser.add_argument("--continue", help="task_id that need to continue", default=None, type=str, required=False) # need implement
|
19 |
+
parser.add_argument("--launch_cfg", help="launch config path", default='./configs/local_launch.yaml', type=str, required=False)
|
20 |
+
args = parser.parse_args()
|
21 |
+
|
22 |
+
return args
|
23 |
+
|
24 |
+
if __name__ == "__main__":
|
25 |
+
args = parse_args()
|
26 |
+
launch_cfg = load(open(args.launch_cfg), Loader=Loader)
|
27 |
+
|
28 |
+
# initialize dir
|
29 |
+
local_dir = Path(launch_cfg['local_dump'])
|
30 |
+
|
31 |
+
# initialize task queue
|
32 |
+
if not local_dir.exists():
|
33 |
+
local_dir.mkdir(parents=False, exist_ok=False)
|
34 |
+
f = open(local_dir.joinpath("task_queue.yaml"), "w")
|
35 |
+
f.write("Task Queue: []\n")
|
36 |
+
f.close()
|
37 |
+
|
38 |
+
# get task id
|
39 |
+
tasks_queue = load(open(local_dir.joinpath("task_queue.yaml")), Loader = Loader)
|
40 |
+
task_list = tasks_queue['Task Queue']
|
41 |
+
task_id = len(task_list)
|
42 |
+
|
43 |
+
# create locak dir for the task
|
44 |
+
task_dir = local_dir.joinpath(f"task_{task_id}")
|
45 |
+
task_dir.mkdir(parents=False, exist_ok=False)
|
46 |
+
task_dir.joinpath("results").mkdir(parents=False, exist_ok=False)
|
47 |
+
task_dir.joinpath("logs").mkdir(parents=False, exist_ok=False)
|
48 |
+
f = open(task_dir.joinpath("task_info.yaml"), "w")
|
49 |
+
f.write(f"task_id: {task_id}")
|
50 |
+
f.close()
|
51 |
+
|
52 |
+
logging.basicConfig(level=logging.INFO, handlers=[
|
53 |
+
logging.FileHandler(
|
54 |
+
"{}/{}_{}.log".format(task_dir.joinpath("logs"), f"task_{task_id}", datetime.now().strftime("%m%d%Y_%H%M%S")),
|
55 |
+
'w', encoding='utf-8')])
|
56 |
+
|
57 |
+
# task create
|
58 |
+
if args.link is not None:
|
59 |
+
try:
|
60 |
+
task = Task.fromYoutubeLink(args.link, task_id, launch_cfg)
|
61 |
+
except:
|
62 |
+
shutil.rmtree(task_dir)
|
63 |
+
raise RuntimeError("failed to create task from youtube link")
|
64 |
+
|
65 |
+
# add task to the status queue
|
66 |
+
task_list.append({"id": task_id, "status": "created", "resource_status:": "local"})
|
67 |
+
stream = open(local_dir.joinpath("task_queue.yaml"), "w")
|
68 |
+
dump(tasks_queue, stream)
|
69 |
+
|
70 |
+
task.run_pipeline()
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
entries/web_backend.py
ADDED
File without changes
|
src/task.py
CHANGED
@@ -9,6 +9,9 @@ from os import getenv
|
|
9 |
from enum import Enum
|
10 |
from pathlib import Path
|
11 |
from enum import Enum, auto
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
"""
|
@@ -42,6 +45,7 @@ SRT_Script : SrtScript
|
|
42 |
"""
|
43 |
|
44 |
class TaskStatus(Enum):
|
|
|
45 |
INITIALIZING_ASR = (auto(), None)
|
46 |
PRE_PROCESSING = (auto(), None)
|
47 |
TRANSLATING = (auto(), 0.0)
|
@@ -50,22 +54,43 @@ class TaskStatus(Enum):
|
|
50 |
|
51 |
|
52 |
class Task:
|
53 |
-
def __init__(self, task_id,
|
54 |
-
|
55 |
-
self.
|
56 |
-
self.model = model
|
57 |
self.gpu_status = 0
|
58 |
-
self.output_type = output_type
|
59 |
self.task_id = task_id
|
60 |
self.progress = NotImplemented
|
61 |
self.SRT_Script = None
|
62 |
-
|
63 |
|
64 |
@staticmethod
|
65 |
-
def fromYoutubeLink(youtube_url):
|
66 |
# convert to audio
|
|
|
|
|
|
|
67 |
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
@staticmethod
|
71 |
def fromAudioFile():
|
|
|
9 |
from enum import Enum
|
10 |
from pathlib import Path
|
11 |
from enum import Enum, auto
|
12 |
+
import logging
|
13 |
+
import subprocess
|
14 |
+
|
15 |
|
16 |
|
17 |
"""
|
|
|
45 |
"""
|
46 |
|
47 |
class TaskStatus(Enum):
|
48 |
+
CREATED = (auto(), None)
|
49 |
INITIALIZING_ASR = (auto(), None)
|
50 |
PRE_PROCESSING = (auto(), None)
|
51 |
TRANSLATING = (auto(), 0.0)
|
|
|
54 |
|
55 |
|
56 |
class Task:
|
57 |
+
def __init__(self, task_id, task_local_dir, launch_info):
|
58 |
+
openai.api_key = getenv("OPENAI_API_KEY")
|
59 |
+
self.task_local_dir = task_local_dir
|
60 |
+
self.model = launch_info["model"]
|
61 |
self.gpu_status = 0
|
62 |
+
self.output_type = launch_info["output_type"]
|
63 |
self.task_id = task_id
|
64 |
self.progress = NotImplemented
|
65 |
self.SRT_Script = None
|
66 |
+
|
67 |
|
68 |
@staticmethod
|
69 |
+
def fromYoutubeLink(youtube_url, task_id, launch_info):
|
70 |
# convert to audio
|
71 |
+
local_dump = Path(launch_info['local_dump'])# should get from launch config
|
72 |
+
yt = YouTube(youtube_url)
|
73 |
+
video = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
|
74 |
|
75 |
+
if video:
|
76 |
+
video.download(str(local_dump.joinpath(f"task_{task_id}")), filename=f"task_{task_id}.mp4")
|
77 |
+
logging.info(f'Video download completed to {local_dump.joinpath(f"task_{task_id}")}!')
|
78 |
+
else:
|
79 |
+
raise FileNotFoundError(f"Video stream not found for link {youtube_url}")
|
80 |
+
|
81 |
+
audio = yt.streams.filter(only_audio=True).first()
|
82 |
+
if audio:
|
83 |
+
audio.download(str(local_dump.joinpath(f"task_{task_id}")), filename=f"task_{task_id}.mp3")
|
84 |
+
logging.info(f'Audio download completed to {local_dump.joinpath(f"task_{task_id}")}!')
|
85 |
+
else:
|
86 |
+
logging.info("download audio failed, using ffmpeg to extract audio")
|
87 |
+
subprocess.run(['ffmpeg', '-i', local_dump.joinpath(f"task_{task_id}").joinpath(f"task_{task_id}.mp4"), '-f', 'mp3', '-ab', '192000', '-vn', local_dump.joinpath(f"task_{task_id}").joinpath(f"task_{task_id}.mp3")])
|
88 |
+
logging.info("audio extraction finished")
|
89 |
+
|
90 |
+
logging.info("Task Creation Complete.")
|
91 |
+
logging.info("Task Creation method: Youtube Link")
|
92 |
+
|
93 |
+
return Task(task_id, local_dump.joinpath(f"task_{task_id}"), launch_info)
|
94 |
|
95 |
@staticmethod
|
96 |
def fromAudioFile():
|