Eason Lu commited on
Commit
d658831
·
2 Parent(s): eb9b4ad d581753

task initialization done

Browse files

Former-commit-id: c84c9e1172e01e6a6cdb30d3d1f79f6f38e76a04

Files changed (3) hide show
  1. configs/task_config.yaml +1 -1
  2. entries/run.py +15 -6
  3. src/task.py +46 -24
configs/task_config.yaml CHANGED
@@ -6,5 +6,5 @@ output_type:
6
  video: False
7
  bilingal: False
8
  source_lang: EN
9
- target_lang: CN
10
  field: SC2
 
6
  video: False
7
  bilingal: False
8
  source_lang: EN
9
+ target_lang: ZH
10
  field: SC2
entries/run.py CHANGED
@@ -18,7 +18,7 @@ def parse_args():
18
  parser.add_argument("--srt_file", help="srt file input path here", default=None, type=str, required=False)
19
  parser.add_argument("--continue", help="task_id that need to continue", default=None, type=str, required=False) # need implement
20
  parser.add_argument("--launch_cfg", help="launch config path", default='./configs/local_launch.yaml', type=str, required=False)
21
- # task config
22
  args = parser.parse_args()
23
 
24
  return args
@@ -27,7 +27,7 @@ if __name__ == "__main__":
27
  # read args and configs
28
  args = parse_args()
29
  launch_cfg = load(open(args.launch_cfg), Loader=Loader)
30
- # TODO: task config
31
 
32
  # initialize dir
33
  local_dir = Path(launch_cfg['local_dump'])
@@ -41,7 +41,6 @@ if __name__ == "__main__":
41
  task_dir = local_dir.joinpath(f"task_{task_id}")
42
  task_dir.mkdir(parents=False, exist_ok=False)
43
  task_dir.joinpath("results").mkdir(parents=False, exist_ok=False)
44
- task_dir.joinpath("logs").mkdir(parents=False, exist_ok=False)
45
 
46
  # logging
47
  logging.basicConfig(level=logging.INFO, handlers=[
@@ -49,12 +48,22 @@ if __name__ == "__main__":
49
  "{}/{}_{}.log".format(task_dir, f"task_{task_id}", datetime.now().strftime("%m%d%Y_%H%M%S")),
50
  'w', encoding='utf-8')])
51
 
52
- # TODO: write task info into log
53
-
54
  # Task create
55
  if args.link is not None:
56
  try:
57
- task = Task.fromYoutubeLink(args.link, task_id, launch_cfg)
 
 
 
 
 
 
 
 
 
 
 
 
58
  except:
59
  shutil.rmtree(task_dir)
60
  raise RuntimeError("failed to create task from youtube link")
 
18
  parser.add_argument("--srt_file", help="srt file input path here", default=None, type=str, required=False)
19
  parser.add_argument("--continue", help="task_id that need to continue", default=None, type=str, required=False) # need implement
20
  parser.add_argument("--launch_cfg", help="launch config path", default='./configs/local_launch.yaml', type=str, required=False)
21
+ parser.add_argument("--task_cfg", help="task config path", default='./configs/task_config.yaml', type=str, required=False)
22
  args = parser.parse_args()
23
 
24
  return args
 
27
  # read args and configs
28
  args = parse_args()
29
  launch_cfg = load(open(args.launch_cfg), Loader=Loader)
30
+ task_cfg = load(open(args.task_cfg), Loader=Loader)
31
 
32
  # initialize dir
33
  local_dir = Path(launch_cfg['local_dump'])
 
41
  task_dir = local_dir.joinpath(f"task_{task_id}")
42
  task_dir.mkdir(parents=False, exist_ok=False)
43
  task_dir.joinpath("results").mkdir(parents=False, exist_ok=False)
 
44
 
45
  # logging
46
  logging.basicConfig(level=logging.INFO, handlers=[
 
48
  "{}/{}_{}.log".format(task_dir, f"task_{task_id}", datetime.now().strftime("%m%d%Y_%H%M%S")),
49
  'w', encoding='utf-8')])
50
 
 
 
51
  # Task create
52
  if args.link is not None:
53
  try:
54
+ task = Task.fromYoutubeLink(args.link, task_id, task_dir, task_cfg)
55
+ except:
56
+ shutil.rmtree(task_dir)
57
+ raise RuntimeError("failed to create task from youtube link")
58
+ elif args.video_file is not None:
59
+ try:
60
+ task = Task.fromVideoFile(args.video_file, task_id, task_dir, task_cfg)
61
+ except:
62
+ shutil.rmtree(task_dir)
63
+ raise RuntimeError("failed to create task from youtube link")
64
+ elif args.audio_file is not None:
65
+ try:
66
+ task = Task.fromVideoFile(args.audio_file, task_id, task_dir, task_cfg)
67
  except:
68
  shutil.rmtree(task_dir)
69
  raise RuntimeError("failed to create task from youtube link")
src/task.py CHANGED
@@ -64,36 +64,47 @@ class Task:
64
  with self.__status_lock:
65
  self.__status = new_status
66
 
67
- def __init__(self, task_id, task_local_dir, launch_info):
68
  self.__status_lock = threading.Lock()
69
  self.__status = TaskStatus.CREATED
70
  openai.api_key = getenv("OPENAI_API_KEY")
71
- self.launch_info = launch_info
72
  self.task_local_dir = task_local_dir
73
- self.model = launch_info["model"]
74
  self.gpu_status = 0
75
- self.output_type = launch_info["output_type"]
 
 
 
76
  self.task_id = task_id
77
  self.audio_path = None
78
  self.SRT_Script = None
79
  self.result = None
 
 
 
 
 
 
 
80
 
81
  @staticmethod
82
- def fromYoutubeLink(youtube_url, task_id, launch_info):
83
- # get audio path
84
  logging.info("Task Creation method: Youtube Link")
85
- local_dump = Path(launch_info['local_dump']) # should get from launch config
86
- return YoutubeTask(task_id, local_dump.joinpath(f"task_{task_id}"), launch_info, youtube_url)
87
 
88
  @staticmethod
89
- def fromAudioFile():
90
  # get audio path
91
- return Task(...)
 
92
 
93
  @staticmethod
94
- def fromVideoFile():
95
  # get audio path
96
- return Task(...)
 
97
 
98
  # Module 1 ASR: audio --> SRT_script
99
  def get_srt_class(self, whisper_model='tiny', method="stable"):
@@ -103,7 +114,8 @@ class Task:
103
  time.sleep(5)
104
  pass
105
 
106
- # Module 2: SRT preprocess: perform preprocess steps
 
107
  def preprocess(self):
108
  self.status = TaskStatus.PRE_PROCESSING
109
  logging.info("--------------------Start Preprocessing SRT class--------------------")
@@ -113,7 +125,7 @@ class Task:
113
  processed_srt_path_en = str(Path(self.task_local_dir).with_suffix('')) + '_processed.srt'
114
  self.SRT_Script.write_srt_file_src(processed_srt_path_en)
115
 
116
- if self.output_type == "ass":
117
  logging.info("write English .srt file to .ass")
118
  assSub_en = srt2ass(processed_srt_path_en)
119
  logging.info('ASS subtitle saved as: ' + assSub_en)
@@ -140,7 +152,6 @@ class Task:
140
  def output_render(self):
141
  self.status = TaskStatus.OUTPUT_MODULE
142
  return "TODO"
143
- pass
144
 
145
  def run_pipeline(self):
146
  self.get_srt_class()
@@ -150,8 +161,8 @@ class Task:
150
  self.result = self.output_render()
151
 
152
  class YoutubeTask(Task):
153
- def __init__(self, task_id, task_local_dir, launch_info, youtube_url):
154
- super().__init__(task_id, task_local_dir, launch_info)
155
  self.youtube_url = youtube_url
156
 
157
  def run(self):
@@ -176,26 +187,37 @@ class YoutubeTask(Task):
176
  '-ab', '192000', '-vn', self.task_local_dir.joinpath(f"task_{self.task_id}.mp3")])
177
  logging.info("audio extraction finished")
178
 
 
179
  self.audio_path = self.task_local_dir.joinpath(f"task_{self.task_id}.mp3")
180
 
181
- logging.info("Task Creation Complete.")
182
 
183
  super().run_pipeline()
184
 
185
  class AudioTask(Task):
186
- def __init__(self, task_id, task_local_dir, launch_info, audio_path):
187
- super().__init__(task_id, task_local_dir, launch_info)
 
188
  self.audio_path = audio_path
 
189
 
190
  def run(self):
 
191
  super().run_pipeline()
192
 
193
  class VideoTask(Task):
194
- def __init__(self, task_id, task_local_dir, launch_info, video_path):
195
- super().__init__(task_id, task_local_dir, launch_info)
196
- self.audio_path = video_path
 
197
 
198
  def run(self):
199
-
 
 
 
 
200
 
 
 
201
  super().run_pipeline()
 
64
  with self.__status_lock:
65
  self.__status = new_status
66
 
67
+ def __init__(self, task_id, task_local_dir, task_cfg):
68
  self.__status_lock = threading.Lock()
69
  self.__status = TaskStatus.CREATED
70
  openai.api_key = getenv("OPENAI_API_KEY")
71
+ self.launch_info = task_cfg # do not use, just for fallback
72
  self.task_local_dir = task_local_dir
73
+ self.model = task_cfg["model"]
74
  self.gpu_status = 0
75
+ self.output_type = task_cfg["output_type"]
76
+ self.target_lang = task_cfg["target_lang"]
77
+ self.source_lang = task_cfg["source_lang"]
78
+ self.field = task_cfg["field"]
79
  self.task_id = task_id
80
  self.audio_path = None
81
  self.SRT_Script = None
82
  self.result = None
83
+ print(f" Task ID: {self.task_id}")
84
+ logging.info(f" Task ID: {self.task_id}")
85
+ logging.info(f" {self.source_lang} -> {self.target_lang} task in {self.field}")
86
+ logging.info(f" Model: {self.model}")
87
+ logging.info(f" subtitle_type: {self.output_type['subtitle']}")
88
+ logging.info(f" video_ouput: \t{self.output_type['video']}")
89
+ logging.info(f" bilingal_ouput: \t{self.output_type['bilingal']}")
90
 
91
  @staticmethod
92
+ def fromYoutubeLink(youtube_url, task_id, task_dir, task_cfg):
93
+ # convert to audio
94
  logging.info("Task Creation method: Youtube Link")
95
+ return YoutubeTask(task_id, task_dir, task_cfg, youtube_url)
 
96
 
97
  @staticmethod
98
+ def fromAudioFile(audio_path, task_id, task_dir, task_cfg):
99
  # get audio path
100
+ logging.info("Task Creation method: Audio File")
101
+ return AudioTask(task_id, task_dir, task_cfg, audio_path)
102
 
103
  @staticmethod
104
+ def fromVideoFile(video_path, task_id, task_dir, task_cfg):
105
  # get audio path
106
+ logging.info("Task Creation method: Video File")
107
+ return VideoTask(task_id, task_dir, task_cfg, video_path)
108
 
109
  # Module 1 ASR: audio --> SRT_script
110
  def get_srt_class(self, whisper_model='tiny', method="stable"):
 
114
  time.sleep(5)
115
  pass
116
 
117
+ # Module 2: SRT preprocess: perform preprocess steps
118
+ # TODO: multi-lang and multi-field support according to task_cfg
119
  def preprocess(self):
120
  self.status = TaskStatus.PRE_PROCESSING
121
  logging.info("--------------------Start Preprocessing SRT class--------------------")
 
125
  processed_srt_path_en = str(Path(self.task_local_dir).with_suffix('')) + '_processed.srt'
126
  self.SRT_Script.write_srt_file_src(processed_srt_path_en)
127
 
128
+ if self.output_type["subtitle"] == "ass":
129
  logging.info("write English .srt file to .ass")
130
  assSub_en = srt2ass(processed_srt_path_en)
131
  logging.info('ASS subtitle saved as: ' + assSub_en)
 
152
  def output_render(self):
153
  self.status = TaskStatus.OUTPUT_MODULE
154
  return "TODO"
 
155
 
156
  def run_pipeline(self):
157
  self.get_srt_class()
 
161
  self.result = self.output_render()
162
 
163
  class YoutubeTask(Task):
164
+ def __init__(self, task_id, task_local_dir, task_cfg, youtube_url):
165
+ super().__init__(task_id, task_local_dir, task_cfg)
166
  self.youtube_url = youtube_url
167
 
168
  def run(self):
 
187
  '-ab', '192000', '-vn', self.task_local_dir.joinpath(f"task_{self.task_id}.mp3")])
188
  logging.info("audio extraction finished")
189
 
190
+ self.video_path = self.task_local_dir.joinpath(f"task_{self.task_id}.mp4")
191
  self.audio_path = self.task_local_dir.joinpath(f"task_{self.task_id}.mp3")
192
 
193
+ logging.info("Data Prep Complete. Start pipeline")
194
 
195
  super().run_pipeline()
196
 
197
  class AudioTask(Task):
198
+ def __init__(self, task_id, task_local_dir, task_cfg, audio_path):
199
+ super().__init__(task_id, task_local_dir, task_cfg)
200
+ # TODO: check audio format
201
  self.audio_path = audio_path
202
+ self.video_path = None
203
 
204
  def run(self):
205
+ logging.info("Data Prep Complete. Start pipeline")
206
  super().run_pipeline()
207
 
208
  class VideoTask(Task):
209
+ def __init__(self, task_id, task_local_dir, task_cfg, video_path):
210
+ super().__init__(task_id, task_local_dir, task_cfg)
211
+ # TODO: check video format {.mp4}
212
+ self.video_path = video_path
213
 
214
  def run(self):
215
+ logging.info("using ffmpeg to extract audio")
216
+ subprocess.run(
217
+ ['ffmpeg', '-i', self.video_path, '-f', 'mp3',
218
+ '-ab', '192000', '-vn', self.task_local_dir.joinpath(f"task_{self.task_id}.mp3")])
219
+ logging.info("audio extraction finished")
220
 
221
+ self.audio_path = self.task_local_dir.joinpath(f"task_{self.task_id}.mp3")
222
+ logging.info("Data Prep Complete. Start pipeline")
223
  super().run_pipeline()