Spaces:
Runtime error
Runtime error
Delete videoclipper.py
Browse files- videoclipper.py +0 -172
videoclipper.py
DELETED
@@ -1,172 +0,0 @@
|
|
1 |
-
import sys
|
2 |
-
import copy
|
3 |
-
import librosa
|
4 |
-
import logging
|
5 |
-
import argparse
|
6 |
-
import numpy as np
|
7 |
-
import soundfile as sf
|
8 |
-
import moviepy.editor as mpy
|
9 |
-
# from modelscope.pipelines import pipeline
|
10 |
-
# from modelscope.utils.constant import Tasks
|
11 |
-
from subtitle_utils import generate_srt, generate_srt_clip, distribute_spk
|
12 |
-
from trans_utils import pre_proc, proc, write_state, load_state, proc_spk, generate_vad_data
|
13 |
-
# from argparse_tools import ArgumentParser, get_commandline_args
|
14 |
-
|
15 |
-
from moviepy.editor import *
|
16 |
-
from moviepy.video.tools.subtitles import SubtitlesClip
|
17 |
-
|
18 |
-
|
19 |
-
class VideoClipper():
|
20 |
-
def __init__(self, asr_pipeline, sd_pipeline=None):
|
21 |
-
logging.warning("Initializing VideoClipper.")
|
22 |
-
self.asr_pipeline = asr_pipeline
|
23 |
-
self.sd_pipeline = sd_pipeline
|
24 |
-
|
25 |
-
def recog(self, audio_input, sd_switch='no', state=None):
|
26 |
-
if state is None:
|
27 |
-
state = {}
|
28 |
-
sr, data = audio_input
|
29 |
-
assert sr == 16000, "16kHz sample rate required, {} given.".format(sr)
|
30 |
-
if len(data.shape) == 2: # multi-channel wav input
|
31 |
-
logging.warning("Input wav shape: {}, only first channel reserved.").format(data.shape)
|
32 |
-
data = data[:,0]
|
33 |
-
state['audio_input'] = (sr, data)
|
34 |
-
data = data.astype(np.float64)
|
35 |
-
rec_result = self.asr_pipeline(audio_in=data)
|
36 |
-
if sd_switch == 'yes':
|
37 |
-
vad_data = generate_vad_data(data.astype(np.float32), rec_result['sentences'], sr)
|
38 |
-
sd_result = self.sd_pipeline(audio=vad_data, batch_size=1)
|
39 |
-
rec_result['sd_sentences'] = distribute_spk(rec_result['sentences'], sd_result['text'])
|
40 |
-
res_srt = generate_srt(rec_result['sd_sentences'])
|
41 |
-
state['sd_sentences'] = rec_result['sd_sentences']
|
42 |
-
else:
|
43 |
-
res_srt = generate_srt(rec_result['sentences'])
|
44 |
-
state['recog_res_raw'] = rec_result['text_postprocessed']
|
45 |
-
state['timestamp'] = rec_result['time_stamp']
|
46 |
-
state['sentences'] = rec_result['sentences']
|
47 |
-
res_text = rec_result['text']
|
48 |
-
return res_text, res_srt, state
|
49 |
-
|
50 |
-
def clip(self, dest_text, start_ost, end_ost, state, dest_spk=None):
|
51 |
-
# get from state
|
52 |
-
audio_input = state['audio_input']
|
53 |
-
recog_res_raw = state['recog_res_raw']
|
54 |
-
timestamp = state['timestamp']
|
55 |
-
sentences = state['sentences']
|
56 |
-
sr, data = audio_input
|
57 |
-
data = data.astype(np.float64)
|
58 |
-
|
59 |
-
all_ts = []
|
60 |
-
if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
|
61 |
-
for _dest_text in dest_text.split('#'):
|
62 |
-
_dest_text = pre_proc(_dest_text)
|
63 |
-
ts = proc(recog_res_raw, timestamp, _dest_text)
|
64 |
-
for _ts in ts: all_ts.append(_ts)
|
65 |
-
else:
|
66 |
-
for _dest_spk in dest_spk.split('#'):
|
67 |
-
ts = proc_spk(_dest_spk, state['sd_sentences'])
|
68 |
-
for _ts in ts: all_ts.append(_ts)
|
69 |
-
ts = all_ts
|
70 |
-
ts.sort()
|
71 |
-
srt_index = 0
|
72 |
-
clip_srt = ""
|
73 |
-
if len(ts):
|
74 |
-
start, end = ts[0]
|
75 |
-
start = min(max(0, start+start_ost*16), len(data))
|
76 |
-
end = min(max(0, end+end_ost*16), len(data))
|
77 |
-
res_audio = data[start:end]
|
78 |
-
start_end_info = "from {} to {}".format(start/16000, end/16000)
|
79 |
-
srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index)
|
80 |
-
clip_srt += srt_clip
|
81 |
-
for _ts in ts[1:]: # multiple sentence input or multiple output matched
|
82 |
-
start, end = _ts
|
83 |
-
start = min(max(0, start+start_ost*16), len(data))
|
84 |
-
end = min(max(0, end+end_ost*16), len(data))
|
85 |
-
start_end_info += ", from {} to {}".format(start, end)
|
86 |
-
res_audio = np.concatenate([res_audio, data[start+start_ost*16:end+end_ost*16]], -1)
|
87 |
-
srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
|
88 |
-
clip_srt += srt_clip
|
89 |
-
if len(ts):
|
90 |
-
message = "{} periods found in the speech: ".format(len(ts)) + start_end_info
|
91 |
-
else:
|
92 |
-
message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
|
93 |
-
res_audio = data
|
94 |
-
return (sr, res_audio), message, clip_srt
|
95 |
-
|
96 |
-
def video_recog(self, vedio_filename, sd_switch='no'):
|
97 |
-
vedio_filename = vedio_filename
|
98 |
-
clip_video_file = vedio_filename[:-4] + '_clip.mp4'
|
99 |
-
video = mpy.VideoFileClip(vedio_filename)
|
100 |
-
audio_file = vedio_filename[:-3] + 'wav'
|
101 |
-
video.audio.write_audiofile(audio_file)
|
102 |
-
wav = librosa.load(audio_file, sr=16000)[0]
|
103 |
-
state = {
|
104 |
-
'vedio_filename': vedio_filename,
|
105 |
-
'clip_video_file': clip_video_file,
|
106 |
-
'video': video,
|
107 |
-
}
|
108 |
-
# res_text, res_srt = self.recog((16000, wav), state)
|
109 |
-
return self.recog((16000, wav), sd_switch, state)
|
110 |
-
|
111 |
-
def video_clip(self, dest_text, start_ost, end_ost, state, font_size=32, font_color='white', add_sub=False, dest_spk=None):
|
112 |
-
# get from state
|
113 |
-
recog_res_raw = state['recog_res_raw']
|
114 |
-
timestamp = state['timestamp']
|
115 |
-
sentences = state['sentences']
|
116 |
-
video = state['video']
|
117 |
-
clip_video_file = state['clip_video_file']
|
118 |
-
vedio_filename = state['vedio_filename']
|
119 |
-
|
120 |
-
all_ts = []
|
121 |
-
srt_index = 0
|
122 |
-
if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
|
123 |
-
for _dest_text in dest_text.split('#'):
|
124 |
-
_dest_text = pre_proc(_dest_text)
|
125 |
-
ts = proc(recog_res_raw, timestamp, _dest_text)
|
126 |
-
for _ts in ts: all_ts.append(_ts)
|
127 |
-
else:
|
128 |
-
for _dest_spk in dest_spk.split('#'):
|
129 |
-
ts = proc_spk(_dest_spk, state['sd_sentences'])
|
130 |
-
for _ts in ts: all_ts.append(_ts)
|
131 |
-
time_acc_ost = 0.0
|
132 |
-
ts = all_ts
|
133 |
-
ts.sort()
|
134 |
-
clip_srt = ""
|
135 |
-
if len(ts):
|
136 |
-
start, end = ts[0][0] / 16000, ts[0][1] / 16000
|
137 |
-
srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index, time_acc_ost=time_acc_ost)
|
138 |
-
start, end = start+start_ost/1000.0, end+end_ost/1000.0
|
139 |
-
video_clip = video.subclip(start, end)
|
140 |
-
start_end_info = "from {} to {}".format(start, end)
|
141 |
-
clip_srt += srt_clip
|
142 |
-
if add_sub:
|
143 |
-
generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
|
144 |
-
subtitles = SubtitlesClip(subs, generator)
|
145 |
-
video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
|
146 |
-
concate_clip = [video_clip]
|
147 |
-
time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
|
148 |
-
for _ts in ts[1:]:
|
149 |
-
start, end = _ts[0] / 16000, _ts[1] / 16000
|
150 |
-
srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1, time_acc_ost=time_acc_ost)
|
151 |
-
start, end = start+start_ost/1000.0, end+end_ost/1000.0
|
152 |
-
_video_clip = video.subclip(start, end)
|
153 |
-
start_end_info += ", from {} to {}".format(start, end)
|
154 |
-
clip_srt += srt_clip
|
155 |
-
if add_sub:
|
156 |
-
generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
|
157 |
-
subtitles = SubtitlesClip(subs, generator)
|
158 |
-
_video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
|
159 |
-
concate_clip.append(copy.copy(_video_clip))
|
160 |
-
time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
|
161 |
-
message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
|
162 |
-
logging.warning("Concating...")
|
163 |
-
if len(concate_clip) > 1:
|
164 |
-
video_clip = concatenate_videoclips(concate_clip)
|
165 |
-
video_clip.write_videofile(clip_video_file, audio_codec="aac")
|
166 |
-
else:
|
167 |
-
clip_video_file = vedio_filename
|
168 |
-
message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
|
169 |
-
srt_clip = ''
|
170 |
-
return clip_video_file, message, clip_srt
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|