glt3953 commited on
Commit
33a8656
1 Parent(s): 019e962

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +154 -0
  2. subtitle_utils.py +130 -0
  3. videoclipper.py +172 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+
4
+ # update modelscope
5
+ os.system("pip install -U modelscope -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -i https://mirror.sjtu.edu.cn/pypi/web/simple")
6
+
7
+ import datetime
8
+ from modelscope.pipelines import pipeline
9
+ from modelscope.utils.constant import Tasks
10
+ from subtitle_utils import generate_srt
11
+
12
+ #获取当前北京时间
13
+ utc_dt = datetime.datetime.utcnow()
14
+ beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
15
+ formatted = beijing_dt.strftime("%Y-%m-%d_%H")
16
+ print(f"北京时间: {beijing_dt.year}年{beijing_dt.month}月{beijing_dt.day}日 "
17
+ f"{beijing_dt.hour}时{beijing_dt.minute}分{beijing_dt.second}秒")
18
+ #创建作品存放目录
19
+ works_path = '../works_audio_video_recognize/' + formatted
20
+ if not os.path.exists(works_path):
21
+ os.makedirs(works_path)
22
+ print('作品目录:' + works_path)
23
+
24
+ inference_pipeline = pipeline(
25
+ task=Tasks.auto_speech_recognition,
26
+ model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
27
+
28
+ def transcript(audiofile, text_file, srt_file):
29
+ rec_result = inference_pipeline(audio_in=audiofile)
30
+
31
+ text_output = rec_result['text']
32
+ with open(text_file, "w") as f:
33
+ f.write(text_output)
34
+
35
+ srt_output = generate_srt(rec_result['sentences'])
36
+ with open(srt_file, "w") as f:
37
+ f.write(srt_output)
38
+
39
+ return text_output, srt_output
40
+
41
+ def audio_recog(audiofile):
42
+ utc_dt = datetime.datetime.utcnow()
43
+ beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
44
+ formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
45
+ print(f"开始时间: {beijing_dt.year}年{beijing_dt.month}月{beijing_dt.day}日 "
46
+ f"{beijing_dt.hour}时{beijing_dt.minute}分{beijing_dt.second}秒")
47
+
48
+ print("音频文件:" + audiofile)
49
+
50
+ filename = os.path.splitext(os.path.basename(audiofile))[0]
51
+ text_file = works_path + '/' + filename + '.txt'
52
+ srt_file = works_path + '/' + filename + '.srt'
53
+ text_output, srt_output = transcript(audiofile, text_file, srt_file)
54
+
55
+ utc_dt = datetime.datetime.utcnow()
56
+ beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
57
+ formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
58
+ print(f"结束时间: {beijing_dt.year}年{beijing_dt.month}月{beijing_dt.day}日 "
59
+ f"{beijing_dt.hour}时{beijing_dt.minute}分{beijing_dt.second}秒")
60
+
61
+ return text_output, text_file, srt_output, srt_file
62
+
63
+ def video_recog(filepath):
64
+ filename = os.path.splitext(os.path.basename(filepath))[0]
65
+ worksfile = works_path + '/works_' + filename + '.mp4'
66
+ print("视频文件:" + filepath)
67
+
68
+ utc_dt = datetime.datetime.utcnow()
69
+ beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
70
+ formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S.%f")
71
+
72
+ # 提取音频为mp3
73
+ audiofile = works_path + '/' + formatted + '.mp3'
74
+ os.system(f"ffmpeg -i {filepath} -vn -c:a libmp3lame -q:a 4 {audiofile}")
75
+
76
+ #识别音频文件
77
+ text_output, text_file, srt_output, srt_file = audio_recog(audiofile)
78
+
79
+ # # 给视频添加字幕
80
+ # os.system(f"ffmpeg -i {filepath} -i {srt_file} -c:s mov_text -c:v copy -c:a copy {worksfile}")
81
+ # print("作品:" + worksfile)
82
+
83
+ return text_output, text_file, srt_output, srt_file
84
+
85
+ css_style = "#fixed_size_img {height: 240px;} " \
86
+ "#overview {margin: auto;max-width: 400px; max-height: 400px;}"
87
+
88
+ title = "音视频识别 by宁侠"
89
+ description = "您只需要上传一段音频或视频文件,我们的服务会快速对其进行语音识别,然后生成相应的文字和字幕。这样,您就可以轻松地记录下重要的语音内容,或者为视频添加精准的字幕。现在就来试试我们的音视频识别服务吧,让您的生活和工作更加便捷!"
90
+
91
+ examples_path = 'examples/'
92
+ examples = [[examples_path + 'demo_shejipuhui.mp4']]
93
+
94
+ # gradio interface
95
+ with gr.Blocks(title=title, css=css_style) as demo:
96
+ gr.HTML('''
97
+ <div style="text-align: center; max-width: 720px; margin: 0 auto;">
98
+ <div
99
+ style="
100
+ display: inline-flex;
101
+ align-items: center;
102
+ gap: 0.8rem;
103
+ font-size: 1.75rem;
104
+ "
105
+ >
106
+ <h1 style="font-family: PingFangSC; font-weight: 500; font-size: 36px; margin-bottom: 7px;">
107
+ 音视频识别
108
+ </h1>
109
+ <h1 style="font-family: PingFangSC; font-weight: 500; line-height: 1.5em; font-size: 16px; margin-bottom: 7px;">
110
+ by宁侠
111
+ </h1>
112
+ ''')
113
+ gr.Markdown(description)
114
+
115
+ with gr.Tab("🔊音频识别 Audio Transcribe"):
116
+ with gr.Row():
117
+ with gr.Column():
118
+ audio_input = gr.Audio(label="🔊音频输入 Audio Input", type="filepath")
119
+ gr.Examples(['examples/paddlespeech.asr-zh.wav', 'examples/demo_shejipuhui.mp3'], [audio_input])
120
+ audio_recog_button = gr.Button("👂音频识别 Recognize")
121
+ with gr.Column():
122
+ audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
123
+ audio_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
124
+ audio_srt_output = gr.Textbox(label="📖SRT字幕内容 SRT Subtitles", max_lines=10)
125
+ audio_srt_file = gr.File(label="📖SRT字幕文件 SRT File")
126
+ audio_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
127
+ audio_output = gr.Audio(label="🔊音频 Audio", visible=False)
128
+
129
+ audio_recog_button.click(audio_recog, inputs=[audio_input], outputs=[audio_text_output, audio_text_file, audio_srt_output, audio_srt_file])
130
+ # audio_subtitles_button.click(audio_subtitles, inputs=[audio_text_input], outputs=[audio_output])
131
+
132
+ with gr.Tab("🎥视频识别 Video Transcribe"):
133
+ with gr.Row():
134
+ with gr.Column():
135
+ video_input = gr.Video(label="🎥视频输入 Video Input")
136
+ gr.Examples(['examples/demo_shejipuhui.mp4'], [video_input], label='语音识别示例 ASR Demo')
137
+ video_recog_button = gr.Button("👂视频识别 Recognize")
138
+ video_output = gr.Video(label="🎥视频 Video", visible=False)
139
+ with gr.Column():
140
+ video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
141
+ video_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
142
+ video_srt_output = gr.Textbox(label="📖SRT字幕内容 SRT Subtitles", max_lines=10)
143
+ video_srt_file = gr.File(label="📖SRT字幕文件 SRT File")
144
+ with gr.Row(visible=False):
145
+ font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
146
+ font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
147
+ video_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
148
+
149
+
150
+ video_recog_button.click(video_recog, inputs=[video_input], outputs=[video_text_output, video_text_file, video_srt_output, video_srt_file])
151
+ # video_subtitles_button.click(video_subtitles, inputs=[video_text_input], outputs=[video_output])
152
+
153
+ # start gradio service in local
154
+ demo.queue(api_open=False).launch(debug=True)
subtitle_utils.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def time_convert(ms):
2
+ ms = int(ms)
3
+ tail = ms % 1000
4
+ s = ms // 1000
5
+ mi = s // 60
6
+ s = s % 60
7
+ h = mi // 60
8
+ mi = mi % 60
9
+ h = "00" if h == 0 else str(h)
10
+ mi = "00" if mi == 0 else str(mi)
11
+ s = "00" if s == 0 else str(s)
12
+ tail = str(tail)
13
+ if len(h) == 1: h = '0' + h
14
+ if len(mi) == 1: mi = '0' + mi
15
+ if len(s) == 1: s = '0' + s
16
+ return "{}:{}:{},{}".format(h, mi, s, tail)
17
+
18
+
19
+ class Text2SRT():
20
+ def __init__(self, text_seg, ts_list, offset=0):
21
+ self.token_list = [i for i in text_seg.split() if len(i)]
22
+ self.ts_list = ts_list
23
+ start, end = ts_list[0][0] - offset, ts_list[-1][1] - offset
24
+ self.start_sec, self.end_sec = start, end
25
+ self.start_time = time_convert(start)
26
+ self.end_time = time_convert(end)
27
+ def text(self):
28
+ res = ""
29
+ for word in self.token_list:
30
+ if '\u4e00' <= word <= '\u9fff':
31
+ res += word
32
+ else:
33
+ res += " " + word
34
+ return res
35
+ def len(self):
36
+ return len(self.token_list)
37
+ def srt(self, acc_ost=0.0):
38
+ return "{} --> {}\n{}\n".format(
39
+ time_convert(self.start_sec+acc_ost*1000),
40
+ time_convert(self.end_sec+acc_ost*1000),
41
+ self.text())
42
+ def time(self, acc_ost=0.0):
43
+ return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost)
44
+
45
+ def distribute_spk(sentence_list, sd_time_list):
46
+ sd_sentence_list = []
47
+ for d in sentence_list:
48
+ sentence_start = d['ts_list'][0][0]
49
+ sentence_end = d['ts_list'][-1][1]
50
+ sentence_spk = 0
51
+ max_overlap = 0
52
+ for sd_time in sd_time_list:
53
+ spk_st, spk_ed, spk = sd_time
54
+ spk_st = spk_st*1000
55
+ spk_ed = spk_ed*1000
56
+ overlap = max(
57
+ min(sentence_end, spk_ed) - max(sentence_start, spk_st), 0)
58
+ if overlap > max_overlap:
59
+ max_overlap = overlap
60
+ sentence_spk = spk
61
+ d['spk'] = sentence_spk
62
+ sd_sentence_list.append(d)
63
+ return sd_sentence_list
64
+
65
+ def generate_srt(sentence_list):
66
+ srt_total = ''
67
+ for i, d in enumerate(sentence_list):
68
+ t2s = Text2SRT(d['text_seg'], d['ts_list'])
69
+ if 'spk' in d:
70
+ srt_total += "{} spk{}\n{}".format(i, d['spk'], t2s.srt())
71
+ else:
72
+ srt_total += "{}\n{}".format(i, t2s.srt())
73
+ return srt_total
74
+
75
+ def generate_srt_clip(sentence_list, start, end, begin_index=0, time_acc_ost=0.0):
76
+ start, end = int(start * 1000), int(end * 1000)
77
+ srt_total = ''
78
+ cc = 1 + begin_index
79
+ subs = []
80
+ for i, d in enumerate(sentence_list):
81
+ if d['ts_list'][-1][1] <= start:
82
+ continue
83
+ if d['ts_list'][0][0] >= end:
84
+ break
85
+ # parts in between
86
+ if (d['ts_list'][-1][1] <= end and d['ts_list'][0][0] > start) or (d['ts_list'][-1][1] == end and d['ts_list'][0][0] == start):
87
+ t2s = Text2SRT(d['text_seg'], d['ts_list'], offset=start)
88
+ srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
89
+ subs.append((t2s.time(time_acc_ost), t2s.text()))
90
+ cc += 1
91
+ continue
92
+ if d['ts_list'][0][0] <= start:
93
+ if not d['ts_list'][-1][1] > end:
94
+ for j, ts in enumerate(d['ts_list']):
95
+ if ts[1] > start:
96
+ break
97
+ _text = " ".join(d['text_seg'].split()[j:])
98
+ _ts = d['ts_list'][j:]
99
+ else:
100
+ for j, ts in enumerate(d['ts_list']):
101
+ if ts[1] > start:
102
+ _start = j
103
+ break
104
+ for j, ts in enumerate(d['ts_list']):
105
+ if ts[1] > end:
106
+ _end = j
107
+ break
108
+ _text = " ".join(d['text_seg'].split()[_start:_end])
109
+ _ts = d['ts_list'][_start:_end]
110
+ if len(ts):
111
+ t2s = Text2SRT(_text, _ts, offset=start)
112
+ srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
113
+ subs.append((t2s.time(time_acc_ost), t2s.text()))
114
+ cc += 1
115
+ continue
116
+ if d['ts_list'][-1][1] > end:
117
+ for j, ts in enumerate(d['ts_list']):
118
+ if ts[1] > end:
119
+ break
120
+ _text = " ".join(d['text_seg'].split()[:j])
121
+ _ts = d['ts_list'][:j]
122
+ if len(_ts):
123
+ t2s = Text2SRT(_text, _ts, offset=start)
124
+ srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
125
+ subs.append(
126
+ (t2s.time(time_acc_ost), t2s.text())
127
+ )
128
+ cc += 1
129
+ continue
130
+ return srt_total, subs, cc
videoclipper.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import copy
3
+ import librosa
4
+ import logging
5
+ import argparse
6
+ import numpy as np
7
+ import soundfile as sf
8
+ import moviepy.editor as mpy
9
+ # from modelscope.pipelines import pipeline
10
+ # from modelscope.utils.constant import Tasks
11
+ from subtitle_utils import generate_srt, generate_srt_clip, distribute_spk
12
+ from trans_utils import pre_proc, proc, write_state, load_state, proc_spk, generate_vad_data
13
+ # from argparse_tools import ArgumentParser, get_commandline_args
14
+
15
+ from moviepy.editor import *
16
+ from moviepy.video.tools.subtitles import SubtitlesClip
17
+
18
+
19
+ class VideoClipper():
20
+ def __init__(self, asr_pipeline, sd_pipeline=None):
21
+ logging.warning("Initializing VideoClipper.")
22
+ self.asr_pipeline = asr_pipeline
23
+ self.sd_pipeline = sd_pipeline
24
+
25
+ def recog(self, audio_input, sd_switch='no', state=None):
26
+ if state is None:
27
+ state = {}
28
+ sr, data = audio_input
29
+ assert sr == 16000, "16kHz sample rate required, {} given.".format(sr)
30
+ if len(data.shape) == 2: # multi-channel wav input
31
+ logging.warning("Input wav shape: {}, only first channel reserved.").format(data.shape)
32
+ data = data[:,0]
33
+ state['audio_input'] = (sr, data)
34
+ data = data.astype(np.float64)
35
+ rec_result = self.asr_pipeline(audio_in=data)
36
+ if sd_switch == 'yes':
37
+ vad_data = generate_vad_data(data.astype(np.float32), rec_result['sentences'], sr)
38
+ sd_result = self.sd_pipeline(audio=vad_data, batch_size=1)
39
+ rec_result['sd_sentences'] = distribute_spk(rec_result['sentences'], sd_result['text'])
40
+ res_srt = generate_srt(rec_result['sd_sentences'])
41
+ state['sd_sentences'] = rec_result['sd_sentences']
42
+ else:
43
+ res_srt = generate_srt(rec_result['sentences'])
44
+ state['recog_res_raw'] = rec_result['text_postprocessed']
45
+ state['timestamp'] = rec_result['time_stamp']
46
+ state['sentences'] = rec_result['sentences']
47
+ res_text = rec_result['text']
48
+ return res_text, res_srt, state
49
+
50
+ def clip(self, dest_text, start_ost, end_ost, state, dest_spk=None):
51
+ # get from state
52
+ audio_input = state['audio_input']
53
+ recog_res_raw = state['recog_res_raw']
54
+ timestamp = state['timestamp']
55
+ sentences = state['sentences']
56
+ sr, data = audio_input
57
+ data = data.astype(np.float64)
58
+
59
+ all_ts = []
60
+ if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
61
+ for _dest_text in dest_text.split('#'):
62
+ _dest_text = pre_proc(_dest_text)
63
+ ts = proc(recog_res_raw, timestamp, _dest_text)
64
+ for _ts in ts: all_ts.append(_ts)
65
+ else:
66
+ for _dest_spk in dest_spk.split('#'):
67
+ ts = proc_spk(_dest_spk, state['sd_sentences'])
68
+ for _ts in ts: all_ts.append(_ts)
69
+ ts = all_ts
70
+ ts.sort()
71
+ srt_index = 0
72
+ clip_srt = ""
73
+ if len(ts):
74
+ start, end = ts[0]
75
+ start = min(max(0, start+start_ost*16), len(data))
76
+ end = min(max(0, end+end_ost*16), len(data))
77
+ res_audio = data[start:end]
78
+ start_end_info = "from {} to {}".format(start/16000, end/16000)
79
+ srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index)
80
+ clip_srt += srt_clip
81
+ for _ts in ts[1:]: # multiple sentence input or multiple output matched
82
+ start, end = _ts
83
+ start = min(max(0, start+start_ost*16), len(data))
84
+ end = min(max(0, end+end_ost*16), len(data))
85
+ start_end_info += ", from {} to {}".format(start, end)
86
+ res_audio = np.concatenate([res_audio, data[start+start_ost*16:end+end_ost*16]], -1)
87
+ srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
88
+ clip_srt += srt_clip
89
+ if len(ts):
90
+ message = "{} periods found in the speech: ".format(len(ts)) + start_end_info
91
+ else:
92
+ message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
93
+ res_audio = data
94
+ return (sr, res_audio), message, clip_srt
95
+
96
+ def video_recog(self, vedio_filename, sd_switch='no'):
97
+ vedio_filename = vedio_filename
98
+ clip_video_file = vedio_filename[:-4] + '_clip.mp4'
99
+ video = mpy.VideoFileClip(vedio_filename)
100
+ audio_file = vedio_filename[:-3] + 'wav'
101
+ video.audio.write_audiofile(audio_file)
102
+ wav = librosa.load(audio_file, sr=16000)[0]
103
+ state = {
104
+ 'vedio_filename': vedio_filename,
105
+ 'clip_video_file': clip_video_file,
106
+ 'video': video,
107
+ }
108
+ # res_text, res_srt = self.recog((16000, wav), state)
109
+ return self.recog((16000, wav), sd_switch, state)
110
+
111
+ def video_clip(self, dest_text, start_ost, end_ost, state, font_size=32, font_color='white', add_sub=False, dest_spk=None):
112
+ # get from state
113
+ recog_res_raw = state['recog_res_raw']
114
+ timestamp = state['timestamp']
115
+ sentences = state['sentences']
116
+ video = state['video']
117
+ clip_video_file = state['clip_video_file']
118
+ vedio_filename = state['vedio_filename']
119
+
120
+ all_ts = []
121
+ srt_index = 0
122
+ if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
123
+ for _dest_text in dest_text.split('#'):
124
+ _dest_text = pre_proc(_dest_text)
125
+ ts = proc(recog_res_raw, timestamp, _dest_text)
126
+ for _ts in ts: all_ts.append(_ts)
127
+ else:
128
+ for _dest_spk in dest_spk.split('#'):
129
+ ts = proc_spk(_dest_spk, state['sd_sentences'])
130
+ for _ts in ts: all_ts.append(_ts)
131
+ time_acc_ost = 0.0
132
+ ts = all_ts
133
+ ts.sort()
134
+ clip_srt = ""
135
+ if len(ts):
136
+ start, end = ts[0][0] / 16000, ts[0][1] / 16000
137
+ srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index, time_acc_ost=time_acc_ost)
138
+ start, end = start+start_ost/1000.0, end+end_ost/1000.0
139
+ video_clip = video.subclip(start, end)
140
+ start_end_info = "from {} to {}".format(start, end)
141
+ clip_srt += srt_clip
142
+ if add_sub:
143
+ generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
144
+ subtitles = SubtitlesClip(subs, generator)
145
+ video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
146
+ concate_clip = [video_clip]
147
+ time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
148
+ for _ts in ts[1:]:
149
+ start, end = _ts[0] / 16000, _ts[1] / 16000
150
+ srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1, time_acc_ost=time_acc_ost)
151
+ start, end = start+start_ost/1000.0, end+end_ost/1000.0
152
+ _video_clip = video.subclip(start, end)
153
+ start_end_info += ", from {} to {}".format(start, end)
154
+ clip_srt += srt_clip
155
+ if add_sub:
156
+ generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
157
+ subtitles = SubtitlesClip(subs, generator)
158
+ _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
159
+ concate_clip.append(copy.copy(_video_clip))
160
+ time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
161
+ message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
162
+ logging.warning("Concating...")
163
+ if len(concate_clip) > 1:
164
+ video_clip = concatenate_videoclips(concate_clip)
165
+ video_clip.write_videofile(clip_video_file, audio_codec="aac")
166
+ else:
167
+ clip_video_file = vedio_filename
168
+ message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
169
+ srt_clip = ''
170
+ return clip_video_file, message, clip_srt
171
+
172
+