glt3953 commited on
Commit
cd29950
1 Parent(s): d278405

Delete videoclipper.py

Browse files
Files changed (1) hide show
  1. videoclipper.py +0 -172
videoclipper.py DELETED
@@ -1,172 +0,0 @@
1
- import sys
2
- import copy
3
- import librosa
4
- import logging
5
- import argparse
6
- import numpy as np
7
- import soundfile as sf
8
- import moviepy.editor as mpy
9
- # from modelscope.pipelines import pipeline
10
- # from modelscope.utils.constant import Tasks
11
- from subtitle_utils import generate_srt, generate_srt_clip, distribute_spk
12
- from trans_utils import pre_proc, proc, write_state, load_state, proc_spk, generate_vad_data
13
- # from argparse_tools import ArgumentParser, get_commandline_args
14
-
15
- from moviepy.editor import *
16
- from moviepy.video.tools.subtitles import SubtitlesClip
17
-
18
-
19
- class VideoClipper():
20
- def __init__(self, asr_pipeline, sd_pipeline=None):
21
- logging.warning("Initializing VideoClipper.")
22
- self.asr_pipeline = asr_pipeline
23
- self.sd_pipeline = sd_pipeline
24
-
25
- def recog(self, audio_input, sd_switch='no', state=None):
26
- if state is None:
27
- state = {}
28
- sr, data = audio_input
29
- assert sr == 16000, "16kHz sample rate required, {} given.".format(sr)
30
- if len(data.shape) == 2: # multi-channel wav input
31
- logging.warning("Input wav shape: {}, only first channel reserved.").format(data.shape)
32
- data = data[:,0]
33
- state['audio_input'] = (sr, data)
34
- data = data.astype(np.float64)
35
- rec_result = self.asr_pipeline(audio_in=data)
36
- if sd_switch == 'yes':
37
- vad_data = generate_vad_data(data.astype(np.float32), rec_result['sentences'], sr)
38
- sd_result = self.sd_pipeline(audio=vad_data, batch_size=1)
39
- rec_result['sd_sentences'] = distribute_spk(rec_result['sentences'], sd_result['text'])
40
- res_srt = generate_srt(rec_result['sd_sentences'])
41
- state['sd_sentences'] = rec_result['sd_sentences']
42
- else:
43
- res_srt = generate_srt(rec_result['sentences'])
44
- state['recog_res_raw'] = rec_result['text_postprocessed']
45
- state['timestamp'] = rec_result['time_stamp']
46
- state['sentences'] = rec_result['sentences']
47
- res_text = rec_result['text']
48
- return res_text, res_srt, state
49
-
50
- def clip(self, dest_text, start_ost, end_ost, state, dest_spk=None):
51
- # get from state
52
- audio_input = state['audio_input']
53
- recog_res_raw = state['recog_res_raw']
54
- timestamp = state['timestamp']
55
- sentences = state['sentences']
56
- sr, data = audio_input
57
- data = data.astype(np.float64)
58
-
59
- all_ts = []
60
- if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
61
- for _dest_text in dest_text.split('#'):
62
- _dest_text = pre_proc(_dest_text)
63
- ts = proc(recog_res_raw, timestamp, _dest_text)
64
- for _ts in ts: all_ts.append(_ts)
65
- else:
66
- for _dest_spk in dest_spk.split('#'):
67
- ts = proc_spk(_dest_spk, state['sd_sentences'])
68
- for _ts in ts: all_ts.append(_ts)
69
- ts = all_ts
70
- ts.sort()
71
- srt_index = 0
72
- clip_srt = ""
73
- if len(ts):
74
- start, end = ts[0]
75
- start = min(max(0, start+start_ost*16), len(data))
76
- end = min(max(0, end+end_ost*16), len(data))
77
- res_audio = data[start:end]
78
- start_end_info = "from {} to {}".format(start/16000, end/16000)
79
- srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index)
80
- clip_srt += srt_clip
81
- for _ts in ts[1:]: # multiple sentence input or multiple output matched
82
- start, end = _ts
83
- start = min(max(0, start+start_ost*16), len(data))
84
- end = min(max(0, end+end_ost*16), len(data))
85
- start_end_info += ", from {} to {}".format(start, end)
86
- res_audio = np.concatenate([res_audio, data[start+start_ost*16:end+end_ost*16]], -1)
87
- srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
88
- clip_srt += srt_clip
89
- if len(ts):
90
- message = "{} periods found in the speech: ".format(len(ts)) + start_end_info
91
- else:
92
- message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
93
- res_audio = data
94
- return (sr, res_audio), message, clip_srt
95
-
96
- def video_recog(self, vedio_filename, sd_switch='no'):
97
- vedio_filename = vedio_filename
98
- clip_video_file = vedio_filename[:-4] + '_clip.mp4'
99
- video = mpy.VideoFileClip(vedio_filename)
100
- audio_file = vedio_filename[:-3] + 'wav'
101
- video.audio.write_audiofile(audio_file)
102
- wav = librosa.load(audio_file, sr=16000)[0]
103
- state = {
104
- 'vedio_filename': vedio_filename,
105
- 'clip_video_file': clip_video_file,
106
- 'video': video,
107
- }
108
- # res_text, res_srt = self.recog((16000, wav), state)
109
- return self.recog((16000, wav), sd_switch, state)
110
-
111
- def video_clip(self, dest_text, start_ost, end_ost, state, font_size=32, font_color='white', add_sub=False, dest_spk=None):
112
- # get from state
113
- recog_res_raw = state['recog_res_raw']
114
- timestamp = state['timestamp']
115
- sentences = state['sentences']
116
- video = state['video']
117
- clip_video_file = state['clip_video_file']
118
- vedio_filename = state['vedio_filename']
119
-
120
- all_ts = []
121
- srt_index = 0
122
- if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
123
- for _dest_text in dest_text.split('#'):
124
- _dest_text = pre_proc(_dest_text)
125
- ts = proc(recog_res_raw, timestamp, _dest_text)
126
- for _ts in ts: all_ts.append(_ts)
127
- else:
128
- for _dest_spk in dest_spk.split('#'):
129
- ts = proc_spk(_dest_spk, state['sd_sentences'])
130
- for _ts in ts: all_ts.append(_ts)
131
- time_acc_ost = 0.0
132
- ts = all_ts
133
- ts.sort()
134
- clip_srt = ""
135
- if len(ts):
136
- start, end = ts[0][0] / 16000, ts[0][1] / 16000
137
- srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index, time_acc_ost=time_acc_ost)
138
- start, end = start+start_ost/1000.0, end+end_ost/1000.0
139
- video_clip = video.subclip(start, end)
140
- start_end_info = "from {} to {}".format(start, end)
141
- clip_srt += srt_clip
142
- if add_sub:
143
- generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
144
- subtitles = SubtitlesClip(subs, generator)
145
- video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
146
- concate_clip = [video_clip]
147
- time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
148
- for _ts in ts[1:]:
149
- start, end = _ts[0] / 16000, _ts[1] / 16000
150
- srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1, time_acc_ost=time_acc_ost)
151
- start, end = start+start_ost/1000.0, end+end_ost/1000.0
152
- _video_clip = video.subclip(start, end)
153
- start_end_info += ", from {} to {}".format(start, end)
154
- clip_srt += srt_clip
155
- if add_sub:
156
- generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
157
- subtitles = SubtitlesClip(subs, generator)
158
- _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
159
- concate_clip.append(copy.copy(_video_clip))
160
- time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
161
- message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
162
- logging.warning("Concating...")
163
- if len(concate_clip) > 1:
164
- video_clip = concatenate_videoclips(concate_clip)
165
- video_clip.write_videofile(clip_video_file, audio_codec="aac")
166
- else:
167
- clip_video_file = vedio_filename
168
- message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
169
- srt_clip = ''
170
- return clip_video_file, message, clip_srt
171
-
172
-