File size: 7,860 Bytes
33a8656
 
 
b56ae4b
33a8656
d278405
 
33a8656
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import gradio as gr
import os

# os.system('pip install numpy --upgrade')
# update modelscope
# os.system("pip install -U modelscope -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -i https://mirror.sjtu.edu.cn/pypi/web/simple")
os.system('pip install "modelscope[cv]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html')
    
import datetime
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from subtitle_utils import generate_srt

#获取当前北京时间
utc_dt = datetime.datetime.utcnow()
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
formatted = beijing_dt.strftime("%Y-%m-%d_%H")
print(f"北京时间: {beijing_dt.year}{beijing_dt.month}{beijing_dt.day}日 "
      f"{beijing_dt.hour}{beijing_dt.minute}{beijing_dt.second}秒")
#创建作品存放目录
works_path = '../works_audio_video_recognize/' + formatted
if not os.path.exists(works_path):
  os.makedirs(works_path)
print('作品目录:' + works_path)

inference_pipeline = pipeline(
    task=Tasks.auto_speech_recognition,
    model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
    
def transcript(audiofile, text_file, srt_file):
    rec_result = inference_pipeline(audio_in=audiofile)
    
    text_output = rec_result['text']
    with open(text_file, "w") as f:
        f.write(text_output)
        
    srt_output = generate_srt(rec_result['sentences'])
    with open(srt_file, "w") as f:
        f.write(srt_output)
        
    return text_output, srt_output

def audio_recog(audiofile):
    utc_dt = datetime.datetime.utcnow()
    beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
    formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
    print(f"开始时间: {beijing_dt.year}{beijing_dt.month}{beijing_dt.day}日 "
      f"{beijing_dt.hour}{beijing_dt.minute}{beijing_dt.second}秒")
      
    print("音频文件:" + audiofile)
    
    filename = os.path.splitext(os.path.basename(audiofile))[0]
    text_file = works_path + '/' + filename + '.txt'
    srt_file = works_path + '/' + filename + '.srt'
    text_output, srt_output = transcript(audiofile, text_file, srt_file)
    
    utc_dt = datetime.datetime.utcnow()
    beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
    formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
    print(f"结束时间: {beijing_dt.year}{beijing_dt.month}{beijing_dt.day}日 "
      f"{beijing_dt.hour}{beijing_dt.minute}{beijing_dt.second}秒")
    
    return text_output, text_file, srt_output, srt_file

def video_recog(filepath):
    filename = os.path.splitext(os.path.basename(filepath))[0]
    worksfile = works_path + '/works_' + filename + '.mp4'
    print("视频文件:" + filepath)
    
    utc_dt = datetime.datetime.utcnow()
    beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
    formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S.%f")
    
    # 提取音频为mp3
    audiofile = works_path + '/' + formatted + '.mp3'
    os.system(f"ffmpeg -i {filepath} -vn -c:a libmp3lame -q:a 4 {audiofile}")
    
    #识别音频文件
    text_output, text_file, srt_output, srt_file = audio_recog(audiofile)
    
#    # 给视频添加字幕
#    os.system(f"ffmpeg -i {filepath} -i {srt_file} -c:s mov_text -c:v copy -c:a copy {worksfile}")
#    print("作品:" + worksfile)
    
    return text_output, text_file, srt_output, srt_file

css_style = "#fixed_size_img {height: 240px;} " \
            "#overview {margin: auto;max-width: 400px; max-height: 400px;}"

title = "音视频识别 by宁侠"
description = "您只需要上传一段音频或视频文件,我们的服务会快速对其进行语音识别,然后生成相应的文字和字幕。这样,您就可以轻松地记录下重要的语音内容,或者为视频添加精准的字幕。现在就来试试我们的音视频识别服务吧,让您的生活和工作更加便捷!"

examples_path = 'examples/'
examples = [[examples_path + 'demo_shejipuhui.mp4']]

# gradio interface
with gr.Blocks(title=title, css=css_style) as demo:
    gr.HTML('''
      <div style="text-align: center; max-width: 720px; margin: 0 auto;">
                  <div
                    style="
                      display: inline-flex;
                      align-items: center;
                      gap: 0.8rem;
                      font-size: 1.75rem;
                    "
                  >
                    <h1 style="font-family:  PingFangSC; font-weight: 500; font-size: 36px; margin-bottom: 7px;">
                      音视频识别
                    </h1>
                    <h1 style="font-family: PingFangSC; font-weight: 500; line-height: 1.5em; font-size: 16px; margin-bottom: 7px;">
                      by宁侠
                    </h1>
      ''')
    gr.Markdown(description)
    
    with gr.Tab("🔊音频识别 Audio Transcribe"):
        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(label="🔊音频输入 Audio Input", type="filepath")
                gr.Examples(['examples/paddlespeech.asr-zh.wav', 'examples/demo_shejipuhui.mp3'], [audio_input])
                audio_recog_button = gr.Button("👂音频识别 Recognize")
            with gr.Column():
                audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
                audio_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
                audio_srt_output = gr.Textbox(label="📖SRT字幕内容 SRT Subtitles", max_lines=10)
                audio_srt_file = gr.File(label="📖SRT字幕文件 SRT File")
                audio_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
                audio_output = gr.Audio(label="🔊音频 Audio", visible=False)
    
    audio_recog_button.click(audio_recog, inputs=[audio_input], outputs=[audio_text_output, audio_text_file, audio_srt_output, audio_srt_file])
#    audio_subtitles_button.click(audio_subtitles, inputs=[audio_text_input], outputs=[audio_output])

    with gr.Tab("🎥视频识别 Video Transcribe"):
        with gr.Row():
            with gr.Column():
                video_input = gr.Video(label="🎥视频输入 Video Input")
                gr.Examples(['examples/demo_shejipuhui.mp4'], [video_input], label='语音识别示例 ASR Demo')
                video_recog_button = gr.Button("👂视频识别 Recognize")
                video_output = gr.Video(label="🎥视频 Video", visible=False)
            with gr.Column():
                video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
                video_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
                video_srt_output = gr.Textbox(label="📖SRT字幕内容 SRT Subtitles", max_lines=10)
                video_srt_file = gr.File(label="📖SRT字幕文件 SRT File")
                with gr.Row(visible=False):
                    font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
                    font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
                video_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
                
    
    video_recog_button.click(video_recog, inputs=[video_input], outputs=[video_text_output, video_text_file, video_srt_output, video_srt_file])
#    video_subtitles_button.click(video_subtitles, inputs=[video_text_input], outputs=[video_output])

# start gradio service in local
demo.queue(api_open=False).launch(debug=True)