File size: 6,677 Bytes
a409d9a
 
139a744
 
 
 
4e65837
 
 
 
 
a409d9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os

os.system(
    'pip install "modelscope[cv]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html'
)

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
import gradio as gr
import datetime

#获取当前北京时间
utc_dt = datetime.datetime.utcnow()
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
formatted = beijing_dt.strftime("%Y-%m-%d_%H")
print(f"北京时间: {beijing_dt.year}{beijing_dt.month}{beijing_dt.day}日 "
      f"{beijing_dt.hour}{beijing_dt.minute}{beijing_dt.second}秒")
#创建作品存放目录
works_path = '../works_audio_video_transcribe/' + formatted
if not os.path.exists(works_path):
  os.makedirs(works_path)
print('作品目录:' + works_path)

inference_pipeline = pipeline(
    task=Tasks.auto_speech_recognition,
    model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
    
def transcript(audiofile, text_file):
    rec_result = inference_pipeline(audio_in=audiofile)
    print(rec_result['text'])
    
    with open(text_file, "w") as f:
        f.write(rec_result['text'])
        
    return rec_result['text']

def audio_recog(audiofile):
    utc_dt = datetime.datetime.utcnow()
    beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
    formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
    print(f"开始时间: {beijing_dt.year}{beijing_dt.month}{beijing_dt.day}日 "
      f"{beijing_dt.hour}{beijing_dt.minute}{beijing_dt.second}秒")
      
    print("音频文件:" + audiofile)
    
    filename = os.path.splitext(os.path.basename(audiofile))[0]
    text_file = works_path + '/' + filename + '.txt'
    
    text_output = transcript(audiofile, text_file)
    
    utc_dt = datetime.datetime.utcnow()
    beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
    formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
    print(f"结束时间: {beijing_dt.year}{beijing_dt.month}{beijing_dt.day}日 "
      f"{beijing_dt.hour}{beijing_dt.minute}{beijing_dt.second}秒")
    
    return text_output, text_file

def video_recog(filepath):
    filename = os.path.splitext(os.path.basename(filepath))[0]
    worksfile = works_path + '/works_' + filename + '.mp4'
    print("视频文件:" + filepath)
    
    utc_dt = datetime.datetime.utcnow()
    beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
    formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S.%f")
    
    # 提取音频为mp3
    audiofile = works_path + '/' + formatted + '.mp3'
    os.system(f"ffmpeg -i {filepath} -vn -c:a libmp3lame -q:a 4 {audiofile}")
    
    #识别音频文件
    text_output, text_file = audio_recog(audiofile)
    
    return text_output, text_file

css_style = "#fixed_size_img {height: 240px;} " \
            "#overview {margin: auto;max-width: 400px; max-height: 400px;}"

title = "音视频识别 by宁侠"
description = "您只需要上传一段音频或视频文件,我们的服务会快速对其进行语音识别,然后生成相应的文字。这样,您就可以轻松地记录下重要的语音内容。现在就来试试我们的音视频识别服务吧,让您的生活和工作更加便捷!"

examples_path = 'examples/'
examples = [[examples_path + 'demo_shejipuhui.mp4']]

# gradio interface
with gr.Blocks(title=title, css=css_style) as demo:
    gr.HTML('''
      <div style="text-align: center; max-width: 720px; margin: 0 auto;">
                  <div
                    style="
                      display: inline-flex;
                      align-items: center;
                      gap: 0.8rem;
                      font-size: 1.75rem;
                    "
                  >
                    <h1 style="font-family:  PingFangSC; font-weight: 500; font-size: 36px; margin-bottom: 7px;">
                      音视频识别
                    </h1>
                    <h1 style="font-family: PingFangSC; font-weight: 500; line-height: 1.5em; font-size: 16px; margin-bottom: 7px;">
                      by宁侠
                    </h1>
      ''')
    gr.Markdown(description)
    
    with gr.Tab("🔊音频识别 Audio Transcribe"):
        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(label="🔊音频输入 Audio Input", type="filepath")
                gr.Examples(['examples/paddlespeech.asr-zh.wav', 'examples/demo_shejipuhui.mp3'], [audio_input])
                audio_recog_button = gr.Button("👂音频识别 Recognize")
            with gr.Column():
                audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
                audio_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
                audio_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
                audio_output = gr.Audio(label="🔊音频 Audio", visible=False)
    
    audio_recog_button.click(audio_recog, inputs=[audio_input], outputs=[audio_text_output, audio_text_file])
#    audio_subtitles_button.click(audio_subtitles, inputs=[audio_text_input], outputs=[audio_output])

    with gr.Tab("🎥视频识别 Video Transcribe"):
        with gr.Row():
            with gr.Column():
                video_input = gr.Video(label="🎥视频输入 Video Input")
                gr.Examples(['examples/demo_shejipuhui.mp4'], [video_input], label='语音识别示例 ASR Demo')
                video_recog_button = gr.Button("👂视频识别 Recognize")
                video_output = gr.Video(label="🎥视频 Video", visible=False)
            with gr.Column():
                video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
                video_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
                with gr.Row(visible=False):
                    font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
                    font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
                video_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
                
    
    video_recog_button.click(video_recog, inputs=[video_input], outputs=[video_text_output, video_text_file])
#    video_subtitles_button.click(video_subtitles, inputs=[video_text_input], outputs=[video_output])

# start gradio service in local
demo.queue(api_open=False).launch(debug=True)