glt3953's picture
Update app.py
139a744
raw
history blame
No virus
6.68 kB
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
import gradio as gr
import datetime
import os
os.system(
'pip install "modelscope[cv]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html'
)
#获取当前北京时间
utc_dt = datetime.datetime.utcnow()
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
formatted = beijing_dt.strftime("%Y-%m-%d_%H")
print(f"北京时间: {beijing_dt.year}{beijing_dt.month}{beijing_dt.day}日 "
f"{beijing_dt.hour}{beijing_dt.minute}{beijing_dt.second}秒")
#创建作品存放目录
works_path = '../works_audio_video_transcribe/' + formatted
if not os.path.exists(works_path):
os.makedirs(works_path)
print('作品目录:' + works_path)
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
def transcript(audiofile, text_file):
rec_result = inference_pipeline(audio_in=audiofile)
print(rec_result['text'])
with open(text_file, "w") as f:
f.write(rec_result['text'])
return rec_result['text']
def audio_recog(audiofile):
utc_dt = datetime.datetime.utcnow()
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
print(f"开始时间: {beijing_dt.year}{beijing_dt.month}{beijing_dt.day}日 "
f"{beijing_dt.hour}{beijing_dt.minute}{beijing_dt.second}秒")
print("音频文件:" + audiofile)
filename = os.path.splitext(os.path.basename(audiofile))[0]
text_file = works_path + '/' + filename + '.txt'
text_output = transcript(audiofile, text_file)
utc_dt = datetime.datetime.utcnow()
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
print(f"结束时间: {beijing_dt.year}{beijing_dt.month}{beijing_dt.day}日 "
f"{beijing_dt.hour}{beijing_dt.minute}{beijing_dt.second}秒")
return text_output, text_file
def video_recog(filepath):
filename = os.path.splitext(os.path.basename(filepath))[0]
worksfile = works_path + '/works_' + filename + '.mp4'
print("视频文件:" + filepath)
utc_dt = datetime.datetime.utcnow()
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S.%f")
# 提取音频为mp3
audiofile = works_path + '/' + formatted + '.mp3'
os.system(f"ffmpeg -i {filepath} -vn -c:a libmp3lame -q:a 4 {audiofile}")
#识别音频文件
text_output, text_file = audio_recog(audiofile)
return text_output, text_file
css_style = "#fixed_size_img {height: 240px;} " \
"#overview {margin: auto;max-width: 400px; max-height: 400px;}"
title = "音视频识别 by宁侠"
description = "您只需要上传一段音频或视频文件,我们的服务会快速对其进行语音识别,然后生成相应的文字。这样,您就可以轻松地记录下重要的语音内容。现在就来试试我们的音视频识别服务吧,让您的生活和工作更加便捷!"
examples_path = 'examples/'
examples = [[examples_path + 'demo_shejipuhui.mp4']]
# gradio interface
with gr.Blocks(title=title, css=css_style) as demo:
gr.HTML('''
<div style="text-align: center; max-width: 720px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-family: PingFangSC; font-weight: 500; font-size: 36px; margin-bottom: 7px;">
音视频识别
</h1>
<h1 style="font-family: PingFangSC; font-weight: 500; line-height: 1.5em; font-size: 16px; margin-bottom: 7px;">
by宁侠
</h1>
''')
gr.Markdown(description)
with gr.Tab("🔊音频识别 Audio Transcribe"):
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="🔊音频输入 Audio Input", type="filepath")
gr.Examples(['examples/paddlespeech.asr-zh.wav', 'examples/demo_shejipuhui.mp3'], [audio_input])
audio_recog_button = gr.Button("👂音频识别 Recognize")
with gr.Column():
audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
audio_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
audio_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
audio_output = gr.Audio(label="🔊音频 Audio", visible=False)
audio_recog_button.click(audio_recog, inputs=[audio_input], outputs=[audio_text_output, audio_text_file])
# audio_subtitles_button.click(audio_subtitles, inputs=[audio_text_input], outputs=[audio_output])
with gr.Tab("🎥视频识别 Video Transcribe"):
with gr.Row():
with gr.Column():
video_input = gr.Video(label="🎥视频输入 Video Input")
gr.Examples(['examples/demo_shejipuhui.mp4'], [video_input], label='语音识别示例 ASR Demo')
video_recog_button = gr.Button("👂视频识别 Recognize")
video_output = gr.Video(label="🎥视频 Video", visible=False)
with gr.Column():
video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
video_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
with gr.Row(visible=False):
font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
video_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
video_recog_button.click(video_recog, inputs=[video_input], outputs=[video_text_output, video_text_file])
# video_subtitles_button.click(video_subtitles, inputs=[video_text_input], outputs=[video_output])
# start gradio service in local
demo.queue(api_open=False).launch(debug=True)