Spaces:
Runtime error
Runtime error
File size: 7,860 Bytes
33a8656 b56ae4b 33a8656 d278405 33a8656 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import gradio as gr
import os
# os.system('pip install numpy --upgrade')
# update modelscope
# os.system("pip install -U modelscope -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -i https://mirror.sjtu.edu.cn/pypi/web/simple")
os.system('pip install "modelscope[cv]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html')
import datetime
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from subtitle_utils import generate_srt
#获取当前北京时间
utc_dt = datetime.datetime.utcnow()
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
formatted = beijing_dt.strftime("%Y-%m-%d_%H")
print(f"北京时间: {beijing_dt.year}年{beijing_dt.month}月{beijing_dt.day}日 "
f"{beijing_dt.hour}时{beijing_dt.minute}分{beijing_dt.second}秒")
#创建作品存放目录
works_path = '../works_audio_video_recognize/' + formatted
if not os.path.exists(works_path):
os.makedirs(works_path)
print('作品目录:' + works_path)
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
def transcript(audiofile, text_file, srt_file):
rec_result = inference_pipeline(audio_in=audiofile)
text_output = rec_result['text']
with open(text_file, "w") as f:
f.write(text_output)
srt_output = generate_srt(rec_result['sentences'])
with open(srt_file, "w") as f:
f.write(srt_output)
return text_output, srt_output
def audio_recog(audiofile):
utc_dt = datetime.datetime.utcnow()
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
print(f"开始时间: {beijing_dt.year}年{beijing_dt.month}月{beijing_dt.day}日 "
f"{beijing_dt.hour}时{beijing_dt.minute}分{beijing_dt.second}秒")
print("音频文件:" + audiofile)
filename = os.path.splitext(os.path.basename(audiofile))[0]
text_file = works_path + '/' + filename + '.txt'
srt_file = works_path + '/' + filename + '.srt'
text_output, srt_output = transcript(audiofile, text_file, srt_file)
utc_dt = datetime.datetime.utcnow()
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
print(f"结束时间: {beijing_dt.year}年{beijing_dt.month}月{beijing_dt.day}日 "
f"{beijing_dt.hour}时{beijing_dt.minute}分{beijing_dt.second}秒")
return text_output, text_file, srt_output, srt_file
def video_recog(filepath):
filename = os.path.splitext(os.path.basename(filepath))[0]
worksfile = works_path + '/works_' + filename + '.mp4'
print("视频文件:" + filepath)
utc_dt = datetime.datetime.utcnow()
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=16)))
formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S.%f")
# 提取音频为mp3
audiofile = works_path + '/' + formatted + '.mp3'
os.system(f"ffmpeg -i {filepath} -vn -c:a libmp3lame -q:a 4 {audiofile}")
#识别音频文件
text_output, text_file, srt_output, srt_file = audio_recog(audiofile)
# # 给视频添加字幕
# os.system(f"ffmpeg -i {filepath} -i {srt_file} -c:s mov_text -c:v copy -c:a copy {worksfile}")
# print("作品:" + worksfile)
return text_output, text_file, srt_output, srt_file
css_style = "#fixed_size_img {height: 240px;} " \
"#overview {margin: auto;max-width: 400px; max-height: 400px;}"
title = "音视频识别 by宁侠"
description = "您只需要上传一段音频或视频文件,我们的服务会快速对其进行语音识别,然后生成相应的文字和字幕。这样,您就可以轻松地记录下重要的语音内容,或者为视频添加精准的字幕。现在就来试试我们的音视频识别服务吧,让您的生活和工作更加便捷!"
examples_path = 'examples/'
examples = [[examples_path + 'demo_shejipuhui.mp4']]
# gradio interface
with gr.Blocks(title=title, css=css_style) as demo:
gr.HTML('''
<div style="text-align: center; max-width: 720px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-family: PingFangSC; font-weight: 500; font-size: 36px; margin-bottom: 7px;">
音视频识别
</h1>
<h1 style="font-family: PingFangSC; font-weight: 500; line-height: 1.5em; font-size: 16px; margin-bottom: 7px;">
by宁侠
</h1>
''')
gr.Markdown(description)
with gr.Tab("🔊音频识别 Audio Transcribe"):
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="🔊音频输入 Audio Input", type="filepath")
gr.Examples(['examples/paddlespeech.asr-zh.wav', 'examples/demo_shejipuhui.mp3'], [audio_input])
audio_recog_button = gr.Button("👂音频识别 Recognize")
with gr.Column():
audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
audio_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
audio_srt_output = gr.Textbox(label="📖SRT字幕内容 SRT Subtitles", max_lines=10)
audio_srt_file = gr.File(label="📖SRT字幕文件 SRT File")
audio_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
audio_output = gr.Audio(label="🔊音频 Audio", visible=False)
audio_recog_button.click(audio_recog, inputs=[audio_input], outputs=[audio_text_output, audio_text_file, audio_srt_output, audio_srt_file])
# audio_subtitles_button.click(audio_subtitles, inputs=[audio_text_input], outputs=[audio_output])
with gr.Tab("🎥视频识别 Video Transcribe"):
with gr.Row():
with gr.Column():
video_input = gr.Video(label="🎥视频输入 Video Input")
gr.Examples(['examples/demo_shejipuhui.mp4'], [video_input], label='语音识别示例 ASR Demo')
video_recog_button = gr.Button("👂视频识别 Recognize")
video_output = gr.Video(label="🎥视频 Video", visible=False)
with gr.Column():
video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
video_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
video_srt_output = gr.Textbox(label="📖SRT字幕内容 SRT Subtitles", max_lines=10)
video_srt_file = gr.File(label="📖SRT字幕文件 SRT File")
with gr.Row(visible=False):
font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
video_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
video_recog_button.click(video_recog, inputs=[video_input], outputs=[video_text_output, video_text_file, video_srt_output, video_srt_file])
# video_subtitles_button.click(video_subtitles, inputs=[video_text_input], outputs=[video_output])
# start gradio service in local
demo.queue(api_open=False).launch(debug=True)
|