Spaces:
Runtime error
Runtime error
import os | |
import gradio as gr | |
from pipelines.pipeline import InferencePipeline | |
TITLE = """ | |
<div style="text-align: center; max-width: 650px; margin: 0 auto;"> | |
<div | |
style=" | |
display: inline-flex; | |
align-items: center; | |
gap: 0.8rem; | |
font-size: 1.75rem; | |
" | |
> | |
<h1 style="font-weight: 900; margin-bottom: 7px;"> | |
Auto-AVSR: Audio-Visual Speech Recognition | |
</h1> | |
</div> | |
<p style="margin-bottom: 10px; font-size: 94%"> | |
Want to recognize content in a noisy environment?<br>Our Auto-AVSR models are here to transcribe your answers from audio or visual information! | |
</p> | |
</div> | |
""" | |
ARTICLE = """ | |
<div style="text-align: center; max-width: 650px; margin: 0 auto;"> | |
<p> | |
Want to look into models? You can find our [<a href="https://github.com/mpc001/auto_avsr">training code</a>] and [<a href="https://arxiv.org/abs/2303.14307">paper</a>]. | |
</p> | |
<p> | |
The inference is performed on the CPU. You can also run on <a href="https://colab.research.google.com/drive/1jfb6e4xxhXHbmQf-nncdLno1u0b4j614?usp=sharing">Colab GPU</a> | |
</p> | |
<p> | |
We share this demo only for non-commercial purposes. | |
</p> | |
</div> | |
""" | |
CSS = """ | |
#col-container {margin-left: auto; margin-right: auto;} | |
a {text-decoration-line: underline; font-weight: 600;} | |
.animate-spin { | |
animation: spin 1s linear infinite; | |
} | |
@keyframes spin { | |
from { transform: rotate(0deg); } | |
to { transform: rotate(360deg); } | |
} | |
#share-btn-container { | |
display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem; | |
} | |
#share-btn { | |
all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important; | |
} | |
#share-btn * { | |
all: unset; | |
} | |
#share-btn-container div:nth-child(-n+2){ | |
width: auto !important; | |
min-height: 0px !important; | |
} | |
#share-btn-container .wrap { | |
display: none !important; | |
} | |
""" | |
pipelines = { | |
"VSR(mediapipe)": InferencePipeline("./configs/LRS3_V_WER19.1.ini", device="cpu", face_track=True, detector="mediapipe"), | |
"ASR": InferencePipeline("./configs/LRS3_A_WER1.0.ini", device="cpu", face_track=True, detector="mediapipe"), | |
"AVSR(mediapipe)": InferencePipeline("./configs/LRS3_AV_WER0.9.ini", device="cpu", face_track=True, detector="mediapipe") | |
} | |
def fn(pipeline_type, filename): | |
selected_pipeline_instance = pipelines[pipeline_type] | |
landmarks = selected_pipeline_instance.process_landmarks(filename, landmarks_filename=None) | |
data = selected_pipeline_instance.dataloader.load_data(filename, landmarks) | |
transcript = selected_pipeline_instance.model.infer(data) | |
return transcript | |
demo = gr.Blocks(css=CSS) | |
with demo: | |
gr.HTML(TITLE) | |
dropdown_list = gr.inputs.Dropdown(["ASR", "VSR(mediapipe)", "AVSR(mediapipe)"], label="model") | |
video_file = gr.Video(label="INPUT VIDEO", include_audio=True) | |
text = gr.Textbox(label="PREDICTION") | |
btn = gr.Button("Submit").style(full_width=True) | |
btn.click(fn, inputs=[dropdown_list, video_file], outputs=text) | |
gr.HTML(ARTICLE) | |
demo.launch() |