File size: 1,967 Bytes
8c79f36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import streamlit as st
import os
from utils.demo import load_video, ctc_decode
from utils.two_stream_infer import load_model
import os
from scripts.extract_lip_coordinates import generate_lip_coordinates
import options as opt

st.set_page_config(layout="wide")

model = load_model()

st.title("LipCoordNet Demo")

st.info(
    "The inference speed is very slow on Huggingface spaces due to it being processed entirely on CPU. For a quicker inference, please clone the repository and change the “device” under options.py to “cuda” for local inference using GPU",
    icon="ℹ️",
)

# Generating a list of options or videos
options = os.listdir(os.path.join("app_input"))
selected_video = st.selectbox("Choose video", options)

col1, col2 = st.columns(2)


with col1:
    file_path = os.path.join("app_input", selected_video)
    video_name = selected_video.split(".")[0]
    os.system(f"ffmpeg -i {file_path} -vcodec libx264 {video_name}.mp4 -y")

    # Rendering inside of the app
    video = open(f"{video_name}.mp4", "rb")
    video_bytes = video.read()
    st.video(video_bytes)


with col1, st.spinner("Splitting video into frames"):
    video, img_p, files = load_video(f"{video_name}.mp4", opt.device)
    prediction_video = video
    st.markdown(f"Frames Generated:\n{files}")
    frames_generated = True
with col1, st.spinner("Generating Lip Landmark Coordinates"):
    coordinates = generate_lip_coordinates(f"{video_name}_samples")
    prediction_coordinates = coordinates
    st.markdown(f"Coordinates Generated:\n{coordinates}")
    coordinates_generated = True

with col2:
    st.info("Ready to make prediction!")
    generate = st.button("Generate")
    if generate:
        with col2, st.spinner("Generating..."):
            y = model(
                prediction_video[None, ...].to(opt.device),
                prediction_coordinates[None, ...].to(opt.device),
            )
            txt = ctc_decode(y[0])
            st.text(txt[-1])