Spaces:

wissemkarous
/

PFA-Demo

Sleeping

App Files Files Community

wissemkarous commited on Mar 17

Commit

8c79f36

•

1 Parent(s): b54d938

init

Browse files

Files changed (7) hide show

README.md +7 -7
app.py +59 -0
cvtransforms.py +14 -0
dataset.py +155 -0
options.py +20 -0
packages.txt +1 -0
requirements.txt +0 -0

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: PFA Demo
-emoji: 🐨
-colorFrom: indigo
-colorTo: red
-sdk: gradio
-sdk_version: 4.20.0
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: SilentSpeak
+emoji: 📉
+colorFrom: blue
+colorTo: indigo
+sdk: streamlit
+sdk_version: 1.29.0
 app_file: app.py
 pinned: false
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import streamlit as st
+import os
+from utils.demo import load_video, ctc_decode
+from utils.two_stream_infer import load_model
+import os
+from scripts.extract_lip_coordinates import generate_lip_coordinates
+import options as opt
+st.set_page_config(layout="wide")
+model = load_model()
+st.title("LipCoordNet Demo")
+st.info(
+    "The inference speed is very slow on Huggingface spaces due to it being processed entirely on CPU. For a quicker inference, please clone the repository and change the “device” under options.py to “cuda” for local inference using GPU",
+    icon="ℹ️",
+)
+# Generating a list of options or videos
+options = os.listdir(os.path.join("app_input"))
+selected_video = st.selectbox("Choose video", options)
+col1, col2 = st.columns(2)
+with col1:
+    file_path = os.path.join("app_input", selected_video)
+    video_name = selected_video.split(".")[0]
+    os.system(f"ffmpeg -i {file_path} -vcodec libx264 {video_name}.mp4 -y")
+    # Rendering inside of the app
+    video = open(f"{video_name}.mp4", "rb")
+    video_bytes = video.read()
+    st.video(video_bytes)
+with col1, st.spinner("Splitting video into frames"):
+    video, img_p, files = load_video(f"{video_name}.mp4", opt.device)
+    prediction_video = video
+    st.markdown(f"Frames Generated:\n{files}")
+    frames_generated = True
+with col1, st.spinner("Generating Lip Landmark Coordinates"):
+    coordinates = generate_lip_coordinates(f"{video_name}_samples")
+    prediction_coordinates = coordinates
+    st.markdown(f"Coordinates Generated:\n{coordinates}")
+    coordinates_generated = True
+with col2:
+    st.info("Ready to make prediction!")
+    generate = st.button("Generate")
+    if generate:
+        with col2, st.spinner("Generating..."):
+            y = model(
+                prediction_video[None, ...].to(opt.device),
+                prediction_coordinates[None, ...].to(opt.device),
+            )
+            txt = ctc_decode(y[0])
+            st.text(txt[-1])

cvtransforms.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# coding: utf-8
+import random
+def HorizontalFlip(batch_img, p=0.5):
+    # (T, H, W, C)
+    if random.random() > p:
+        batch_img = batch_img[:, :, ::-1, ...]
+    return batch_img
+def ColorNormalize(batch_img):
+    batch_img = batch_img / 255.0
+    return batch_img

dataset.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# encoding: utf-8
+import numpy as np
+import cv2
+import os
+from torch.utils.data import Dataset
+from cvtransforms import *
+import torch
+import editdistance
+class MyDataset(Dataset):
+    letters = [
+        " ",
+        "A",
+        "B",
+        "C",
+        "D",
+        "E",
+        "F",
+        "G",
+        "H",
+        "I",
+        "J",
+        "K",
+        "L",
+        "M",
+        "N",
+        "O",
+        "P",
+        "Q",
+        "R",
+        "S",
+        "T",
+        "U",
+        "V",
+        "W",
+        "X",
+        "Y",
+        "Z",
+    ]
+    def __init__(self, video_path, anno_path, file_list, vid_pad, txt_pad, phase):
+        self.anno_path = anno_path
+        self.vid_pad = vid_pad
+        self.txt_pad = txt_pad
+        self.phase = phase
+        with open(file_list, "r") as f:
+            self.videos = [
+                os.path.join(video_path, line.strip()) for line in f.readlines()
+            ]
+        self.data = []
+        for vid in self.videos:
+            # items = vid.split(os.path.sep)
+            items = vid.split("/")
+            self.data.append((vid, items[-4], items[-1]))
+    def __getitem__(self, idx):
+        (vid, spk, name) = self.data[idx]
+        vid = self._load_vid(vid)
+        anno = self._load_anno(
+            os.path.join(self.anno_path, spk, "align", name + ".align")
+        )
+        if self.phase == "train":
+            vid = HorizontalFlip(vid)
+        vid = ColorNormalize(vid)
+        vid_len = vid.shape[0]
+        anno_len = anno.shape[0]
+        vid = self._padding(vid, self.vid_pad)
+        anno = self._padding(anno, self.txt_pad)
+        return {
+            "vid": torch.FloatTensor(vid.transpose(3, 0, 1, 2)),
+            "txt": torch.LongTensor(anno),
+            "txt_len": anno_len,
+            "vid_len": vid_len,
+        }
+    def __len__(self):
+        return len(self.data)
+    def _load_vid(self, p):
+        files = os.listdir(p)
+        files = list(filter(lambda file: file.find(".jpg") != -1, files))
+        files = sorted(files, key=lambda file: int(os.path.splitext(file)[0]))
+        array = [cv2.imread(os.path.join(p, file)) for file in files]
+        array = list(filter(lambda im: not im is None, array))
+        array = [
+            cv2.resize(im, (128, 64), interpolation=cv2.INTER_LANCZOS4) for im in array
+        ]
+        array = np.stack(array, axis=0).astype(np.float32)
+        return array
+    def _load_anno(self, name):
+        with open(name, "r") as f:
+            lines = [line.strip().split(" ") for line in f.readlines()]
+            txt = [line[2] for line in lines]
+            txt = list(filter(lambda s: not s.upper() in ["SIL", "SP"], txt))
+        return MyDataset.txt2arr(" ".join(txt).upper(), 1)
+    def _padding(self, array, length):
+        array = [array[_] for _ in range(array.shape[0])]
+        size = array[0].shape
+        for i in range(length - len(array)):
+            array.append(np.zeros(size))
+        return np.stack(array, axis=0)
+    @staticmethod
+    def txt2arr(txt, start):
+        arr = []
+        for c in list(txt):
+            arr.append(MyDataset.letters.index(c) + start)
+        return np.array(arr)
+    @staticmethod
+    def arr2txt(arr, start):
+        txt = []
+        for n in arr:
+            if n >= start:
+                txt.append(MyDataset.letters[n - start])
+        return "".join(txt).strip()
+    @staticmethod
+    def ctc_arr2txt(arr, start):
+        pre = -1
+        txt = []
+        for n in arr:
+            if pre != n and n >= start:
+                if (
+                    len(txt) > 0
+                    and txt[-1] == " "
+                    and MyDataset.letters[n - start] == " "
+                ):
+                    pass
+                else:
+                    txt.append(MyDataset.letters[n - start])
+            pre = n
+        return "".join(txt).strip()
+    @staticmethod
+    def wer(predict, truth):
+        word_pairs = [(p[0].split(" "), p[1].split(" ")) for p in zip(predict, truth)]
+        wer = [1.0 * editdistance.eval(p[0], p[1]) / len(p[1]) for p in word_pairs]
+        return wer
+    @staticmethod
+    def cer(predict, truth):
+        cer = [
+            1.0 * editdistance.eval(p[0], p[1]) / len(p[1]) for p in zip(predict, truth)
+        ]
+        return cer

options.py ADDED Viewed

	@@ -0,0 +1,20 @@

+gpu = "0"
+random_seed = 0
+data_type = "unseen"
+video_path = "../lip/"
+train_list = f"data/{data_type}_train.txt"
+val_list = f"data/{data_type}_val.txt"
+anno_path = "../GRID_align_txt"
+vid_padding = 75
+txt_padding = 200
+batch_size = 8
+base_lr = 2e-5
+num_workers = 4
+max_epoch = 90
+display = 10
+test_step = 1000
+save_prefix = f"weights/LipNet_{data_type}"
+is_optimize = True
+device = "cpu"
+two_stream_weights = "pretrain/LipNet_coords_loss_0.025581153109669685_wer_0.01746208431890914_cer_0.006488426950253695.pt"

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

Binary file (3.37 kB). View file