wissemkarous commited on
Commit
8c79f36
β€’
1 Parent(s): b54d938
Files changed (7) hide show
  1. README.md +7 -7
  2. app.py +59 -0
  3. cvtransforms.py +14 -0
  4. dataset.py +155 -0
  5. options.py +20 -0
  6. packages.txt +1 -0
  7. requirements.txt +0 -0
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: PFA Demo
3
- emoji: 🐨
4
- colorFrom: indigo
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 4.20.0
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: SilentSpeak
3
+ emoji: πŸ“‰
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ sdk_version: 1.29.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from utils.demo import load_video, ctc_decode
4
+ from utils.two_stream_infer import load_model
5
+ import os
6
+ from scripts.extract_lip_coordinates import generate_lip_coordinates
7
+ import options as opt
8
+
9
+ st.set_page_config(layout="wide")
10
+
11
+ model = load_model()
12
+
13
+ st.title("LipCoordNet Demo")
14
+
15
+ st.info(
16
+ "The inference speed is very slow on Huggingface spaces due to it being processed entirely on CPU. For a quicker inference, please clone the repository and change the β€œdevice” under options.py to β€œcuda” for local inference using GPU",
17
+ icon="ℹ️",
18
+ )
19
+
20
+ # Generating a list of options or videos
21
+ options = os.listdir(os.path.join("app_input"))
22
+ selected_video = st.selectbox("Choose video", options)
23
+
24
+ col1, col2 = st.columns(2)
25
+
26
+
27
+ with col1:
28
+ file_path = os.path.join("app_input", selected_video)
29
+ video_name = selected_video.split(".")[0]
30
+ os.system(f"ffmpeg -i {file_path} -vcodec libx264 {video_name}.mp4 -y")
31
+
32
+ # Rendering inside of the app
33
+ video = open(f"{video_name}.mp4", "rb")
34
+ video_bytes = video.read()
35
+ st.video(video_bytes)
36
+
37
+
38
+ with col1, st.spinner("Splitting video into frames"):
39
+ video, img_p, files = load_video(f"{video_name}.mp4", opt.device)
40
+ prediction_video = video
41
+ st.markdown(f"Frames Generated:\n{files}")
42
+ frames_generated = True
43
+ with col1, st.spinner("Generating Lip Landmark Coordinates"):
44
+ coordinates = generate_lip_coordinates(f"{video_name}_samples")
45
+ prediction_coordinates = coordinates
46
+ st.markdown(f"Coordinates Generated:\n{coordinates}")
47
+ coordinates_generated = True
48
+
49
+ with col2:
50
+ st.info("Ready to make prediction!")
51
+ generate = st.button("Generate")
52
+ if generate:
53
+ with col2, st.spinner("Generating..."):
54
+ y = model(
55
+ prediction_video[None, ...].to(opt.device),
56
+ prediction_coordinates[None, ...].to(opt.device),
57
+ )
58
+ txt = ctc_decode(y[0])
59
+ st.text(txt[-1])
cvtransforms.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ import random
3
+
4
+
5
+ def HorizontalFlip(batch_img, p=0.5):
6
+ # (T, H, W, C)
7
+ if random.random() > p:
8
+ batch_img = batch_img[:, :, ::-1, ...]
9
+ return batch_img
10
+
11
+
12
+ def ColorNormalize(batch_img):
13
+ batch_img = batch_img / 255.0
14
+ return batch_img
dataset.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # encoding: utf-8
2
+ import numpy as np
3
+ import cv2
4
+ import os
5
+ from torch.utils.data import Dataset
6
+ from cvtransforms import *
7
+ import torch
8
+ import editdistance
9
+
10
+
11
+ class MyDataset(Dataset):
12
+ letters = [
13
+ " ",
14
+ "A",
15
+ "B",
16
+ "C",
17
+ "D",
18
+ "E",
19
+ "F",
20
+ "G",
21
+ "H",
22
+ "I",
23
+ "J",
24
+ "K",
25
+ "L",
26
+ "M",
27
+ "N",
28
+ "O",
29
+ "P",
30
+ "Q",
31
+ "R",
32
+ "S",
33
+ "T",
34
+ "U",
35
+ "V",
36
+ "W",
37
+ "X",
38
+ "Y",
39
+ "Z",
40
+ ]
41
+
42
+ def __init__(self, video_path, anno_path, file_list, vid_pad, txt_pad, phase):
43
+ self.anno_path = anno_path
44
+ self.vid_pad = vid_pad
45
+ self.txt_pad = txt_pad
46
+ self.phase = phase
47
+
48
+ with open(file_list, "r") as f:
49
+ self.videos = [
50
+ os.path.join(video_path, line.strip()) for line in f.readlines()
51
+ ]
52
+
53
+ self.data = []
54
+ for vid in self.videos:
55
+ # items = vid.split(os.path.sep)
56
+ items = vid.split("/")
57
+ self.data.append((vid, items[-4], items[-1]))
58
+
59
+ def __getitem__(self, idx):
60
+ (vid, spk, name) = self.data[idx]
61
+ vid = self._load_vid(vid)
62
+ anno = self._load_anno(
63
+ os.path.join(self.anno_path, spk, "align", name + ".align")
64
+ )
65
+
66
+ if self.phase == "train":
67
+ vid = HorizontalFlip(vid)
68
+
69
+ vid = ColorNormalize(vid)
70
+
71
+ vid_len = vid.shape[0]
72
+ anno_len = anno.shape[0]
73
+ vid = self._padding(vid, self.vid_pad)
74
+ anno = self._padding(anno, self.txt_pad)
75
+
76
+ return {
77
+ "vid": torch.FloatTensor(vid.transpose(3, 0, 1, 2)),
78
+ "txt": torch.LongTensor(anno),
79
+ "txt_len": anno_len,
80
+ "vid_len": vid_len,
81
+ }
82
+
83
+ def __len__(self):
84
+ return len(self.data)
85
+
86
+ def _load_vid(self, p):
87
+ files = os.listdir(p)
88
+ files = list(filter(lambda file: file.find(".jpg") != -1, files))
89
+ files = sorted(files, key=lambda file: int(os.path.splitext(file)[0]))
90
+ array = [cv2.imread(os.path.join(p, file)) for file in files]
91
+ array = list(filter(lambda im: not im is None, array))
92
+ array = [
93
+ cv2.resize(im, (128, 64), interpolation=cv2.INTER_LANCZOS4) for im in array
94
+ ]
95
+ array = np.stack(array, axis=0).astype(np.float32)
96
+ return array
97
+
98
+ def _load_anno(self, name):
99
+ with open(name, "r") as f:
100
+ lines = [line.strip().split(" ") for line in f.readlines()]
101
+ txt = [line[2] for line in lines]
102
+ txt = list(filter(lambda s: not s.upper() in ["SIL", "SP"], txt))
103
+ return MyDataset.txt2arr(" ".join(txt).upper(), 1)
104
+
105
+ def _padding(self, array, length):
106
+ array = [array[_] for _ in range(array.shape[0])]
107
+ size = array[0].shape
108
+ for i in range(length - len(array)):
109
+ array.append(np.zeros(size))
110
+ return np.stack(array, axis=0)
111
+
112
+ @staticmethod
113
+ def txt2arr(txt, start):
114
+ arr = []
115
+ for c in list(txt):
116
+ arr.append(MyDataset.letters.index(c) + start)
117
+ return np.array(arr)
118
+
119
+ @staticmethod
120
+ def arr2txt(arr, start):
121
+ txt = []
122
+ for n in arr:
123
+ if n >= start:
124
+ txt.append(MyDataset.letters[n - start])
125
+ return "".join(txt).strip()
126
+
127
+ @staticmethod
128
+ def ctc_arr2txt(arr, start):
129
+ pre = -1
130
+ txt = []
131
+ for n in arr:
132
+ if pre != n and n >= start:
133
+ if (
134
+ len(txt) > 0
135
+ and txt[-1] == " "
136
+ and MyDataset.letters[n - start] == " "
137
+ ):
138
+ pass
139
+ else:
140
+ txt.append(MyDataset.letters[n - start])
141
+ pre = n
142
+ return "".join(txt).strip()
143
+
144
+ @staticmethod
145
+ def wer(predict, truth):
146
+ word_pairs = [(p[0].split(" "), p[1].split(" ")) for p in zip(predict, truth)]
147
+ wer = [1.0 * editdistance.eval(p[0], p[1]) / len(p[1]) for p in word_pairs]
148
+ return wer
149
+
150
+ @staticmethod
151
+ def cer(predict, truth):
152
+ cer = [
153
+ 1.0 * editdistance.eval(p[0], p[1]) / len(p[1]) for p in zip(predict, truth)
154
+ ]
155
+ return cer
options.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gpu = "0"
2
+ random_seed = 0
3
+ data_type = "unseen"
4
+ video_path = "../lip/"
5
+ train_list = f"data/{data_type}_train.txt"
6
+ val_list = f"data/{data_type}_val.txt"
7
+ anno_path = "../GRID_align_txt"
8
+ vid_padding = 75
9
+ txt_padding = 200
10
+ batch_size = 8
11
+ base_lr = 2e-5
12
+ num_workers = 4
13
+ max_epoch = 90
14
+ display = 10
15
+ test_step = 1000
16
+ save_prefix = f"weights/LipNet_{data_type}"
17
+ is_optimize = True
18
+ device = "cpu"
19
+
20
+ two_stream_weights = "pretrain/LipNet_coords_loss_0.025581153109669685_wer_0.01746208431890914_cer_0.006488426950253695.pt"
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
Binary file (3.37 kB). View file