nam_nguyenhoai_AI commited on
Commit
b0a48de
1 Parent(s): 987b643

Update algorithm

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. algorithm.py +118 -0
  3. app.py +123 -4
  4. utils.py +77 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.mp4
2
+ assets/examples_Video
algorithm.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ from sklearn.metrics import pairwise_distances_argmin_min
3
+ import random
4
+ import numpy as np
5
+ from utils import *
6
+
7
+ def kmeans(number_of_clusters, features):
8
+ # Cluster the frames using K-Means
9
+
10
+ # K-means from sklearn
11
+ #kmeans = KMeans(n_clusters=number_of_clusters, random_state=0).fit(features)
12
+
13
+ # K-means from faiss
14
+ ncentroids = number_of_clusters
15
+ niter = 10
16
+ verbose = True
17
+ x = features
18
+
19
+ # Take the first dimension of the first element of the list
20
+ dimension = x[0].shape[0]
21
+
22
+ kmeans = faiss.Kmeans(dimension, ncentroids, niter=niter, verbose=verbose)
23
+ kmeans.train(x)
24
+
25
+ #closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, features)
26
+ closest, _ = pairwise_distances_argmin_min(kmeans.centroids, x)
27
+
28
+ closest_clips_frames = []
29
+
30
+ for i in sorted(closest):
31
+ for idx in range(i*8, (i+1)*8):
32
+ closest_clips_frames.append(idx)
33
+
34
+ return closest_clips_frames
35
+
36
+ def tt01(features, threshold):
37
+
38
+ i = 0
39
+ clips = []
40
+
41
+ # compare the sum of squared difference between clips i and j
42
+ for j in range(1, len(features)):
43
+ if sum_of_squared_difference(features[i], features[j]) > threshold:
44
+ clip = []
45
+
46
+ # add frames from clip i to j-1 to the clip list
47
+ for b in range(i*8, j*8):
48
+ clip.append(b)
49
+
50
+ # randomly select 15% of the frames from the clip list
51
+ random_num = round(len(clip)*0.15)
52
+
53
+ # sort the frames in the clip list to ensure the order of the frames
54
+ random_Frames = sorted(random.sample(clip, random_num))
55
+ i = j
56
+ clips.extend(random_Frames)
57
+
58
+ # add the last clip to the clip list
59
+ clip = []
60
+ if i==j:
61
+ for c in range(j*8, j*8+8):
62
+ clip.append(c)
63
+ random_num = round(len(clip)*0.15)
64
+ random_Frames = sorted(random.sample(clip, random_num))
65
+ #print("i == j")
66
+
67
+ else: # (i<j)
68
+ for c in range(i*8, (j+1)*8):
69
+ clip.append(c)
70
+ random_num = round(len(clip)*0.15)
71
+ random_Frames = sorted(random.sample(clip, random_num))
72
+ #print(f"{i} with {j}")
73
+
74
+ clips.extend(random_Frames)
75
+
76
+ return clips
77
+
78
+ def tt02(features, threshold):
79
+
80
+ i = 0
81
+ previous = i
82
+ clips = []
83
+
84
+ #compare the sum of squared difference between clips j and previous
85
+ for j in range(1, len(features)):
86
+ if sum_of_squared_difference(features[previous], features[j]) > threshold:
87
+ clip = []
88
+
89
+ # add frames from clip i to j-1 to the clip list
90
+ for b in range(i*8, j*8):
91
+ clip.append(b)
92
+
93
+ # randomly select 15% of the frames from the clip list
94
+ random_num = round(len(clip)*0.15)
95
+ # sort the frames in the clip list to ensure the order of the frames
96
+ random_Frames = sorted(random.sample(clip, random_num))
97
+ i = j
98
+ clips.extend(random_Frames)
99
+
100
+ previous = j
101
+
102
+ # add the last clip to the clip list
103
+ clip = []
104
+ if i==j:
105
+ for c in range(j*8, j*8+8):
106
+ clip.append(c)
107
+ random_num = round(len(clip)*0.15)
108
+ random_Frames = sorted(random.sample(clip, random_num))
109
+
110
+ else: # (i<j)
111
+ for c in range(i*8, (j+1)*8):
112
+ clip.append(c)
113
+ random_num = round(len(clip)*0.15)
114
+ random_Frames = sorted(random.sample(clip, random_num))
115
+
116
+ clips.extend(random_Frames)
117
+
118
+ return clips
app.py CHANGED
@@ -1,6 +1,126 @@
1
  import gradio as gr
2
  import cv2
3
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  css = """
6
  #img-display-container {
@@ -14,7 +134,6 @@ css = """
14
  }
15
  """
16
 
17
-
18
  title = "# Video Summarization Demo"
19
  description = """Video Summarization using Timesformer.
20
 
@@ -28,18 +147,18 @@ with gr.Blocks(css=css) as demo:
28
 
29
  with gr.Row():
30
  input_video = gr.Video(label="Input Video")
31
- model_type = gr.Dropdown(["K-means", "Sum of Squared Difference 01", "Sum of Squared Difference 02"], type="value", label='Model Type')
32
  submit = gr.Button("Submit")
33
  processed_video = gr.Video(label="Summarized Video")
34
 
35
- def on_submit(uploaded_video,model_type):
36
 
37
  # Process the video and get the path of the output video
38
  #output_video_path = make_video(uploaded_video,encoder=model_type)
39
  pass
40
  #return output_video_path
41
 
42
- submit.click(on_submit, inputs=[input_video, model_type], outputs=processed_video)
43
 
44
  #example_files = os.listdir('assets/examples_video')
45
  #example_files.sort()
 
1
  import gradio as gr
2
  import cv2
3
  import os
4
+ import spaces
5
+ import tempfile
6
+ from torchvision import transforms
7
+ from torchvision.transforms import Compose
8
+ import torch
9
+ import numpy as np
10
+ from PIL import Image
11
+ import torch.nn.functional as F
12
+ from pytorchvideo.transforms.functional import predict_depth
13
+ from transformers import pipeline, TimesformerModel, VideoMAEImageProcessor
14
+ from utils import *
15
+ from algorithm import *
16
+
17
+ @spaces.GPU
18
+ def make_video(video_path, outdir='./summarized_video',encoder='Kmeans'):
19
+ if encoder not in ["Kmeans", "Sum of Squared Difference 01", "Sum of Squared Difference 02"]:
20
+ encoder = "Kmeans"
21
+ # nen them vao cac truong hop mo hinh khac
22
+ margin_width = 50
23
+
24
+ model, processor, device = load_model()
25
+
26
+ # total_params = sum(param.numel() for param in model.parameters())
27
+ # print('Total parameters: {:.2f}M'.format(total_params / 1e6))
28
+
29
+ if os.path.isfile(video_path):
30
+ if video_path.endswith('txt'):
31
+ with open(video_path, 'r') as f:
32
+ lines = f.read().splitlines()
33
+ else:
34
+ filenames = [video_path]
35
+ else:
36
+ filenames = os.listdir(video_path)
37
+ filenames = [os.path.join(video_path, filename) for filename in filenames if not filename.startswith('.')]
38
+ filenames.sort()
39
+
40
+ for k, filename in enumerate(filenames):
41
+ print('Progress {:}/{:},'.format(k+1, len(filenames)), 'Processing', filename)
42
+
43
+ raw_video = cv2.VideoCapture(filename)
44
+ frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
45
+ frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS))
46
+ #length = int(raw_video.get(cv2.CAP_PROP_FRAME_COUNT))
47
+ output_width = frame_width * 2 + margin_width
48
+
49
+ filename = os.path.basename(filename)
50
+
51
+ # Find the size to resize
52
+ if "shortest_edge" in processor.size:
53
+ height = width = processor.size["shortest_edge"]
54
+ else:
55
+ height = processor.size["height"]
56
+ width = processor.size["width"]
57
+ resize_to = (height, width)
58
+
59
+ # F/Fs
60
+ clip_sample_rate = 1
61
+ # F
62
+ num_frames = 8
63
+
64
+ frames = []
65
+ features = []
66
+
67
+ # output_path = os.path.join(outdir, filename[:filename.rfind('.')] + '_video_depth.mp4')
68
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile:
69
+ output_path = tmpfile.name
70
+ #out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"avc1"), frame_rate, (output_width, frame_height))
71
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
72
+ out = cv2.VideoWriter(output_path, fourcc, frame_rate, (output_width, frame_height))
73
+ # count=0
74
+
75
+ while raw_video.isOpened():
76
+ ret, raw_frame = raw_video.read()
77
+ if not ret:
78
+ break
79
+
80
+ raw_frame = cv2.resize(raw_frame, resize_to)
81
+ frames.append(raw_frame)
82
+
83
+ # Find key frames by selecting frames with clip_sample_rate
84
+ key_frames = frames[::clip_sample_rate]
85
+ #print('total of frames after sample:', len(selected_frames))
86
+
87
+ # Remove redundant frames to make the number of frames can be divided by num_frames
88
+ num_redudant_frames = len(key_frames) - (len(key_frames) % num_frames)
89
+
90
+ # Final key frames
91
+ final_key_frames = key_frames[:num_redudant_frames]
92
+ #print('total of frames after remove redundant frames:', len(selected_frames))
93
+
94
+ for i in range(0, len(final_key_frames), num_frames):
95
+ if i % num_frames*50 == 0:
96
+ print(f"Loading {i}/{len(final_key_frames)}")
97
+
98
+ # Input clip to the model
99
+ input_frames = final_key_frames[i:i+num_frames]
100
+ # Extract features
101
+ batch_features = extract_features(input_frames, device, model, processor)
102
+ # Convert to numpy array to decrease the memory usage
103
+ batch_features = np.array(batch_features.cpu().detach().numpy())
104
+ features.extend(batch_features)
105
+
106
+ number_of_clusters = round(len(features)*0.15)
107
+
108
+ selected_frames = []
109
+ if encoder == "Kmeans":
110
+ selected_frames = kmeans(features, number_of_clusters)
111
+ elif encoder == "Sum of Squared Difference 01":
112
+ selected_frames = tt01(features, 400)
113
+ else:
114
+ selected_frames = tt02(features, 400)
115
+
116
+ video_writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), frame_rate, (frames[0].shape[1], frames[0].shape[0]))
117
+ for idx in selected_frames:
118
+ video_writer.write(frames[idx])
119
+
120
+ raw_video.release()
121
+ video_writer.release()
122
+ print("Completed summarizing the video (wait for a moment to load).")
123
+ return output_path
124
 
125
  css = """
126
  #img-display-container {
 
134
  }
135
  """
136
 
 
137
  title = "# Video Summarization Demo"
138
  description = """Video Summarization using Timesformer.
139
 
 
147
 
148
  with gr.Row():
149
  input_video = gr.Video(label="Input Video")
150
+ algorithm_type = gr.Dropdown(["Kmeans", "Sum of Squared Difference 01", "Sum of Squared Difference 02"], type="value", label='Algorithm')
151
  submit = gr.Button("Submit")
152
  processed_video = gr.Video(label="Summarized Video")
153
 
154
+ def on_submit(uploaded_video,algorithm_type):
155
 
156
  # Process the video and get the path of the output video
157
  #output_video_path = make_video(uploaded_video,encoder=model_type)
158
  pass
159
  #return output_video_path
160
 
161
+ submit.click(on_submit, inputs=[input_video, algorithm_type], outputs=processed_video)
162
 
163
  #example_files = os.listdir('assets/examples_video')
164
  #example_files.sort()
utils.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import TimesformerModel, VideoMAEImageProcessor
2
+ import torch
3
+ import cv2
4
+ import numpy as np
5
+ from torchvision.transforms import Lambda
6
+ from pytorchvideo.transforms import (
7
+ Normalize,
8
+ )
9
+ from torchvision.transforms import (
10
+ Lambda,
11
+ )
12
+ import os
13
+ from os.path import isfile, join, basename
14
+
15
+ def extract_features(frames, device, model, image_processor):
16
+ # Convert frames to tensor
17
+ frames_tensor = torch.stack([torch.from_numpy(frame) for frame in frames])
18
+ # Change the order of the tensor to (num_frames, channel, height, width)
19
+ frames_tensor = frames_tensor.permute(3, 0, 1, 2).to(device)
20
+
21
+ # Get the mean and std of the image processor
22
+ mean = image_processor.image_mean
23
+ std = image_processor.image_std
24
+
25
+ # Normalize frames
26
+ frames_tensor = Lambda(lambda x: x / 255.0)(frames_tensor)
27
+ frames_tensor = Normalize(mean, std)(frames_tensor)
28
+
29
+ # Change the order of the tensor to (num_frames, channel, height, width) and add a batch dimension
30
+ frames_tensor = frames_tensor.permute(1, 0, 2, 3).unsqueeze(0)
31
+
32
+ # Load the model to the device
33
+ model.to(device)
34
+ model.eval()
35
+ outputs = model(frames_tensor)
36
+
37
+ # Get the output after the Transformer Encoder (MLP head)
38
+ final_output = outputs[0][:, 0]
39
+
40
+ return final_output
41
+
42
+ def to_video(selected_frames, frames, output_path, video_fps):
43
+
44
+ print("MP4 Format.")
45
+ # Write the selected frames to a video
46
+ video_writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), video_fps, (frames[0].shape[1], frames[0].shape[0]))
47
+
48
+ # selected_frames is a list of indices of frames
49
+ for idx in selected_frames:
50
+ video_writer.write(frames[idx])
51
+
52
+ video_writer.release()
53
+ print("Completed summarizing the video (wait for a moment to load).")
54
+
55
+ def to_txt(selected_frames, output_path, clip_sample_rate):
56
+ # Write the selected frames to a txt file
57
+
58
+ with open(output_path, "w") as file:
59
+ for item in selected_frames:
60
+ file.write(str(item) + "\n")
61
+
62
+ print("Completed summarizing the txt (wait for a moment to load).")
63
+
64
+ def load_model():
65
+ try:
66
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
67
+ model = TimesformerModel.from_pretrained("facebook/timesformer-base-finetuned-k600").to(DEVICE).eval()
68
+ processor=VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
69
+ return model, processor, DEVICE
70
+
71
+ except Exception as e:
72
+ print(e)
73
+
74
+ def sum_of_squared_difference(vector1, vector2):
75
+ squared_diff = np.square(vector1 - vector2)
76
+ sum_squared_diff = np.sum(squared_diff)
77
+ return sum_squared_diff