hexuan21 commited on
Commit
df7a5b3
1 Parent(s): e7742fc

update for app_regression.py

Browse files
Files changed (1) hide show
  1. app_regression.py +27 -15
app_regression.py CHANGED
@@ -14,10 +14,11 @@ from models.conversation import conv_templates
14
  from typing import List
15
 
16
 
17
- processor = AutoProcessor.from_pretrained("/home/dongfu/WorkSpace/Mantis/checkpoints/idefics2-8b/mantis-8b-idefics2-video-eval-debug_4096_regression/checkpoint-final")
18
- model = Idefics2ForSequenceClassification.from_pretrained("/home/dongfu/WorkSpace/Mantis/checkpoints/idefics2-8b/mantis-8b-idefics2-video-eval-debug_4096_regression/checkpoint-final", torch_dtype=torch.bfloat16)
19
  model.eval()
20
- MAX_NUM_FRAMES = 24
 
21
  conv_template = conv_templates["idefics_2"]
22
 
23
  with open("./examples/all_subsets.json", 'r') as f:
@@ -46,22 +47,34 @@ then give scores from 7 different dimensions:
46
  (4) motion smoothness, the smoothness of motion or movements
47
  (5) text-to-video alignment, the alignment between the text prompt and the video content
48
  (6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
49
- (7) overall score, the overall quality of the video
50
- for each dimension, output a number from [1,2,3,4],
51
- in which '1' is 'Bad', '2' is 'Average', '3' is 'Good', '4' is 'Perfect'
 
52
  Here is an output example:
53
- visual quality: 3
54
- object consistency: 4
55
- dynamic degree: 4
56
- motion smoothness: 1
57
- text-to-video alignment: 1
58
- factual consistency: 2
59
- overall score: 1
60
 
61
  For this video, the text prompt is "{text_prompt}",
62
  all the frames of video are as follows:
63
 
64
  """
 
 
 
 
 
 
 
 
 
 
 
 
65
  @spaces.GPU(duration=60)
66
  def score(prompt:str, images:List[Image.Image]):
67
  if not prompt:
@@ -85,7 +98,7 @@ def score(prompt:str, images:List[Image.Image]):
85
 
86
  logits = outputs.logits
87
  num_aspects = logits.shape[-1]
88
- aspects = [f"aspect_{i}" for i in range(num_aspects)]
89
 
90
  aspect_scores = {}
91
  for i, aspect in enumerate(aspects):
@@ -130,7 +143,6 @@ def eval_video(prompt, video:str):
130
 
131
  eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
132
 
133
-
134
  num_image_token = eval_prompt.count("<image>")
135
  if num_image_token < len(frames):
136
  eval_prompt += "<image> " * (len(frames) - num_image_token)
 
14
  from typing import List
15
 
16
 
17
+ processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression")
18
+ model = Idefics2ForSequenceClassification.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression", torch_dtype=torch.bfloat16)
19
  model.eval()
20
+
21
+ MAX_NUM_FRAMES = 8
22
  conv_template = conv_templates["idefics_2"]
23
 
24
  with open("./examples/all_subsets.json", 'r') as f:
 
47
  (4) motion smoothness, the smoothness of motion or movements
48
  (5) text-to-video alignment, the alignment between the text prompt and the video content
49
  (6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
50
+
51
+ for each dimension, output a float number from 1.0 to 4.0,
52
+ the higher the number is, the better the video performs in that sub-score,
53
+ the lowest 1.0 means Bad, the highest 4.0 means Perfect/Real (the video is like a real video)
54
  Here is an output example:
55
+ visual quality: 3.2
56
+ object consistency: 2.7
57
+ dynamic degree: 4.0
58
+ motion smoothness: 1.6
59
+ text-to-video alignment: 2.3
60
+ factual consistency: 1.8
 
61
 
62
  For this video, the text prompt is "{text_prompt}",
63
  all the frames of video are as follows:
64
 
65
  """
66
+
67
+
68
+ aspect_mapping={
69
+ 1:"visual quality",
70
+ 2:"object consistency",
71
+ 3:"dynamic degree",
72
+ 4:"motion smoothness",
73
+ 5:'text-to-video alignment',
74
+ 6:'factual consistency',
75
+ }
76
+
77
+
78
  @spaces.GPU(duration=60)
79
  def score(prompt:str, images:List[Image.Image]):
80
  if not prompt:
 
98
 
99
  logits = outputs.logits
100
  num_aspects = logits.shape[-1]
101
+ aspects = [aspect_mapping[i+1] for i in range(num_aspects)]
102
 
103
  aspect_scores = {}
104
  for i, aspect in enumerate(aspects):
 
143
 
144
  eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
145
 
 
146
  num_image_token = eval_prompt.count("<image>")
147
  if num_image_token < len(frames):
148
  eval_prompt += "<image> " * (len(frames) - num_image_token)