zhangzicheng commited on
Commit
dd37d46
·
verified ·
1 Parent(s): eba9bb9

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +114 -3
README.md CHANGED
@@ -1,3 +1,114 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+ <h2>[Installation Free!] Quicker Start with Hugging Face AutoModel</h2>
5
+
6
+ No need to install this GitHub repo. Ensure that you use the Transformers package of 4.45.0 (`pip install transformers==4.45.0`).
7
+
8
+ Do the image quality interpreting chat with q-sit.
9
+ ```python
10
+ import requests
11
+ from PIL import Image
12
+ import torch
13
+ from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
14
+
15
+ model_id = "zhangzicheng/q-sit-mini"
16
+ # if you want to use primary version, switch to q-sit
17
+ # model_id = "zhangzicheng/q-sit"
18
+
19
+ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
20
+ model_id,
21
+ torch_dtype=torch.float16,
22
+ low_cpu_mem_usage=True,
23
+ ).to(0)
24
+
25
+ processor = AutoProcessor.from_pretrained(model_id)
26
+
27
+
28
+ conversation = [
29
+ {
30
+ "role": "user",
31
+ "content": [
32
+ {"type": "text", "text": "How is the clarity of the human in this image?"},
33
+ {"type": "image"},
34
+ ],
35
+ },
36
+ ]
37
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
38
+
39
+ raw_image = Image.open(requests.get("https://github.com/Q-Future/Q-SiT/blob/main/44009500.jpg?raw=true",stream=True).raw)
40
+
41
+ inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)
42
+
43
+ output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
44
+ print(processor.decode(output[0][2:], skip_special_tokens=True).split("assistant")[-1])
45
+ # very low
46
+ ```
47
+
48
+ Do the image quality scoring with q-sit.
49
+ ```python
50
+ import torch
51
+ import requests
52
+ from PIL import Image
53
+ from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration, AutoTokenizer
54
+ import numpy as np
55
+
56
+ def wa5(logits):
57
+ logprobs = np.array([logits["Excellent"], logits["Good"], logits["Fair"], logits["Poor"], logits["Bad"]])
58
+ probs = np.exp(logprobs) / np.sum(np.exp(logprobs))
59
+ return np.inner(probs, np.array([1, 0.75, 0.5, 0.25, 0]))
60
+
61
+ model_id = "zhangzicheng/q-sit-mini"
62
+ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
63
+ model_id,
64
+ torch_dtype=torch.float16,
65
+ low_cpu_mem_usage=True,
66
+ ).to(0)
67
+
68
+ processor = AutoProcessor.from_pretrained(model_id)
69
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
70
+
71
+ # Define rating tokens
72
+ toks = ["Excellent", "Good", "Fair", "Poor", "Bad"]
73
+ ids_ = [id_[0] for id_ in tokenizer(toks)["input_ids"]]
74
+ print("Rating token IDs:", ids_)
75
+
76
+ conversation = [
77
+ {
78
+ "role": "user",
79
+ "content": [
80
+ {"type": "text", "text": "Assume you are an image quality evaluator. \nYour rating should be chosen from the following five categories: Excellent, Good, Fair, Poor, and Bad (from high to low). \nHow would you rate the quality of this image?"},
81
+ {"type": "image"},
82
+ ],
83
+ },
84
+ ]
85
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
86
+
87
+ # Load image
88
+ raw_image = Image.open(requests.get("https://github.com/Q-Future/Q-SiT/blob/main/44009500.jpg?raw=true",stream=True).raw)
89
+ inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)
90
+
91
+ # Manually append the assistant prefix "The quality of this image is "
92
+ prefix_text = "The quality of this image is "
93
+ prefix_ids = tokenizer(prefix_text, return_tensors="pt")["input_ids"].to(0)
94
+ inputs["input_ids"] = torch.cat([inputs["input_ids"], prefix_ids], dim=-1)
95
+ inputs["attention_mask"] = torch.ones_like(inputs["input_ids"]) # Update attention mask
96
+
97
+ # Generate exactly one token (the rating)
98
+ output = model.generate(
99
+ **inputs,
100
+ max_new_tokens=1, # Generate only the rating token
101
+ output_logits=True,
102
+ return_dict_in_generate=True,
103
+ )
104
+
105
+ # Extract logits for the generated rating token
106
+ last_logits = output.logits[-1][0] # Shape: [vocab_size]
107
+ logits_dict = {tok: last_logits[id_].item() for tok, id_ in zip(toks, ids_)}
108
+ weighted_score = wa5(logits_dict)
109
+ print("Weighted average score:", weighted_score)
110
+ # Weighted average score: 0.045549712192942585 range from 0-1
111
+ # if you want range from 0-5, multiply 5
112
+ ```
113
+
114
+ To test q-sit on datasets, please refer to evaluation scripts [here](https://github.com/Q-Future/Q-SiT/tree/main/eval_scripts).