Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,114 @@
|
|
1 |
-
---
|
2 |
-
license: mit
|
3 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
---
|
4 |
+
<h2>[Installation Free!] Quicker Start with Hugging Face AutoModel</h2>
|
5 |
+
|
6 |
+
No need to install this GitHub repo. Ensure that you use the Transformers package of 4.45.0 (`pip install transformers==4.45.0`).
|
7 |
+
|
8 |
+
Do the image quality interpreting chat with q-sit.
|
9 |
+
```python
|
10 |
+
import requests
|
11 |
+
from PIL import Image
|
12 |
+
import torch
|
13 |
+
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
|
14 |
+
|
15 |
+
model_id = "zhangzicheng/q-sit-mini"
|
16 |
+
# if you want to use primary version, switch to q-sit
|
17 |
+
# model_id = "zhangzicheng/q-sit"
|
18 |
+
|
19 |
+
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
|
20 |
+
model_id,
|
21 |
+
torch_dtype=torch.float16,
|
22 |
+
low_cpu_mem_usage=True,
|
23 |
+
).to(0)
|
24 |
+
|
25 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
26 |
+
|
27 |
+
|
28 |
+
conversation = [
|
29 |
+
{
|
30 |
+
"role": "user",
|
31 |
+
"content": [
|
32 |
+
{"type": "text", "text": "How is the clarity of the human in this image?"},
|
33 |
+
{"type": "image"},
|
34 |
+
],
|
35 |
+
},
|
36 |
+
]
|
37 |
+
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
38 |
+
|
39 |
+
raw_image = Image.open(requests.get("https://github.com/Q-Future/Q-SiT/blob/main/44009500.jpg?raw=true",stream=True).raw)
|
40 |
+
|
41 |
+
inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)
|
42 |
+
|
43 |
+
output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
|
44 |
+
print(processor.decode(output[0][2:], skip_special_tokens=True).split("assistant")[-1])
|
45 |
+
# very low
|
46 |
+
```
|
47 |
+
|
48 |
+
Do the image quality scoring with q-sit.
|
49 |
+
```python
|
50 |
+
import torch
|
51 |
+
import requests
|
52 |
+
from PIL import Image
|
53 |
+
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration, AutoTokenizer
|
54 |
+
import numpy as np
|
55 |
+
|
56 |
+
def wa5(logits):
|
57 |
+
logprobs = np.array([logits["Excellent"], logits["Good"], logits["Fair"], logits["Poor"], logits["Bad"]])
|
58 |
+
probs = np.exp(logprobs) / np.sum(np.exp(logprobs))
|
59 |
+
return np.inner(probs, np.array([1, 0.75, 0.5, 0.25, 0]))
|
60 |
+
|
61 |
+
model_id = "zhangzicheng/q-sit-mini"
|
62 |
+
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
|
63 |
+
model_id,
|
64 |
+
torch_dtype=torch.float16,
|
65 |
+
low_cpu_mem_usage=True,
|
66 |
+
).to(0)
|
67 |
+
|
68 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
69 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
70 |
+
|
71 |
+
# Define rating tokens
|
72 |
+
toks = ["Excellent", "Good", "Fair", "Poor", "Bad"]
|
73 |
+
ids_ = [id_[0] for id_ in tokenizer(toks)["input_ids"]]
|
74 |
+
print("Rating token IDs:", ids_)
|
75 |
+
|
76 |
+
conversation = [
|
77 |
+
{
|
78 |
+
"role": "user",
|
79 |
+
"content": [
|
80 |
+
{"type": "text", "text": "Assume you are an image quality evaluator. \nYour rating should be chosen from the following five categories: Excellent, Good, Fair, Poor, and Bad (from high to low). \nHow would you rate the quality of this image?"},
|
81 |
+
{"type": "image"},
|
82 |
+
],
|
83 |
+
},
|
84 |
+
]
|
85 |
+
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
86 |
+
|
87 |
+
# Load image
|
88 |
+
raw_image = Image.open(requests.get("https://github.com/Q-Future/Q-SiT/blob/main/44009500.jpg?raw=true",stream=True).raw)
|
89 |
+
inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)
|
90 |
+
|
91 |
+
# Manually append the assistant prefix "The quality of this image is "
|
92 |
+
prefix_text = "The quality of this image is "
|
93 |
+
prefix_ids = tokenizer(prefix_text, return_tensors="pt")["input_ids"].to(0)
|
94 |
+
inputs["input_ids"] = torch.cat([inputs["input_ids"], prefix_ids], dim=-1)
|
95 |
+
inputs["attention_mask"] = torch.ones_like(inputs["input_ids"]) # Update attention mask
|
96 |
+
|
97 |
+
# Generate exactly one token (the rating)
|
98 |
+
output = model.generate(
|
99 |
+
**inputs,
|
100 |
+
max_new_tokens=1, # Generate only the rating token
|
101 |
+
output_logits=True,
|
102 |
+
return_dict_in_generate=True,
|
103 |
+
)
|
104 |
+
|
105 |
+
# Extract logits for the generated rating token
|
106 |
+
last_logits = output.logits[-1][0] # Shape: [vocab_size]
|
107 |
+
logits_dict = {tok: last_logits[id_].item() for tok, id_ in zip(toks, ids_)}
|
108 |
+
weighted_score = wa5(logits_dict)
|
109 |
+
print("Weighted average score:", weighted_score)
|
110 |
+
# Weighted average score: 0.045549712192942585 range from 0-1
|
111 |
+
# if you want range from 0-5, multiply 5
|
112 |
+
```
|
113 |
+
|
114 |
+
To test q-sit on datasets, please refer to evaluation scripts [here](https://github.com/Q-Future/Q-SiT/tree/main/eval_scripts).
|