bramw commited on
Commit
3638e5c
1 Parent(s): ddb5bc5

Fixing accidental edit

Browse files

My bad, clicked into wrong space to edit the app

Files changed (1) hide show
  1. app.py +242 -97
app.py CHANGED
@@ -1,140 +1,285 @@
1
- import gradio as gr
2
- import numpy as np
3
- # from edict_functions import EDICT_editing
4
- from PIL import Image
5
- from utils import Endpoint, get_token
6
  from io import BytesIO
7
- import requests
8
 
 
 
 
 
9
 
10
- endpoint = Endpoint()
11
-
12
- def local_edict(x, source_text, edit_text,
13
- edit_strength, guidance_scale,
14
- steps=50, mix_weight=0.93, ):
15
- x = Image.fromarray(x)
16
- return_im = EDICT_editing(x,
17
- source_text,
18
- edit_text,
19
- steps=steps,
20
- mix_weight=mix_weight,
21
- init_image_strength=edit_strength,
22
- guidance_scale=guidance_scale
23
- )[0]
24
- return np.array(return_im)
25
 
26
  def encode_image(image):
27
  buffered = BytesIO()
28
- image.save(buffered, format="JPEG", quality=95)
29
  buffered.seek(0)
30
 
31
  return buffered
32
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- def decode_image(img_obj):
36
- img = Image.open(img_obj).convert("RGB")
37
- return img
38
 
39
- def edict(x, source_text, edit_text,
40
- edit_strength, guidance_scale,
41
- steps=50, mix_weight=0.93, ):
42
 
43
  url = endpoint.url
44
- url = url + "/api/edit"
45
- headers = {### Misc.
46
 
47
- "User-Agent": "EDICT HuggingFace Space",
 
48
  "Auth-Token": get_token(),
49
  }
50
 
51
  data = {
52
- "source_text": source_text,
53
- "edit_text": edit_text,
54
- "edit_strength": edit_strength,
55
- "guidance_scale": guidance_scale,
56
  }
57
 
58
- image = encode_image(Image.fromarray(x))
59
- files = {"image": image}
60
 
61
  response = requests.post(url, data=data, files=files, headers=headers)
62
-
63
  if response.status_code == 200:
64
- return np.array(decode_image(BytesIO(response.content)))
65
  else:
66
- return "Error: " + response.text
67
- # x = decode_image(response)
68
- # return np.array(x)
69
 
70
- examples = [
71
- ['square_ims/american_gothic.jpg', 'A painting of two people frowning', 'A painting of two people smiling', 0.5, 3],
72
- ['square_ims/colloseum.jpg', 'An old ruined building', 'A new modern office building', 0.8, 3],
73
- ]
74
 
 
 
 
 
75
 
76
- examples.append(['square_ims/scream.jpg', 'A painting of someone screaming', 'A painting of an alien', 0.5, 3])
77
- examples.append(['square_ims/yosemite.jpg', 'Granite forest valley', 'Granite desert valley', 0.8, 3])
78
- examples.append(['square_ims/einstein.jpg', 'Mouth open', 'Mouth closed', 0.8, 3])
79
- examples.append(['square_ims/einstein.jpg', 'A man', 'A man in K.I.S.S. facepaint', 0.8, 3])
80
- """
81
- examples.extend([
82
- ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A Chinese New Year cupcake', 0.8, 3],
83
- ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A Union Jack cupcake', 0.8, 3],
84
- ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A Nigerian flag cupcake', 0.8, 3],
85
- ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A Santa Claus cupcake', 0.8, 3],
86
- ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'An Easter cupcake', 0.8, 3],
87
- ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A hedgehog cupcake', 0.8, 3],
88
- ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A rose cupcake', 0.8, 3],
89
- ])
90
- """
91
 
92
- for dog_i in [1, 2]:
93
- for breed in ['Golden Retriever', 'Chihuahua', 'Dalmatian']:
94
- examples.append([f'square_ims/imagenet_dog_{dog_i}.jpg', 'A dog', f'A {breed}', 0.8, 3])
95
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- description = '**For safety and ethical considerations, we have disabled image uploading from March 21. 2023.\nPlease try examples provided below.**\nA gradio demo for [EDICT](https://arxiv.org/abs/2211.12446) (CVPR23)'
98
- # description = gr.Markdown(description)
99
 
100
- article = """
 
 
 
 
101
 
102
- ### Prompting Style
 
 
103
 
104
- As with many text-to-image methods, the prompting style of EDICT can make a big difference. When in doubt, experiment! Some guidance:
105
- * Parallel *Original Description* and *Edit Description* construction as much as possible. Inserting/editing single words often is enough to affect a change while maintaining a lot of the original structure
106
- * Words that will affect the entire setting (e.g. "A photo of " vs. "A painting of") can make a big difference. Playing around with them can help a lot
107
 
108
- ### Parameters
109
- Both `edit_strength` and `guidance_scale` have similar properties qualitatively: the higher the value the more the image will change. We suggest
110
- * Increasing/decreasing `edit_strength` first, particularly to alter/preserve more of the original structure/content
111
- * Then changing `guidance_scale` to make the change in the edited region more or less pronounced.
112
 
113
- Usually we find changing `edit_strength` to be enough, but feel free to play around (and report any interesting results)!
 
 
 
 
 
 
 
 
 
114
 
115
- ### Misc.
116
 
117
- Having difficulty coming up with a caption? Try [BLIP](https://huggingface.co/spaces/Salesforce/BLIP2) to automatically generate one!
118
 
119
- As with most StableDiffusion approaches, faces/text are often problematic to render, especially if they're small. Having these in the foreground will help keep them cleaner.
 
 
 
 
 
 
 
120
 
121
- A returned black image means that the [Safety Checker](https://huggingface.co/CompVis/stable-diffusion-safety-checker) triggered on the photo. This happens in odd cases sometimes (it often rejects
122
- the huggingface logo or variations), but we need to keep it in for obvious reasons.
123
  """
124
- # article = gr.Markdown(description)
125
-
126
- iface = gr.Interface(fn=edict, inputs=[gr.Image(interactive=False),
127
- gr.Textbox(label="Original Description"),
128
- gr.Textbox(label="Edit Description"),
129
- # 50, # gr.Slider(5, 50, value=20, step=1),
130
- # 0.93, # gr.Slider(0.5, 1, value=0.7, step=0.05),
131
- gr.Slider(0.0, 1, value=0.8, step=0.05),
132
- gr.Slider(0, 10, value=3, step=0.5),
133
- ],
134
- # examples = examples,
135
- outputs="image",
136
- description=description,
137
- article=article,
138
- #cache_examples=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  )
140
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from io import BytesIO
 
2
 
3
+ import string
4
+ import gradio as gr
5
+ import requests
6
+ from utils import Endpoint, get_token
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def encode_image(image):
10
  buffered = BytesIO()
11
+ image.save(buffered, format="JPEG")
12
  buffered.seek(0)
13
 
14
  return buffered
15
 
16
 
17
+ def query_chat_api(
18
+ image, prompt, decoding_method, temperature, len_penalty, repetition_penalty
19
+ ):
20
+
21
+ url = endpoint.url
22
+ url = url + "/api/generate"
23
+
24
+ headers = {
25
+ "User-Agent": "BLIP-2 HuggingFace Space",
26
+ "Auth-Token": get_token(),
27
+ }
28
+
29
+ data = {
30
+ "prompt": prompt,
31
+ "use_nucleus_sampling": decoding_method == "Nucleus sampling",
32
+ "temperature": temperature,
33
+ "length_penalty": len_penalty,
34
+ "repetition_penalty": repetition_penalty,
35
+ }
36
+
37
+ image = encode_image(image)
38
+ files = {"image": image}
39
+
40
+ response = requests.post(url, data=data, files=files, headers=headers)
41
+
42
+ if response.status_code == 200:
43
+ return response.json()
44
+ else:
45
+ return "Error: " + response.text
46
 
 
 
 
47
 
48
+ def query_caption_api(
49
+ image, decoding_method, temperature, len_penalty, repetition_penalty
50
+ ):
51
 
52
  url = endpoint.url
53
+ url = url + "/api/caption"
 
54
 
55
+ headers = {
56
+ "User-Agent": "BLIP-2 HuggingFace Space",
57
  "Auth-Token": get_token(),
58
  }
59
 
60
  data = {
61
+ "use_nucleus_sampling": decoding_method == "Nucleus sampling",
62
+ "temperature": temperature,
63
+ "length_penalty": len_penalty,
64
+ "repetition_penalty": repetition_penalty,
65
  }
66
 
67
+ image = encode_image(image)
68
+ files = {"image": image}
69
 
70
  response = requests.post(url, data=data, files=files, headers=headers)
71
+
72
  if response.status_code == 200:
73
+ return response.json()
74
  else:
75
+ return "Error: " + response.text
 
 
76
 
 
 
 
 
77
 
78
+ def postprocess_output(output):
79
+ # if last character is not a punctuation, add a full stop
80
+ if not output[0][-1] in string.punctuation:
81
+ output[0] += "."
82
 
83
+ return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
 
 
 
85
 
86
+ def inference_chat(
87
+ image,
88
+ text_input,
89
+ decoding_method,
90
+ temperature,
91
+ length_penalty,
92
+ repetition_penalty,
93
+ history=[],
94
+ ):
95
+ text_input = text_input
96
+ history.append(text_input)
97
 
98
+ prompt = " ".join(history)
 
99
 
100
+ output = query_chat_api(
101
+ image, prompt, decoding_method, temperature, length_penalty, repetition_penalty
102
+ )
103
+ output = postprocess_output(output)
104
+ history += output
105
 
106
+ chat = [
107
+ (history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)
108
+ ] # convert to tuples of list
109
 
110
+ return {chatbot: chat, state: history}
 
 
111
 
 
 
 
 
112
 
113
+ def inference_caption(
114
+ image,
115
+ decoding_method,
116
+ temperature,
117
+ length_penalty,
118
+ repetition_penalty,
119
+ ):
120
+ output = query_caption_api(
121
+ image, decoding_method, temperature, length_penalty, repetition_penalty
122
+ )
123
 
124
+ return output[0]
125
 
 
126
 
127
+ title = """<h1 align="center">BLIP-2</h1>"""
128
+ description = """Gradio demo for BLIP-2, image-to-text generation from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them.
129
+ <br> <strong>Disclaimer</strong>: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected."""
130
+ article = """<strong>Paper</strong>: <a href='https://arxiv.org/abs/2301.12597' target='_blank'>BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>
131
+ <br> <strong>Code</strong>: BLIP2 is now integrated into GitHub repo: <a href='https://github.com/salesforce/LAVIS' target='_blank'>LAVIS: a One-stop Library for Language and Vision</a>
132
+ <br> <strong>🤗 `transformers` integration</strong>: You can now use `transformers` to use our BLIP-2 models! Check out the <a href='https://huggingface.co/docs/transformers/main/en/model_doc/blip-2' target='_blank'> official docs </a>
133
+ <p> <strong>Project Page</strong>: <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'> BLIP2 on LAVIS</a>
134
+ <br> <strong>Description</strong>: Captioning results from <strong>BLIP2_OPT_6.7B</strong>. Chat results from <strong>BLIP2_FlanT5xxl</strong>.
135
 
136
+ <h2><strong>Due to ethical concerns, we have disabled image uploading from March 21. 2023. </strong>
137
+ <h2><strong>Please try examples provided below.</strong>
138
  """
139
+
140
+ endpoint = Endpoint()
141
+
142
+ examples = [
143
+ ["house.png", "How could someone get out of the house?"],
144
+ ["flower.jpg", "Question: What is this flower and where is it's origin? Answer:"],
145
+ ["pizza.jpg", "What are steps to cook it?"],
146
+ ["sunset.jpg", "Here is a romantic message going along the photo:"],
147
+ ["forbidden_city.webp", "In what dynasties was this place built?"],
148
+ ]
149
+
150
+ with gr.Blocks(
151
+ css="""
152
+ .message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px}
153
+ #component-21 > div.wrap.svelte-w6rprc {height: 600px;}
154
+ """
155
+ ) as iface:
156
+ state = gr.State([])
157
+
158
+ gr.Markdown(title)
159
+ gr.Markdown(description)
160
+ gr.Markdown(article)
161
+
162
+ with gr.Row():
163
+ with gr.Column(scale=1):
164
+ image_input = gr.Image(type="pil", interactive=False)
165
+
166
+ # with gr.Row():
167
+ sampling = gr.Radio(
168
+ choices=["Beam search", "Nucleus sampling"],
169
+ value="Beam search",
170
+ label="Text Decoding Method",
171
+ interactive=True,
172
+ )
173
+
174
+ temperature = gr.Slider(
175
+ minimum=0.5,
176
+ maximum=1.0,
177
+ value=1.0,
178
+ step=0.1,
179
+ interactive=True,
180
+ label="Temperature (used with nucleus sampling)",
181
+ )
182
+
183
+ len_penalty = gr.Slider(
184
+ minimum=-1.0,
185
+ maximum=2.0,
186
+ value=1.0,
187
+ step=0.2,
188
+ interactive=True,
189
+ label="Length Penalty (set to larger for longer sequence, used with beam search)",
190
+ )
191
+
192
+ rep_penalty = gr.Slider(
193
+ minimum=1.0,
194
+ maximum=5.0,
195
+ value=1.5,
196
+ step=0.5,
197
+ interactive=True,
198
+ label="Repeat Penalty (larger value prevents repetition)",
199
+ )
200
+
201
+ with gr.Column(scale=1.8):
202
+
203
+ with gr.Column():
204
+ caption_output = gr.Textbox(lines=1, label="Caption Output")
205
+ caption_button = gr.Button(
206
+ value="Caption it!", interactive=True, variant="primary"
207
+ )
208
+ caption_button.click(
209
+ inference_caption,
210
+ [
211
+ image_input,
212
+ sampling,
213
+ temperature,
214
+ len_penalty,
215
+ rep_penalty,
216
+ ],
217
+ [caption_output],
218
+ )
219
+
220
+ gr.Markdown("""Trying prompting your input for chat; e.g. example prompt for QA, \"Question: {} Answer:\" Use proper punctuation (e.g., question mark).""")
221
+ with gr.Row():
222
+ with gr.Column(
223
+ scale=1.5,
224
+ ):
225
+ chatbot = gr.Chatbot(
226
+ label="Chat Output (from FlanT5)",
227
+ )
228
+
229
+ # with gr.Row():
230
+ with gr.Column(scale=1):
231
+ chat_input = gr.Textbox(lines=1, label="Chat Input")
232
+ chat_input.submit(
233
+ inference_chat,
234
+ [
235
+ image_input,
236
+ chat_input,
237
+ sampling,
238
+ temperature,
239
+ len_penalty,
240
+ rep_penalty,
241
+ state,
242
+ ],
243
+ [chatbot, state],
244
  )
245
+
246
+ with gr.Row():
247
+ clear_button = gr.Button(value="Clear", interactive=True)
248
+ clear_button.click(
249
+ lambda: ("", [], []),
250
+ [],
251
+ [chat_input, chatbot, state],
252
+ queue=False,
253
+ )
254
+
255
+ submit_button = gr.Button(
256
+ value="Submit", interactive=True, variant="primary"
257
+ )
258
+ submit_button.click(
259
+ inference_chat,
260
+ [
261
+ image_input,
262
+ chat_input,
263
+ sampling,
264
+ temperature,
265
+ len_penalty,
266
+ rep_penalty,
267
+ state,
268
+ ],
269
+ [chatbot, state],
270
+ )
271
+
272
+ image_input.change(
273
+ lambda: ("", "", []),
274
+ [],
275
+ [chatbot, caption_output, state],
276
+ queue=False,
277
+ )
278
+
279
+ examples = gr.Examples(
280
+ examples=examples,
281
+ inputs=[image_input, chat_input],
282
+ )
283
+
284
+ iface.queue(concurrency_count=1, api_open=False, max_size=10)
285
+ iface.launch(enable_queue=True)