BLIP2-with-transformers

Running

App Files Files Community

hysts HF staff commited on Feb 10, 2023

Commit

2a6a910

•

1 Parent(s): 3b85b9a

Use transformers

Browse files

Files changed (7) hide show

.pre-commit-config.yaml +37 -0
.style.yapf +5 -0
README.md +2 -2
app.py +249 -262
requirements.txt +8 -0
style.css +3 -0
utils.py +0 -27

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+exclude: patch
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.2.0
+  hooks:
+  - id: check-executables-have-shebangs
+  - id: check-json
+  - id: check-merge-conflict
+  - id: check-shebang-scripts-are-executable
+  - id: check-toml
+  - id: check-yaml
+  - id: double-quote-string-fixer
+  - id: end-of-file-fixer
+  - id: mixed-line-ending
+    args: ['--fix=lf']
+  - id: requirements-txt-fixer
+  - id: trailing-whitespace
+- repo: https://github.com/myint/docformatter
+  rev: v1.4
+  hooks:
+  - id: docformatter
+    args: ['--in-place']
+- repo: https://github.com/pycqa/isort
+  rev: 5.12.0
+  hooks:
+    - id: isort
+- repo: https://github.com/pre-commit/mirrors-mypy
+  rev: v0.991
+  hooks:
+    - id: mypy
+      args: ['--ignore-missing-imports']
+      additional_dependencies: ['types-python-slugify']
+- repo: https://github.com/google/yapf
+  rev: v0.32.0
+  hooks:
+  - id: yapf
+    args: ['--parallel', '--in-place']

.style.yapf ADDED Viewed

	@@ -0,0 +1,5 @@

+[style]
+based_on_style = pep8
+blank_line_before_nested_class_or_def = false
+spaces_before_comment = 2
+split_before_logical_operator = true

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: BLIP2
 emoji: 🌖
 colorFrom: blue
 colorTo: pink
 sdk: gradio
-sdk_version: 3.17.0
 app_file: app.py
 pinned: false
 license: bsd-3-clause

 ---
+title: BLIP2 with transformers
 emoji: 🌖
 colorFrom: blue
 colorTo: pink
 sdk: gradio
+sdk_version: 3.18.0
 app_file: app.py
 pinned: false
 license: bsd-3-clause

app.py CHANGED Viewed

@@ -1,282 +1,269 @@
-from io import BytesIO
-import string
-import gradio as gr
-import requests
-from utils import Endpoint, get_token
-def encode_image(image):
-    buffered = BytesIO()
-    image.save(buffered, format="JPEG")
-    buffered.seek(0)
-    return buffered
-def query_chat_api(
-    image, prompt, decoding_method, temperature, len_penalty, repetition_penalty
-):
-    url = endpoint.url
-    url = url + "/api/generate"
-    headers = {
-        "User-Agent": "BLIP-2 HuggingFace Space",
-        "Auth-Token": get_token(),
-    }
-    data = {
-        "prompt": prompt,
-        "use_nucleus_sampling": decoding_method == "Nucleus sampling",
-        "temperature": temperature,
-        "length_penalty": len_penalty,
-        "repetition_penalty": repetition_penalty,
-    }
-    image = encode_image(image)
-    files = {"image": image}
-    response = requests.post(url, data=data, files=files, headers=headers)
-    if response.status_code == 200:
-        return response.json()
-    else:
-        return "Error: " + response.text
-def query_caption_api(
-    image, decoding_method, temperature, len_penalty, repetition_penalty
-):
-    url = endpoint.url
-    url = url + "/api/caption"
-    headers = {
-        "User-Agent": "BLIP-2 HuggingFace Space",
-        "Auth-Token": get_token(),
-    }
-    data = {
-        "use_nucleus_sampling": decoding_method == "Nucleus sampling",
-        "temperature": temperature,
-        "length_penalty": len_penalty,
-        "repetition_penalty": repetition_penalty,
     }
-    image = encode_image(image)
-    files = {"image": image}
-    response = requests.post(url, data=data, files=files, headers=headers)
-    if response.status_code == 200:
-        return response.json()
-    else:
-        return "Error: " + response.text
-def postprocess_output(output):
-    # if last character is not a punctuation, add a full stop
-    if not output[0][-1] in string.punctuation:
-        output[0] += "."
     return output
-def inference_chat(
-    image,
-    text_input,
-    decoding_method,
-    temperature,
-    length_penalty,
-    repetition_penalty,
-    history=[],
-):
-    text_input = text_input
-    history.append(text_input)
-    prompt = " ".join(history)
-    output = query_chat_api(
-        image, prompt, decoding_method, temperature, length_penalty, repetition_penalty
     )
     output = postprocess_output(output)
-    history += output
-    chat = [
-        (history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)
-    ]  # convert to tuples of list
-    return {chatbot: chat, state: history}
-def inference_caption(
-    image,
-    decoding_method,
-    temperature,
-    length_penalty,
-    repetition_penalty,
-):
-    output = query_caption_api(
-        image, decoding_method, temperature, length_penalty, repetition_penalty
-    )
-    return output[0]
-title = """<h1 align="center">BLIP-2</h1>"""
-description = """Gradio demo for BLIP-2, image-to-text generation from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them.
-<br> <strong>Disclaimer</strong>: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected."""
-article = """<strong>Paper</strong>: <a href='https://arxiv.org/abs/2301.12597' target='_blank'>BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>
-<br> <strong>Code</strong>: BLIP2 is now integrated into GitHub repo: <a href='https://github.com/salesforce/LAVIS' target='_blank'>LAVIS: a One-stop Library for Language and Vision</a>
-<br> <strong>🤗 `transformers` integration</strong>: You can now use `transformers` to use our BLIP-2 models! Check out the <a href='https://huggingface.co/docs/transformers/main/en/model_doc/blip-2' target='_blank'> official docs </a>
-<p> <strong>Project Page</strong>: <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'> BLIP2 on LAVIS</a>
-<br> <strong>Description</strong>: Captioning results from <strong>BLIP2_OPT_6.7B</strong>. Chat results from <strong>BLIP2_FlanT5xxl</strong>.
-"""
-endpoint = Endpoint()
 examples = [
-    ["house.png", "How could someone get out of the house?"],
-    ["flower.jpg", "Question: What is this flower and where is it's origin? Answer:"],
-    ["pizza.jpg", "What are steps to cook it?"],
-    ["sunset.jpg", "Here is a romantic message going along the photo:"],
-    ["forbidden_city.webp", "In what dynasties was this place built?"],
 ]
-with gr.Blocks(
-    css="""
-    .message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px}
-    #component-21 > div.wrap.svelte-w6rprc {height: 600px;}
-    """
-) as iface:
-    state = gr.State([])
-    gr.Markdown(title)
-    gr.Markdown(description)
-    gr.Markdown(article)
     with gr.Row():
-        with gr.Column(scale=1):
-            image_input = gr.Image(type="pil")
-            # with gr.Row():
-            sampling = gr.Radio(
-                choices=["Beam search", "Nucleus sampling"],
-                value="Beam search",
-                label="Text Decoding Method",
-                interactive=True,
-            )
-            temperature = gr.Slider(
-                minimum=0.5,
-                maximum=1.0,
-                value=1.0,
-                step=0.1,
-                interactive=True,
-                label="Temperature (used with nucleus sampling)",
-            )
-            len_penalty = gr.Slider(
-                minimum=-1.0,
-                maximum=2.0,
-                value=1.0,
-                step=0.2,
-                interactive=True,
-                label="Length Penalty (set to larger for longer sequence, used with beam search)",
-            )
-            rep_penalty = gr.Slider(
-                minimum=1.0,
-                maximum=5.0,
-                value=1.5,
-                step=0.5,
-                interactive=True,
-                label="Repeat Penalty (larger value prevents repetition)",
-            )
-        with gr.Column(scale=1.8):
-            with gr.Column():
-                caption_output = gr.Textbox(lines=1, label="Caption Output")
-                caption_button = gr.Button(
-                    value="Caption it!", interactive=True, variant="primary"
-                )
-                caption_button.click(
-                    inference_caption,
-                    [
-                        image_input,
-                        sampling,
-                        temperature,
-                        len_penalty,
-                        rep_penalty,
-                    ],
-                    [caption_output],
-                )
-            gr.Markdown("""Trying prompting your input for chat; e.g. example prompt for QA, \"Question: {} Answer:\" Use proper punctuation (e.g., question mark).""")
-            with gr.Row():
-                with gr.Column(
-                    scale=1.5,
-                ):
-                    chatbot = gr.Chatbot(
-                        label="Chat Output (from FlanT5)",
-                    )
-                # with gr.Row():
-                with gr.Column(scale=1):
-                    chat_input = gr.Textbox(lines=1, label="Chat Input")
-                    chat_input.submit(
-                        inference_chat,
-                        [
-                            image_input,
-                            chat_input,
-                            sampling,
-                            temperature,
-                            len_penalty,
-                            rep_penalty,
-                            state,
-                        ],
-                        [chatbot, state],
-                    )
-                    with gr.Row():
-                        clear_button = gr.Button(value="Clear", interactive=True)
-                        clear_button.click(
-                            lambda: ("", [], []),
-                            [],
-                            [chat_input, chatbot, state],
-                            queue=False,
-                        )
-                        submit_button = gr.Button(
-                            value="Submit", interactive=True, variant="primary"
-                        )
-                        submit_button.click(
-                            inference_chat,
-                            [
-                                image_input,
-                                chat_input,
-                                sampling,
-                                temperature,
-                                len_penalty,
-                                rep_penalty,
-                                state,
-                            ],
-                            [chatbot, state],
-                        )
-            image_input.change(
-                lambda: ("", "", []),
-                [],
-                [chatbot, caption_output, state],
-                queue=False,
-            )
-    examples = gr.Examples(
-        examples=examples,
-        inputs=[image_input, chat_input],
     )
-iface.queue(concurrency_count=1, api_open=False, max_size=10)
-iface.launch(enable_queue=True)

+#!/usr/bin/env python
+from __future__ import annotations
+import string
+import gradio as gr
+import PIL.Image
+import torch
+from transformers import AutoProcessor, Blip2ForConditionalGeneration
+DESCRIPTION = '# BLIP-2'
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+MODEL_ID_OPT_6_7B = 'Salesforce/blip2-opt-6.7b'
+MODEL_ID_FLAN_T5_XXL = 'Salesforce/blip2-flan-t5-xxl'
+model_dict = {
+    MODEL_ID_OPT_6_7B: {
+        'processor':
+        AutoProcessor.from_pretrained(MODEL_ID_OPT_6_7B),
+        'model':
+        Blip2ForConditionalGeneration.from_pretrained(MODEL_ID_OPT_6_7B,
+                                                      device_map='auto',
+                                                      load_in_8bit=True),
+    },
+    MODEL_ID_FLAN_T5_XXL: {
+        'processor':
+        AutoProcessor.from_pretrained(MODEL_ID_FLAN_T5_XXL),
+        'model':
+        Blip2ForConditionalGeneration.from_pretrained(MODEL_ID_FLAN_T5_XXL,
+                                                      device_map='auto',
+                                                      load_in_8bit=True),
     }
+}
+def generate_caption(model_id: str, image: PIL.Image.Image,
+                     decoding_method: str, temperature: float,
+                     length_penalty: float, repetition_penalty: float) -> str:
+    model_info = model_dict[model_id]
+    processor = model_info['processor']
+    model = model_info['model']
+    inputs = processor(images=image,
+                       return_tensors='pt').to(device, torch.float16)
+    generated_ids = model.generate(
+        pixel_values=inputs.pixel_values,
+        do_sample=decoding_method == 'Nucleus sampling',
+        temperature=temperature,
+        length_penalty=length_penalty,
+        repetition_penalty=repetition_penalty,
+        max_length=50)
+    result = processor.batch_decode(generated_ids,
+                                    skip_special_tokens=True)[0].strip()
+    return result
+def answer_question(model_id: str, image: PIL.Image.Image, text: str,
+                    decoding_method: str, temperature: float,
+                    length_penalty: float, repetition_penalty: float) -> str:
+    model_info = model_dict[model_id]
+    processor = model_info['processor']
+    model = model_info['model']
+    inputs = processor(images=image, text=text,
+                       return_tensors='pt').to(device, torch.float16)
+    generated_ids = model.generate(**inputs,
+                                   do_sample=decoding_method ==
+                                   'Nucleus sampling',
+                                   temperature=temperature,
+                                   length_penalty=length_penalty,
+                                   repetition_penalty=repetition_penalty)
+    result = processor.batch_decode(generated_ids,
+                                    skip_special_tokens=True)[0].strip()
+    return result
+def postprocess_output(output: str) -> str:
+    if output and not output[-1] in string.punctuation:
+        output += '.'
     return output
+def chat(
+    model_id: str,
+    image: PIL.Image.Image,
+    text: str,
+    decoding_method: str,
+    temperature: float,
+    length_penalty: float,
+    repetition_penalty: float,
+    history_orig: list[str] = [],
+    history_qa: list[str] = [],
+) -> tuple[dict[str, list[str]], dict[str, list[str]], dict[str, list[str]]]:
+    history_orig.append(text)
+    text_qa = f'Question: {text} Answer:'
+    history_qa.append(text_qa)
+    prompt = ' '.join(history_qa)
+    output = answer_question(
+        model_id,
+        image,
+        prompt,
+        decoding_method,
+        temperature,
+        length_penalty,
+        repetition_penalty,
     )
     output = postprocess_output(output)
+    history_orig.append(output)
+    history_qa.append(output)
+    chat_val = list(zip(history_orig[0::2], history_orig[1::2]))
+    return gr.update(value=chat_val), gr.update(value=history_orig), gr.update(
+        value=history_qa)
 examples = [
+    [
+        'house.png',
+        'How could someone get out of the house?',
+    ],
+    [
+        'flower.jpg',
+        'What is this flower and where is it\'s origin?',
+    ],
+    [
+        'pizza.jpg',
+        'What are steps to cook it?',
+    ],
+    [
+        'sunset.jpg',
+        'Here is a romantic message going along the photo:',
+    ],
+    [
+        'forbidden_city.webp',
+        'In what dynasties was this place built?',
+    ],
 ]
+with gr.Blocks(css='style.css') as demo:
+    gr.Markdown(DESCRIPTION)
+    image = gr.Image(type='pil')
+    with gr.Accordion(label='Advanced settings', open=False):
+        with gr.Row():
+            model_id_caption = gr.Dropdown(
+                label='Model ID for image captioning',
+                choices=[MODEL_ID_OPT_6_7B, MODEL_ID_FLAN_T5_XXL],
+                value=MODEL_ID_OPT_6_7B)
+            model_id_chat = gr.Dropdown(
+                label='Model ID for VQA',
+                choices=[MODEL_ID_OPT_6_7B, MODEL_ID_FLAN_T5_XXL],
+                value=MODEL_ID_FLAN_T5_XXL)
+        sampling_method = gr.Radio(
+            label='Text Decoding Method',
+            choices=['Beam search', 'Nucleus sampling'],
+            value='Beam search',
+        )
+        temperature = gr.Slider(
+            label='Temperature (used with nucleus sampling)',
+            minimum=0.5,
+            maximum=1.0,
+            value=1.0,
+            step=0.1,
+        )
+        length_penalty = gr.Slider(
+            label=
+            'Length Penalty (set to larger for longer sequence, used with beam search)',
+            minimum=-1.0,
+            maximum=2.0,
+            value=1.0,
+            step=0.2,
+        )
+        rep_penalty = gr.Slider(
+            label='Repeat Penalty (larger value prevents repetition)',
+            minimum=1.0,
+            maximum=5.0,
+            value=1.5,
+            step=0.5,
+        )
     with gr.Row():
+        with gr.Column():
+            with gr.Box():
+                gr.Markdown('Image Captioning')
+                caption_button = gr.Button(value='Caption it!')
+                caption_output = gr.Textbox(label='Caption Output')
+        with gr.Column():
+            with gr.Box():
+                gr.Markdown('VQA Chat')
+                vqa_input = gr.Text(label='Chat Input', max_lines=1)
+                with gr.Row():
+                    clear_chat_button = gr.Button(value='Clear')
+                    chat_button = gr.Button(value='Submit')
+                chatbot = gr.Chatbot(label='Chat Output')
+                history_orig = gr.State(value=[])
+                history_qa = gr.State(value=[])
+    gr.Examples(
+        examples=examples,
+        inputs=[
+            image,
+            vqa_input,
+        ],
+    )
+    caption_button.click(
+        fn=generate_caption,
+        inputs=[
+            model_id_caption,
+            image,
+            sampling_method,
+            temperature,
+            length_penalty,
+            rep_penalty,
+        ],
+        outputs=caption_output,
+    )
+    chat_inputs = [
+        model_id_chat,
+        image,
+        vqa_input,
+        sampling_method,
+        temperature,
+        length_penalty,
+        rep_penalty,
+        history_orig,
+    ]
+    chat_outputs = [
+        chatbot,
+        history_orig,
+        history_qa,
+    ]
+    vqa_input.submit(
+        fn=chat,
+        inputs=chat_inputs,
+        outputs=chat_outputs,
+    )
+    chat_button.click(
+        fn=chat,
+        inputs=chat_inputs,
+        outputs=chat_outputs,
+    )
+    clear_chat_button.click(
+        fn=lambda: ('', [], [], []),
+        inputs=None,
+        outputs=[
+            vqa_input,
+            chatbot,
+            history_orig,
+            history_qa,
+        ],
+        queue=False,
+    )
+    image.change(
+        fn=lambda: ('', '', [], []),
+        inputs=None,
+        outputs=[
+            chatbot,
+            caption_output,
+            history_orig,
+            history_qa,
+        ],
+        queue=False,
     )
+demo.queue(max_size=10).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+accelerate==0.16.0
+bitsandbytes==0.37.0
+git+https://github.com/huggingface/transformers@c836f77
+gradio==3.18.0
+huggingface-hub==0.12.0
+Pillow==9.4.0
+torch==1.13.1
+torchvision==0.14.1

style.css ADDED Viewed

	@@ -0,0 +1,3 @@

+h1 {
+  text-align: center;
+}

utils.py DELETED Viewed

@@ -1,27 +0,0 @@
-import os
-class Endpoint:
-    def __init__(self):
-        self._url = None
-    @property
-    def url(self):
-        if self._url is None:
-            self._url = self.get_url()
-        return self._url
-    def get_url(self):
-        endpoint = os.environ.get("endpoint")
-        return endpoint
-def get_token():
-    token = os.environ.get("auth_token")
-    if token is None:
-        raise ValueError("auth-token not found in environment variables")
-    return token