Spaces:

AIML-TUDA
/

LlavaGuard

Running on Zero

App Files Files Community

LukasHug commited on Jun 13, 2024

Commit

7771cfc

verified ·

1 Parent(s): 6262027

Upload 5 files

Browse files

Files changed (3) hide show

README.md +6 -6
app.py +20 -77
builder.py +167 -0

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: LlavaGuard
-emoji: 🖼
-colorFrom: purple
-colorTo: red
 sdk: gradio
-sdk_version: 4.16.0
 app_file: app.py
 pinned: false
-license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: LLaVA 1.6
+emoji: 👁
+colorFrom: green
+colorTo: yellow
 sdk: gradio
+sdk_version: 4.36.1
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -11,11 +11,11 @@ import gradio as gr
 import spaces
 import torch
 from llava.constants import IMAGE_TOKEN_INDEX
 from llava.constants import LOGDIR
 from llava.conversation import (default_conversation, conv_templates)
 from llava.mm_utils import KeywordsStoppingCriteria, tokenizer_image_token
-from llava.model.builder import load_pretrained_model
 from llava.utils import (build_logger, violates_moderation, moderation_msg)
 from taxonomy import wrap_taxonomy, default_taxonomy
@@ -67,24 +67,28 @@ def run_llava(prompt, pil_image):
     return outputs[0].strip()
 def get_conv_log_filename():
     t = datetime.datetime.now()
     name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
     return name
 def get_model_list():
-    # ret = requests.post(args.controller_url + "/refresh_all_workers")
-    # assert ret.status_code == 200
-    # ret = requests.post(args.controller_url + "/list_models")
-    # logger.info(f"get_model_list: {ret.json()}")
-    # models = ret.json()["models"]
-    # models.sort(key=lambda x: priority.get(x, x))
-    # logger.info(f"Models: {models}")
     models = [
-        'LukasHug/LlavaGuard-7B-hf',
-        'LukasHug/LlavaGuard-13B-hf',
-        'LukasHug/LlavaGuard-34B-hf', ][:1]
     return models
@@ -245,18 +249,6 @@ def llava_bot(state, model_selector, temperature, top_p, max_new_tokens, request
         new_state.append_message(new_state.roles[1], None)
         state = new_state
-    # Query worker address
-    # controller_url = args.controller_url
-    # ret = requests.post(controller_url + "/get_worker_address",
-    #                     json={"model": model_name})
-    # worker_addr = ret.json()["address"]
-    # logger.info(f"model_name: {model_name}, worker_addr: {worker_addr}")
-    # No available worker
-    # if worker_addr == "":
-    #     state.messages[-1][-1] = server_error_msg
-    #     yield (state, state.to_gradio_chatbot(), disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
-    #     return
     # Construct prompt
     prompt = state.get_prompt()
@@ -274,51 +266,8 @@ def llava_bot(state, model_selector, temperature, top_p, max_new_tokens, request
     state.messages[-1][-1] = output
-    # Make requests
-    # pload = {
-    #     "model": model_name,
-    #     "prompt": prompt,
-    #     "temperature": float(temperature),
-    #     "top_p": float(top_p),
-    #     # "num_beams": 2,
-    #     # "top_k": 50,
-    #     "max_new_tokens": min(int(max_new_tokens), 1536),
-    #     "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
-    #     "images": f'List of {len(state.get_images())} images: {all_image_hash}',
-    # }
-    # logger.info(f"==== request ====\n{pload}")
-    #
-    # pload['images'] = state.get_images()
-    # state.messages[-1][-1] = "▌"
     yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
-    # try:
-    #     # Stream output
-    #     response = requests.post(worker_addr + "/worker_generate_stream",
-    #                              headers=headers, json=pload, stream=True, timeout=10)
-    #     for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
-    #         if chunk:
-    #             data = json.loads(chunk.decode())
-    #             if data["error_code"] == 0:
-    #                 output = data["text"][len(prompt):].strip()
-    #                 state.messages[-1][-1] = output + "▌"
-    #                 yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
-    #             else:
-    #                 output = data["text"] + f" (error_code: {data['error_code']})"
-    #                 state.messages[-1][-1] = output
-    #                 yield (state, state.to_gradio_chatbot()) + (
-    #                     disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
-    #                 return
-    #             time.sleep(0.03)
-    # except requests.exceptions.RequestException as e:
-    #     state.messages[-1][-1] = server_error_msg
-    #     yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
-    #     return
-    #
-    # state.messages[-1][-1] = state.messages[-1][-1][:-1]
-    # yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
     finish_tstamp = time.time()
     logger.info(f"{output}")
@@ -457,6 +406,8 @@ def build_demo(embed_mode, cur_dir=None, concurrency_count=10):
             [textbox, upvote_btn, downvote_btn, flag_btn]
         )
         regenerate_btn.click(
             regenerate,
             [state, image_process_mode],
@@ -540,10 +491,7 @@ Set the environment variable `model` to change the model:
 ['AIML-TUDA/LlavaGuard-13B'](https://huggingface.co/AIML-TUDA/LlavaGuard-13B),
 ['AIML-TUDA/LlavaGuard-34B'](https://huggingface.co/AIML-TUDA/LlavaGuard-34B),
 """
-    # set_up_env_and_token(read=True)
     print(f"args: {args}")
-    # set the huggingface login token
-    # controller_proc = start_controller()
     concurrency_count = int(os.getenv("concurrency_count", 5))
     api_key = os.getenv("token")
     if api_key:
@@ -561,19 +509,14 @@ Set the environment variable `model` to change the model:
         'LukasHug/LlavaGuard-13B-hf',
         'LukasHug/LlavaGuard-34B-hf', ]
     bits = int(os.getenv("bits", 16))
-    model = os.getenv("model", models[0])
     available_devices = os.getenv("CUDA_VISIBLE_DEVICES", "0")
-    model_path, model_name = model, model.split("/")[1]
-    # model_path = '/common-repos/LlavaGuard/models/LlavaGuard-v1.1-7b-full/smid_and_crawled_v2_with_augmented_policies/json-v12/llava'
     print(f"Loading model {model_path}")
     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name, token=api_key)
     model.config.tokenizer_model_max_length = 2048 * 2
-    # Wait for worker and controller to start
-    # time.sleep(10)
     exit_status = 0
     try:

 import spaces
 import torch
+from builder import load_pretrained_model
 from llava.constants import IMAGE_TOKEN_INDEX
 from llava.constants import LOGDIR
 from llava.conversation import (default_conversation, conv_templates)
 from llava.mm_utils import KeywordsStoppingCriteria, tokenizer_image_token
 from llava.utils import (build_logger, violates_moderation, moderation_msg)
 from taxonomy import wrap_taxonomy, default_taxonomy
     return outputs[0].strip()
+def load_selected_model(model_path):
+    model_name = model_path.split("/")[-1]
+    global tokenizer, model, image_processor, context_len
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)
+    for warning in w:
+        if "vision" not in str(warning.message).lower():
+            print(warning.message)
+    model.config.tokenizer_model_max_length = 2048 * 2
 def get_conv_log_filename():
     t = datetime.datetime.now()
     name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
     return name
 def get_model_list():
     models = [
+                 'LukasHug/LlavaGuard-7B-hf',
+                 'LukasHug/LlavaGuard-13B-hf',
+                 'LukasHug/LlavaGuard-34B-hf', ]
     return models
         new_state.append_message(new_state.roles[1], None)
         state = new_state
     # Construct prompt
     prompt = state.get_prompt()
     state.messages[-1][-1] = output
     yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
     finish_tstamp = time.time()
     logger.info(f"{output}")
             [textbox, upvote_btn, downvote_btn, flag_btn]
         )
+        model_selector.change(load_selected_model)
         regenerate_btn.click(
             regenerate,
             [state, image_process_mode],
 ['AIML-TUDA/LlavaGuard-13B'](https://huggingface.co/AIML-TUDA/LlavaGuard-13B),
 ['AIML-TUDA/LlavaGuard-34B'](https://huggingface.co/AIML-TUDA/LlavaGuard-34B),
 """
     print(f"args: {args}")
     concurrency_count = int(os.getenv("concurrency_count", 5))
     api_key = os.getenv("token")
     if api_key:
         'LukasHug/LlavaGuard-13B-hf',
         'LukasHug/LlavaGuard-34B-hf', ]
     bits = int(os.getenv("bits", 16))
+    model = os.getenv("model", models[1])
     available_devices = os.getenv("CUDA_VISIBLE_DEVICES", "0")
+    model_path, model_name = model, model.split("/")[0]
     print(f"Loading model {model_path}")
     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name, token=api_key)
     model.config.tokenizer_model_max_length = 2048 * 2
     exit_status = 0
     try:

builder.py ADDED Viewed

	@@ -0,0 +1,167 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import warnings
+import shutil
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+import torch
+from llava.model import *
+from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
+    kwargs = {"device_map": device_map, **kwargs}
+    if device != "cuda":
+        kwargs['device_map'] = {"": device}
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+    if use_flash_attn:
+        kwargs['attn_implementation'] = 'flash_attention_2'
+    token = kwargs.get('token', None)
+    if 'llava' in model_name.lower():
+        # Load LLaVA model
+        if 'lora' in model_name.lower() and model_base is None:
+            warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
+        if 'lora' in model_name.lower() and model_base is not None:
+            from llava.model.language_model.llava_llama import LlavaConfig
+            lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            print('Loading LLaVA from base model...')
+            model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
+            token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+            if model.lm_head.weight.shape[0] != token_num:
+                model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+                model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+            print('Loading additional LLaVA weights...')
+            if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
+                non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+            else:
+                # this is probably from HF Hub
+                from huggingface_hub import hf_hub_download
+                def load_from_hf(repo_id, filename, subfolder=None):
+                    cache_file = hf_hub_download(
+                        repo_id=repo_id,
+                        filename=filename,
+                        subfolder=subfolder)
+                    return torch.load(cache_file, map_location='cpu')
+                non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
+            non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+            if any(k.startswith('model.model.') for k in non_lora_trainables):
+                non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+            model.load_state_dict(non_lora_trainables, strict=False)
+            from peft import PeftModel
+            print('Loading LoRA weights...')
+            model = PeftModel.from_pretrained(model, model_path)
+            print('Merging LoRA weights...')
+            model = model.merge_and_unload()
+            print('Model is loaded...')
+        elif model_base is not None:
+            # this may be mm projector only
+            print('Loading LLaVA from base model...')
+            if 'mpt' in model_name.lower():
+                if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')):
+                    shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py'))
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
+                cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+                model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+                cfg_pretrained = AutoConfig.from_pretrained(model_path)
+                model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+            mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+            model.load_state_dict(mm_projector_weights, strict=False)
+        else:
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = LlavaMptForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+            elif 'mistral' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = LlavaMistralForCausalLM.from_pretrained(
+                    model_path,
+                    low_cpu_mem_usage=True,
+                    **kwargs
+                )
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
+                model = LlavaLlamaForCausalLM.from_pretrained(
+                    model_path,
+                    low_cpu_mem_usage=True,
+                    **kwargs
+                )
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print('Convert to FP16...')
+            model.to(torch.float16)
+        else:
+            use_fast = False
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    image_processor = None
+    if 'llava' in model_name.lower():
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+        model.resize_token_embeddings(len(tokenizer))
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model(device_map=device_map)
+        if device_map != 'auto':
+            vision_tower.to(device=device_map, dtype=torch.float16)
+        image_processor = vision_tower.image_processor
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    return tokenizer, model, image_processor, context_len