Spaces:

ningshanwutuobang
/

panda_gpt_llama_cpp

Runtime error

App Files Files Community

ningshanwutuobang commited on Jul 1, 2023

Commit

13f6fc8

•

1 Parent(s): 499cad2

init

Browse files

Files changed (3) hide show

app.py +70 -0
panda_gpt.py +116 -0
requirements.txt +22 -0

app.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from panda_gpt import PandaGPT
+import gradio as gr
+from huggingface_hub import hf_hub_download
+vicuna_path = hf_hub_download(repo_id="ningshanwutuobang/ggml-pandagpt-vicuna-merge", filename="ggml-pandagpt-vicuna-q4_1.bin")
+panda_path = hf_hub_download(repo_id="openllmplayground/pandagpt_13b_max_len_400", filename="pytorch_model.pt")
+a = PandaGPT((vicuna_path,))
+a.load_projection(panda_path)
+import gradio as gr
+def add_text(history, text):
+   history = history + [(text, None)]
+   return history, gr.update(value="", interactive=False)
+def add_file(history, file):
+    history = history + [((file.name,), None)]
+    return history
+def bot(history):
+    text = history[-1][0]
+    image_paths = []
+    audio_paths = []
+    video_paths = []
+    for i in history[:-1]:
+        if i[1] is None:
+            if i[0][:4] in [".png", "jpeg"]:
+                image_paths += list(i[0])
+            if i[0][:3] in ["mp3", "wav"]:
+                audio_paths += list(i[0])
+            if i[0][:3] in ["mp4", "avi", "mkv"]:
+                video_paths += list(i[0])
+        else:
+            image_paths = []
+            audio_paths = []
+            video_paths = []
+    if len(image_paths) == 0 and len(audio_paths) == 0 and len(video_paths) == 0:
+        response = a.chat(text)
+    else:
+        response = a.chat_with_image({"image_paths": image_paths,"audio_paths": audio_paths, "video_paths": video_paths}, text)
+    history[-1][1] = response[:-3]
+    return history
+with gr.Blocks() as demo:
+    chatbot = gr.Chatbot([], elem_id="chatbot").style(height=750)
+    with gr.Row():
+        with gr.Column(scale=0.85):
+            txt = gr.Textbox(
+                show_label=False,
+                placeholder="Enter text and press enter, or upload an image",
+            ).style(container=False)
+        with gr.Column(scale=0.15, min_width=0):
+            btn = gr.UploadButton("📁", file_types=["image", "video", "audio"])
+    txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
+        bot, chatbot, chatbot
+    )
+    txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
+    file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False)
+demo.launch()
+# a.chat_with

panda_gpt.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import llama_cpp
+import os, sys
+from ctypes import POINTER, c_float
+import torch
+from torch import nn
+# use PandaGPT path
+panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
+imagebind_ckpt_path = os.path.join(os.path.dirname(__file__), "imagebind_huge.pth")
+if not os.path.exists(panda_gpt_path):
+    os.system("git clone https://github.com/yxuansu/PandaGPT "+panda_gpt_path)
+sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
+from ImageBind.models import imagebind_model
+from ImageBind import data
+def numpy_to_floatptr(x):
+    return x.astype(np.float32).ctypes.data_as(POINTER(c_float))
+class PandaGPT:
+    def __init__(self, args=(), kwargs={}):
+        self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=os.path.dirname(imagebind_ckpt_path))
+        self.visual_encoder.eval()
+        self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
+        self.max_tgt_len = 400
+        self.model = llama_cpp.Llama(*args, **kwargs)
+        self.generated_text = ""
+        self.device = "cpu"
+    def eval_embd(self, x):
+        y = numpy_to_floatptr(x.T)
+        ctx = self.model.ctx
+        n_past = self.model.n_tokens
+        n_threads = self.model.n_threads
+        llama_cpp.llama_eval_embd(ctx, y, x.shape[0], n_past, n_threads)
+        self.model.n_tokens += x.shape[0]
+    def eval_string(self, s):
+        s = self.model.tokenize(s.encode())
+        self.model.eval(s)
+    def generate_with_print(self, end="###"):
+        end = end.encode()
+        ret = b""
+        for i in range(self.max_tgt_len):
+            token = self.model.sample()
+            self.model.eval([token])
+            txt = self.model.detokenize([token])
+            ret += txt
+            print(txt.decode(errors="replace"), flush=True, end="")
+            if ret.endswith(end):
+                break
+        return ret.decode(errors="replace")
+    def load_projection(self, path):
+        state = torch.load(path, map_location="cpu")
+        self.llama_proj.load_state_dict({
+            "weight": state["llama_proj.weight"],
+            "bias": state["llama_proj.bias"]})
+    def eval_inputs(self, inputs):
+        self.eval_string("<Img>")
+        embds = self.extract_multimoal_feature(inputs)
+        for i in embds:
+            self.eval_embd(i)
+        self.eval_string("</Img> ")
+    def chat(self, question):
+        return self.chat_with_image(None, question)
+    def chat_with_image(self, inputs, question):
+        if self.generated_text == "":
+            self.eval_string("###")
+        self.eval_string(" Human: ")
+        if inputs:
+            self.eval_inputs(inputs)
+        self.eval_string(question)
+        self.eval_string("\n### Assistant:")
+        ret = self.generate_with_print(end="###")
+        self.generated_text += ret
+        return ret
+    def extract_multimoal_feature(self, inputs):
+        features = []
+        for key in ["image", "audio", "video", "thermal"]:
+            if key + "_paths" in inputs:
+                embeds = self.encode_data(key, inputs[key+"_paths"])
+                features.append(embeds)
+        return features
+    def encode_data(self, data_type, data_paths):
+        type_map = {
+            "image": ModalityType.VISION,
+            "audio": ModalityType.AUDIO,
+            "video": ModalityType.VISION,
+            "thermal": ModalityType.THERMAL,
+        }
+        load_map = {
+            "image": data.load_and_transform_vision_data,
+            "audio": data.load_and_transform_audio_data,
+            "video": data.load_and_transform_video_data,
+            "thermal": data.load_and_transform_thermal_data
+        }
+        load_function = load_map[data_type]
+        key = type_map[data_type]
+        inputs = {key: load_function(data_paths, self.device)}
+        with torch.no_grad():
+            embeddings = self.visual_encoder(inputs)
+            embeds = embeddings[key]
+            embeds = self.llama_proj(embeds).cpu().numpy()
+        return embeds

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+timm==0.6.7
+deepspeed==0.9.2
+data
+einops==0.6.1
+ftfy==6.1.1
+iopath==0.1.10
+ipdb==0.13.13
+numpy==1.24.3
+peft==0.3.0
+Pillow==9.5.0
+PyYAML==6.0
+regex==2022.10.31
+torchvision==0.14.1
+torchaudio==0.13.1
+pytorchvideo
+fvcore
+decord==0.6.0
+tqdm==4.64.1
+transformers==4.29.1
+llama-cpp-python>=0.1.67
+gradio
+huggingface_hub