ningshanwutuobang commited on
Commit
13f6fc8
β€’
1 Parent(s): 499cad2
Files changed (3) hide show
  1. app.py +70 -0
  2. panda_gpt.py +116 -0
  3. requirements.txt +22 -0
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from panda_gpt import PandaGPT
2
+ import gradio as gr
3
+
4
+ from huggingface_hub import hf_hub_download
5
+
6
+ vicuna_path = hf_hub_download(repo_id="ningshanwutuobang/ggml-pandagpt-vicuna-merge", filename="ggml-pandagpt-vicuna-q4_1.bin")
7
+ panda_path = hf_hub_download(repo_id="openllmplayground/pandagpt_13b_max_len_400", filename="pytorch_model.pt")
8
+
9
+ a = PandaGPT((vicuna_path,))
10
+ a.load_projection(panda_path)
11
+
12
+ import gradio as gr
13
+
14
+
15
+ def add_text(history, text):
16
+ history = history + [(text, None)]
17
+ return history, gr.update(value="", interactive=False)
18
+
19
+ def add_file(history, file):
20
+ history = history + [((file.name,), None)]
21
+ return history
22
+
23
+
24
+ def bot(history):
25
+ text = history[-1][0]
26
+ image_paths = []
27
+ audio_paths = []
28
+ video_paths = []
29
+ for i in history[:-1]:
30
+ if i[1] is None:
31
+ if i[0][:4] in [".png", "jpeg"]:
32
+ image_paths += list(i[0])
33
+ if i[0][:3] in ["mp3", "wav"]:
34
+ audio_paths += list(i[0])
35
+ if i[0][:3] in ["mp4", "avi", "mkv"]:
36
+ video_paths += list(i[0])
37
+ else:
38
+ image_paths = []
39
+ audio_paths = []
40
+ video_paths = []
41
+ if len(image_paths) == 0 and len(audio_paths) == 0 and len(video_paths) == 0:
42
+ response = a.chat(text)
43
+ else:
44
+ response = a.chat_with_image({"image_paths": image_paths,"audio_paths": audio_paths, "video_paths": video_paths}, text)
45
+ history[-1][1] = response[:-3]
46
+ return history
47
+
48
+
49
+ with gr.Blocks() as demo:
50
+ chatbot = gr.Chatbot([], elem_id="chatbot").style(height=750)
51
+
52
+ with gr.Row():
53
+ with gr.Column(scale=0.85):
54
+ txt = gr.Textbox(
55
+ show_label=False,
56
+ placeholder="Enter text and press enter, or upload an image",
57
+ ).style(container=False)
58
+ with gr.Column(scale=0.15, min_width=0):
59
+ btn = gr.UploadButton("πŸ“", file_types=["image", "video", "audio"])
60
+
61
+ txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
62
+ bot, chatbot, chatbot
63
+ )
64
+ txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
65
+ file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False)
66
+
67
+ demo.launch()
68
+
69
+
70
+ # a.chat_with
panda_gpt.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import llama_cpp
2
+ import os, sys
3
+ from ctypes import POINTER, c_float
4
+ import torch
5
+ from torch import nn
6
+
7
+ # use PandaGPT path
8
+ panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
9
+ imagebind_ckpt_path = os.path.join(os.path.dirname(__file__), "imagebind_huge.pth")
10
+
11
+ if not os.path.exists(panda_gpt_path):
12
+ os.system("git clone https://github.com/yxuansu/PandaGPT "+panda_gpt_path)
13
+
14
+ sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
15
+ from ImageBind.models import imagebind_model
16
+ from ImageBind import data
17
+
18
+ def numpy_to_floatptr(x):
19
+ return x.astype(np.float32).ctypes.data_as(POINTER(c_float))
20
+
21
+ class PandaGPT:
22
+ def __init__(self, args=(), kwargs={}):
23
+ self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=os.path.dirname(imagebind_ckpt_path))
24
+ self.visual_encoder.eval()
25
+ self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
26
+ self.max_tgt_len = 400
27
+ self.model = llama_cpp.Llama(*args, **kwargs)
28
+ self.generated_text = ""
29
+ self.device = "cpu"
30
+
31
+ def eval_embd(self, x):
32
+ y = numpy_to_floatptr(x.T)
33
+ ctx = self.model.ctx
34
+ n_past = self.model.n_tokens
35
+ n_threads = self.model.n_threads
36
+ llama_cpp.llama_eval_embd(ctx, y, x.shape[0], n_past, n_threads)
37
+ self.model.n_tokens += x.shape[0]
38
+
39
+ def eval_string(self, s):
40
+ s = self.model.tokenize(s.encode())
41
+ self.model.eval(s)
42
+
43
+ def generate_with_print(self, end="###"):
44
+ end = end.encode()
45
+ ret = b""
46
+ for i in range(self.max_tgt_len):
47
+ token = self.model.sample()
48
+ self.model.eval([token])
49
+ txt = self.model.detokenize([token])
50
+ ret += txt
51
+ print(txt.decode(errors="replace"), flush=True, end="")
52
+ if ret.endswith(end):
53
+ break
54
+ return ret.decode(errors="replace")
55
+
56
+ def load_projection(self, path):
57
+ state = torch.load(path, map_location="cpu")
58
+ self.llama_proj.load_state_dict({
59
+ "weight": state["llama_proj.weight"],
60
+ "bias": state["llama_proj.bias"]})
61
+
62
+ def eval_inputs(self, inputs):
63
+ self.eval_string("<Img>")
64
+ embds = self.extract_multimoal_feature(inputs)
65
+ for i in embds:
66
+ self.eval_embd(i)
67
+ self.eval_string("</Img> ")
68
+
69
+ def chat(self, question):
70
+ return self.chat_with_image(None, question)
71
+
72
+ def chat_with_image(self, inputs, question):
73
+ if self.generated_text == "":
74
+ self.eval_string("###")
75
+ self.eval_string(" Human: ")
76
+ if inputs:
77
+ self.eval_inputs(inputs)
78
+ self.eval_string(question)
79
+ self.eval_string("\n### Assistant:")
80
+ ret = self.generate_with_print(end="###")
81
+ self.generated_text += ret
82
+ return ret
83
+
84
+ def extract_multimoal_feature(self, inputs):
85
+ features = []
86
+ for key in ["image", "audio", "video", "thermal"]:
87
+ if key + "_paths" in inputs:
88
+ embeds = self.encode_data(key, inputs[key+"_paths"])
89
+ features.append(embeds)
90
+ return features
91
+
92
+ def encode_data(self, data_type, data_paths):
93
+
94
+ type_map = {
95
+ "image": ModalityType.VISION,
96
+ "audio": ModalityType.AUDIO,
97
+ "video": ModalityType.VISION,
98
+ "thermal": ModalityType.THERMAL,
99
+ }
100
+ load_map = {
101
+ "image": data.load_and_transform_vision_data,
102
+ "audio": data.load_and_transform_audio_data,
103
+ "video": data.load_and_transform_video_data,
104
+ "thermal": data.load_and_transform_thermal_data
105
+ }
106
+
107
+ load_function = load_map[data_type]
108
+ key = type_map[data_type]
109
+
110
+ inputs = {key: load_function(data_paths, self.device)}
111
+ with torch.no_grad():
112
+ embeddings = self.visual_encoder(inputs)
113
+ embeds = embeddings[key]
114
+ embeds = self.llama_proj(embeds).cpu().numpy()
115
+ return embeds
116
+
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ timm==0.6.7
2
+ deepspeed==0.9.2
3
+ data
4
+ einops==0.6.1
5
+ ftfy==6.1.1
6
+ iopath==0.1.10
7
+ ipdb==0.13.13
8
+ numpy==1.24.3
9
+ peft==0.3.0
10
+ Pillow==9.5.0
11
+ PyYAML==6.0
12
+ regex==2022.10.31
13
+ torchvision==0.14.1
14
+ torchaudio==0.13.1
15
+ pytorchvideo
16
+ fvcore
17
+ decord==0.6.0
18
+ tqdm==4.64.1
19
+ transformers==4.29.1
20
+ llama-cpp-python>=0.1.67
21
+ gradio
22
+ huggingface_hub